def item_iterator(name,cmd_config=None): score_config = simple_config.load("parse") input_data_dir = score_config["output_data_directory"] F_SQL = glob.glob(os.path.join(input_data_dir,'*')) # If there is a whitelist only keep the matching filename try: whitelist = cmd_config["command_whitelist"].strip() except: whitelist = None if whitelist: assert(type(whitelist)==list) F_SQL2 = set() for f_sql in F_SQL: for token in whitelist: if token in f_sql: F_SQL2.add(f_sql) F_SQL = F_SQL2 # Randomize the order of the input files F_SQL = random.sample(sorted(F_SQL), len(F_SQL)) DB_ITR = itertools.product(F_SQL, config["target_columns"]) for f_sql, target_col in DB_ITR: #print ("Computing {}:{}".format(f_sql, target_col)) conn = sqlite3.connect(f_sql, check_same_thread=False) args = { "column_name":"text", "table_name" :target_col, "conn":conn, "limit":_global_limit, "shuffle":False, "include_table_name":True, } INPUT_ITR = database_iterator(**args) for item in INPUT_ITR: yield list(item) + [f_sql,]
def __iter__(self): # Setup the progress bar progress_bar = tqdm.tqdm(total=self.total_items) # Rebuild the iterator DB_ITR = itertools.product(self.F_SQL, self.config["target_columns"]) for f_sql, target_col in DB_ITR: conn = sqlite3.connect(f_sql, check_same_thread=False) args = { "column_name":"text", "table_name" :target_col, "conn":conn, "limit":_global_limit, "shuffle":False, "include_table_name":True, } requires_meta = [] requires_ref = ["document_scores",] if name in requires_meta: args["include_meta"] = True INPUT_ITR = database_iterator(**args) if not self.yield_single: data = [] for item in INPUT_ITR: val = list(item) + [f_sql,] data.append(val) if self.yield_single: for item in INPUT_ITR: val = list(item) + [f_sql,] yield val progress_bar.update() if not self.yield_single: yield data
# Remove the table if it exists print "Removing table {}:{}".format(f_sql,target_col) conn_out.execute("DROP TABLE {}".format(target_col)) print "Parsing {}:{}".format(f_sql, target_col) args = { "column_name":target_col, "table_name":import_config["output_table"], "conn":conn, "limit":global_limit, "progress_bar":True, } INPUT_ITR = database_iterator(**args) ITR = jobmap(dispatcher, INPUT_ITR, _PARALLEL) cmd_create = ''' DROP TABLE IF EXISTS {table_name}; CREATE TABLE IF NOT EXISTS {table_name} ( _ref INTEGER PRIMARY KEY, text STRING, meta STRING ); '''.format(table_name=target_col) conn_out.executescript(cmd_create) cmd_insert = '''