Exemplo n.º 1
0
def item_iterator(name,cmd_config=None):

    score_config = simple_config.load("parse")
    input_data_dir = score_config["output_data_directory"]

    F_SQL = glob.glob(os.path.join(input_data_dir,'*'))

    # If there is a whitelist only keep the matching filename
    try:
        whitelist = cmd_config["command_whitelist"].strip()
    except:
        whitelist = None
    if whitelist:
        assert(type(whitelist)==list)

        F_SQL2 = set()
        for f_sql in F_SQL:
            for token in whitelist:
                if token in f_sql:
                    F_SQL2.add(f_sql)
        F_SQL = F_SQL2

    
    # Randomize the order of the input files
    F_SQL = random.sample(sorted(F_SQL), len(F_SQL))  
    DB_ITR = itertools.product(F_SQL, config["target_columns"])

    for f_sql, target_col in DB_ITR:

        #print ("Computing {}:{}".format(f_sql, target_col))
        
        conn = sqlite3.connect(f_sql, check_same_thread=False)

        args = {
            "column_name":"text",
            "table_name" :target_col,
            "conn":conn,
            "limit":_global_limit,
            "shuffle":False,
            "include_table_name":True,
        }

        INPUT_ITR = database_iterator(**args)
        for item in INPUT_ITR:
            yield list(item) + [f_sql,]
Exemplo n.º 2
0
    def __iter__(self):

        # Setup the progress bar
        progress_bar = tqdm.tqdm(total=self.total_items)

        # Rebuild the iterator
        DB_ITR = itertools.product(self.F_SQL,
                                   self.config["target_columns"])

        for f_sql, target_col in DB_ITR:

            conn = sqlite3.connect(f_sql, check_same_thread=False)

            args = {
                "column_name":"text",
                "table_name" :target_col,
                "conn":conn,
                "limit":_global_limit,
                "shuffle":False,
                "include_table_name":True,
            }

            requires_meta = []
            requires_ref  = ["document_scores",]

            if name in requires_meta:
                args["include_meta"] = True

            INPUT_ITR = database_iterator(**args)

            if not self.yield_single:
                data = []
                for item in INPUT_ITR:
                    val = list(item) + [f_sql,]
                    data.append(val)

            if self.yield_single:
                for item in INPUT_ITR:
                    val = list(item) + [f_sql,]
                    yield val
                    progress_bar.update()

            if not self.yield_single:
                yield data
Exemplo n.º 3
0
            # Remove the table if it exists
            print "Removing table {}:{}".format(f_sql,target_col)
            conn_out.execute("DROP TABLE {}".format(target_col))
        
        
        print "Parsing {}:{}".format(f_sql, target_col)       

        args = {
            "column_name":target_col,
            "table_name":import_config["output_table"],
            "conn":conn,
            "limit":global_limit,
            "progress_bar":True,
        }
            
        INPUT_ITR = database_iterator(**args)

        ITR = jobmap(dispatcher, INPUT_ITR, _PARALLEL)

        cmd_create = '''
        DROP TABLE IF EXISTS {table_name};
        CREATE TABLE IF NOT EXISTS {table_name} (
        _ref INTEGER PRIMARY KEY,
        text STRING,
        meta STRING
        );
        '''.format(table_name=target_col)
        
        conn_out.executescript(cmd_create)

        cmd_insert = '''