示例#1
0
def shuffle_table(table=None, engine=None, shuffle=''):
    if table is None:
        table = config['TABLES']['QUEUE']
        
    print()
    if shuffle is True:
        x = 'y'
    elif shuffle is False:
        x = 'n'
    else:
        x = input(f'Shuffle {table}? Recomended to prevent IP being banned (y/n): ')
    print()
    
    if x == 'y':

        if engine == None:
            engine = mysql_engine()

        temp = f'{table}_backup_'+datetime.now().strftime('%d_%m')

        tprint(f'[·] Shuffling table {table} (can take up to 5 mins).')

        engine.execute(f'create table {temp} like {table}')
        engine.execute(f'insert into {temp} (original_link) select original_link from {table} order by rand()')
                
        engine.execute(f'drop table {table}')
        engine.execute(f'rename table {temp} to {table}')
        tprint('[+] Done shuffling.')
示例#2
0
def recover_discarded(con=None, table=None):
    
    if table == None:
        table = config['TABLES']['ERROR']
        
    close = False
    
    if con == None:
        engine = mysql_engine()
        con = engine.connect()
        close = True

        
    d = list_discarded(con=con)
    df = pd.DataFrame({'original_link': d, 
                       'info': ['Discarded by insert ignore.']*len(d), 
                       'borrar': [0]*len(d)})
    df.to_sql(table,
              con = con,
              if_exists='append',
              index=False,
              chunksize=50000)

    
    if close == True:
        con.close()
        engine.dispose()
示例#3
0
def flush_temp_tables(engine=None):
    tables = get_temp_tables()
    if engine==None:
        engine = mysql_engine()
    
    for t in tables:
        engine.execute('drop table '+t)
示例#4
0
def get_full_df(n_pools=15,
                n=150,
                queue_table=None,
                processed_table=None,
                delete=False,
                engine=None,
                con=None,
                rand=False):

    if engine == None:
        engine = mysql_engine()
    if queue_table == None:
        queue_table = config['TABLES']['QUEUE']
    if processed_table == None:
        processed_table = config['TABLES']['PROCESSED']

    tprint('[·] Getting chunk...')
    chunk = get_chunk_from_db(n=n,
                              queue_table=queue_table,
                              processed_table=processed_table,
                              delete=delete,
                              engine=engine,
                              con=con,
                              rand=rand)

    tprint('[·] Populating chunk...')
    df = populate_df(chunk, n_pools=n_pools)

    return df
示例#5
0
    def __init__(self,
                 result_table=None,
                 df=None,
                 debug=True,
                 n_pools=15,
                 n=150,
                 queue_table=None,
                 processed_table=None,
                 error_table=None,
                 delete=False,
                 engine=None,
                 con=None,
                 rand=False):
        
        if df is None:
            df = pd.DataFrame()
        if result_table == None:
            result_table = config['TABLES']['RESULT']
        if queue_table == None:
            queue_table = config['TABLES']['QUEUE']
        if processed_table == None:
            processed_table = config['TABLES']['PROCESSED']
        if error_table == None:
            error_table = config['TABLES']['ERROR']

        self.result_table = result_table
        self.queue_table = queue_table
        self.processed_table = processed_table
        self.error_table = error_table
        
        self.press = pd.DataFrame()
        self.error = pd.DataFrame()
        self.table_name = None
        self.df = df
        self.debug = debug
        self.engine = engine
        self.con = con
        
        if self.engine==None:
            self.engine = mysql_engine()
        if self.con==None:
            self.con = self.engine.connect()
        
        if self.df.empty:
            self.df = get_full_df(n_pools=n_pools,
                                  n=n,
                                  queue_table=queue_table,
                                  processed_table=processed_table,
                                  delete=delete,
                                  engine=self.engine,
                                  con=self.con,
                                  rand=rand)
        
        if not self.df.empty:
            self.divide_df()
示例#6
0
def get_temp_tables(engine=None):
    if engine==None:
        engine = mysql_engine()

    tables = []
    temps = engine.execute('show tables')
    
    for t in temps:
        tables += t.values()
    tables = [t for t in tables if 'temp' in t]
    
    return tables
示例#7
0
def mysql_query_as_set(query, con=None):
    '''Gets the first column of a MySQL query as a set'''
    
    if con == None:
        engine = mysql_engine()
        con = engine.connect()

    df = pd.read_sql(query, con=con, chunksize=50000)
    query_set = set()
    for d in df:
        query_set.update(d.iloc[:,0].tolist())
        
    return query_set
示例#8
0
def reinsert_from_error_to_queue(engine=None, con=None, where=''):
    queue_table = config['TABLES']['QUEUE']
    error_table = config['TABLES']['ERROR']
    
    close = False
    close_ = False
    
    if engine == None:
        engine = mysql_engine()
        con = engine.connect()
        close = True

    if con == None:
        con = engine.connect()
        close_ = True
        
    # Where clause
    if where == '':
        where= input('Where clause for mysql query:\n\t- ')
        print()
        
    # Count and confirm
    tprint('[·] Counting links...')
    count = engine.execute(f'select count(*) from {error_table} where {where}').scalar()
    y = input(f'\nAre you sure you want to reinsert {count} links? (y/n): ')
    print()
    
    if y=='y':
    
        # Get links to be reinserted
        tprint('[·] Getting Links...')
        to_be_reinserted = mysql_query_as_set(f'select original_link from {error_table} where {where};', con=con)
        
        # Reinserting into queue
        tprint('[·] Reinserting into queue table...')
        insert_set(to_be_reinserted, queue_table, 'original_link', engine=engine, con=con)

        # Delete from error
        tprint('[·] Deleting from error table...')
        engine.execute(f'delete from {error_table} where {where}')
        
        count_error = engine.execute(f'select count(*) from {error_table}').scalar()
        tprint(f'[+] Done! {count_error} links left in {error_table} table')

    if close == True:
        con.close()
        engine.dispose()
    if close_ == True:
        con.close()
示例#9
0
def update_db(df, backup=None, queue=None, engine=None, con=None, shuffle=False):
    if backup is None: backup = config['TABLES']['BACKUP']
    if queue is None: queue = config['TABLES']['QUEUE']
    if engine is None and con is None: engine = mysql_engine()
    if con is None: con = engine.connect()

    print()
    df.to_sql('erase', con=con, index=False, if_exists='append', chunksize=50000)

    engine.execute(f'insert ignore into {backup} (original_link) select original_link from erase')
    engine.execute(f'insert ignore into {queue} (original_link) select original_link from erase')

    engine.execute('drop table if exists erase')
    
    tprint('Successfully added urls to database.')
    
    shuffle_table(engine=engine, shuffle=shuffle)
示例#10
0
def get_press_rows(n=1, engine=None, con=None, 
                   result=None, 
                   ids=None, source=None):
                    
    queue = config['TABLES']['QUEUE']
    processed = config['TABLES']['PROCESSED']
    result = config['TABLES']['RESULT']
    error = config['TABLES']['ERROR']
    backup = config['TABLES']['BACKUP']
    db = config['MYSQL']['DB']
    
    close_engine = False
    close_con = False
    
    if engine == None:
        engine = mysql_engine()
        close_engine = True
        
    if con == None:
        con = engine.connect()
        close_con = True
        
        
        
    if source == None and ids == None:
        n_rows = engine.execute(f'SELECT Auto_increment FROM information_schema.tables WHERE table_name = "{result}" and table_schema = "{db}";').scalar()
        ids = random.sample(range(1, n_rows), n)

    query = f'select * from {result} where '
    for id in ids:
        query += 'id='+str(id)+' or '
    query = query.strip(' or ')

    
    rows = pd.read_sql(query, con)

        
    if close_con == True:
        con.close()
    if close_engine == True:
        engine.dispose()

    
    return rows
示例#11
0
def insert_set(iterable, table, column, engine=None, con=None):

    if engine == None and con == None:
        engine = mysql_engine()
        
    if con == None:
        con = engine.connect()
    
    iterable = list(iterable)
    
    df = pd.DataFrame({'original_link': iterable})
    
    df.to_sql('erase',
              con = con,
              if_exists='replace',
              index=False, 
              chunksize=50000)
    engine.execute(f'insert ignore into {table} ({column}) select {column} from erase')
    engine.execute('drop table erase')
示例#12
0
def get_chunk_from_db(n=150,
                      queue_table=None,
                      processed_table=None,
                      delete=False,
                      engine=mysql_engine(),
                      con=None,
                      rand=False):

    if queue_table == None:
        queue_table = config['TABLES']['QUEUE']
    if processed_table == None:
        processed_table = config['TABLES']['PROCESSED']

    if con == None:
        con = engine.connect()

    order = 'order by id'
    if rand == True:
        order = 'order by rand()'

    query = f'select original_link from {queue_table} {order} limit {str(n)}'

    try:
        # Reading rows
        df = pd.read_sql(query, con)

        # Backup and delete rows
        if delete == True:
            temp_table = temp_name(engine)
            df.to_sql(temp_table, con=con, if_exists='append', index=False)

            insert_query = f'INSERT  IGNORE INTO {processed_table} (original_link) SELECT original_link FROM {temp_table}'

            engine.execute(insert_query)
            engine.execute(f'delete from {queue_table} limit {str(n)}')
            engine.execute(f'drop table if exists {temp_table}')

    except Exception as exc:
        tprint('[-] Error en get_chunk_from_db()', exc)
        df = None

    return df
示例#13
0
def list_discarded(con=None):
    close = False

    if con == None:
        engine = mysql_engine()
        con = engine.connect()
        close = True
        
    tables = ['PROCESSED', 'RESULT', 'ERROR', 'QUEUE']
        
    sets = {}
    for table in tables:
        sets[table]= mysql_query_as_set(f"select original_link from {config['TABLES'][table]}")
                
    discarded = sets['PROCESSED'] - sets['RESULT'] - sets['ERROR'] - sets['QUEUE']
    
        
    if close == True:
        con.close()
        engine.dispose()
        
    return list(discarded)
示例#14
0
def init_mysql_db(engine=None):
    close = False
    
    if engine == None:
        engine=mysql_engine()
        close = True
        
    queue = config['TABLES']['QUEUE']
    processed = config['TABLES']['PROCESSED']
    result = config['TABLES']['RESULT']
    error = config['TABLES']['ERROR']
    backup = config['TABLES']['BACKUP']

    processed_query = f'''
    CREATE TABLE IF NOT EXISTS {processed} (
        original_link VARCHAR(300),
        UNIQUE KEY no_duplicates (original_link)
    ) CHARSET=utf8mb4; '''

    queue_query = f'''
    CREATE TABLE IF NOT EXISTS {queue} (
      original_link varchar(300) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
      id int(11) NOT NULL AUTO_INCREMENT,
      PRIMARY KEY (id),
      UNIQUE KEY no_duplicates (original_link)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4; '''

    result_query = f'''
    CREATE TABLE IF NOT EXISTS {result} (
      id int(11) NOT NULL AUTO_INCREMENT,
      title varchar(255) DEFAULT NULL,
      description text,
      body text,
      authors varchar(300) DEFAULT NULL,
      date datetime DEFAULT NULL,
      section varchar(120) DEFAULT NULL,
      original_link varchar(300) DEFAULT NULL,
      source varchar(120) DEFAULT NULL,
      year smallint(6) DEFAULT NULL,
      image varchar(300) DEFAULT NULL,
      tags varchar(300) DEFAULT NULL,
      link varchar(300) DEFAULT NULL,
      PRIMARY KEY (id),
      UNIQUE KEY no_duplicates (link)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4'''

    error_query = f'''
    CREATE TABLE IF NOT EXISTS {error} (
        id int(11) NOT NULL AUTO_INCREMENT,
        original_link VARCHAR(300),
        info VARCHAR(300),
        borrar tinyint(1) DEFAULT 0,
        PRIMARY KEY (id)
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4'''
    
    engine.execute(processed_query)
    engine.execute(queue_query)
    engine.execute(result_query)
    engine.execute(error_query)
    
    # No duplicates
    
    result_no_dup = f'alter table {result} add constraint no_duplicates unique key(link)'
    queue_no_dup = f"alter table {queue} add constraint no_duplicates unique key(original_link)"
    backup_no_dup = f"alter table {backup} add constraint no_duplicates unique key(original_link)"
    
    try:
        engine.execute(result_no_dup)
    except: pass    
    try:
        engine.execute(queue_no_dup)
    except: pass    
    try:
        engine.execute(backup_no_dup)
    except: pass

    if close == True:
        engine.dispose()
示例#15
0
def delete_error_where(engine=None, con=None, where=''):
    
    processed_table = config['TABLES']['PROCESSED']
    error_table = config['TABLES']['ERROR']
    
    close = False
    close_ = False

    if engine == None:
        engine = mysql_engine()
        con = engine.connect()
        close = True

    if con == None:
        con = engine.connect()
        close_ = True

    # Where clause
    if where == '':
        where= input('Where clause for mysql query:\n\t- ')
        print()
        
    # Count and confirm
    tprint('[·] Counting links...')
    count = engine.execute(f'select count(*) from {error_table} where {where}').scalar()
    y = input(f'\nAre you sure you want to remove {count} links? (y/n): ')
    print()
    
    if y=='y':
    
        # Get links to be removed
        tprint('[·] Getting Links...')
        to_be_removed = mysql_query_as_set(f'select original_link from {error_table} where {where};', con=con)

        # Filtering Processed
        tprint('[·] Filtering processed table...')
        processed = mysql_query_as_set(f'select original_link from {processed_table};', con=con)
        processed = processed - to_be_removed

        # Reinserting into processed
        tprint('[·] Reinserting into processed table...')
        temp = f'{processed_table}_backup_'+datetime.now().strftime('%d_%m')
        engine.execute(f'create table {temp} like {processed_table}')
        insert_set(processed, temp, 'original_link', engine=engine, con=con)

        engine.execute(f'drop table {processed_table}')
        engine.execute(f'rename table {temp} to {processed_table}')
        
        # Delete from error
        tprint('[·] Deleting from processed and error table...')
        engine.execute(f'delete from {error_table} where {where}')
        
        # Done.
        count_error = engine.execute(f'select count(*) from {error_table}').scalar()
        tprint(f'[+] Done! {count_error} links left in {error_table} table')
        

    if close == True:
        con.close()
        engine.dispose()
    if close_ == True:
        con.close()
示例#16
0
def program(result_table=None, 
            df=None,
            debug=True, 
            n_pools=15, 
            n=150, 
            queue_table=None,
            processed_table=None,
            error_table=None,
            delete=False, 
            engine=None, 
            con=None, 
            rand=False):
    
    if df is None:
        df = pd.DataFrame
        
    if result_table == None:
        result_table = config['TABLES']['RESULT']
    if queue_table == None:
        queue_table = config['TABLES']['QUEUE']
    if processed_table == None:
        processed_table = config['TABLES']['PROCESSED']
    if error_table == None:
        error_table = config['TABLES']['ERROR']
    
    
    # Initialiazing...
    if engine == None:
        engine = mysql_engine()
        con = engine.connect()
    if con == None:
        con = engine.connect()

    print()
    tprint('[·] Initializing...')
    status = 'working'
    init_mysql_db(engine=engine)
    recover_discarded(con=con)
    queue = len_tables('queue')['queue']
    tprint('[+] Done.')
    print()
    
    con.close()
    engine.dispose()

    
    
    i = 1
    while queue != 0:
        
        engine = mysql_engine()
        con = engine.connect()
        
        try:
            result, status = work(result_table=result_table, 
                                  df=df,
                                  debug=debug,
                                  n_pools=n_pools, 
                                  n=n,
                                  queue_table=queue_table,
                                  processed_table=processed_table,
                                  error_table=error_table,
                                  delete=delete,
                                  engine=engine, 
                                  con=con, 
                                  rand=rand)
            
            if status == 'done':
                tprint('[+] DONE!')
                input('\n(ENTER)')
                sys.exit()

        except KeyboardInterrupt:
            sys.exit()
            
        except Exception as exc:
            tprint('[-] Error General - ', exc)
        print()
                
        con.close()
        engine.dispose()
        
        if i%100 == 0:
            
            queue = len_tables('queue')['queue']
            tprint('[+] {} left in queue.'.format(queue))
            print()
            
        i += 1
        
    tprint('[+] DONE!')