Пример #1
0
def get_page():
    if len(data) > 0:
        conn, cur = mysql_database.connect()
        driver = phantomjs.start()
        counter = 0

        for url, pid, mid in data:
            try:
                driver.get(url)
                counter += 1
                print('{} visiting {}'.format(counter, url))
                ## allow time for page to load (or possibly redirect)
                time.sleep(20)
            except Exception as e:
                logging.info(
                    'An error occured trying to visit {}. \n{}'.format(url, e))
                continue
            else:
                current_url = driver.current_url
                update_db(driver, url, pid, mid, current_url, conn, cur, num,
                          table)

        mysql_database.disconnect(conn, cur)
        driver.quit()

    return
def check():

    if len(data) > 0:

        conn, cur = mysql_database.connect()

        print('starting the driver...')
        driver = webdriver.PhantomJS(executable_path=local_path.phantomjs_path)
        driver.set_window_size(1124, 850)

        counter = 0

        for url, pid in data:
            try:
                driver.get(url)
                ## allow time for page to load (or possibly redirect)
                counter += 1
                print('{} visiting {}'.format(counter, url))
                time.sleep(20)
            except Exception as e:
                print(str(e))
                print('An error occured trying to visit {}.'.format(url))
                continue
            else:
                current_url = driver.current_url
                update_db(url, pid, current_url, conn, cur, sys.argv[1])

        mysql_database.disconnect(conn, cur)
        driver.quit()

    return
def select_posts(table):
    conn, cur = mysql_database.connect()
    cur.execute(
    '''SELECT url, pid from {}
    WHERE pubdate < CURDATE() - INTERVAL 2 DAY
    AND tested = "not yet" '''.format(table))

    data = list(cur.fetchall())
    print('no. of posts to be tested: {}'.format(len(data)))

    mysql_database.disconnect(conn, cur)

    return data
Пример #4
0
def select(num, table):
    conn, cur = mysql_database.connect()
    ## 1st test after 2 days
    if num == '1':
        selectData(cur, '2', table, 'tested')
    ## 2nd test after 14 days
    elif num == '2':
        selectData(cur, '14', table, 'retested')
    else:
        sys.exit('unable to select appropriate data to test')

    data = list(cur.fetchall())
    logging.info('\nno. of posts to be tested: {}'.format(len(data)))

    mysql_database.disconnect(conn, cur)

    return data
Пример #5
0
def save_to_db(table, queue):
    saved = 0
    skipped = 0

    conn, cur = mysql_database.connect()

    while not queue.empty():
        cur.execute('SELECT url from {}'.format(table))
        ## turn tuple of tuples into list of strings
        exisiting_urls = [
            ''.join(ele) for urls in list(cur.fetchall()) for ele in urls
        ]
        data = queue.get()

        for i in range(len(data["content"])):
            if data["url"][i] not in exisiting_urls:
                try:
                    cur.execute('''INSERT INTO {} (content, url, uid, pid, mid, pubdate, tested, testdate, status)
                                   VALUES (%s, %s, %s, %s, %s, CURDATE(), DEFAULT, DEFAULT, DEFAULT)'''.format(table), \
                                    (data["content"][i], data["url"][i], data["uid"][i], data["pid"][i], data["mid"][i]))
                    logging.info('saved pid: {}'.format(data["pid"][i]))

                    saved += 1
                    cur.connection.commit()
                except Exception as e:
                    logging.info(
                        '\nunable to insert pid {} into table. {}'.format(
                            data["pid"][i], e))
                    continue
            else:
                skipped += 1

    logging.info('\nsaved: {}; skipped: {}'.format(saved, skipped))

    cur.execute('SELECT COUNT(*) FROM {}'.format(table))
    no_of_rows = str(cur.fetchone()[0])
    logging.info('\nno. of rows in database: {}'.format(no_of_rows))

    mysql_database.disconnect(conn, cur)

    return