예제 #1
0
def do_upload():
    csvfile = request.files.get('csvfile', None)
    fblogin()
    s = Scraper()
    try:
        if csvfile.file == None:
            raise Exception('The file is None')
        company_list = s.read_csv(csvfile.file)
        if len(company_list) > 0:
            # Save the file
            f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb')
            f.write(csvfile.value)
            f.seek(0)

            # Save to CSV_FILE db
            csv_file_path, db_file_path = save_csv_db(csvfile)
            # Run the scrape process in background
            # TODO just upload, not scrape
            do_scrape_async(s, csv_file_path, db_file_path)
        else:
            raise Exception('The file is not format as company list')
    except Exception as e:
        log.error(e)
        return csv_upload(error_message='Error: %s' % e.message)
        #return redirect('/')
    return csv_upload(
        success_message=
        'The file updated success and will do scrape in background. Please refrush page later to view the new data.'
    )
예제 #2
0
def re_scrape_schedule():
    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    schedule_interval = c.execute(
        'SELECT SCHEDULE_INTERVAL FROM SETTINGS').fetchone()[0]
    c.close()
    conn.close()

    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    csv_db_file_list = c.execute(
        'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall()
    c.close()
    conn.close()

    fblogin()

    for item in csv_db_file_list:
        csv_path = item[0]
        db_path = item[1]
        s = Scraper()
        thread = ScrapeThread(s, csv_path, db_path)
        thread.start()
        # Re schedule with new interval seconds
    cron.reSchedule(seconds=schedule_interval)
    return settings(
        success_message=
        'The cron job has been started in background and rescheduled.')
예제 #3
0
파일: webapp.py 프로젝트: zhiwehu/scraper
def do_upload():
    csvfile = request.files.get('csvfile', None)
    fblogin()
    s = Scraper()
    try:
        if csvfile.file == None:
            raise Exception('The file is None')
        company_list = s.read_csv(csvfile.file)
        if len(company_list) > 0:
            # Save the file
            f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb')
            f.write(csvfile.value)
            f.seek(0)

            # Save to CSV_FILE db
            csv_file_path, db_file_path = save_csv_db(csvfile)
            # Run the scrape process in background
            # TODO just upload, not scrape
            do_scrape_async(s, csv_file_path, db_file_path)
        else:
            raise Exception('The file is not format as company list')
    except Exception as e:
        log.error(e)
        return csv_upload(error_message='Error: %s' % e.message)
        #return redirect('/')
    return csv_upload(
        success_message='The file updated success and will do scrape in background. Please refrush page later to view the new data.')
예제 #4
0
def doJob():
    '''
        Do the scrape every interval time
    '''
    #get all csv and db paths
    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    csv_db_file_list = c.execute(
        'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall()
    c.close()
    conn.close()
    threads = []

    fblogin()

    for item in csv_db_file_list:
        csv_path = item[0]
        db_path = item[1]
        s = Scraper()
        thread = ScrapeThread(s, csv_path, db_path)
        #thread.start()
        thread.run()  # run threads one by one
        threads.append(thread)

    # Wait for all threads to complete
    #for t in threads:
    #    t.join()

    log.info('all scraper threads finished in doJob()')
    return
예제 #5
0
파일: cron.py 프로젝트: zhiwehu/scraper
def doJob():
    '''
        Do the scrape every interval time
    '''
    #get all csv and db paths
    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    csv_db_file_list = c.execute('SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall()
    c.close()
    conn.close()
    threads = []

    fblogin()

    for item in csv_db_file_list:
        csv_path = item[0]
        db_path  = item[1]
        s = Scraper()
        thread = ScrapeThread(s,csv_path, db_path)
        #thread.start()
        thread.run() # run threads one by one
        threads.append(thread)

    # Wait for all threads to complete
    #for t in threads:
    #    t.join()

    log.info('all scraper threads finished in doJob()')
    return
예제 #6
0
파일: tests.py 프로젝트: zhiwehu/scraper
    def testMain(self):
        fblogin()
        s = main.Scraper()

        # Test None file
        file = None
        try:
            s.read_csv(file)
        except Exception as e:
            self.assertEqual('The file is none.', e.message)

        # Test good format csv
        file = open('testdata/good_format.csv', 'rb')
        company_list = s.read_csv(file)
        self.assertTrue(len(company_list) > 0)
        list = s.get_social_media(company_list[0:1], 'testdata/data.db')
        self.assertEqual(1, len(list))
        c = list[0]
        self.assertTrue('Wal-Mart Stores', c.company_name)

        # Test good format csv
        file = open('data/NRN_RestaurantList.csv', 'rb')
        company_list = s.read_csv(file)
        self.assertTrue(len(company_list) > 0)
        list = s.get_social_media(company_list[0:1], 'testdata/data.db')
        self.assertEqual(1, len(list))
        c = list[0]
        #self.assertTrue('Wal-Mart Stores', c.company_name)

        # Test error format csv
        try:
            file = open('testdata/error_format.csv', 'rb')
            s.read_csv(file)
        except Exception as e:
            self.assertTrue(e)

        # Test write db
        s.write_db(list,'testdata/data.db')
        conn = sqlite3.connect('testdata/data.db')
        c = conn.cursor()
        # Create table
        c.execute('DELETE FROM COMPANY')
        conn.commit()
        c.close()
        conn.close()
예제 #7
0
파일: tests.py 프로젝트: zhiwehu/scraper
    def testMain(self):
        fblogin()
        s = main.Scraper()

        # Test None file
        file = None
        try:
            s.read_csv(file)
        except Exception as e:
            self.assertEqual('The file is none.', e.message)

        # Test good format csv
        file = open('testdata/good_format.csv', 'rb')
        company_list = s.read_csv(file)
        self.assertTrue(len(company_list) > 0)
        list = s.get_social_media(company_list[0:1], 'testdata/data.db')
        self.assertEqual(1, len(list))
        c = list[0]
        self.assertTrue('Wal-Mart Stores', c.company_name)

        # Test good format csv
        file = open('data/NRN_RestaurantList.csv', 'rb')
        company_list = s.read_csv(file)
        self.assertTrue(len(company_list) > 0)
        list = s.get_social_media(company_list[0:1], 'testdata/data.db')
        self.assertEqual(1, len(list))
        c = list[0]
        #self.assertTrue('Wal-Mart Stores', c.company_name)

        # Test error format csv
        try:
            file = open('testdata/error_format.csv', 'rb')
            s.read_csv(file)
        except Exception as e:
            self.assertTrue(e)

        # Test write db
        s.write_db(list, 'testdata/data.db')
        conn = sqlite3.connect('testdata/data.db')
        c = conn.cursor()
        # Create table
        c.execute('DELETE FROM COMPANY')
        conn.commit()
        c.close()
        conn.close()
예제 #8
0
파일: webapp.py 프로젝트: zhiwehu/scraper
def re_scrape_schedule():
    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    schedule_interval = c.execute('SELECT SCHEDULE_INTERVAL FROM SETTINGS').fetchone()[0]
    c.close()
    conn.close()

    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    csv_db_file_list = c.execute('SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall()
    c.close()
    conn.close()

    fblogin()

    for item in csv_db_file_list:
        csv_path = item[0]
        db_path = item[1]
        s = Scraper()
        thread = ScrapeThread(s, csv_path, db_path)
        thread.start()
        # Re schedule with new interval seconds
    cron.reSchedule(seconds=schedule_interval)
    return settings(success_message='The cron job has been started in background and rescheduled.')
예제 #9
0
파일: main.py 프로젝트: zhiwehu/scraper
                     company.micro_metrics['tw_percent'],
                     company.micro_metrics['yt_percent'],
                     company.micro_metrics['fb_abs'],
                     company.micro_metrics['tw_abs'],
                     company.micro_metrics['yt_abs'],
                     company.time_taken
                        ))
                count += 1
            except Exception as e:
                log.error(e)
                pass

        conn.commit()
        c.close()
        conn.close()
        return count

if __name__ == '__main__':
    log.info('begin')
    args = sys.argv
    if len(args) >= 2:
        file = open(args[1], 'r')
        fblogin()
        s = Scraper()
        count = s.write_db(s.get_social_media(s.read_csv(file), 'data/data.db'), 'data/data.db')
        print '\n'
        print '%d records has been saved to database %s' % (count, 'data/data.db')
    else:
        print 'Please input the file name as the first parameter.'
    log.info('end')