if __name__ == '__main__': ''' This script should be called in the following way: $ python fox_scraper.py 'startdate' 'enddate' 'table (optional)' ''' # Create MongoClient client = MongoClient() # Initialize the Database db = client['election_analysis'] # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['articles'] start_date, end_date = argv[1], argv[2] print 'Scraping FOX URLs from {0} to {1}'.format(start_date, end_date) file_path = '../url_files/{0}'.format( get_file_name('fox', start_date, end_date)) urls = load_urls(file_path) bad_urls = [] for url in urls: result = add_to_mongo(tab, url) if result: bad_urls.append(result) print 'FOX Scraping Done...' print 'Number of Bad Extractions = {0}'.format(bad_urls)
import requests import lorem import random from load_data import load, load_urls from search import * site = 'http://localhost/api/projects/' data = load() websites = load_urls() project_names = list(set(list(zip(*data))[4])) def st(integer): return lorem.text()[:integer] print(random.choice(websites)) for name in project_names: print(requests.post(site, data={'name': name, 'description' : lorem.paragraph()[:1000], 'link' : "http://www." + random.choice(websites), 'open_positions': lorem.sentence()[:200]}).json())
# Create MongoClient client = MongoClient() # Initialize the Database db = client['election_analysis'] # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['articles'] start_date, end_date = argv[1], argv[2] print 'Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date) file_path = '../url_files/{0}'.format(get_file_name('wsj', start_date, end_date)) urls = load_urls(file_path) good_urls, bad_urls = [], [] driver = log_in_wsj() inserts, good_urls, bad_urls = scrape_wsj(tab, driver, urls, good_urls, bad_urls) driver.close() print 'WSJ Scraping Done...' print 'Number of Bad URLs = {0}'.format(len(bad_urls)) if len(bad_urls): file_path = '../url_files/{0}'.format(get_file_name('wsj', start_date, end_date, bad=True)) with open(file_path, 'w') as f: f.write(json.dumps(list(bad_urls))) f.close()