if __name__ == '__main__': ''' This script should be called in the following way: $ python wsj_article_urls.py 'startdate' 'enddate' ''' start_date, end_date = argv[1], argv[2] start_datetime = datetime.datetime.strptime(start_date, '%Y-%m-%d') end_datetime = datetime.datetime.strptime(end_date, '%Y-%m-%d') while True: print('Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date)) # Get dates to search over dates = get_dates(start_date, end_date) urls = set() for date in dates: print(date) urls = get_urls(date, urls) # Convert urls set to a list and write to a txt file file_path = '../url_files/{0}'.format( get_file_name('wsj', start_date, end_date)) with open(file_path, 'w') as f: f.write(json.dumps(list(urls))) f.close() start_datetime = start_datetime - datetime.timedelta(days=7)
num_bad_extractions += thread.result return num_bad_extractions if __name__=='__main__': ''' This script should be called in the following way: $ python npr_scraper.py 'startdate' 'enddate' 'table (optional)' ''' # Create MongoClient client = MongoClient() # Initialize the Database db = client['election_analysis'] # Initialize table # If a table name has been provided use that, otherwise initialize 'articles' table if len(argv) > 3: tab = db[argv[3]] else: tab = db['articles'] start_date, end_date = argv[1], argv[2] print 'Scraping NPR from {0} to {1}'.format(start_date, end_date) dates = get_dates(start_date, end_date) keywords = get_keywords_2016() num_bad_extractions = concurrent_scrape_npr(tab, keywords, dates) print 'NPR Scraping Done...' print 'Number of Bad Extractions = {0}'.format(num_bad_extractions)