Пример #1
0
if __name__ == '__main__':
    ''' This script should be called in the following way:
    $ python fox_scraper.py 'startdate' 'enddate' 'table (optional)'
    '''
    # Create MongoClient
    client = MongoClient()
    # Initialize the Database
    db = client['election_analysis']
    # Initialize table
    # If a table name has been provided use that, otherwise initialize 'articles' table
    if len(argv) > 3:
        tab = db[argv[3]]
    else:
        tab = db['articles']

    start_date, end_date = argv[1], argv[2]
    print 'Scraping FOX URLs from {0} to {1}'.format(start_date, end_date)

    file_path = '../url_files/{0}'.format(
        get_file_name('fox', start_date, end_date))
    urls = load_urls(file_path)

    bad_urls = []
    for url in urls:
        result = add_to_mongo(tab, url)
        if result:
            bad_urls.append(result)

    print 'FOX Scraping Done...'
    print 'Number of Bad Extractions = {0}'.format(bad_urls)
Пример #2
0
import requests
import lorem
import random

from load_data import load, load_urls
from search import *


site = 'http://localhost/api/projects/'

data = load()
websites = load_urls()

project_names = list(set(list(zip(*data))[4]))

def st(integer):
	return lorem.text()[:integer]

print(random.choice(websites))

for name in project_names:
	print(requests.post(site, data={'name': name, 'description' : lorem.paragraph()[:1000], 'link' : "http://www." + random.choice(websites), 'open_positions': lorem.sentence()[:200]}).json())
Пример #3
0
    # Create MongoClient
    client = MongoClient()
    # Initialize the Database
    db = client['election_analysis']
    # Initialize table
    # If a table name has been provided use that, otherwise initialize 'articles' table
    if len(argv) > 3:
        tab = db[argv[3]]
    else:
        tab = db['articles']

    start_date, end_date = argv[1], argv[2]
    print 'Scraping WSJ URLs from {0} to {1}'.format(start_date, end_date)

    file_path = '../url_files/{0}'.format(get_file_name('wsj', start_date, end_date))
    urls = load_urls(file_path)
    good_urls, bad_urls = [], []

    driver = log_in_wsj()

    inserts, good_urls, bad_urls = scrape_wsj(tab, driver, urls, good_urls, bad_urls)
    driver.close()

    print 'WSJ Scraping Done...'
    print 'Number of Bad URLs = {0}'.format(len(bad_urls))
    if len(bad_urls):
        file_path = '../url_files/{0}'.format(get_file_name('wsj', start_date, end_date, bad=True))
        with open(file_path, 'w') as f:
            f.write(json.dumps(list(bad_urls)))
            f.close()