def test_threaded(): import thready results = set() def f(i): results.add(i) thready.threaded(range(10), f) assert results == set(range(10))
def get_new_data_for_page(page_arg_set): """ get all new posts on a page """ api, page_id, config = page_arg_set print "INFO\tFACEBOOK\tgetting new data for facebook.com/%s" % page_id # fetch account data so we can associate the number of likes with the account AT THAT TIME try: acct_data = api.get(page_id) except Exception as e: print e return None else: # determine limit if page_id in config['facebook']['insights_pages']: limit = config['facebook']['insights_limit'] else: limit = config['facebook']['page_limit'] # get last 100 articles for this page page = api.get(page_id + "/posts", page=False, retry=5, limit=limit) if DEBUG: for post_data in page['data']: post_arg_set = (api, post_data, acct_data, page_id, config) insert_new_post(post_arg_set) else: post_arg_sets = [(api, post_data, acct_data, page_id, config) for post_data in page['data']] threaded(post_arg_sets, insert_new_post, 20, 150)
def download_fixies(start_date, end_date=None): if not end_date: end_date = start_date all_dates = generate_date_range(start_date, end_date) print '\nINFO: Downloading FMS MTS from', all_dates[0], 'to', all_dates[-1], "!\n" fnames = [''.join(['mts',datetime.datetime.strftime(date, '%m%y'), '.txt']) for date in all_dates] threaded(fnames, request_and_test_fixie, 20, 100)
def generate_list(api, slug, list_dict): # parse handles if isinstance(list_dict['screen_names'], basestring): screen_names = [ sn.strip() for sn in open(list_dict['screen_names']).read().split("\n") if sn != '' and sn is not None ] else: screen_names = list_dict['screen_names'] owner_screen_name = list_dict['owner'] try: api.create_list(slug) except tweepy.error.TweepError as e: print "ERROR\tTWT\t%s Already Exists for user %s" % (slug, owner_screen_name) print e return None else: list_member_arg_sets = [ (screen_name, slug, owner_screen_name, api) for screen_name in screen_names ] threaded(list_member_arg_sets, add_list_member, 30, 200)
def main(): import sys, csv writer = csv.writer(sys.stdout) l = listings() first_listing = next(l) first_listing['tags'] = ', '.join(first_listing['tags']) first_listing['downloads'] = ', '.join(first_listing['downloads']) del(first_listing['messages']) writer.writerow(list(first_listing.keys())) writer.writerow(list(first_listing.values())) for listing in l: # foia_files filenames = (dl.split('/')[-1] for dl in listing['downloads']) threaded(filenames, get_foia_file, num_threads = 20, daemon = False, join = False) # Remove lists listing['tags'] = ', '.join(listing['tags']) listing['downloads'] = ', '.join(listing['downloads']) # listing['messages'] = ('\n\n' + ('-' * 60) + '\n\n').join(listing['messages']) del(listing['messages']) writer.writerow(list(listing.values()))
def scrape(): csv = MultiCSV() threaded(gdocs_persons(), lambda data: scrape_image(data['Full Name'], data['Image URL'], csv, data['Image Credit']), num_threads=THREAD_COUNT) csv.close()
def scrape_deputies(): """ Scrape all the deputies from the list """ # Create a generator of all the deputies on the list. generator = list_deputies() # Start 10 threads, begin processing list items through ``scrape_deputy``. threaded(generator, scrape_deputy, num_threads=10)
def run(): annual_urls = [get_url_year(year) for year in years] url_list = [cook_soup(u) for u in annual_urls] # Unnest items = [i for item in url_list for i in item] # Thread the data collection threaded(items, get_data, 20, 200)
def run(config): pages = config['promopages'] if DEBUG: for slug, url in pages.iteritems(): page_arg_set = (url, slug, config) scrape_promo_page(page_arg_set) else: page_arg_sets = [(url, slug, config) for slug, url in pages.iteritems()] threaded(page_arg_sets, scrape_promo_page, 2, 10)
def scrape_states(): def states(): for state in STATES: q = QUERY.copy() q['bundesland' + state] = "on" yield (q, state) # for s in states(): # scrape_state(s) threaded(states(), scrape_state, num_threads=5)
def run(): annual_urls = [get_url_year(year) for year in years] url_list = [cook_soup(u) for u in annual_urls] # Unnest items = [i for item in url_list for i in item ] # Thread the data collection threaded(items, get_data, 20, 200)
def scrape(): csv = MultiCSV() threaded( gdocs_persons(), lambda data: scrape_image(data['Full Name'], data['Image URL'], csv, data['Image Credit']), num_threads=THREAD_COUNT ) csv.close()
def scrape_missed_connections(): response = requests.get(BASE_URL + "mis/") soup = BeautifulSoup(response.content) missed_connections = soup.find_all('span', {'class': 'pl'}) urls = [] for missed_connection in missed_connections: link = missed_connection.find('a').attrs['href'] url = urljoin(BASE_URL, link) urls.append(url) threaded(urls, scrape_missed_connection, num_threads=10)
def parse_one_feed(newspaper_datum): """ parse all the items in an rss feed """ rss = newspaper_datum['rss'] print "parsing %s\n" % rss feed_data = feedparser.parse(rss) feed_items = zip_entries(feed_data['entries'], newspaper_datum) # thread that shit! threaded(feed_items, parse_one_entry, 3, 1000)
def run(): if os.path.exists('frb_releases/federalreserve.db'): os.remove('frb_releases/federalreserve.db') annual_urls = [get_url_year(year) for year in years] url_list = [cook_soup(u) for u in annual_urls] # unnest items = [i for item in url_list for i in item ] # thread that shit threaded(items, get_data, 20, 200)
def run(): if os.path.exists('frb_releases/federalreserve.db'): os.remove('frb_releases/federalreserve.db') annual_urls = [get_url_year(year) for year in years] url_list = [cook_soup(u) for u in annual_urls] # unnest items = [i for item in url_list for i in item] # thread that shit threaded(items, get_data, 20, 200)
def scrape_missed_connections(): """ Scrape all the missed connections from a list """ # Download the list of missed connections # here were using requests, # a python library for accessing the web # we add "mis/" to the url to tell requests # to get the missed connections # on newyork.craigslist.org response = requests.get(BASE_URL + "mis/") # parse HTML using Beautiful Soup # this returns a `soup` object which # gives us convenience methods for parsing html soup = BeautifulSoup(response.content) # find all the posts in the page. # here we're telling BeautifulSoup to get us every # span tag that has a class that equals pl # these tags might look something like this: # <span class='pl'> {content} </span> missed_connections = soup.find_all('span', {'class':'pl'}) # create an empty list of urls to scrape urls = [] for missed_connection in missed_connections: # for each span list, find the "a" tag which # represents the link to the missed connection page. link = missed_connection.find('a').attrs['href'] # join this relative link with the # BASE_URL to create an absolute link url = urljoin(BASE_URL, link) # iteratively populate this list urls.append(url) # download and parse these missed connections using # multiple threads threaded(urls, scrape_missed_connection, num_threads=10)
def run(config): """ get all new posts on all pages """ page_ids = config['facebook']['pages'] api = fb.connect(config) if DEBUG: for page_id in page_ids: page_arg_set = (api, page_id, config) get_new_data_for_page(page_arg_set) # fetch account data so we can associate the number of likes with the account AT THAT TIME else: page_arg_sets = [(api, page_id, config) for page_id in page_ids] threaded(page_arg_sets, get_new_data_for_page, 5, 20)
def scrapeMissedConnections(): """ Scrape all the missed connections from a list """ soup = BeautifulSoup(getContent(BASE_URL + "mis/")) missed_connections = soup.find_all('span', {'class': 'pl'}) urls = [] for missed_connection in missed_connections: link = missed_connection.find('a').attrs['href'] url = urljoin(BASE_URL, link) urls.append(url) threaded(urls, scrapeMissedConnection, num_threads=10)
def scrapeMissedConnections(): """ Scrape all the missed connections from a list """ soup = BeautifulSoup(getContent(BASE_URL + "mis/")) missed_connections = soup.find_all('span', {'class':'pl'}) urls = [] for missed_connection in missed_connections: link = missed_connection.find('a').attrs['href'] url = urljoin(BASE_URL, link) urls.append(url) threaded(urls, scrapeMissedConnection, num_threads=10)
def scrape_cl_ads(): count = 0 response = requests.get(BASE_URL + "apa/") soup = BeautifulSoup(response.content) ads = soup.find_all('span', {'class':'pl'}) urls = [] for ad in ads: link = ad.find('a').attrs['href'] url = urljoin(BASE_URL, link) urls.append(url) threaded(urls, scrape_cl_ad, num_threads=2) print str(count) + "Ads inserted in CL"
def run(config): """ parse all teh feedz """ # generate args from config feed_arg_sets = [] for data_source, v in config["rssfeeds"].iteritems(): feed_arg = (v["feed_url"], data_source, v["full_text"], config) feed_arg_sets.append(feed_arg) if DEBUG: for feed_arg_set in feed_arg_sets: parse_one_feed(feed_arg_set) else: # thread that shit! threaded(feed_arg_sets, parse_one_feed, 5, 25)
def parse_one_feed(feed_arg_set): feed_url, data_source, full_text, config = feed_arg_set """ parse all the items in an rss feed """ feed_data = feedparser.parse(feed_url) entries = feed_data["entries"] entry_arg_sets = [(entry, data_source, full_text, config) for entry in entries] if DEBUG: for entry_arg_set in entry_arg_sets: parse_one_entry(entry_arg_set) else: # thread that shit! threaded(entry_arg_sets, parse_one_entry, 30, 100)
def main(): fieldnames = [ 'project.name', 'contract.uri', 'contract.number', 'bidder.name', 'bid.status', 'bidder.country', 'original.company.name', 'original.company.country', 'opencorporates.company.name', 'opencorporates.company.uri', 'address', 'country', 'duration', 'date.signature', 'score.financial', 'score.technical', 'score.final', 'original.price.opening', 'original.price.evaluated', 'original.price.contract', 'price.opening.currency', 'price.opening.amount', 'price.evaluated.currency', 'price.evaluated.amount', 'price.contract.currency', 'price.contract.amount', 'method.procurement', 'method.selection', 'scope', 'name', 'reason.rejection', 'small.contract.notice', 'ranking.final', ] reader = csv.DictReader(open(os.path.join('pagedata', 'company.csv'))) writer = csv.DictWriter(sys.stdout, fieldnames = fieldnames) writer.writeheader() threaded(args(reader), partial(ask, writer), num_threads = 30)
def main(): fieldnames = [ 'project.name', 'contract.uri', 'contract.number', 'bidder.name', 'bid.status', 'bidder.country', 'original.company.name', 'original.company.country', 'opencorporates.company.name', 'opencorporates.company.uri', 'address', 'country', 'duration', 'date.signature', 'score.financial', 'score.technical', 'score.final', 'original.price.opening', 'original.price.evaluated', 'original.price.contract', 'price.opening.currency', 'price.opening.amount', 'price.evaluated.currency', 'price.evaluated.amount', 'price.contract.currency', 'price.contract.amount', 'method.procurement', 'method.selection', 'scope', 'name', 'reason.rejection', 'small.contract.notice', 'ranking.final', ] reader = csv.DictReader(open(os.path.join('pagedata', 'company.csv'))) writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames) writer.writeheader() threaded(args(reader), partial(ask, writer), num_threads=30)
def get_profile(lid,fname,lname,comp,tcount,ccount): ## Grab the target profile global rID, targetname profile = base_url+"/profile/view?id=%s" % lid r = requests.get(profile, cookies=cookies) #regex1 = re.compile("fmt__skill_name\":\"(.*?)\"") # Skill Names regex1 = re.compile("endorse-item-name-text\">(.*?)<") # Skill Names regex2 = re.compile("recipientId:(.*?),") # Recipient ID regex3 = re.compile("<title[^>]*>(.*?)\s\|") # Target Name print "\r[Info] Gathering data for %s %s (%d/%d)" % (fname,lname,ccount,tcount) try: ## Get RequestorID and Name of target rID = regex2.search(r.text).groups()[0] targetname = regex3.search(r.text).groups()[0] except: ## Target is outside of your network. Grabbing authToken value print "[Info] Target is outside your network. Optaining auth token." authToken = get_authToken(fname,lname,comp) targetname = fname + " " + lname ## Resend request with authToken profile2 = profile + "&authType=NAME_SEARCH&authToken=%s" % authToken r = requests.get(profile2, cookies=cookies) try: rID = regex2.search(r.text).groups()[0] except: print "\n[Error] Skipping %s %s" % (fname,lname) print "[Error] Unable to view the targets full profile\n" print targetname ## Get the skills of the target rID = rID.replace("'","") skills = set(regex1.findall(r.text)) if skills: for s in skills: skillset.append(s) #get_endorsements(s) else: print "\n[Error] Could not find skill names. Check that the session is still valid" ## Send requests to the threading engine threaded(skillset, get_endorsements, num_threads=10)
def delaware_river(): links = [] for i in range(1000,100000): url = "https://dpronline.delaware.gov/mylicense weblookup/Details.aspx?agency_id=1&license_id=%d&"%i links.append(url) threaded(links, george_washington, num_threads=20)
return [buzzfeed % (c % p) for c in categories for p in pages] # extract links and headlines def dump_urls_and_headlines(content, category): soup = BeautifulSoup(content) for article in soup.findAll("article"): for link in article.findAll("a"): if link.text != "": datum = { "url": buzzfeed % link['href'], "headline": link.text, "category": category } table.upsert(datum, ["url"]) # generate urls by hitting page up until we get a 404 def fetch_data(url): category = url.split("/")[3].lower() print "scraping %s" % url r = requests.get(url) status_code = r.status_code if status_code == 200: dump_urls_and_headlines(r.content, category) if __name__ == '__main__': urls = gen_urls(categories) threaded(urls, fetch_data, num_threads=10, max_queue=200)
def scrape_persons(): csv = MultiCSV() threaded(make_urls(), lambda i: scrape_person(csv, i), num_threads=25) csv.close()
import logging from lxml import html from urlparse import urljoin import requests from thready import threaded from spon.scrape.articles import scrape_article log = logging.getLogger(__name__) BASE_URL = 'http://www.spiegel.de/schlagzeilen/index-siebentage.html' def get_latest(): res = requests.get(BASE_URL) doc = html.fromstring(res.content) seen = set() for a in doc.cssselect('.schlagzeilen-content a'): url = urljoin(BASE_URL, a.get('href')) if url not in seen: yield url seen.add(url) if __name__ == "__main__": get_latest() threaded(get_latest(), scrape_article, num_threads=10)
def scrape(): csv = MultiCSV() threaded(scrape_index(), lambda i: scrape_record(csv, i), num_threads=30) csv.close()
def scrape(): csv = MultiCSV() threaded(make_names_from_gdocs(), lambda name: scrape_image(name, csv), num_threads=THREAD_COUNT) csv.close()
def scrape_npos(): csv = MultiCSV() threaded(make_urls(), lambda i: scrape_npo(csv, i), num_threads=30) csv.close()
def scrape_contracts(): csv = MultiCSV() threaded(make_urls2(), lambda i: scrape_contract(csv, i), num_threads=30) csv.close()
import requests from urlparse import urljoin from thready import threaded from spon.scrape.articles import scrape_article, url_to_number from spon.scrape.latest import get_latest def resolve_forward(num): redir_url = "http://www.spiegel.de/artikel/a-%s.html" % num redir_response = requests.head(redir_url) if redir_response.status_code >= 400: return article_url = urljoin(redir_url, redir_response.headers.get('location')) scrape_article(article_url, number=num, force=False) def article_gen(num): while num > 0: yield num num -= 1 if __name__ == "__main__": max_id = max([url_to_number(u) for u in get_latest()]) threaded(article_gen(max_id), resolve_forward, num_threads=20)
tds[1].text_content(), 'English Company Name': tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'), 'Chinese Company Name': tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'), 'Company Type': tds[4].text_content()[:-1], 'Date of incorporation': tds[6].text_content(), # 'Company status' : tds[8].text_content()[:-1], 'Active status': tds[8].text_content()[:-1], 'Remarks': tds[9].text_content().replace(u"備註:", ""), 'Winding up mode': tds[11].text_content()[:-1], 'Date of Dissolution': tds[13].text_content(), 'Register of Charges': tds[15].text_content()[:-1], 'Important Note': tds[16].text_content().replace(u"重要事項:", "").lstrip('\r\n\t') } data.update(names) db['swdata'].upsert(data, ['cr']) print "wrote", tds[1].text_content() threaded(range(mincr, maxcr), scrape, num_threads=20)
def run(self): # infinite queue to prevent deadlock when generator can't yield threaded(self.consume_urls(), self.thread_func, num_threads=self.thread_count, max_queue=0)
def scrape_companies(): csv = MultiCSV() threaded(make_urls(), lambda url: scrape_company(csv, url), num_threads=5) csv.close()
def parse_all_feeds(newspaper_data): """ parse all teh feedz """ # thread that shit! threaded(newspaper_data, parse_one_feed, 2, 25)
def scrape_ngos(): csv = MultiCSV() threaded(make_urls(), lambda url: scrape_ngo(csv, url), num_threads=3) csv.close()
os._exit(1) except RowException, re: msg = '\nRow %s: %s' % (row['__row_id__'], re.message) click.secho(msg, fg='red', bold=True) if not force: os._exit(1) def generate(): with click.progressbar(DictReader(data), label=data.name, length=lines) as bar: for i, row in enumerate(bar): row['__row_id__'] = i yield row threaded(generate(), process_row, num_threads=threads, max_queue=1) @app.command() @click.argument('schema', type=click.File('rb')) @click.pass_context def schema(ctx, schema): """ Load schema definitions from a YAML file. """ data = yaml.load(schema) if not isinstance(data, (list, tuple)): data = [data] with click.progressbar(data, label=schema.name) as bar: for schema in bar: ctx.obj['grano'].schemata.upsert(schema)
def test_error(): def f(i): raise ValueError thready.threaded(range(10), f)
def _pluplusch(get, catalogs = [], standardize = True, force_colnames = False): ''' pluplusch downloads data from open data websites. Here are its inputs. catalogs List of catalogs to download, each item being either a full URL string, including the scheme, or a tuple of the full URL string and the software, in case pluplusch doesn't know about the catalog standardize Should the metadata schema be standardized across softwares? force_colnames Should the full data file be downloaded if needed? (This is only relevant if standardize is True.) It returns a generator of datasets. ''' # Use all catalogs by default. if catalogs == []: catalogs = list(i.all_catalogs()) # Detect catalog softwares if they're not specified. catalog_names = [(catalog[0] if len(catalog) == 2 else catalog) for catalog in catalogs] catalog_softwares = [(catalog[1] if len(catalog) == 2 else i.catalog_to_software(catalog)) for catalog in catalogs] catalog_names_softwares = list(zip(catalog_names, catalog_softwares)) # Threading queue = [] running = set(catalog_names) submodules = i.submodules() def enqueue_datasets(catalog_name_software): catalog_name, catalog_software = catalog_name_software generator = submodules[catalog_software].metadata(get, catalog_name) while True: try: dataset = next(generator) except StopIteration: break except: not_file = StringIO() print_exc(file = not_file) logger.error('Error at %s:\n%s\n' % (catalog_name, not_file.getvalue())) else: if standardize: out = submodules[catalog_software].standardize(dataset) if out['download_url'] != None: out['download_url'] = urljoin(out['url'], out['download_url']) if force_colnames: out['colnames'] = submodules[catalog_software].colnames(get, dataset) else: out = dataset out['_catalog'] = catalog_name out['_software'] = catalog_software queue.append(out) running.remove(catalog_name) threaded(catalog_names_softwares, enqueue_datasets, join = False) from time import sleep while len(running) > 0 or queue != []: sleep(0.0001) if queue != []: yield queue.pop(0)
def threaded_or_serial(tasks, func, num_threads, max_queue): if DEBUG: for t in tasks: func(t) else: threaded(tasks, func, num_threads, max_queue)
#for idx, val in enumerate(tds): # print idx, ":", val.text_content().encode('utf-8') names = {} for nameidx, nameval in enumerate(namestds): names["Name" + str(nameidx)] = nameval.text_content()[10:] names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10] print "got", tds[1].text_content() data = { 'cr' : tds[1].text_content(), 'English Company Name' : tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'), 'Chinese Company Name' : tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'), 'Company Type' : tds[4].text_content()[:-1], 'Date of incorporation' : tds[6].text_content(), # 'Company status' : tds[8].text_content()[:-1], 'Active status' : tds[8].text_content()[:-1], 'Remarks' : tds[9].text_content().replace(u"備註:",""), 'Winding up mode' : tds[11].text_content()[:-1], 'Date of Dissolution' : tds[13].text_content(), 'Register of Charges' : tds[15].text_content()[:-1], 'Important Note' : tds[16].text_content().replace(u"重要事項:","").lstrip('\r\n\t') } data.update(names) db['swdata'].upsert(data, ['cr']) print "wrote", tds[1].text_content() threaded(range(mincr, maxcr), scrape, num_threads = 20)
red.sadd('urls', url) # scrape post now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "scraping data from %s @ %s" % (url, now) key, rank, value = scrape_page(url, city) # add post data to redis red.zadd(key, rank, value) if __name__ == '__main__': # generate input cities = [] feeds = [] for line in open('feeds/all_rss_feeds.csv').read().split('\r')[1:]: row = line.split(",") feeds.append(row[0].strip()) cities.append(parse_city_to_slug(row[2].strip())) items = zip(cities, feeds) # go forth young scraper threaded(items, crawl, num_threads=10, max_queue=200) # # # debug mode # # [crawl(i) for i in items] # print md for readme # for i in items: # print "[%s](%s)<br/>" % i
return with open(file_path, 'rb') as fh: data = fh.read() key, text = get_cache(data) if text is not None: return text = extract_image_data(data) counter_lock.acquire() try: processed += 1 time_taken = time.time() - START_TIME img_per_sec = time_taken / processed finally: counter_lock.release() log.info('Extracted: %s (%d characters of text), %.3fs/img', file_path, len(text), img_per_sec) def crawl_directory(base_path): for (dirpath, _, files) in os.walk(base_path): for file_name in files: file_path = os.path.abspath(os.path.join(dirpath, file_name)) file_path = os.path.normpath(file_path) yield file_path if __name__ == '__main__': if not os.environ.get('EXTRACTORS_CACHE_DIR'): print 'No cache dir, this makes no sense' else: threaded(crawl_directory(DATA_PATH), crawl_file, num_threads=5)