Пример #1
0
def test_threaded():
    import thready
    results = set()
    def f(i):
        results.add(i)
    thready.threaded(range(10), f)
    assert results == set(range(10))
Пример #2
0
def get_new_data_for_page(page_arg_set):
  """
  get all new posts on a page
  """
  api, page_id, config = page_arg_set

  print "INFO\tFACEBOOK\tgetting new data for facebook.com/%s" % page_id
  
  # fetch account data so we can associate the number of likes with the account AT THAT TIME
  try:
    acct_data = api.get(page_id)
  except Exception as e:
    print e
    return None
  else:
    # determine limit
    if page_id in config['facebook']['insights_pages']:
      limit = config['facebook']['insights_limit']
    else:
      limit = config['facebook']['page_limit']

    # get last 100 articles for this page
    page = api.get(page_id + "/posts", page=False, retry=5, limit=limit)
    if DEBUG:
      for post_data in page['data']:
        post_arg_set = (api, post_data, acct_data, page_id, config)
        insert_new_post(post_arg_set)
    else:
      post_arg_sets = [(api, post_data, acct_data, page_id, config) for post_data in page['data']]
      threaded(post_arg_sets, insert_new_post, 20, 150)
def download_fixies(start_date, end_date=None):
	if not end_date:
		end_date = start_date
	all_dates = generate_date_range(start_date, end_date)
	print '\nINFO: Downloading FMS MTS from', all_dates[0], 'to', all_dates[-1], "!\n"
	fnames = [''.join(['mts',datetime.datetime.strftime(date, '%m%y'), '.txt']) for date in all_dates]
	threaded(fnames, request_and_test_fixie, 20, 100)
Пример #4
0
def generate_list(api, slug, list_dict):
	
	# parse handles
	if isinstance(list_dict['screen_names'], basestring):
		screen_names = [
			sn.strip() 
			for sn in open(list_dict['screen_names']).read().split("\n") 
			if sn != '' and sn is not None 
		]

	else:
		screen_names = list_dict['screen_names']
	
	owner_screen_name = list_dict['owner']
	
	try:
		api.create_list(slug)

	except tweepy.error.TweepError as e:
		
		print "ERROR\tTWT\t%s Already Exists for user %s" % (slug, owner_screen_name)
		print e
		return None

	else:
		
		list_member_arg_sets = [
			(screen_name, slug, owner_screen_name, api) 
			for screen_name in screen_names
		]
		threaded(list_member_arg_sets, add_list_member, 30, 200)
Пример #5
0
def main():
    import sys, csv

    writer = csv.writer(sys.stdout)
    l = listings()
    first_listing = next(l)
    first_listing['tags'] = ', '.join(first_listing['tags'])
    first_listing['downloads'] = ', '.join(first_listing['downloads'])
    del(first_listing['messages'])

    writer.writerow(list(first_listing.keys()))
    writer.writerow(list(first_listing.values()))
    for listing in l:
        # foia_files
        filenames = (dl.split('/')[-1] for dl in listing['downloads'])
        threaded(filenames, get_foia_file,
                 num_threads = 20, daemon = False, join = False)

        # Remove lists
        listing['tags'] = ', '.join(listing['tags'])
        listing['downloads'] = ', '.join(listing['downloads'])
        # listing['messages'] = ('\n\n' + ('-' * 60) + '\n\n').join(listing['messages'])
        del(listing['messages'])

        writer.writerow(list(listing.values()))
Пример #6
0
def scrape():
    csv = MultiCSV()
    threaded(gdocs_persons(),
             lambda data: scrape_image(data['Full Name'], data['Image URL'],
                                       csv, data['Image Credit']),
             num_threads=THREAD_COUNT)
    csv.close()
Пример #7
0
def scrape_deputies():
    """ Scrape all the deputies from the list """

    # Create a generator of all the deputies on the list.
    generator = list_deputies()

    # Start 10 threads, begin processing list items through ``scrape_deputy``.
    threaded(generator, scrape_deputy, num_threads=10)
Пример #8
0
def scrape_deputies():
    """ Scrape all the deputies from the list """
    
    # Create a generator of all the deputies on the list.
    generator = list_deputies()

    # Start 10 threads, begin processing list items through ``scrape_deputy``.
    threaded(generator, scrape_deputy, num_threads=10)
Пример #9
0
def run():
    annual_urls = [get_url_year(year) for year in years]
    url_list = [cook_soup(u) for u in annual_urls]

    # Unnest
    items = [i for item in url_list for i in item]

    # Thread the data collection
    threaded(items, get_data, 20, 200)
Пример #10
0
def run(config):
  pages = config['promopages']
  if DEBUG:
    for slug, url in pages.iteritems():
      page_arg_set = (url, slug, config)
      scrape_promo_page(page_arg_set)
  else:
    page_arg_sets = [(url, slug, config) for slug, url in pages.iteritems()]
    threaded(page_arg_sets, scrape_promo_page, 2, 10)
Пример #11
0
def scrape_states():
    def states():
        for state in STATES:
            q = QUERY.copy()
            q['bundesland' + state] = "on"
            yield (q, state)
    # for s in states():
    #     scrape_state(s)
    threaded(states(), scrape_state, num_threads=5)
Пример #12
0
def run():
	annual_urls = [get_url_year(year) for year in years]
	url_list = [cook_soup(u) for u in annual_urls]
	
	# Unnest 
	items = [i for item in url_list for i in item ]
	
	# Thread the data collection
	threaded(items, get_data,  20, 200)
Пример #13
0
def scrape():
    csv = MultiCSV()
    threaded(
        gdocs_persons(),
        lambda data: scrape_image(data['Full Name'], data['Image URL'], csv,
                                  data['Image Credit']),
        num_threads=THREAD_COUNT
    )
    csv.close()
Пример #14
0
def scrape_missed_connections():
    response = requests.get(BASE_URL + "mis/")
    soup = BeautifulSoup(response.content)
    missed_connections = soup.find_all('span', {'class': 'pl'})
    urls = []
    for missed_connection in missed_connections:
        link = missed_connection.find('a').attrs['href']
        url = urljoin(BASE_URL, link)
        urls.append(url)
    threaded(urls, scrape_missed_connection, num_threads=10)
Пример #15
0
def scrape_states():
    def states():
        for state in STATES:
            q = QUERY.copy()
            q['bundesland' + state] = "on"
            yield (q, state)

    # for s in states():
    #     scrape_state(s)
    threaded(states(), scrape_state, num_threads=5)
Пример #16
0
def parse_one_feed(newspaper_datum):
  """
  parse all the items in an rss feed
  """
  rss = newspaper_datum['rss']
  print "parsing %s\n" % rss
  feed_data = feedparser.parse(rss)
  feed_items = zip_entries(feed_data['entries'], newspaper_datum)

  # thread that shit!
  threaded(feed_items, parse_one_entry, 3, 1000)
Пример #17
0
def parse_one_feed(newspaper_datum):
    """
  parse all the items in an rss feed
  """
    rss = newspaper_datum['rss']
    print "parsing %s\n" % rss
    feed_data = feedparser.parse(rss)
    feed_items = zip_entries(feed_data['entries'], newspaper_datum)

    # thread that shit!
    threaded(feed_items, parse_one_entry, 3, 1000)
Пример #18
0
def run():
	if os.path.exists('frb_releases/federalreserve.db'):
		os.remove('frb_releases/federalreserve.db')

	annual_urls = [get_url_year(year) for year in years]
	url_list = [cook_soup(u) for u in annual_urls]
	
	# unnest
	items = [i for item in url_list for i in item ]
	
	# thread that shit
	threaded(items, get_data,  20, 200)
Пример #19
0
def run():
    if os.path.exists('frb_releases/federalreserve.db'):
        os.remove('frb_releases/federalreserve.db')

    annual_urls = [get_url_year(year) for year in years]
    url_list = [cook_soup(u) for u in annual_urls]

    # unnest
    items = [i for item in url_list for i in item]

    # thread that shit
    threaded(items, get_data, 20, 200)
def scrape_missed_connections():
    """ Scrape all the missed connections from a list """
    
    # Download the list of missed connections

    # here were using requests, 
    # a python library for accessing the web

    # we add "mis/" to the url to tell requests
    # to get the missed connections 
    # on newyork.craigslist.org

    response = requests.get(BASE_URL + "mis/")

    # parse HTML using Beautiful Soup
    # this returns a `soup` object which
    # gives us convenience methods for parsing html

    soup = BeautifulSoup(response.content)

    # find all the posts in the page.

    # here we're telling BeautifulSoup to get us every
    # span tag that has a class that equals pl

    # these tags might look something like this:
    # <span class='pl'> {content} </span>

    missed_connections = soup.find_all('span', {'class':'pl'})

    # create an empty list of urls to scrape 
    urls = []
    for missed_connection in missed_connections:
        
        # for each span list, find the "a" tag which 
        # represents the link to the missed connection page.

        link = missed_connection.find('a').attrs['href']
        
        # join this relative link with the 
        # BASE_URL to create an absolute link

        url = urljoin(BASE_URL, link)
        
        # iteratively populate this list 
        urls.append(url)


    # download and parse these missed connections using
    # multiple threads
    threaded(urls, scrape_missed_connection, num_threads=10)
Пример #21
0
def run(config):
  """
  get all new posts on all pages
  """
  page_ids = config['facebook']['pages']
  api = fb.connect(config)
  if DEBUG:
    for page_id in page_ids:
      page_arg_set = (api, page_id, config)
      get_new_data_for_page(page_arg_set)
  # fetch account data so we can associate the number of likes with the account AT THAT TIME
  else:
    page_arg_sets = [(api, page_id, config) for page_id in page_ids]
    threaded(page_arg_sets, get_new_data_for_page, 5, 20)
Пример #22
0
def scrapeMissedConnections():
    """ Scrape all the missed connections from a list """

    soup = BeautifulSoup(getContent(BASE_URL + "mis/"))

    missed_connections = soup.find_all('span', {'class': 'pl'})

    urls = []
    for missed_connection in missed_connections:

        link = missed_connection.find('a').attrs['href']
        url = urljoin(BASE_URL, link)
        urls.append(url)

    threaded(urls, scrapeMissedConnection, num_threads=10)
Пример #23
0
def scrapeMissedConnections():
    """ Scrape all the missed connections from a list """

    soup = BeautifulSoup(getContent(BASE_URL + "mis/"))

    missed_connections = soup.find_all('span', {'class':'pl'})

    urls = []
    for missed_connection in missed_connections:

        link = missed_connection.find('a').attrs['href']
        url = urljoin(BASE_URL, link)
        urls.append(url)


    threaded(urls, scrapeMissedConnection, num_threads=10)
Пример #24
0
def scrape_cl_ads():

    count = 0
    response = requests.get(BASE_URL + "apa/")
    soup = BeautifulSoup(response.content)
    ads = soup.find_all('span', {'class':'pl'})
    
    urls = []
    for ad in ads:
        
        link = ad.find('a').attrs['href']
        url = urljoin(BASE_URL, link)
        urls.append(url)

    threaded(urls, scrape_cl_ad, num_threads=2)
    print str(count) + "Ads inserted in CL"
Пример #25
0
def run(config):
    """
  parse all teh feedz
  """
    # generate args from config
    feed_arg_sets = []
    for data_source, v in config["rssfeeds"].iteritems():
        feed_arg = (v["feed_url"], data_source, v["full_text"], config)
        feed_arg_sets.append(feed_arg)

    if DEBUG:
        for feed_arg_set in feed_arg_sets:
            parse_one_feed(feed_arg_set)
    else:
        # thread that shit!
        threaded(feed_arg_sets, parse_one_feed, 5, 25)
Пример #26
0
def parse_one_feed(feed_arg_set):
    feed_url, data_source, full_text, config = feed_arg_set
    """
  parse all the items in an rss feed
  """

    feed_data = feedparser.parse(feed_url)

    entries = feed_data["entries"]
    entry_arg_sets = [(entry, data_source, full_text, config) for entry in entries]

    if DEBUG:
        for entry_arg_set in entry_arg_sets:
            parse_one_entry(entry_arg_set)
    else:
        # thread that shit!
        threaded(entry_arg_sets, parse_one_entry, 30, 100)
Пример #27
0
def main():
    fieldnames = [
        'project.name',
        'contract.uri',
        'contract.number',
        'bidder.name',
        'bid.status',
        'bidder.country',
        'original.company.name',
        'original.company.country',
        'opencorporates.company.name',
        'opencorporates.company.uri',
        'address',
        'country',
        'duration',
        'date.signature',
        'score.financial',
        'score.technical',
        'score.final',
        'original.price.opening',
        'original.price.evaluated',
        'original.price.contract',
        'price.opening.currency',
        'price.opening.amount',
        'price.evaluated.currency',
        'price.evaluated.amount',
        'price.contract.currency',
        'price.contract.amount',
        'method.procurement',
        'method.selection',
        'scope',
        'name',
        'reason.rejection',
        'small.contract.notice',
        'ranking.final',
    ]
    reader = csv.DictReader(open(os.path.join('pagedata', 'company.csv')))
    writer = csv.DictWriter(sys.stdout, fieldnames = fieldnames)
    writer.writeheader()
    threaded(args(reader), partial(ask, writer), num_threads = 30)
Пример #28
0
def main():
    fieldnames = [
        'project.name',
        'contract.uri',
        'contract.number',
        'bidder.name',
        'bid.status',
        'bidder.country',
        'original.company.name',
        'original.company.country',
        'opencorporates.company.name',
        'opencorporates.company.uri',
        'address',
        'country',
        'duration',
        'date.signature',
        'score.financial',
        'score.technical',
        'score.final',
        'original.price.opening',
        'original.price.evaluated',
        'original.price.contract',
        'price.opening.currency',
        'price.opening.amount',
        'price.evaluated.currency',
        'price.evaluated.amount',
        'price.contract.currency',
        'price.contract.amount',
        'method.procurement',
        'method.selection',
        'scope',
        'name',
        'reason.rejection',
        'small.contract.notice',
        'ranking.final',
    ]
    reader = csv.DictReader(open(os.path.join('pagedata', 'company.csv')))
    writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
    writer.writeheader()
    threaded(args(reader), partial(ask, writer), num_threads=30)
def get_profile(lid,fname,lname,comp,tcount,ccount):
    ## Grab the target profile
    global rID, targetname
    profile = base_url+"/profile/view?id=%s" % lid
    r = requests.get(profile, cookies=cookies)
    #regex1 = re.compile("fmt__skill_name\":\"(.*?)\"") # Skill Names
    regex1 = re.compile("endorse-item-name-text\">(.*?)<") # Skill Names
    regex2 = re.compile("recipientId:(.*?),") # Recipient ID
    regex3 = re.compile("<title[^>]*>(.*?)\s\|") # Target Name
    print "\r[Info] Gathering data for %s %s (%d/%d)" % (fname,lname,ccount,tcount)
    try:
        ## Get RequestorID and Name of target
        rID = regex2.search(r.text).groups()[0]
        targetname = regex3.search(r.text).groups()[0]
    except:
        ## Target is outside of your network. Grabbing authToken value
        print "[Info] Target is outside your network. Optaining auth token."
        authToken = get_authToken(fname,lname,comp)
        targetname = fname + " " + lname
        ## Resend request with authToken
        profile2 = profile + "&authType=NAME_SEARCH&authToken=%s" % authToken
        r = requests.get(profile2, cookies=cookies)
        try:
            rID = regex2.search(r.text).groups()[0]
        except:
            print "\n[Error] Skipping %s %s" % (fname,lname)
            print "[Error] Unable to view the targets full profile\n"
    print targetname
    ## Get the skills of the target
    rID = rID.replace("'","")
    skills = set(regex1.findall(r.text))
    if skills:
        for s in skills:
            skillset.append(s)
            #get_endorsements(s)
    else:
        print "\n[Error] Could not find skill names. Check that the session is still valid"

    ## Send requests to the threading engine
    threaded(skillset, get_endorsements, num_threads=10)
Пример #30
0
def delaware_river():
	links = []
	for i in range(1000,100000):
		url = "https://dpronline.delaware.gov/mylicense weblookup/Details.aspx?agency_id=1&license_id=%d&"%i
		links.append(url)
	threaded(links, george_washington, num_threads=20)
Пример #31
0
    return [buzzfeed % (c % p) for c in categories for p in pages]


# extract links and headlines
def dump_urls_and_headlines(content, category):
    soup = BeautifulSoup(content)
    for article in soup.findAll("article"):
        for link in article.findAll("a"):
            if link.text != "":
                datum = {
                    "url": buzzfeed % link['href'],
                    "headline": link.text,
                    "category": category
                }
                table.upsert(datum, ["url"])


# generate urls by hitting page up until we get a 404
def fetch_data(url):
    category = url.split("/")[3].lower()
    print "scraping %s" % url
    r = requests.get(url)
    status_code = r.status_code
    if status_code == 200:
        dump_urls_and_headlines(r.content, category)


if __name__ == '__main__':
    urls = gen_urls(categories)
    threaded(urls, fetch_data, num_threads=10, max_queue=200)
Пример #32
0
def scrape_persons():
    csv = MultiCSV()
    threaded(make_urls(), lambda i: scrape_person(csv, i), num_threads=25)
    csv.close()
Пример #33
0
import logging
from lxml import html
from urlparse import urljoin
import requests
from thready import threaded
from spon.scrape.articles import scrape_article

log = logging.getLogger(__name__)
BASE_URL = 'http://www.spiegel.de/schlagzeilen/index-siebentage.html'


def get_latest():
    res = requests.get(BASE_URL)
    doc = html.fromstring(res.content)
    seen = set()
    for a in doc.cssselect('.schlagzeilen-content a'):
        url = urljoin(BASE_URL, a.get('href'))
        if url not in seen:
            yield url
        seen.add(url)


if __name__ == "__main__":
    get_latest()
    threaded(get_latest(), scrape_article, num_threads=10)
Пример #34
0
def scrape():
    csv = MultiCSV()
    threaded(scrape_index(), lambda i: scrape_record(csv, i), num_threads=30)
    csv.close()
Пример #35
0
def scrape():
    csv = MultiCSV()
    threaded(make_names_from_gdocs(),
             lambda name: scrape_image(name, csv),
             num_threads=THREAD_COUNT)
    csv.close()
Пример #36
0
def scrape_npos():
    csv = MultiCSV()
    threaded(make_urls(), lambda i: scrape_npo(csv, i), num_threads=30)
    csv.close()
Пример #37
0
def scrape_contracts():
    csv = MultiCSV()
    threaded(make_urls2(), lambda i: scrape_contract(csv, i), num_threads=30)
    csv.close()
Пример #38
0
import requests
from urlparse import urljoin
from thready import threaded
from spon.scrape.articles import scrape_article, url_to_number
from spon.scrape.latest import get_latest


def resolve_forward(num):
    redir_url = "http://www.spiegel.de/artikel/a-%s.html" % num
    redir_response = requests.head(redir_url)
    if redir_response.status_code >= 400:
        return
    article_url = urljoin(redir_url, redir_response.headers.get('location'))
    scrape_article(article_url, number=num, force=False)


def article_gen(num):
    while num > 0:
        yield num
        num -= 1


if __name__ == "__main__":
    max_id = max([url_to_number(u) for u in get_latest()])
    threaded(article_gen(max_id), resolve_forward, num_threads=20)
Пример #39
0
        tds[1].text_content(),
        'English Company Name':
        tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'),
        'Chinese Company Name':
        tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'),
        'Company Type':
        tds[4].text_content()[:-1],
        'Date of incorporation':
        tds[6].text_content(),
        # 'Company status' : tds[8].text_content()[:-1],
        'Active status':
        tds[8].text_content()[:-1],
        'Remarks':
        tds[9].text_content().replace(u"備註:", ""),
        'Winding up mode':
        tds[11].text_content()[:-1],
        'Date of Dissolution':
        tds[13].text_content(),
        'Register of Charges':
        tds[15].text_content()[:-1],
        'Important Note':
        tds[16].text_content().replace(u"重要事項:", "").lstrip('\r\n\t')
    }
    data.update(names)

    db['swdata'].upsert(data, ['cr'])
    print "wrote", tds[1].text_content()


threaded(range(mincr, maxcr), scrape, num_threads=20)
Пример #40
0
 def run(self):
     # infinite queue to prevent deadlock when generator can't yield
     threaded(self.consume_urls(), self.thread_func,
              num_threads=self.thread_count,
              max_queue=0)
Пример #41
0
def scrape_companies():
    csv = MultiCSV()
    threaded(make_urls(), lambda url: scrape_company(csv, url), num_threads=5)
    csv.close()
Пример #42
0
def parse_all_feeds(newspaper_data):
    """
  parse all teh feedz
  """
    # thread that shit!
    threaded(newspaper_data, parse_one_feed, 2, 25)
Пример #43
0
def scrape_ngos():
    csv = MultiCSV()
    threaded(make_urls(), lambda url: scrape_ngo(csv, url), num_threads=3)
    csv.close()
Пример #44
0
def scrape():
    csv = MultiCSV()
    threaded(make_names_from_gdocs(),
             lambda name: scrape_image(name, csv),
             num_threads=THREAD_COUNT)
    csv.close()
Пример #45
0
                os._exit(1)
        except RowException, re:
            msg = '\nRow %s: %s' % (row['__row_id__'], re.message)
            click.secho(msg, fg='red', bold=True)
            if not force:
                os._exit(1)

    def generate():
        with click.progressbar(DictReader(data),
                               label=data.name,
                               length=lines) as bar:
            for i, row in enumerate(bar):
                row['__row_id__'] = i
                yield row

    threaded(generate(), process_row, num_threads=threads,
             max_queue=1)


@app.command()
@click.argument('schema', type=click.File('rb'))
@click.pass_context
def schema(ctx, schema):
    """ Load schema definitions from a YAML file. """
    data = yaml.load(schema)
    if not isinstance(data, (list, tuple)):
        data = [data]
    with click.progressbar(data, label=schema.name) as bar:
        for schema in bar:
            ctx.obj['grano'].schemata.upsert(schema)
Пример #46
0
def test_error():
    def f(i):
        raise ValueError
    thready.threaded(range(10), f)
Пример #47
0
def _pluplusch(get, catalogs = [], standardize = True, force_colnames = False):
    '''
    pluplusch downloads data from open data websites. Here are
    its inputs.

    catalogs
        List of catalogs to download, each item being either
        a full URL string, including the scheme, or a tuple
        of the full URL string and the software, in case pluplusch
        doesn't know about the catalog
    standardize
        Should the metadata schema be standardized across softwares?
    force_colnames
        Should the full data file be downloaded if needed?
        (This is only relevant if standardize is True.)

    It returns a generator of datasets.
    '''

    # Use all catalogs by default.
    if catalogs == []:
        catalogs = list(i.all_catalogs())

    # Detect catalog softwares if they're not specified.
    catalog_names = [(catalog[0] if len(catalog) == 2 else catalog) for catalog in catalogs]
    catalog_softwares = [(catalog[1] if len(catalog) == 2 else i.catalog_to_software(catalog)) for catalog in catalogs]
    catalog_names_softwares = list(zip(catalog_names, catalog_softwares))

    # Threading
    queue = []
    running = set(catalog_names)
    submodules = i.submodules()

    def enqueue_datasets(catalog_name_software):
        catalog_name, catalog_software = catalog_name_software
        generator = submodules[catalog_software].metadata(get, catalog_name)
        while True:
            try:
                dataset = next(generator)
            except StopIteration:
                break
            except:
                not_file = StringIO()
                print_exc(file = not_file)
                logger.error('Error at %s:\n%s\n' % (catalog_name, not_file.getvalue()))
            else:
                if standardize:
                    out = submodules[catalog_software].standardize(dataset)
                    if out['download_url'] != None:
                        out['download_url'] = urljoin(out['url'], out['download_url'])
                    if force_colnames:
                        out['colnames'] = submodules[catalog_software].colnames(get, dataset)
                else:
                    out = dataset
                    out['_catalog'] = catalog_name
                    out['_software'] = catalog_software
                queue.append(out)
        running.remove(catalog_name)

    threaded(catalog_names_softwares, enqueue_datasets, join = False)

    from time import sleep
    while len(running) > 0 or queue != []:
        sleep(0.0001)
        if queue != []:
            yield queue.pop(0)
Пример #48
0
def threaded_or_serial(tasks, func, num_threads, max_queue):
  if DEBUG:
    for t in tasks:
      func(t)
  else:
    threaded(tasks, func, num_threads, max_queue)
Пример #49
0
        #for idx, val in enumerate(tds):
        #    print idx, ":", val.text_content().encode('utf-8')
    names = {}
    for nameidx, nameval in enumerate(namestds):
        names["Name" + str(nameidx)] = nameval.text_content()[10:]
        names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10]

    print "got", tds[1].text_content() 

    data = {
        'cr' : tds[1].text_content(),
        'English Company Name' : tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'),
        'Chinese Company Name' : tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'),
        'Company Type' : tds[4].text_content()[:-1],
        'Date of incorporation' : tds[6].text_content(),
        # 'Company status' : tds[8].text_content()[:-1],
        'Active status' : tds[8].text_content()[:-1],
        'Remarks' : tds[9].text_content().replace(u"備註:",""),
        'Winding up mode' : tds[11].text_content()[:-1],
        'Date of Dissolution' : tds[13].text_content(),
        'Register of Charges' : tds[15].text_content()[:-1],
        'Important Note' : tds[16].text_content().replace(u"重要事項:","").lstrip('\r\n\t')
    }
    data.update(names)
    
    db['swdata'].upsert(data, ['cr'])
    print "wrote", tds[1].text_content()

threaded(range(mincr, maxcr), scrape, num_threads = 20)
Пример #50
0
          red.sadd('urls', url)

          # scrape post
          now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
          print "scraping data from %s @ %s" % (url, now)
          key, rank, value = scrape_page(url, city)
          # add post data to redis
          red.zadd(key, rank, value)

if __name__ == '__main__':

  # generate input
  cities = []
  feeds = []
  for line in open('feeds/all_rss_feeds.csv').read().split('\r')[1:]:
    row = line.split(",")
    feeds.append(row[0].strip())
    cities.append(parse_city_to_slug(row[2].strip()))
  items = zip(cities, feeds)

  # go forth young scraper
  threaded(items, crawl, num_threads=10, max_queue=200)

  # # # debug mode #
  # [crawl(i) for i in items]

  # print md for readme
  # for i in items:
  #   print "[%s](%s)<br/>" % i

Пример #51
0
        return
    with open(file_path, 'rb') as fh:
        data = fh.read()
    key, text = get_cache(data)
    if text is not None:
        return
    text = extract_image_data(data)
    counter_lock.acquire()
    try:
        processed += 1
        time_taken = time.time() - START_TIME
        img_per_sec = time_taken / processed
    finally:
        counter_lock.release()
    log.info('Extracted: %s (%d characters of text), %.3fs/img', file_path,
             len(text), img_per_sec)


def crawl_directory(base_path):
    for (dirpath, _, files) in os.walk(base_path):
        for file_name in files:
            file_path = os.path.abspath(os.path.join(dirpath, file_name))
            file_path = os.path.normpath(file_path)
            yield file_path

if __name__ == '__main__':
    if not os.environ.get('EXTRACTORS_CACHE_DIR'):
        print 'No cache dir, this makes no sense'
    else:
        threaded(crawl_directory(DATA_PATH), crawl_file, num_threads=5)