def get_results_content(fetch_all, fetch_indexes, share_content): # We order search results by URL so that we can visit search results that share the # same URL one after the other. This way we can associate the same fetched contents # with all search results that share a URL at the same time. results = ( SearchResult .select() .order_by(SearchResult.url) ) if fetch_all: results = results elif fetch_indexes: results = ( results .join(Search) .where(Search.fetch_index << fetch_indexes) ) else: results = ( results .join(SearchResultContent, JOIN_LEFT_OUTER) .where(SearchResultContent.content >> None) ) previous_url = None previous_content = None for search_result in results: # If the caller has specified that we should share fetched contents between # search results with the same URL, then check to see if the URL has stayed the same. if share_content and search_result.url == previous_url: logger.debug("Already called URL %s. Reusing its response.", search_result.url) if previous_content is not None: SearchResultContent.create(search_result=search_result, content=previous_content) continue # Fetch content for the search result resp = make_request(default_requests_session.get, search_result.url) # Associate the scraped content to a URL if hasattr(resp, 'content'): # To avoid redundant storage, we create a record for web page # contents that can be shared across multiple URLs. # As it turns out, we want "response.text" (Unicode) and not "response.content" (bytes), # if we want to successfully store the responses from all URLs. web_page_content = WebPageContent.create(url=search_result.url, content=resp.text) SearchResultContent.create(search_result=search_result, content=web_page_content) previous_content = web_page_content else: logger.warn("Error fetching content from URL: %s", search_result.url) previous_content = None # With either a successful or failed response, save that we queried this URL previous_url = search_result.url # Even though most of the pages will be from different domains, we pause between # fetching the content for each result to avoid spamming any specific domain with requests. time.sleep(DELAY_TIME)
def fetch_questions_for_tag(tag, fetch_index): # Prepare initial API query parameters params = DEFAULT_PARAMS.copy() params['tagged'] = tag params['page'] = 1 # paging for Stack Exchange API starts at 1 # We intentionally choose to iterate until the results tell us there are 'no more'. # The Stack Exchange API documents tell us that requesting a 'total' from the API # will double the request time, so we don't fetch the total. more_results = True while more_results: response = make_request(default_requests_session.get, API_URL, params=params) if response is not None: response_data = response.json() for question in response_data['items']: _save_question(question, fetch_index) # Advance the page if there are more results coming more_results = response_data[ 'has_more'] if response is not None else True time.sleep(REQUEST_DELAY) params['page'] += 1
def get_history(url, fetch_index): params = DEFAULT_PARAMS.copy() params["url"] = url # Flags for controlling paging and scanning results more_results = True watch_for_resume_key = False while more_results: more_results = False response = make_request(default_requests_session.get, ARCHIVE_URL, params=params) time.sleep(REQUEST_DELAY) # Pause so that we don't bombard the server with requests if response is None: break results = response.json() for result_index, result in enumerate(results): # Read the field names from the first result if result_index == 0: field_names = result continue # Resumption key appears after one blank record after the rest of the records # These two lines keep watch for the resumption key and exit the loop once # it has been found. if result == []: watch_for_resume_key = True continue elif watch_for_resume_key: # Setting this parameter advances the page of results for the next query params["resumeKey"] = result[0] more_results = True watch_for_resume_key = False break # If the code has made it this far, this record is a web # page version, and we want to save it. data = dict(zip(field_names, result)) _save_record(url, data, fetch_index)
def get_citation_count(query, fetch_index, api_key): # Request for citation counts for the publication params = DEFAULT_PARAMS.copy() params['expr'] = ( "AND(" + # we will search based on two criteria: "Ti=\'{title}\'...," + # the title prefix "Y={year})" # the publication year ).format(title=query['title'], year=int(query['year'])) response = make_request( default_requests_session.get, URL, params=params, headers={'Ocp-Apim-Subscription-Key': api_key}, ) time.sleep(REQUEST_DELAY) # enforce a pause between each fetch to be respectful to API # Go no further if the call failed if not response: return publications = response.json()['entities'] if len(publications) == 0: logger.warn("No publications found for title: %s", query['title']) return # Store data from the fetched publications first_publication = publications[0] authors = ','.join([author['AuN'] for author in first_publication['AA']]) Publication.create( fetch_index=fetch_index, citation_count=first_publication['CC'], author=authors, year=first_publication['Y'], title=first_publication['Ti'], )
def fetch_questions_for_tag(tag, fetch_index): # Prepare initial API query parameters params = DEFAULT_PARAMS.copy() params['tagged'] = tag params['page'] = 1 # paging for Stack Exchange API starts at 1 # We intentionally choose to iterate until the results tell us there are 'no more'. # The Stack Exchange API documents tell us that requesting a 'total' from the API # will double the request time, so we don't fetch the total. more_results = True while more_results: response = make_request(default_requests_session.get, API_URL, params=params) if response is not None: response_data = response.json() for question in response_data['items']: _save_question(question, fetch_index) # Advance the page if there are more results coming more_results = response_data['has_more'] if response is not None else True time.sleep(REQUEST_DELAY) params['page'] += 1
def main(show_progress, *args, **kwargs): # Create a new fetch index for the records fetched. last_fetch_index = Dataset.select(fn.Max(Dataset.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 # Set up progress bar if show_progress: progress_bar = ProgressBar(widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Fetched metadata for ', Counter(), ' datasets.' ]) progress_bar.start() # Fetch all pages of datasets datasets_fetched = 0 last_page = False while not last_page: params = DEFAULT_PARAMS.copy() params['start'] = datasets_fetched resp = make_request(default_requests_session.get, URL, params=params).json() if not resp['success']: logging.error("Request to URL %s was unsuccessful", URL) result = resp['result'] num_datasets = len(result['results']) datasets_fetched += num_datasets if show_progress: # We can finally initialize the total number of datasets expected # only after we get the first round of results. progress_bar.maxval = result['count'] progress_bar.update(datasets_fetched) for dataset in result['results']: dataset_record = Dataset.create( dataset_id=dataset['id'], title=trim_char_data(dataset['title']), license_title=trim_char_data(['license_title']), fetch_index=fetch_index, ) for resource in dataset['resources']: if resource['format'] == DATA_FORMAT: Resource.create( resource_id=resource['id'], dataset=dataset_record, format=resource['format'], url=resource['url'], ) time.sleep(REQUEST_DELAY) # enforce a pause between each fetch to be respectful to API last_page = datasets_fetched >= result['count'] if show_progress: progress_bar.finish()
def get_slant_pros_and_cons(show_progress): # Create a new fetch index last_fetch_index = ViewpointSection.select(fn.Max(ViewpointSection.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 # Get the index of the latest fetch of topics and viewpoints. # We will only collect pros and cons for this set of topics. viewpoint_fetch_index = Viewpoint.select(fn.Max(Viewpoint.fetch_index)).scalar() or 0 latest_viewpoint_batch = ( Viewpoint .select() .where(Viewpoint.fetch_index == viewpoint_fetch_index) ) # Initialize the progress bar if requested if show_progress: viewpoint_count = latest_viewpoint_batch.count() progress_bar = ProgressBar(maxval=viewpoint_count, widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Collected pros and cons for viewpoint ', Counter(), ' / ' + str(viewpoint_count) + '.' ]) progress_bar.start() # For every viewpoint, fetch and save all pros and cons for viewpoint_index, viewpoint in enumerate(latest_viewpoint_batch, start=1): # Without the format=json parameter, the Slant server will return # HTML for the viewpoint. We get something resembling a JSON API # response if we ask for JSON format. response = make_request( default_requests_session.get, SLANT_URL + viewpoint.url_path, params={'format': 'json'}, ) # Skip all missing responses if response is None: continue results = response.json() # If we have somehow ended up on an entry where it has an error field # with the 404 code, something was probably wrong with the request. # Just skip this entry and move on. if 'error' in results and results['error'] == 404: logger.warn("Got 404 when retrieving viewpoint with path %s.", viewpoint.url_path) break # Each 'section' for a view point is a pro or a con. Save a record for each one. for section in results['sections']['children']: ViewpointSection.create( fetch_index=fetch_index, viewpoint=viewpoint, section_index=section['id'], title=section['revision']['title'], text=section['revision']['text'], is_con=section['isCon'], upvotes=section['votes']['upvotes'], downvotes=section['votes']['downvotes'], ) if show_progress: progress_bar.update(viewpoint_index) # Pause so that we don't bombard the server with requests time.sleep(REQUEST_DELAY) if show_progress: progress_bar.finish()
def get_results(query, package, include_stack_overflow, fetch_index, search_id, api_key): # Make request for search results params = DEFAULT_PARAMS.copy() params['key'] = api_key params['cx'] = search_id params['q'] = query if not include_stack_overflow: params['siteSearch'] = 'stackoverflow.com' params['siteSearchFilter'] = 'e' # 'e' for 'exclude' response = make_request(default_requests_session.get, SEARCH_URL, params=params) # Pause so that we don't bombard the server with requests time.sleep(REQUEST_DELAY) # If request resulted in error, the response is null. Skip over this query. if response is None: return # Parse search results soup = BeautifulSoup(response.content, 'html.parser') url = soup.find('opensearch:Url') entry_count = len(soup.find_all('entry')) # The Atom spec for the search API # (https://developers.google.com/custom-search/json-api/v1/reference/cse/list#response) # mentions that the estimated results count may be a long integer. # To my knowledge, peewee (our ORM) doesn't support long integer fields. # So, I cast this to an integer instead and cross my fingers there is no overflow. search = Search.create( fetch_index=fetch_index, query=query, page_index=0, requested_count=REQUESTED_RESULT_COUNT, result_count_on_page=entry_count, estimated_results_count=int( soup.find('cse:searchinformation').find('cse:totalresults').text), package=package, ) # Fetch the first "entry" or search result entry = soup.entry # Save all of the search results from first to last. # Maintaining consistency with our query scraping, ranking starts at 1. for rank in range(1, entry_count + 1): # Extract fields from the entry updated_datetime_without_milliseconds = re.sub('\.\d\d\dZ', 'Z', entry.updated.text) updated_datetime = datetime.datetime.strptime( updated_datetime_without_milliseconds, "%Y-%m-%dT%H:%M:%SZ" ) link = entry.link['href'] snippet = entry.summary.string title = entry.title.text url = entry.id.text # Create a record for this search result SearchResult.create( search=search, title=title, snippet=snippet, link=link, url=url, updated_date=updated_datetime, rank=rank, ) # To my knowledge, this is the only method for which it is strongly implied in # the BeautifulSoup documentation that you are fetching the next result # in the sequence. I also assume that the search API is returning results # in the order of decreasing relevance, such that rank increases (gets bigger) # with each successive entry visited. entry = entry.find_next('entry')
def get_results(seed, max_depth): fetch_index = seed.fetch_index # Request for autocomplete results params = DEFAULT_PARAMS.copy() params['q'] = seed.seed response = make_request(default_requests_session.get, URL, params=params) time.sleep(REQUEST_DELAY) # enforce a pause between each fetch to be respectful to API # Go no further if the call failed if not response: return [] # Store data from the fetched queries doc = ElementTree.fromstring(response.text.encode('utf-8')) num_results = 0 rank = 1 for comp_sugg in doc.iterfind('CompleteSuggestion'): for suggestion in comp_sugg.iterfind('suggestion'): # Create a new query and add to the database data = suggestion.attrib['data'] # In Fourney et al.'s implementation of CUTS, the returned queries were checked so that # they started with the exactly the seed. We relax this restriction here. # We note that in some autocomplete entries use valuable synonyms for our # queries, such as converting node -> js or rearranging the terms. These modified # prefixes yield interesting queries that we don't want to miss. Query.create( fetch_index=fetch_index, seed=seed, query=data, rank=rank, depth=seed.depth, ) num_results += 1 rank += 1 # Only expand this seed into new seeds if we got a full set of results and # we have not yet descended to the maximum depth. if num_results == MAX_RESULTS and seed.depth < max_depth: for char in ALPHABET: # The initial query should be followed by a space. if seed.depth == 0 and char != ' ': continue # There shouldn't be any sequence of two spaces. if char == ' ' and seed.seed.endswith(' '): continue # Create and store new seed new_seed_text = seed.seed + char new_seed = Seed.create( fetch_index=fetch_index, parent=seed, seed=new_seed_text, depth=seed.depth + 1, ) # Fetch results for the new seed. get_results(new_seed, max_depth)
def get_slant_topics(show_progress): # Create a new fetch index last_fetch_index = SlantTopic.select(fn.Max(SlantTopic.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 params = DEFAULT_PARAMS.copy() first_request = True next_url = None count_of_processed_topics = 0 # Loop through requests to the Slant server until we reach an empty # response or the end of the pages. while True: # All requests after our first one are made to a URL returned by # the previous request. So there's a little logic here to use verbose # parameters for the first request. They should be included by # default in all requests after that. if first_request: response = make_request( default_requests_session.get, SLANT_TOPICS_URL, params=params, ) # We found that for some reason, the next page path is missing a parameter # to specify that we still want the results of the next page as JSON. # So we explicitly specify the format here. else: response = make_request( default_requests_session.get, next_url, params={'format': 'json'}, ) # Leave this loop if the fetch failed if response is None: break results = response.json() # If we have somehow ended up on an entry where it has an error field # with the 404 code, we have probably seen all results. Break out of the loop. if 'error' in results and results['error'] == 404: break # If this is the first request, initialize the progress bar with # the number of results retrieved from the results if first_request and show_progress: progress_bar = ProgressBar(maxval=results['count'], widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Fetched ', Counter(), ' / ' + str(results['count']) + ' topics.' ]) progress_bar.start() for topic in results['children']: # Each child in the list is a topic. # Save each of these as a new topic. topic_record = SlantTopic.create( fetch_index=fetch_index, topic_id=topic['uuid'], title=topic['revision']['title'], url_path=topic['URL'], owner_username=topic['createdEvent']['user']['username'], ) # A topic on Slant has a number of "viewpoints" or alternatives. # Save each one and a URL to the site where we can visit each one. for viewpoint in topic['viewpoints']['children']: Viewpoint.create( fetch_index=fetch_index, viewpoint_index=viewpoint['id'], title=viewpoint['revision']['title'], topic=topic_record, url_path=viewpoint['URL'], ) count_of_processed_topics += 1 if show_progress: progress_bar.update(count_of_processed_topics) # We are also finished looping through results when there is no longer a 'next' # page in the page properties. It's just a guess on our part that this endpoint # will always report a next page when there is one, as there isn't an official # API and there isn't any documentation for it. if 'next' not in results['properties']['page']: if show_progress: progress_bar.finish() break next_page_path = results['properties']['page']['next'] next_url = SLANT_URL + next_page_path # Pause so that we don't bombard the server with requests time.sleep(REQUEST_DELAY) # Reset the flag that cues us to take actions for the first request first_request = False