def get_results_content(fetch_all, fetch_indexes, share_content): # We order search results by URL so that we can visit search results that share the # same URL one after the other. This way we can associate the same fetched contents # with all search results that share a URL at the same time. results = ( SearchResult .select() .order_by(SearchResult.url) ) if fetch_all: results = results elif fetch_indexes: results = ( results .join(Search) .where(Search.fetch_index << fetch_indexes) ) else: results = ( results .join(SearchResultContent, JOIN_LEFT_OUTER) .where(SearchResultContent.content >> None) ) previous_url = None previous_content = None for search_result in results: # If the caller has specified that we should share fetched contents between # search results with the same URL, then check to see if the URL has stayed the same. if share_content and search_result.url == previous_url: logger.debug("Already called URL %s. Reusing its response.", search_result.url) if previous_content is not None: SearchResultContent.create(search_result=search_result, content=previous_content) continue # Fetch content for the search result resp = make_request(default_requests_session.get, search_result.url) # Associate the scraped content to a URL if hasattr(resp, 'content'): # To avoid redundant storage, we create a record for web page # contents that can be shared across multiple URLs. # As it turns out, we want "response.text" (Unicode) and not "response.content" (bytes), # if we want to successfully store the responses from all URLs. web_page_content = WebPageContent.create(url=search_result.url, content=resp.text) SearchResultContent.create(search_result=search_result, content=web_page_content) previous_content = web_page_content else: logger.warn("Error fetching content from URL: %s", search_result.url) previous_content = None # With either a successful or failed response, save that we queried this URL previous_url = search_result.url # Even though most of the pages will be from different domains, we pause between # fetching the content for each result to avoid spamming any specific domain with requests. time.sleep(DELAY_TIME)
def forward(migrator): # Add a placeholder field for storing a link to a WebPageContent object migrate( migrator.add_column( "searchresultcontent", "webpagecontent_id", ForeignKeyField(WebPageContent, null=True, to_field=WebPageContent.id), ) ) # Move the data previously in SearchResultContent model into WebPageContent, # and link the WebPageContent to the SearchResultContent. # Note that because the model for SearchResultContent has already been updated beyond the # state of the table, we have to access the 'content' and 'date' fields through the "SQL" # class instead of a field on the model. This is also the reason that we mix both # Query object methods and raw queries below. The models access the future field names, # and the raw queries access the past field names. content_records = ( SearchResultContent.select(SQL("content"), SQL("date"), SearchResult.url, SearchResultContent.id) .join(SearchResult) .dicts() ) for record in content_records: web_page_content = WebPageContent.create(content=record["content"], date=record["date"], url=record["url"]) # Normally, it's not recommended to directly insert values into queries. But I do # it here because I think Postgres and SQLite have two different interpolating strings, # so this is one way to write the migration to make it more portable. # I also think there is no risk that either of these fields that I insert will # be anything other than an integer. SearchResultContent.raw( "UPDATE searchresultcontent SET webpagecontent_id = " + str(web_page_content.id) + "WHERE id = " + str(record["id"]) ).execute() # Drop unnecessary columns from SearchResultContent model migrate( migrator.drop_column("searchresultcontent", "date"), migrator.drop_column("searchresultcontent", "content"), migrator.rename_column("searchresultcontent", "webpagecontent_id", "content_id"), migrator.drop_not_null("searchresultcontent", "content_id"), )