Пример #1
0
def _fetchContentMap(binary_record):
    """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    page_url = proto.key()
    target_url = proto.value()
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == page_url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" +
                        page_url + ":" + e.message)

    #start fetch
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    stored_url = None
    if re.match("^/", target_url):
        crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
        target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

    try:
        fetch_result = fetcher.get(target_url)
        if fetch_result:
            #Storing to blobstore
            blob_io = files.blobstore.create(
                mime_type=fetch_result.get("mime_type"),
                _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
            with files.open(blob_io, 'a') as f:
                f.write(fetch_result.get("content"))
            files.finalize(blob_io)
            blob_key = files.blobstore.get_blob_key(blob_io)
            stored_url = images.get_serving_url(str(blob_key))
    except Exception as e:
        logging.warning("Fetch Error Occurs:" + e.message)

    #Put content to datastore.
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    if crawl_db_datum and stored_url is not None:
        entity = ContentDbDatum(
            parent=crawl_db_datum.key,
            fetched_url=fetch_result.get("fetched_url"),
            stored_url=stored_url,
            content_type=fetch_result.get("mime_type"),
            content_size=fetch_result.get("content_length"),
            http_headers=str(fetch_result.get("headers")))
        entity.put()

    yield "%s:%s" % (target_url, stored_url)