Пример #1
0
def _fetchContentMap(binary_record):
    """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    page_url = proto.key()
    target_url = proto.value()
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == page_url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" +
                        page_url + ":" + e.message)

    #start fetch
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    stored_url = None
    if re.match("^/", target_url):
        crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
        target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

    try:
        fetch_result = fetcher.get(target_url)
        if fetch_result:
            #Storing to blobstore
            blob_io = files.blobstore.create(
                mime_type=fetch_result.get("mime_type"),
                _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
            with files.open(blob_io, 'a') as f:
                f.write(fetch_result.get("content"))
            files.finalize(blob_io)
            blob_key = files.blobstore.get_blob_key(blob_io)
            stored_url = images.get_serving_url(str(blob_key))
    except Exception as e:
        logging.warning("Fetch Error Occurs:" + e.message)

    #Put content to datastore.
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    if crawl_db_datum and stored_url is not None:
        entity = ContentDbDatum(
            parent=crawl_db_datum.key,
            fetched_url=fetch_result.get("fetched_url"),
            stored_url=stored_url,
            content_type=fetch_result.get("mime_type"),
            content_size=fetch_result.get("content_length"),
            http_headers=str(fetch_result.get("headers")))
        entity.put()

    yield "%s:%s" % (target_url, stored_url)
Пример #2
0
def _fetchContentMap(binary_record):
  """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  page_url = proto.key()
  target_url = proto.value()
  #Fetch to CrawlDbDatum
  try:
    query = CrawlDbDatum.query(CrawlDbDatum.url==page_url)
    crawl_db_datum_future = query.fetch_async() 
  except Exception as e:
    logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message)
  
  #start fetch    
  fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
  stored_url = None
  if re.match("^/", target_url):
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

  try:
    fetch_result = fetcher.get(target_url)
    if fetch_result:
      #Storing to blobstore
      blob_io = files.blobstore.create(mime_type=fetch_result.get("mime_type"),
          _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
      with files.open(blob_io, 'a') as f:
        f.write(fetch_result.get("content"))
      files.finalize(blob_io)
      blob_key = files.blobstore.get_blob_key(blob_io)
      stored_url = images.get_serving_url(str(blob_key))
  except Exception as e:
    logging.warning("Fetch Error Occurs:" + e.message)

  #Put content to datastore.
  crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
  if crawl_db_datum and stored_url is not None:
    entity = ContentDbDatum(parent=crawl_db_datum.key,
          fetched_url=fetch_result.get("fetched_url"),
          stored_url=stored_url,
          content_type=fetch_result.get("mime_type"),
          content_size=fetch_result.get("content_length"),
          http_headers=str(fetch_result.get("headers")))
    entity.put()

  yield "%s:%s" % (target_url, stored_url)