示例#1
0
def createMockFetchedDatum(url, html_text, status):
  """Create FetchedDatum mock data."""
  key = ndb.Key(CrawlDbDatum, url)
  crawl = CrawlDbDatum.get_or_insert(url, parent=key,
      url=url, last_status=status)
  if status != pipelines.UNFETCHED:
    fetched_datum = FetchedDbDatum(parent=crawl.key,
        url=url, fetched_url = url,
        fetched_content = html_text, content_type="text/html")
    fetched_datum.put()
示例#2
0
def createMockFetchedDatum(url, html_text, status):
    """Create FetchedDatum mock data."""
    key = ndb.Key(CrawlDbDatum, url)
    crawl = CrawlDbDatum.get_or_insert(url,
                                       parent=key,
                                       url=url,
                                       last_status=status)
    if status != pipelines.UNFETCHED:
        fetched_datum = FetchedDbDatum(parent=crawl.key,
                                       url=url,
                                       fetched_url=url,
                                       fetched_content=html_text,
                                       content_type="text/html")
        fetched_datum.put()
示例#3
0
def _fetchMap(binary_record):
  """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  url = proto.key()
  could_fetch = _str2bool(proto.value())
  result = UNFETCHED
  fetched_url = ""
  fetch_date = None
  #Fetch to CrawlDbDatum
  try:
    query = CrawlDbDatum.query(CrawlDbDatum.url==url)
    crawl_db_datum_future = query.fetch_async() 
  except Exception as e:
    logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message)
    could_fetch = False
  
  if could_fetch:
    #start fetch    
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    try:
      fetch_result = fetcher.get(url)
      if fetch_result:
        #Storing to datastore
        crawl_db_datums = crawl_db_datum_future.get_result()
        fetche_datum = FetchedDbDatum(parent=crawl_db_datums[0].key,
            url=url, fetched_url = fetch_result.get("fetched_url"),
            fetch_time = fetch_result.get("time"), fetched_content = fetch_result.get("content"),
            content_type =  fetch_result.get("mime_type"),
            content_size = fetch_result.get("read_rate"),
            response_rate = fetch_result.get("read_rate"),
            http_headers = str(fetch_result.get("headers")))
        fetche_datum.put()
        #update time of last fetched 
        result = FETCHED
        fetch_date = datetime.datetime.now()
        fetched_url = ("%s\n"%url)
    except Exception as e:
      logging.warning("Fetch Page Error Occurs:" + e.message)
      result = FAILED
  else:
    result = FAILED
  
  #Update status to all datums.
  crawl_db_datums = crawl_db_datum_future.get_result()
  for datum in crawl_db_datums:
    datum.last_status = result
    datum.last_fetched = fetch_date
  ndb.put_multi(crawl_db_datums)

  yield fetched_url
示例#4
0
def _fetchMap(binary_record):
    """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    url = proto.key()
    could_fetch = _str2bool(proto.value())
    result = UNFETCHED
    fetched_url = ""
    fetch_date = None
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" + url +
                        ":" + e.message)
        could_fetch = False

    if could_fetch:
        #start fetch
        fetcher = fetchers.SimpleHttpFetcher(
            1, fetcher_policy_yaml.fetcher_policy)
        try:
            fetch_result = fetcher.get(url)
            if fetch_result:
                #Storing to datastore
                crawl_db_datums = crawl_db_datum_future.get_result()
                fetche_datum = FetchedDbDatum(
                    parent=crawl_db_datums[0].key,
                    url=url,
                    fetched_url=fetch_result.get("fetched_url"),
                    fetch_time=fetch_result.get("time"),
                    fetched_content=fetch_result.get("content"),
                    content_type=fetch_result.get("mime_type"),
                    content_size=fetch_result.get("read_rate"),
                    response_rate=fetch_result.get("read_rate"),
                    http_headers=str(fetch_result.get("headers")))
                fetche_datum.put()
                #update time of last fetched
                result = FETCHED
                fetch_date = datetime.datetime.now()
                fetched_url = ("%s\n" % url)
        except Exception as e:
            logging.warning("Fetch Page Error Occurs:" + e.message)
            result = FAILED
    else:
        result = FAILED

    #Update status to all datums.
    crawl_db_datums = crawl_db_datum_future.get_result()
    for datum in crawl_db_datums:
        datum.last_status = result
        datum.last_fetched = fetch_date
    ndb.put_multi(crawl_db_datums)

    yield fetched_url