def _fetchContentMap(binary_record): """Map function of fetch content. Fetched content will store to blobstore. Arg: binary_record: key value data, that key is url of target page, value is url of target of fetch. Returns: url: fetched url. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) page_url = proto.key() target_url = proto.value() #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == page_url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message) #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) stored_url = None if re.match("^/", target_url): crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url) try: fetch_result = fetcher.get(target_url) if fetch_result: #Storing to blobstore blob_io = files.blobstore.create( mime_type=fetch_result.get("mime_type"), _blobinfo_uploaded_filename=fetch_result.get("fetched_url")) with files.open(blob_io, 'a') as f: f.write(fetch_result.get("content")) files.finalize(blob_io) blob_key = files.blobstore.get_blob_key(blob_io) stored_url = images.get_serving_url(str(blob_key)) except Exception as e: logging.warning("Fetch Error Occurs:" + e.message) #Put content to datastore. crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) if crawl_db_datum and stored_url is not None: entity = ContentDbDatum( parent=crawl_db_datum.key, fetched_url=fetch_result.get("fetched_url"), stored_url=stored_url, content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("content_length"), http_headers=str(fetch_result.get("headers"))) entity.put() yield "%s:%s" % (target_url, stored_url)
def _fetchContentMap(binary_record): """Map function of fetch content. Fetched content will store to blobstore. Arg: binary_record: key value data, that key is url of target page, value is url of target of fetch. Returns: url: fetched url. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) page_url = proto.key() target_url = proto.value() #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url==page_url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message) #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) stored_url = None if re.match("^/", target_url): crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url) try: fetch_result = fetcher.get(target_url) if fetch_result: #Storing to blobstore blob_io = files.blobstore.create(mime_type=fetch_result.get("mime_type"), _blobinfo_uploaded_filename=fetch_result.get("fetched_url")) with files.open(blob_io, 'a') as f: f.write(fetch_result.get("content")) files.finalize(blob_io) blob_key = files.blobstore.get_blob_key(blob_io) stored_url = images.get_serving_url(str(blob_key)) except Exception as e: logging.warning("Fetch Error Occurs:" + e.message) #Put content to datastore. crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) if crawl_db_datum and stored_url is not None: entity = ContentDbDatum(parent=crawl_db_datum.key, fetched_url=fetch_result.get("fetched_url"), stored_url=stored_url, content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("content_length"), http_headers=str(fetch_result.get("headers"))) entity.put() yield "%s:%s" % (target_url, stored_url)