Exemplo n.º 1
0
    def testTruncationWithKeepAlive(self):
        fetcher_policy_yaml = self.getCustomFetcherPolicy(
            "fetcher_policy_sizes.yaml")
        resource = self.getResource("cloudysunny14.html")
        static_content = resource.read()
        static_content_length = len(static_content)
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "text/html"
                            })
        simple_http_fetcher = fetchers.SimpleHttpFetcher(
            1, fetcher_policy_yaml.fetcher_policy)
        url = "http://static_resource/cloudysunny14.html"
        result_left = simple_http_fetcher.get(url)
        result_right = simple_http_fetcher.get(url)
        self.assertEqual(1000, result_left.get("content_length"))
        self.assertEqual(1000, result_right.get("content_length"))
        map(self.assertLR, result_left.get("content"),
            result_right.get("content"))

        resource = self.getResource("mining.png")
        static_content = resource.read()
        static_content_length = len(static_content)
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "image/png"
                            })
        url = "http://static_resource/mining.png"
        result = simple_http_fetcher.get(url)
        self.assertTrue(result.get("content_length") > 1000)
Exemplo n.º 2
0
def _robots_fetch_map(data):
    """Map function of fetch robots.txt from page.

  Fetch robots.txt from Web Pages in specified url,
  Fetched result content will store to Blobstore,
  which will parse and set the score for urls.
  
  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: extract domain url.
    content: content of fetched from url's robots.txt
  """
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    k, url = data
    logging.debug("data" + str(k) + ":" + str(url))
    content = ""
    try:
        result = fetcher.get("%s/robots.txt" % str(url))
        content = result.get("content")
    except Exception as e:
        logging.warning("Robots.txt Fetch Error Occurs:" + e.message)
        content = "User-agent: *\nDisallow: /"

    yield (url, content)
Exemplo n.º 3
0
def _fetchContentMap(binary_record):
    """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    page_url = proto.key()
    target_url = proto.value()
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == page_url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" +
                        page_url + ":" + e.message)

    #start fetch
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    stored_url = None
    if re.match("^/", target_url):
        crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
        target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

    try:
        fetch_result = fetcher.get(target_url)
        if fetch_result:
            #Storing to blobstore
            blob_io = files.blobstore.create(
                mime_type=fetch_result.get("mime_type"),
                _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
            with files.open(blob_io, 'a') as f:
                f.write(fetch_result.get("content"))
            files.finalize(blob_io)
            blob_key = files.blobstore.get_blob_key(blob_io)
            stored_url = images.get_serving_url(str(blob_key))
    except Exception as e:
        logging.warning("Fetch Error Occurs:" + e.message)

    #Put content to datastore.
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    if crawl_db_datum and stored_url is not None:
        entity = ContentDbDatum(
            parent=crawl_db_datum.key,
            fetched_url=fetch_result.get("fetched_url"),
            stored_url=stored_url,
            content_type=fetch_result.get("mime_type"),
            content_size=fetch_result.get("content_length"),
            http_headers=str(fetch_result.get("headers")))
        entity.put()

    yield "%s:%s" % (target_url, stored_url)
Exemplo n.º 4
0
 def testRealFetch(self):
     fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy(
     )
     #set min_response_rate of 20KByte/Sec
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     url = "http://cloudysunny14.blogspot.jp/"
     result = simple_http_fetcher.get(url)
     self.assertTrue(result is not None)
Exemplo n.º 5
0
 def testMimeTypeFilteringNoContentType(self):
     fetcher_policy_yaml = self.getCustomFetcherPolicy(
         "fetcher_policy.yaml")
     self.setReturnValue(headers={"Content-Length": 20000},
                         status_code=200,
                         final_url=self.redirectUrl)
     url = "http://static_resource/simple-page.html"
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     simple_http_fetcher.get(url)
Exemplo n.º 6
0
 def testMimeTypeFilteringWithCharset(self):
     fetcher_policy_yaml = self.getCustomFetcherPolicy(
         "fetcher_policy.yaml")
     self.setReturnValue(headers={
         "Content-Length": 20000,
         "Content-Type": "text/html; charset=UTF-8"
     },
                         status_code=200,
                         final_url=self.redirectUrl)
     url = "http://cloudysunny14.blogspot.jp/"
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     simple_http_fetcher.get(url)
Exemplo n.º 7
0
 def testRedirectPolicy(self):
     fetcher_policy_yaml = self.getCustomFetcherPolicy(
         "fetcher_policy_redirect_none.yaml")
     self.setReturnValue(headers={
         "Content-Length": 20000,
         "Content-Type": "text/html"
     },
                         status_code=301,
                         final_url=self.redirectUrl)
     url = "http://static_resource/base"
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     self.assertRaises(errors.RedirectError, simple_http_fetcher.get, url)
Exemplo n.º 8
0
 def testNotTerminatingSlowServer(self):
     #Return server 1Kbyte at 2K byte/sec.
     self.setReturnValue(headers={
         "Content-Length": 5000,
         "Content-Type": "text/html"
     },
                         duration=0.25)
     fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy(
     )
     fetcher_policy_yaml.fetcher_policy.min_response_rate = configuration.NO_MIN_RESPONSE_RATE
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     url = "http://static_resource/simple-page.html"
     simple_http_fetcher.get(url)
Exemplo n.º 9
0
 def testMimeTypeFiltering(self):
     fetcher_policy_yaml = self.getCustomFetcherPolicy(
         "fetcher_policy.yaml")
     self.setReturnValue(headers={
         "Content-Length": 20000,
         "Content-Type": "text/xml"
     },
                         status_code=200,
                         final_url=self.redirectUrl)
     url = "http://static_resource/simple-page.html"
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     self.assertRaises(errors.AbortedFetchError, simple_http_fetcher.get,
                       url)
Exemplo n.º 10
0
 def testRedirectHandling(self):
     fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy(
     )
     self.setReturnValue(headers={
         "Content-Length": 20000,
         "Content-Type": "text/html"
     },
                         final_url=self.redirectUrl)
     url = "http://static_resource/base"
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     result = simple_http_fetcher.get(url)
     self.assertTrue("http://static_resource/redirect",
                     result.get("fetched_url"))
Exemplo n.º 11
0
 def testSlowServerTermination(self):
     #use 20KBytes. And the duration is 2 seconds, thus 10KBytes/Sec
     self.setReturnValue(headers={
         "Content-Length": 20000,
         "Content-Type": "text/html"
     },
                         duration=2)
     fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy(
     )
     #set min_response_rate of 20KByte/Sec
     fetcher_policy_yaml.fetcher_policy.min_response_rate = 20000
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     url = "http://static_resource/simple-page.html"
     self.assertRaises(errors.AbortedFetchError, simple_http_fetcher.get,
                       url)
Exemplo n.º 12
0
 def testLargeContent(self):
     #Test for should be truncate.
     fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy(
     )
     content_size = fetcher_policy_yaml.fetcher_policy.max_content_size[0]
     max_content_size = int(content_size.size)
     self.setReturnValue(headers={
         "Content-Length": max_content_size * 2,
         "Content-Type": "text/html"
     })
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     url = "http://static_resource/simple-page.html"
     result = simple_http_fetcher.get(url)
     self.assertTrue(
         result.get("content_length") <= max_content_size,
         "Should be truncate")
Exemplo n.º 13
0
 def testAcceptLanguage(self):
     fetcher_policy_yaml = self.getCustomFetcherPolicy(
         "fetcher_policy.yaml")
     self.setReturnValue(headers={
         "Content-Length": 20000,
         "Content-Type": "text/html"
     },
                         status_code=200,
                         language_content={
                             "en": "English",
                             "ja": "Japanese"
                         },
                         final_url=self.redirectUrl)
     url = "http://static_resource/simple-page.html"
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     result = simple_http_fetcher.get(url)
     self.assertTrue("English", result.get("content"))
Exemplo n.º 14
0
 def testContentTypeHeader(self):
     fetcher_policy_yaml = configuration.FetcherPolicyYaml.create_default_policy(
     )
     resource = self.getResource("cloudysunny14.html")
     static_content = resource.read()
     static_content_length = len(static_content)
     self.setReturnValue(content=static_content,
                         headers={
                             "Content-Length": static_content_length,
                             "Content-Type": "text/html"
                         })
     simple_http_fetcher = fetchers.SimpleHttpFetcher(
         1, fetcher_policy_yaml.fetcher_policy)
     url = "http://static_resource/cloudysunny14.html"
     result = simple_http_fetcher.get(url)
     header = result.get("headers")
     content_type = header["Content-Type"]
     self.assertTrue(content_type != None)
     self.assertEquals("text/html", content_type)
Exemplo n.º 15
0
def _fetchMap(binary_record):
    """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    url = proto.key()
    could_fetch = _str2bool(proto.value())
    result = UNFETCHED
    fetched_url = ""
    fetch_date = None
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" + url +
                        ":" + e.message)
        could_fetch = False

    if could_fetch:
        #start fetch
        fetcher = fetchers.SimpleHttpFetcher(
            1, fetcher_policy_yaml.fetcher_policy)
        try:
            fetch_result = fetcher.get(url)
            if fetch_result:
                #Storing to datastore
                crawl_db_datums = crawl_db_datum_future.get_result()
                fetche_datum = FetchedDbDatum(
                    parent=crawl_db_datums[0].key,
                    url=url,
                    fetched_url=fetch_result.get("fetched_url"),
                    fetch_time=fetch_result.get("time"),
                    fetched_content=fetch_result.get("content"),
                    content_type=fetch_result.get("mime_type"),
                    content_size=fetch_result.get("read_rate"),
                    response_rate=fetch_result.get("read_rate"),
                    http_headers=str(fetch_result.get("headers")))
                fetche_datum.put()
                #update time of last fetched
                result = FETCHED
                fetch_date = datetime.datetime.now()
                fetched_url = ("%s\n" % url)
        except Exception as e:
            logging.warning("Fetch Page Error Occurs:" + e.message)
            result = FAILED
    else:
        result = FAILED

    #Update status to all datums.
    crawl_db_datums = crawl_db_datum_future.get_result()
    for datum in crawl_db_datums:
        datum.last_status = result
        datum.last_fetched = fetch_date
    ndb.put_multi(crawl_db_datums)

    yield fetched_url