Пример #1
0
def _fetchContentMap(binary_record):
    """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    page_url = proto.key()
    target_url = proto.value()
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == page_url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" +
                        page_url + ":" + e.message)

    #start fetch
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    stored_url = None
    if re.match("^/", target_url):
        crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
        target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

    try:
        fetch_result = fetcher.get(target_url)
        if fetch_result:
            #Storing to blobstore
            blob_io = files.blobstore.create(
                mime_type=fetch_result.get("mime_type"),
                _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
            with files.open(blob_io, 'a') as f:
                f.write(fetch_result.get("content"))
            files.finalize(blob_io)
            blob_key = files.blobstore.get_blob_key(blob_io)
            stored_url = images.get_serving_url(str(blob_key))
    except Exception as e:
        logging.warning("Fetch Error Occurs:" + e.message)

    #Put content to datastore.
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    if crawl_db_datum and stored_url is not None:
        entity = ContentDbDatum(
            parent=crawl_db_datum.key,
            fetched_url=fetch_result.get("fetched_url"),
            stored_url=stored_url,
            content_type=fetch_result.get("mime_type"),
            content_size=fetch_result.get("content_length"),
            http_headers=str(fetch_result.get("headers")))
        entity.put()

    yield "%s:%s" % (target_url, stored_url)
Пример #2
0
def _fetchContentMap(binary_record):
  """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  page_url = proto.key()
  target_url = proto.value()
  #Fetch to CrawlDbDatum
  try:
    query = CrawlDbDatum.query(CrawlDbDatum.url==page_url)
    crawl_db_datum_future = query.fetch_async() 
  except Exception as e:
    logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message)
  
  #start fetch    
  fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
  stored_url = None
  if re.match("^/", target_url):
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

  try:
    fetch_result = fetcher.get(target_url)
    if fetch_result:
      #Storing to blobstore
      blob_io = files.blobstore.create(mime_type=fetch_result.get("mime_type"),
          _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
      with files.open(blob_io, 'a') as f:
        f.write(fetch_result.get("content"))
      files.finalize(blob_io)
      blob_key = files.blobstore.get_blob_key(blob_io)
      stored_url = images.get_serving_url(str(blob_key))
  except Exception as e:
    logging.warning("Fetch Error Occurs:" + e.message)

  #Put content to datastore.
  crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
  if crawl_db_datum and stored_url is not None:
    entity = ContentDbDatum(parent=crawl_db_datum.key,
          fetched_url=fetch_result.get("fetched_url"),
          stored_url=stored_url,
          content_type=fetch_result.get("mime_type"),
          content_size=fetch_result.get("content_length"),
          http_headers=str(fetch_result.get("headers")))
    entity.put()

  yield "%s:%s" % (target_url, stored_url)
Пример #3
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = "successful"
        if self.was_aborted:
            status = "aborted"
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(
            autoescape=True,
            extensions=["jinja2.ext.i18n"],
            loader=FileSystemLoader(os.path.join(base_dir, "templates")),
        )
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(
            sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments
        )
Пример #4
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "http://k.yimg.jp/images/top/sp/logo.gif"))
        file_name2 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "/appengine/images/slide1.png"))
        datum = CrawlDbDatum(
            parent=ndb.Key(CrawlDbDatum,
                           "https://developers.google.com/appengine/"),
            url="https://developers.google.com/appengine/",
            extract_domain_url="https://developers.google.com",
            last_status=pipelines.UNFETCHED)
        datum.put()
        resource = self.getResource("slide1.png")
        static_content = resource.read()
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "image/png"
                            })
        p = pipelines._FetchContentPipeline("FetchContentPipeline",
                                            [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)

        query = CrawlDbDatum.query(
            CrawlDbDatum.url == "https://developers.google.com/appengine/")
        crawl_db_datums = query.fetch()
        self.assertTrue(len(crawl_db_datums) > 0)
        key = crawl_db_datums[0].key
        content_datums = ContentDbDatum.query(ancestor=key).fetch()
        self.assertEqual(2, len(content_datums))
    def testFetchEndToEnd(self):
        """Test for through of fetcher job"""
        createMockCrawlDbDatum("http://foo.com/bar.html")
        static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        self.setReturnValue(url="http://foo.com/robots.txt",
                            content=static_robots,
                            headers={"Content-Length": len(static_robots)})
        #static resource is read from resource
        resource = self.getResource("sample_content.html")
        static_content = resource.read()
        static_content_length = len(static_content)
        self.setReturnValue(url="http://foo.com/bar.html",
                            content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "text/html"
                            })
        resource_image = self.getResource("slide1.png")
        static_content_image = resource_image.read()
        static_content_length = len(static_content_image)
        self.setReturnValue(url="http://foo.com/images/slide1.png",
                            content=static_content_image,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "image/png"
                            })
        p = pipelines.FetcherPipeline(
            "FetcherPipeline",
            params={"entity_kind": "lakshmi.datum.CrawlDbDatum"},
            parser_params={"text/html": __name__ + ".htmlParser"},
            shards=2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://foo.com/bar.html").fetch()
        crawl_db_datum = crawl_db_datums[0]
        self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        fetched_db_datum = fetched_db_datums[0]
        self.assertTrue(fetched_db_datum is not None)
        self.assertTrue("http://foo.com/bar.html",
                        fetched_db_datum.fetched_url)
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(link_db_datums) > 0)
        contents_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(contents_db_datums) > 0)
 def testFetchEndToEnd(self):
   """Test for through of fetcher job"""
   createMockCrawlDbDatum("http://foo.com/bar.html")
   static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
   self.setReturnValue(url="http://foo.com/robots.txt",
       content=static_robots,
       headers={"Content-Length": len(static_robots)})
   #static resource is read from resource
   resource = self.getResource("sample_content.html")
   static_content = resource.read()
   static_content_length = len(static_content)
   self.setReturnValue(url="http://foo.com/bar.html",
       content=static_content,
       headers={"Content-Length": static_content_length,
           "Content-Type": "text/html"})
   resource_image = self.getResource("slide1.png")
   static_content_image = resource_image.read()
   static_content_length = len(static_content_image)
   self.setReturnValue(url="http://foo.com/images/slide1.png",
       content=static_content_image,
       headers={"Content-Length": static_content_length,
           "Content-Type": "image/png"})
   p = pipelines.FetcherPipeline("FetcherPipeline",
       params={
         "entity_kind": "lakshmi.datum.CrawlDbDatum"
       },
       parser_params={
         "text/html": __name__ + ".htmlParser"
       },
       shards=2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   
   crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch()
   crawl_db_datum = crawl_db_datums[0]
   self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
   fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   fetched_db_datum = fetched_db_datums[0]
   self.assertTrue(fetched_db_datum is not None)
   self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url)
   link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(link_db_datums)>0)
   contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(contents_db_datums)>0)
Пример #7
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = 'successful'
        if self.was_aborted:
            status = 'aborted'
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(autoescape=True,
                          extensions=['jinja2.ext.i18n'],
                          loader=FileSystemLoader(
                              os.path.join(base_dir, 'templates')))
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(
                ("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(
            url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(sender=sender,
                       to=email,
                       subject=subject,
                       body="FetchResults",
                       html=html,
                       attachments=attachments)
Пример #8
0
  def testSuccessfulRun(self):
    file_name1 = self.createMockData(("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif"))
    file_name2 = self.createMockData(("https://developers.google.com/appengine/", "/appengine/images/slide1.png"))
    datum = CrawlDbDatum(
        parent =ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"),
        url="https://developers.google.com/appengine/",
        extract_domain_url="https://developers.google.com",
        last_status=pipelines.UNFETCHED)
    datum.put()
    resource = self.getResource("slide1.png")
    static_content = resource.read()
    self.setReturnValue(content=static_content,
                        headers={"Content-Length": len(static_content),
                                 "Content-Type": "image/png"})
    p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    finished_map = pipelines._FetchSetsBufferPipeline.from_id(p.pipeline_id)
    
    # Can open files
    file_paths = finished_map.outputs.default.value
    self.assertTrue(len(file_paths) > 0)
    self.assertTrue(file_paths[0].startswith("/blobstore/"))

    reader = input_readers.RecordsReader(file_paths, 0)
    for binary_record in reader:
      proto = file_service_pb.KeyValue()
      proto.ParseFromString(binary_record)
      key = proto.key()
      value = proto.value()
      self.assertTrue(key is not None)
      self.assertTrue(value is not None)

    query = CrawlDbDatum.query(CrawlDbDatum.url=="https://developers.google.com/appengine/")
    crawl_db_datums = query.fetch()
    self.assertTrue(len(crawl_db_datums)>0)
    key = crawl_db_datums[0].key
    content_datums = ContentDbDatum.query(ancestor=key).fetch()
    self.assertEqual(2, len(content_datums))