Пример #1
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = "successful"
        if self.was_aborted:
            status = "aborted"
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(
            autoescape=True,
            extensions=["jinja2.ext.i18n"],
            loader=FileSystemLoader(os.path.join(base_dir, "templates")),
        )
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(
            sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments
        )
Пример #2
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "http://k.yimg.jp/images/top/sp/logo.gif"))
        file_name2 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "/appengine/images/slide1.png"))
        datum = CrawlDbDatum(
            parent=ndb.Key(CrawlDbDatum,
                           "https://developers.google.com/appengine/"),
            url="https://developers.google.com/appengine/",
            extract_domain_url="https://developers.google.com",
            last_status=pipelines.UNFETCHED)
        datum.put()
        resource = self.getResource("slide1.png")
        static_content = resource.read()
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "image/png"
                            })
        p = pipelines._FetchContentPipeline("FetchContentPipeline",
                                            [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)

        query = CrawlDbDatum.query(
            CrawlDbDatum.url == "https://developers.google.com/appengine/")
        crawl_db_datums = query.fetch()
        self.assertTrue(len(crawl_db_datums) > 0)
        key = crawl_db_datums[0].key
        content_datums = ContentDbDatum.query(ancestor=key).fetch()
        self.assertEqual(2, len(content_datums))
    def testFetchEndToEnd(self):
        """Test for through of fetcher job"""
        createMockCrawlDbDatum("http://foo.com/bar.html")
        static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        self.setReturnValue(url="http://foo.com/robots.txt",
                            content=static_robots,
                            headers={"Content-Length": len(static_robots)})
        #static resource is read from resource
        resource = self.getResource("sample_content.html")
        static_content = resource.read()
        static_content_length = len(static_content)
        self.setReturnValue(url="http://foo.com/bar.html",
                            content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "text/html"
                            })
        resource_image = self.getResource("slide1.png")
        static_content_image = resource_image.read()
        static_content_length = len(static_content_image)
        self.setReturnValue(url="http://foo.com/images/slide1.png",
                            content=static_content_image,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "image/png"
                            })
        p = pipelines.FetcherPipeline(
            "FetcherPipeline",
            params={"entity_kind": "lakshmi.datum.CrawlDbDatum"},
            parser_params={"text/html": __name__ + ".htmlParser"},
            shards=2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://foo.com/bar.html").fetch()
        crawl_db_datum = crawl_db_datums[0]
        self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        fetched_db_datum = fetched_db_datums[0]
        self.assertTrue(fetched_db_datum is not None)
        self.assertTrue("http://foo.com/bar.html",
                        fetched_db_datum.fetched_url)
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(link_db_datums) > 0)
        contents_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(contents_db_datums) > 0)
 def testFetchEndToEnd(self):
   """Test for through of fetcher job"""
   createMockCrawlDbDatum("http://foo.com/bar.html")
   static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
   self.setReturnValue(url="http://foo.com/robots.txt",
       content=static_robots,
       headers={"Content-Length": len(static_robots)})
   #static resource is read from resource
   resource = self.getResource("sample_content.html")
   static_content = resource.read()
   static_content_length = len(static_content)
   self.setReturnValue(url="http://foo.com/bar.html",
       content=static_content,
       headers={"Content-Length": static_content_length,
           "Content-Type": "text/html"})
   resource_image = self.getResource("slide1.png")
   static_content_image = resource_image.read()
   static_content_length = len(static_content_image)
   self.setReturnValue(url="http://foo.com/images/slide1.png",
       content=static_content_image,
       headers={"Content-Length": static_content_length,
           "Content-Type": "image/png"})
   p = pipelines.FetcherPipeline("FetcherPipeline",
       params={
         "entity_kind": "lakshmi.datum.CrawlDbDatum"
       },
       parser_params={
         "text/html": __name__ + ".htmlParser"
       },
       shards=2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   
   crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch()
   crawl_db_datum = crawl_db_datums[0]
   self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
   fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   fetched_db_datum = fetched_db_datums[0]
   self.assertTrue(fetched_db_datum is not None)
   self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url)
   link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(link_db_datums)>0)
   contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(contents_db_datums)>0)
Пример #5
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = 'successful'
        if self.was_aborted:
            status = 'aborted'
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(autoescape=True,
                          extensions=['jinja2.ext.i18n'],
                          loader=FileSystemLoader(
                              os.path.join(base_dir, 'templates')))
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(
                ("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(
            url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(sender=sender,
                       to=email,
                       subject=subject,
                       body="FetchResults",
                       html=html,
                       attachments=attachments)
Пример #6
0
  def testSuccessfulRun(self):
    file_name1 = self.createMockData(("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif"))
    file_name2 = self.createMockData(("https://developers.google.com/appengine/", "/appengine/images/slide1.png"))
    datum = CrawlDbDatum(
        parent =ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"),
        url="https://developers.google.com/appengine/",
        extract_domain_url="https://developers.google.com",
        last_status=pipelines.UNFETCHED)
    datum.put()
    resource = self.getResource("slide1.png")
    static_content = resource.read()
    self.setReturnValue(content=static_content,
                        headers={"Content-Length": len(static_content),
                                 "Content-Type": "image/png"})
    p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    finished_map = pipelines._FetchSetsBufferPipeline.from_id(p.pipeline_id)
    
    # Can open files
    file_paths = finished_map.outputs.default.value
    self.assertTrue(len(file_paths) > 0)
    self.assertTrue(file_paths[0].startswith("/blobstore/"))

    reader = input_readers.RecordsReader(file_paths, 0)
    for binary_record in reader:
      proto = file_service_pb.KeyValue()
      proto.ParseFromString(binary_record)
      key = proto.key()
      value = proto.value()
      self.assertTrue(key is not None)
      self.assertTrue(value is not None)

    query = CrawlDbDatum.query(CrawlDbDatum.url=="https://developers.google.com/appengine/")
    crawl_db_datums = query.fetch()
    self.assertTrue(len(crawl_db_datums)>0)
    key = crawl_db_datums[0].key
    content_datums = ContentDbDatum.query(ancestor=key).fetch()
    self.assertEqual(2, len(content_datums))