Exemplo n.º 1
0
    def testSuccessfulRun(self):
        createMockCrawlDbDatum(2, 2, True)
        file_name1 = self.createMockData(("http://hoge_0.com/content_0", True))
        file_name2 = self.createMockData(
            ("http://hoge_1.com/content_0", False))
        static_content = "<html><body>TestContent</body></html>"
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "text/html"
                            })
        p = pipelines._FetchPagePipeline("FetchPipeline",
                                         [file_name1, file_name2], 2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch()
        entity = entities[0]
        fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
        self.assertTrue(fetched_datum is not None)
Exemplo n.º 2
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = "successful"
        if self.was_aborted:
            status = "aborted"
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(
            autoescape=True,
            extensions=["jinja2.ext.i18n"],
            loader=FileSystemLoader(os.path.join(base_dir, "templates")),
        )
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(
            sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments
        )
Exemplo n.º 3
0
def _extract_content_urls_map(data):
    """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
    k, url = data
    query = CrawlDbDatum.query(CrawlDbDatum.url == url)
    crawl_db_datum = query.fetch()
    key = crawl_db_datum[0].key
    fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
    fetched_datum = fetched_datums[0]
    content = None
    if fetched_datum is not None:
        content = fetched_datum.fetched_content
        mime_type = fetched_datum.content_type
        if content is not None:
            parsed_obj = None
            try:
                params = _get_parser_param(_PARSER_PARAM_KEY)
                parsed_obj = util.handler_for_name(params[mime_type])(key,
                                                                      content)
            except Exception as e:
                logging.warning("Can not handle for %s[params:%s]:%s" %
                                (mime_type, params, e.message))
            if parsed_obj is not None:
                for content_urls in parsed_obj:
                    yield (url, content_urls)
    def testFetchEndToEnd(self):
        """Test for through of fetcher job"""
        createMockCrawlDbDatum("http://foo.com/bar.html")
        static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        self.setReturnValue(url="http://foo.com/robots.txt",
                            content=static_robots,
                            headers={"Content-Length": len(static_robots)})
        #static resource is read from resource
        resource = self.getResource("sample_content.html")
        static_content = resource.read()
        static_content_length = len(static_content)
        self.setReturnValue(url="http://foo.com/bar.html",
                            content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "text/html"
                            })
        resource_image = self.getResource("slide1.png")
        static_content_image = resource_image.read()
        static_content_length = len(static_content_image)
        self.setReturnValue(url="http://foo.com/images/slide1.png",
                            content=static_content_image,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "image/png"
                            })
        p = pipelines.FetcherPipeline(
            "FetcherPipeline",
            params={"entity_kind": "lakshmi.datum.CrawlDbDatum"},
            parser_params={"text/html": __name__ + ".htmlParser"},
            shards=2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://foo.com/bar.html").fetch()
        crawl_db_datum = crawl_db_datums[0]
        self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        fetched_db_datum = fetched_db_datums[0]
        self.assertTrue(fetched_db_datum is not None)
        self.assertTrue("http://foo.com/bar.html",
                        fetched_db_datum.fetched_url)
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(link_db_datums) > 0)
        contents_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(contents_db_datums) > 0)
Exemplo n.º 5
0
def _extract_content_urls_map(data):
  """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
  k, url = data
  query = CrawlDbDatum.query(CrawlDbDatum.url==url)
  crawl_db_datum = query.fetch()
  key = crawl_db_datum[0].key
  fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
  fetched_datum = fetched_datums[0]
  content = None
  if fetched_datum is not None:
    content = fetched_datum.fetched_content
    mime_type = fetched_datum.content_type
    if content is not None:
      parsed_obj = None
      try:
        params = _get_parser_param(_PARSER_PARAM_KEY)
        parsed_obj = util.handler_for_name(params[mime_type])(key, content)
      except Exception as e:
        logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message))
      if parsed_obj is not None:
        for content_urls in parsed_obj:
          yield (url, content_urls)
 def testFetchEndToEnd(self):
   """Test for through of fetcher job"""
   createMockCrawlDbDatum("http://foo.com/bar.html")
   static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
   self.setReturnValue(url="http://foo.com/robots.txt",
       content=static_robots,
       headers={"Content-Length": len(static_robots)})
   #static resource is read from resource
   resource = self.getResource("sample_content.html")
   static_content = resource.read()
   static_content_length = len(static_content)
   self.setReturnValue(url="http://foo.com/bar.html",
       content=static_content,
       headers={"Content-Length": static_content_length,
           "Content-Type": "text/html"})
   resource_image = self.getResource("slide1.png")
   static_content_image = resource_image.read()
   static_content_length = len(static_content_image)
   self.setReturnValue(url="http://foo.com/images/slide1.png",
       content=static_content_image,
       headers={"Content-Length": static_content_length,
           "Content-Type": "image/png"})
   p = pipelines.FetcherPipeline("FetcherPipeline",
       params={
         "entity_kind": "lakshmi.datum.CrawlDbDatum"
       },
       parser_params={
         "text/html": __name__ + ".htmlParser"
       },
       shards=2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   
   crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch()
   crawl_db_datum = crawl_db_datums[0]
   self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
   fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   fetched_db_datum = fetched_db_datums[0]
   self.assertTrue(fetched_db_datum is not None)
   self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url)
   link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(link_db_datums)>0)
   contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(contents_db_datums)>0)
Exemplo n.º 7
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = 'successful'
        if self.was_aborted:
            status = 'aborted'
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(autoescape=True,
                          extensions=['jinja2.ext.i18n'],
                          loader=FileSystemLoader(
                              os.path.join(base_dir, 'templates')))
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(
                ("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(
            url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(sender=sender,
                       to=email,
                       subject=subject,
                       body="FetchResults",
                       html=html,
                       attachments=attachments)
Exemplo n.º 8
0
  def testSuccessfulRun(self):
    """Test extract outlinks by UDF."""
    resource_neg = self.getResource("cloudysunny14.html")
    static_content = resource_neg.read()
    createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED)
    file_name = self.createMockDataLine("http://cloudysunny14.html\n")
    p = pipelines._ExtractOutlinksPipeline("ExtractOutlinksPipeline",
        file_names=[file_name],
        parser_params={
          "text/html": __name__+"._htmlOutlinkParser"
        }) 
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://cloudysunny14.html").fetch()
    entity = entities[0]
    fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
    self.assertTrue(fetched_datum!=None)
    qry = CrawlDbDatum.query(CrawlDbDatum.last_status == pipelines.UNFETCHED)
    crawl_db_datums = qry.fetch()
    self.assertTrue(len(crawl_db_datums)==0)
Exemplo n.º 9
0
    def testSuccessfulRun(self):
        """Test extract outlinks by UDF."""
        resource_neg = self.getResource("cloudysunny14.html")
        static_content = resource_neg.read()
        createMockFetchedDatum("http://cloudysunny14.html", static_content,
                               pipelines.FETCHED)
        file_name = self.createMockDataLine("http://cloudysunny14.html\n")
        p = pipelines._ExtractOutlinksPipeline(
            "ExtractOutlinksPipeline",
            file_names=[file_name],
            parser_params={"text/html": __name__ + "._htmlOutlinkParser"})
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://cloudysunny14.html").fetch()
        entity = entities[0]
        fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
        self.assertTrue(fetched_datum != None)
        qry = CrawlDbDatum.query(
            CrawlDbDatum.last_status == pipelines.UNFETCHED)
        crawl_db_datums = qry.fetch()
        self.assertTrue(len(crawl_db_datums) == 0)
Exemplo n.º 10
0
 def testSuccessfulRun(self):
   createMockCrawlDbDatum(2, 2, True)
   file_name1 = self.createMockData(("http://hoge_0.com/content_0", True))
   file_name2 = self.createMockData(("http://hoge_1.com/content_0", False))
   static_content = "<html><body>TestContent</body></html>"
   self.setReturnValue(content=static_content,
                       headers={"Content-Length": len(static_content),
                                "Content-Type": "text/html"})
   p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id)
   
   # Can open files
   file_paths = finished_map.outputs.default.value
   self.assertTrue(len(file_paths) > 0)
   self.assertTrue(file_paths[0].startswith("/blobstore/"))
   
   
   entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://hoge_0.com/content_0").fetch()
   entity = entities[0]
   fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
   self.assertTrue(fetched_datum is not None)