示例#1
0
def createLinkDatum(parent_url, url):
    """Create Link CrawlDbDatum mock data."""
    key = ndb.Key(CrawlDbDatum, parent_url)
    CrawlDbDatum.get_or_insert(url,
                               parent=key,
                               url=url,
                               last_status=pipelines.UNFETCHED)
示例#2
0
def createMockCrawlDbDatum(domain_count, url_count, isExtracted):
    """Create CrawlDbDatum mock data."""
    for d in range(domain_count):
        for n in range(url_count):
            url = "http://hoge_%d.com/content_%d" % (d, n)
            extracted_url = None
            if isExtracted:
                extracted_url = "http://hoge_%d.com" % (d)
            datum = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url),
                                 url=url,
                                 extract_domain_url=extracted_url,
                                 last_status=pipelines.UNFETCHED)
            datum.put()
示例#3
0
def createMockCrawlDbDatum(domain_count, url_count, isExtracted):
    """Create CrawlDbDatum mock data."""
    for d in range(domain_count):
      for n in range(url_count):
        url = "http://hoge_%d.com/content_%d" % (d, n)
        extracted_url = None
        if isExtracted:
          extracted_url = "http://hoge_%d.com"%(d)
        datum = CrawlDbDatum(
            parent =ndb.Key(CrawlDbDatum, url),
            url=url,
            extract_domain_url=extracted_url,
            last_status=pipelines.UNFETCHED)
        datum.put()
示例#4
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "http://k.yimg.jp/images/top/sp/logo.gif"))
        file_name2 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "/appengine/images/slide1.png"))
        datum = CrawlDbDatum(
            parent=ndb.Key(CrawlDbDatum,
                           "https://developers.google.com/appengine/"),
            url="https://developers.google.com/appengine/",
            extract_domain_url="https://developers.google.com",
            last_status=pipelines.UNFETCHED)
        datum.put()
        resource = self.getResource("slide1.png")
        static_content = resource.read()
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "image/png"
                            })
        p = pipelines._FetchContentPipeline("FetchContentPipeline",
                                            [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)

        query = CrawlDbDatum.query(
            CrawlDbDatum.url == "https://developers.google.com/appengine/")
        crawl_db_datums = query.fetch()
        self.assertTrue(len(crawl_db_datums) > 0)
        key = crawl_db_datums[0].key
        content_datums = ContentDbDatum.query(ancestor=key).fetch()
        self.assertEqual(2, len(content_datums))
示例#5
0
    def get(self):
        url = self.request.get("target", default_value=None)
        email = self.request.get("email", default_value=None)
        if url is None:
            url = memcache.get("url")
        else:
            memcache.set(key="url", value=url)
        if email is None:
            return

        data = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url), url=url, last_status=pipelines.UNFETCHED)
        data.put()
        pipeline = FecherJobPipeline(email)
        pipeline.start()
        path = pipeline.base_path + "/status?root=" + pipeline.pipeline_id
        self.redirect(path)
示例#6
0
    def testSuccessfulRun(self):
        createMockCrawlDbDatum(2, 6, False)

        p = pipelines._ExactDomainMapreducePipeline(
            "ExactDomainMapreducePipeline",
            params={
                "entity_kind": "lakshmi.datum.CrawlDbDatum",
            },
            shard_count=3)
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        for file_path in file_paths:
            blob_key = files.blobstore.get_blob_key(file_path)
            reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100)
            u = 0
            for content in reader:
                self.assertTrue(content[1] != None)
                u += 1

        self.assertEqual(2, u)

        query = CrawlDbDatum.query(
            CrawlDbDatum.extract_domain_url == "http://hoge_0.com")
        entities = query.fetch()
        for entity in entities:
            self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
示例#7
0
    def testSuccessfulRun(self):
        createMockCrawlDbDatum(2, 2, True)
        file_name1 = self.createMockData(("http://hoge_0.com/content_0", True))
        file_name2 = self.createMockData(
            ("http://hoge_1.com/content_0", False))
        static_content = "<html><body>TestContent</body></html>"
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "text/html"
                            })
        p = pipelines._FetchPagePipeline("FetchPipeline",
                                         [file_name1, file_name2], 2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch()
        entity = entities[0]
        fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
        self.assertTrue(fetched_datum is not None)
示例#8
0
  def testSuccessfulRun(self):
    createMockCrawlDbDatum(2, 6, False)
    
    p = pipelines._ExactDomainMapreducePipeline("ExactDomainMapreducePipeline",
                                                 params={
                                                         "entity_kind": "lakshmi.datum.CrawlDbDatum",
                                                         },
                                                 shard_count=3)
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id)
    
    # Can open files
    file_paths = finished_map.outputs.default.value
    self.assertTrue(len(file_paths) > 0)
    self.assertTrue(file_paths[0].startswith("/blobstore/"))
    
    for file_path in file_paths:
      blob_key = files.blobstore.get_blob_key(file_path)
      reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100)
      u = 0
      for content in reader:
        self.assertTrue(content[1]!=None)
        u += 1
    
    self.assertEqual(2, u)

    query = CrawlDbDatum.query(CrawlDbDatum.extract_domain_url=="http://hoge_0.com")
    entities = query.fetch()
    for entity in entities:
      self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
示例#9
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = "successful"
        if self.was_aborted:
            status = "aborted"
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(
            autoescape=True,
            extensions=["jinja2.ext.i18n"],
            loader=FileSystemLoader(os.path.join(base_dir, "templates")),
        )
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(
            sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments
        )
示例#10
0
def _fetchContentMap(binary_record):
    """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    page_url = proto.key()
    target_url = proto.value()
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == page_url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" +
                        page_url + ":" + e.message)

    #start fetch
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    stored_url = None
    if re.match("^/", target_url):
        crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
        target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

    try:
        fetch_result = fetcher.get(target_url)
        if fetch_result:
            #Storing to blobstore
            blob_io = files.blobstore.create(
                mime_type=fetch_result.get("mime_type"),
                _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
            with files.open(blob_io, 'a') as f:
                f.write(fetch_result.get("content"))
            files.finalize(blob_io)
            blob_key = files.blobstore.get_blob_key(blob_io)
            stored_url = images.get_serving_url(str(blob_key))
    except Exception as e:
        logging.warning("Fetch Error Occurs:" + e.message)

    #Put content to datastore.
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    if crawl_db_datum and stored_url is not None:
        entity = ContentDbDatum(
            parent=crawl_db_datum.key,
            fetched_url=fetch_result.get("fetched_url"),
            stored_url=stored_url,
            content_type=fetch_result.get("mime_type"),
            content_size=fetch_result.get("content_length"),
            http_headers=str(fetch_result.get("headers")))
        entity.put()

    yield "%s:%s" % (target_url, stored_url)
示例#11
0
    def get(self):
        url = self.request.get("target", default_value=None)
        email = self.request.get("email", default_value=None)
        if url is None:
            url = memcache.get("url")
        else:
            memcache.set(key="url", value=url)
        if email is None:
            return

        data = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url),
                            url=url,
                            last_status=pipelines.UNFETCHED)
        data.put()
        pipeline = FecherJobPipeline(email)
        pipeline.start()
        path = pipeline.base_path + "/status?root=" + pipeline.pipeline_id
        self.redirect(path)
示例#12
0
def createMockFetchedDatum(url, html_text, status):
  """Create FetchedDatum mock data."""
  key = ndb.Key(CrawlDbDatum, url)
  crawl = CrawlDbDatum.get_or_insert(url, parent=key,
      url=url, last_status=status)
  if status != pipelines.UNFETCHED:
    fetched_datum = FetchedDbDatum(parent=crawl.key,
        url=url, fetched_url = url,
        fetched_content = html_text, content_type="text/html")
    fetched_datum.put()
示例#13
0
  def testSuccessfulRun(self):
    """Test clean pipeline."""
    createMockFetchedDatum("http://foo.html", "Content", pipelines.FETCHED)
    createMockFetchedDatum("http://bar.html", "Content", pipelines.SKIPPED)
    createMockFetchedDatum("http://baz.html", "Content", pipelines.UNFETCHED)
    p = pipelines.CleanDatumPipeline("CleanDatumPipeline",
        params={
          "entity_kind": "lakshmi.datum.CrawlDbDatum",
        },
        shards=3) 
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    entities = CrawlDbDatum.query(CrawlDbDatum.url== "http://foo.html").fetch()
    self.assertEquals(0, len(entities))
    entities = CrawlDbDatum.query(CrawlDbDatum.url== "http://bar.html").fetch()
    self.assertEquals(0, len(entities))
    entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://baz.html").fetch()
    self.assertEquals(1, len(entities))
示例#14
0
def _fetchContentMap(binary_record):
  """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  page_url = proto.key()
  target_url = proto.value()
  #Fetch to CrawlDbDatum
  try:
    query = CrawlDbDatum.query(CrawlDbDatum.url==page_url)
    crawl_db_datum_future = query.fetch_async() 
  except Exception as e:
    logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message)
  
  #start fetch    
  fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
  stored_url = None
  if re.match("^/", target_url):
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

  try:
    fetch_result = fetcher.get(target_url)
    if fetch_result:
      #Storing to blobstore
      blob_io = files.blobstore.create(mime_type=fetch_result.get("mime_type"),
          _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
      with files.open(blob_io, 'a') as f:
        f.write(fetch_result.get("content"))
      files.finalize(blob_io)
      blob_key = files.blobstore.get_blob_key(blob_io)
      stored_url = images.get_serving_url(str(blob_key))
  except Exception as e:
    logging.warning("Fetch Error Occurs:" + e.message)

  #Put content to datastore.
  crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
  if crawl_db_datum and stored_url is not None:
    entity = ContentDbDatum(parent=crawl_db_datum.key,
          fetched_url=fetch_result.get("fetched_url"),
          stored_url=stored_url,
          content_type=fetch_result.get("mime_type"),
          content_size=fetch_result.get("content_length"),
          http_headers=str(fetch_result.get("headers")))
    entity.put()

  yield "%s:%s" % (target_url, stored_url)
示例#15
0
  def testSuccessfulRun(self):
    """Test extract outlinks by UDF."""
    resource_neg = self.getResource("cloudysunny14.html")
    static_content = resource_neg.read()
    createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED)
    file_name = self.createMockDataLine("http://cloudysunny14.html\n")
    p = pipelines._ExtractOutlinksPipeline("ExtractOutlinksPipeline",
        file_names=[file_name],
        parser_params={
          "text/html": __name__+"._htmlOutlinkParser"
        }) 
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://cloudysunny14.html").fetch()
    entity = entities[0]
    fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
    self.assertTrue(fetched_datum!=None)
    qry = CrawlDbDatum.query(CrawlDbDatum.last_status == pipelines.UNFETCHED)
    crawl_db_datums = qry.fetch()
    self.assertTrue(len(crawl_db_datums)==0)
示例#16
0
  def testSuccessfulRun(self):
    file_name1 = self.createMockData(("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif"))
    file_name2 = self.createMockData(("https://developers.google.com/appengine/", "/appengine/images/slide1.png"))
    datum = CrawlDbDatum(
        parent =ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"),
        url="https://developers.google.com/appengine/",
        extract_domain_url="https://developers.google.com",
        last_status=pipelines.UNFETCHED)
    datum.put()
    resource = self.getResource("slide1.png")
    static_content = resource.read()
    self.setReturnValue(content=static_content,
                        headers={"Content-Length": len(static_content),
                                 "Content-Type": "image/png"})
    p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    finished_map = pipelines._FetchSetsBufferPipeline.from_id(p.pipeline_id)
    
    # Can open files
    file_paths = finished_map.outputs.default.value
    self.assertTrue(len(file_paths) > 0)
    self.assertTrue(file_paths[0].startswith("/blobstore/"))

    reader = input_readers.RecordsReader(file_paths, 0)
    for binary_record in reader:
      proto = file_service_pb.KeyValue()
      proto.ParseFromString(binary_record)
      key = proto.key()
      value = proto.value()
      self.assertTrue(key is not None)
      self.assertTrue(value is not None)

    query = CrawlDbDatum.query(CrawlDbDatum.url=="https://developers.google.com/appengine/")
    crawl_db_datums = query.fetch()
    self.assertTrue(len(crawl_db_datums)>0)
    key = crawl_db_datums[0].key
    content_datums = ContentDbDatum.query(ancestor=key).fetch()
    self.assertEqual(2, len(content_datums))
示例#17
0
def createMockFetchedDatum(url, html_text, status):
    """Create FetchedDatum mock data."""
    key = ndb.Key(CrawlDbDatum, url)
    crawl = CrawlDbDatum.get_or_insert(url,
                                       parent=key,
                                       url=url,
                                       last_status=status)
    if status != pipelines.UNFETCHED:
        fetched_datum = FetchedDbDatum(parent=crawl.key,
                                       url=url,
                                       fetched_url=url,
                                       fetched_content=html_text,
                                       content_type="text/html")
        fetched_datum.put()
示例#18
0
    def testSuccessfulRun(self):
        """Test extract outlinks by UDF."""
        resource_neg = self.getResource("cloudysunny14.html")
        static_content = resource_neg.read()
        createMockFetchedDatum("http://cloudysunny14.html", static_content,
                               pipelines.FETCHED)
        file_name = self.createMockDataLine("http://cloudysunny14.html\n")
        p = pipelines._ExtractOutlinksPipeline(
            "ExtractOutlinksPipeline",
            file_names=[file_name],
            parser_params={"text/html": __name__ + "._htmlOutlinkParser"})
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://cloudysunny14.html").fetch()
        entity = entities[0]
        fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
        self.assertTrue(fetched_datum != None)
        qry = CrawlDbDatum.query(
            CrawlDbDatum.last_status == pipelines.UNFETCHED)
        crawl_db_datums = qry.fetch()
        self.assertTrue(len(crawl_db_datums) == 0)
    def testFetchEndToEnd(self):
        """Test for through of fetcher job"""
        createMockCrawlDbDatum("http://foo.com/bar.html")
        static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        self.setReturnValue(url="http://foo.com/robots.txt",
                            content=static_robots,
                            headers={"Content-Length": len(static_robots)})
        #static resource is read from resource
        resource = self.getResource("sample_content.html")
        static_content = resource.read()
        static_content_length = len(static_content)
        self.setReturnValue(url="http://foo.com/bar.html",
                            content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "text/html"
                            })
        resource_image = self.getResource("slide1.png")
        static_content_image = resource_image.read()
        static_content_length = len(static_content_image)
        self.setReturnValue(url="http://foo.com/images/slide1.png",
                            content=static_content_image,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "image/png"
                            })
        p = pipelines.FetcherPipeline(
            "FetcherPipeline",
            params={"entity_kind": "lakshmi.datum.CrawlDbDatum"},
            parser_params={"text/html": __name__ + ".htmlParser"},
            shards=2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://foo.com/bar.html").fetch()
        crawl_db_datum = crawl_db_datums[0]
        self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        fetched_db_datum = fetched_db_datums[0]
        self.assertTrue(fetched_db_datum is not None)
        self.assertTrue("http://foo.com/bar.html",
                        fetched_db_datum.fetched_url)
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(link_db_datums) > 0)
        contents_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(contents_db_datums) > 0)
示例#20
0
def _extract_content_urls_map(data):
    """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
    k, url = data
    query = CrawlDbDatum.query(CrawlDbDatum.url == url)
    crawl_db_datum = query.fetch()
    key = crawl_db_datum[0].key
    fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
    fetched_datum = fetched_datums[0]
    content = None
    if fetched_datum is not None:
        content = fetched_datum.fetched_content
        mime_type = fetched_datum.content_type
        if content is not None:
            parsed_obj = None
            try:
                params = _get_parser_param(_PARSER_PARAM_KEY)
                parsed_obj = util.handler_for_name(params[mime_type])(key,
                                                                      content)
            except Exception as e:
                logging.warning("Can not handle for %s[params:%s]:%s" %
                                (mime_type, params, e.message))
            if parsed_obj is not None:
                for content_urls in parsed_obj:
                    yield (url, content_urls)
示例#21
0
    def testSuccessfulRun(self):
        """Test clean pipeline."""
        createMockFetchedDatum("http://foo.html", "Content", pipelines.FETCHED)
        createMockFetchedDatum("http://bar.html", "Content", pipelines.SKIPPED)
        createMockFetchedDatum("http://baz.html", "Content",
                               pipelines.UNFETCHED)
        p = pipelines.CleanDatumPipeline("CleanDatumPipeline",
                                         params={
                                             "entity_kind":
                                             "lakshmi.datum.CrawlDbDatum",
                                         },
                                         shards=3)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://foo.html").fetch()
        self.assertEquals(0, len(entities))
        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://bar.html").fetch()
        self.assertEquals(0, len(entities))
        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://baz.html").fetch()
        self.assertEquals(1, len(entities))
示例#22
0
def _extract_content_urls_map(data):
  """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
  k, url = data
  query = CrawlDbDatum.query(CrawlDbDatum.url==url)
  crawl_db_datum = query.fetch()
  key = crawl_db_datum[0].key
  fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
  fetched_datum = fetched_datums[0]
  content = None
  if fetched_datum is not None:
    content = fetched_datum.fetched_content
    mime_type = fetched_datum.content_type
    if content is not None:
      parsed_obj = None
      try:
        params = _get_parser_param(_PARSER_PARAM_KEY)
        parsed_obj = util.handler_for_name(params[mime_type])(key, content)
      except Exception as e:
        logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message))
      if parsed_obj is not None:
        for content_urls in parsed_obj:
          yield (url, content_urls)
 def testFetchEndToEnd(self):
   """Test for through of fetcher job"""
   createMockCrawlDbDatum("http://foo.com/bar.html")
   static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
   self.setReturnValue(url="http://foo.com/robots.txt",
       content=static_robots,
       headers={"Content-Length": len(static_robots)})
   #static resource is read from resource
   resource = self.getResource("sample_content.html")
   static_content = resource.read()
   static_content_length = len(static_content)
   self.setReturnValue(url="http://foo.com/bar.html",
       content=static_content,
       headers={"Content-Length": static_content_length,
           "Content-Type": "text/html"})
   resource_image = self.getResource("slide1.png")
   static_content_image = resource_image.read()
   static_content_length = len(static_content_image)
   self.setReturnValue(url="http://foo.com/images/slide1.png",
       content=static_content_image,
       headers={"Content-Length": static_content_length,
           "Content-Type": "image/png"})
   p = pipelines.FetcherPipeline("FetcherPipeline",
       params={
         "entity_kind": "lakshmi.datum.CrawlDbDatum"
       },
       parser_params={
         "text/html": __name__ + ".htmlParser"
       },
       shards=2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   
   crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch()
   crawl_db_datum = crawl_db_datums[0]
   self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
   fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   fetched_db_datum = fetched_db_datums[0]
   self.assertTrue(fetched_db_datum is not None)
   self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url)
   link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(link_db_datums)>0)
   contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(contents_db_datums)>0)
示例#24
0
def _makeFetchSetBufferMap(binary_record):
    """Map function of create fetch buffers,
  that output thus is one or more fetch url to fetch or skip.
  
  Arg:
    binary_record: key value data, that key is extract domain url,
      value is content from robots.txt.

  Returns:
    url: to fetch url.
    fetch_or_unfetch: the boolean value of fetch or unfetch,
      if sets true is fetch, false is skip.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    extract_domain_url = proto.key()
    content = proto.value()
    #Extract urls from CrawlDbDatum.
    try:
        query = CrawlDbDatum.query(
            CrawlDbDatum.extract_domain_url == extract_domain_url)
        crawl_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Fetch error occurs from CrawlDbDatum" + e.message)

    can_fetch = False
    #Get the fetcher policy from resource.
    user_agent = fetcher_policy_yaml.fetcher_policy.agent_name
    rp = robotparser.RobotFileParser()
    try:
        rp.parse(content.split("\n").__iter__())
    except Exception as e:
        logging.warning("RobotFileParser raises exception:" + e.message)

    for crawl_datum in crawl_datum_future.get_result():
        url = crawl_datum.url
        try:
            can_fetch = rp.can_fetch(user_agent, url)
        except Exception as e:
            logging.warning("RobotFileParser raises exception:" + e.message)
            url = ""

        yield (url, can_fetch)
示例#25
0
def _makeFetchSetBufferMap(binary_record):
  """Map function of create fetch buffers,
  that output thus is one or more fetch url to fetch or skip.
  
  Arg:
    binary_record: key value data, that key is extract domain url,
      value is content from robots.txt.

  Returns:
    url: to fetch url.
    fetch_or_unfetch: the boolean value of fetch or unfetch,
      if sets true is fetch, false is skip.
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  extract_domain_url = proto.key()
  content = proto.value()
  #Extract urls from CrawlDbDatum.
  try:
    query = CrawlDbDatum.query(CrawlDbDatum.extract_domain_url==extract_domain_url)
    crawl_datum_future = query.fetch_async()
  except Exception as e:
    logging.warning("Fetch error occurs from CrawlDbDatum" + e.message)

  can_fetch = False
  #Get the fetcher policy from resource.
  user_agent = fetcher_policy_yaml.fetcher_policy.agent_name
  rp = robotparser.RobotFileParser()
  try:
    rp.parse(content.split("\n").__iter__())
  except Exception as e:
    logging.warning("RobotFileParser raises exception:" + e.message) 
   
  for crawl_datum in crawl_datum_future.get_result():
    url = crawl_datum.url
    try:
      can_fetch = rp.can_fetch(user_agent, url)
    except Exception as e:
      logging.warning("RobotFileParser raises exception:" + e.message)
      url = ""
    
    yield (url, can_fetch)
示例#26
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = 'successful'
        if self.was_aborted:
            status = 'aborted'
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(autoescape=True,
                          extensions=['jinja2.ext.i18n'],
                          loader=FileSystemLoader(
                              os.path.join(base_dir, 'templates')))
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(
                ("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(
            url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(sender=sender,
                       to=email,
                       subject=subject,
                       body="FetchResults",
                       html=html,
                       attachments=attachments)
示例#27
0
 def testSuccessfulRun(self):
   createMockCrawlDbDatum(2, 2, True)
   file_name1 = self.createMockData(("http://hoge_0.com/content_0", True))
   file_name2 = self.createMockData(("http://hoge_1.com/content_0", False))
   static_content = "<html><body>TestContent</body></html>"
   self.setReturnValue(content=static_content,
                       headers={"Content-Length": len(static_content),
                                "Content-Type": "text/html"})
   p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id)
   
   # Can open files
   file_paths = finished_map.outputs.default.value
   self.assertTrue(len(file_paths) > 0)
   self.assertTrue(file_paths[0].startswith("/blobstore/"))
   
   
   entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://hoge_0.com/content_0").fetch()
   entity = entities[0]
   fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
   self.assertTrue(fetched_datum is not None)
def createMockCrawlDbDatum(url):
    """Create CrawlDbDatum mock data."""
    CrawlDbDatum.get_or_insert(url,
        url=url, last_status=pipelines.UNFETCHED)
示例#29
0
def createLinkDatum(parent_url, url):
  """Create Link CrawlDbDatum mock data."""
  key = ndb.Key(CrawlDbDatum, parent_url)
  CrawlDbDatum.get_or_insert(url, parent=key,
      url=url, last_status=pipelines.UNFETCHED)
示例#30
0
def _fetchMap(binary_record):
    """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    url = proto.key()
    could_fetch = _str2bool(proto.value())
    result = UNFETCHED
    fetched_url = ""
    fetch_date = None
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" + url +
                        ":" + e.message)
        could_fetch = False

    if could_fetch:
        #start fetch
        fetcher = fetchers.SimpleHttpFetcher(
            1, fetcher_policy_yaml.fetcher_policy)
        try:
            fetch_result = fetcher.get(url)
            if fetch_result:
                #Storing to datastore
                crawl_db_datums = crawl_db_datum_future.get_result()
                fetche_datum = FetchedDbDatum(
                    parent=crawl_db_datums[0].key,
                    url=url,
                    fetched_url=fetch_result.get("fetched_url"),
                    fetch_time=fetch_result.get("time"),
                    fetched_content=fetch_result.get("content"),
                    content_type=fetch_result.get("mime_type"),
                    content_size=fetch_result.get("read_rate"),
                    response_rate=fetch_result.get("read_rate"),
                    http_headers=str(fetch_result.get("headers")))
                fetche_datum.put()
                #update time of last fetched
                result = FETCHED
                fetch_date = datetime.datetime.now()
                fetched_url = ("%s\n" % url)
        except Exception as e:
            logging.warning("Fetch Page Error Occurs:" + e.message)
            result = FAILED
    else:
        result = FAILED

    #Update status to all datums.
    crawl_db_datums = crawl_db_datum_future.get_result()
    for datum in crawl_db_datums:
        datum.last_status = result
        datum.last_fetched = fetch_date
    ndb.put_multi(crawl_db_datums)

    yield fetched_url
def createMockCrawlDbDatum(url):
    """Create CrawlDbDatum mock data."""
    CrawlDbDatum.get_or_insert(url, url=url, last_status=pipelines.UNFETCHED)
示例#32
0
def _fetchMap(binary_record):
  """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  url = proto.key()
  could_fetch = _str2bool(proto.value())
  result = UNFETCHED
  fetched_url = ""
  fetch_date = None
  #Fetch to CrawlDbDatum
  try:
    query = CrawlDbDatum.query(CrawlDbDatum.url==url)
    crawl_db_datum_future = query.fetch_async() 
  except Exception as e:
    logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message)
    could_fetch = False
  
  if could_fetch:
    #start fetch    
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    try:
      fetch_result = fetcher.get(url)
      if fetch_result:
        #Storing to datastore
        crawl_db_datums = crawl_db_datum_future.get_result()
        fetche_datum = FetchedDbDatum(parent=crawl_db_datums[0].key,
            url=url, fetched_url = fetch_result.get("fetched_url"),
            fetch_time = fetch_result.get("time"), fetched_content = fetch_result.get("content"),
            content_type =  fetch_result.get("mime_type"),
            content_size = fetch_result.get("read_rate"),
            response_rate = fetch_result.get("read_rate"),
            http_headers = str(fetch_result.get("headers")))
        fetche_datum.put()
        #update time of last fetched 
        result = FETCHED
        fetch_date = datetime.datetime.now()
        fetched_url = ("%s\n"%url)
    except Exception as e:
      logging.warning("Fetch Page Error Occurs:" + e.message)
      result = FAILED
  else:
    result = FAILED
  
  #Update status to all datums.
  crawl_db_datums = crawl_db_datum_future.get_result()
  for datum in crawl_db_datums:
    datum.last_status = result
    datum.last_fetched = fetch_date
  ndb.put_multi(crawl_db_datums)

  yield fetched_url