예제 #1
0
def createMockCrawlDbDatum(domain_count, url_count, isExtracted):
    """Create CrawlDbDatum mock data."""
    for d in range(domain_count):
        for n in range(url_count):
            url = "http://hoge_%d.com/content_%d" % (d, n)
            extracted_url = None
            if isExtracted:
                extracted_url = "http://hoge_%d.com" % (d)
            datum = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url),
                                 url=url,
                                 extract_domain_url=extracted_url,
                                 last_status=pipelines.UNFETCHED)
            datum.put()
예제 #2
0
def createMockCrawlDbDatum(domain_count, url_count, isExtracted):
    """Create CrawlDbDatum mock data."""
    for d in range(domain_count):
      for n in range(url_count):
        url = "http://hoge_%d.com/content_%d" % (d, n)
        extracted_url = None
        if isExtracted:
          extracted_url = "http://hoge_%d.com"%(d)
        datum = CrawlDbDatum(
            parent =ndb.Key(CrawlDbDatum, url),
            url=url,
            extract_domain_url=extracted_url,
            last_status=pipelines.UNFETCHED)
        datum.put()
예제 #3
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "http://k.yimg.jp/images/top/sp/logo.gif"))
        file_name2 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "/appengine/images/slide1.png"))
        datum = CrawlDbDatum(
            parent=ndb.Key(CrawlDbDatum,
                           "https://developers.google.com/appengine/"),
            url="https://developers.google.com/appengine/",
            extract_domain_url="https://developers.google.com",
            last_status=pipelines.UNFETCHED)
        datum.put()
        resource = self.getResource("slide1.png")
        static_content = resource.read()
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "image/png"
                            })
        p = pipelines._FetchContentPipeline("FetchContentPipeline",
                                            [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)

        query = CrawlDbDatum.query(
            CrawlDbDatum.url == "https://developers.google.com/appengine/")
        crawl_db_datums = query.fetch()
        self.assertTrue(len(crawl_db_datums) > 0)
        key = crawl_db_datums[0].key
        content_datums = ContentDbDatum.query(ancestor=key).fetch()
        self.assertEqual(2, len(content_datums))
예제 #4
0
    def get(self):
        url = self.request.get("target", default_value=None)
        email = self.request.get("email", default_value=None)
        if url is None:
            url = memcache.get("url")
        else:
            memcache.set(key="url", value=url)
        if email is None:
            return

        data = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url), url=url, last_status=pipelines.UNFETCHED)
        data.put()
        pipeline = FecherJobPipeline(email)
        pipeline.start()
        path = pipeline.base_path + "/status?root=" + pipeline.pipeline_id
        self.redirect(path)
예제 #5
0
    def get(self):
        url = self.request.get("target", default_value=None)
        email = self.request.get("email", default_value=None)
        if url is None:
            url = memcache.get("url")
        else:
            memcache.set(key="url", value=url)
        if email is None:
            return

        data = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url),
                            url=url,
                            last_status=pipelines.UNFETCHED)
        data.put()
        pipeline = FecherJobPipeline(email)
        pipeline.start()
        path = pipeline.base_path + "/status?root=" + pipeline.pipeline_id
        self.redirect(path)
예제 #6
0
  def testSuccessfulRun(self):
    file_name1 = self.createMockData(("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif"))
    file_name2 = self.createMockData(("https://developers.google.com/appengine/", "/appengine/images/slide1.png"))
    datum = CrawlDbDatum(
        parent =ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"),
        url="https://developers.google.com/appengine/",
        extract_domain_url="https://developers.google.com",
        last_status=pipelines.UNFETCHED)
    datum.put()
    resource = self.getResource("slide1.png")
    static_content = resource.read()
    self.setReturnValue(content=static_content,
                        headers={"Content-Length": len(static_content),
                                 "Content-Type": "image/png"})
    p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2])
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    finished_map = pipelines._FetchSetsBufferPipeline.from_id(p.pipeline_id)
    
    # Can open files
    file_paths = finished_map.outputs.default.value
    self.assertTrue(len(file_paths) > 0)
    self.assertTrue(file_paths[0].startswith("/blobstore/"))

    reader = input_readers.RecordsReader(file_paths, 0)
    for binary_record in reader:
      proto = file_service_pb.KeyValue()
      proto.ParseFromString(binary_record)
      key = proto.key()
      value = proto.value()
      self.assertTrue(key is not None)
      self.assertTrue(value is not None)

    query = CrawlDbDatum.query(CrawlDbDatum.url=="https://developers.google.com/appengine/")
    crawl_db_datums = query.fetch()
    self.assertTrue(len(crawl_db_datums)>0)
    key = crawl_db_datums[0].key
    content_datums = ContentDbDatum.query(ancestor=key).fetch()
    self.assertEqual(2, len(content_datums))