示例#1
0
def createMockFetchedDatum(url, html_text, status):
  """Create FetchedDatum mock data."""
  key = ndb.Key(CrawlDbDatum, url)
  crawl = CrawlDbDatum.get_or_insert(url, parent=key,
      url=url, last_status=status)
  if status != pipelines.UNFETCHED:
    fetched_datum = FetchedDbDatum(parent=crawl.key,
        url=url, fetched_url = url,
        fetched_content = html_text, content_type="text/html")
    fetched_datum.put()
示例#2
0
def createMockFetchedDatum(url, html_text, status):
    """Create FetchedDatum mock data."""
    key = ndb.Key(CrawlDbDatum, url)
    crawl = CrawlDbDatum.get_or_insert(url,
                                       parent=key,
                                       url=url,
                                       last_status=status)
    if status != pipelines.UNFETCHED:
        fetched_datum = FetchedDbDatum(parent=crawl.key,
                                       url=url,
                                       fetched_url=url,
                                       fetched_content=html_text,
                                       content_type="text/html")
        fetched_datum.put()
示例#3
0
def _clean_map(crawl_db_datum):
  """Delete entities map function.

  Delete unnecessary entities, also FetchedDbDatum. 

  Args:
    crawl_db_datum: The entity of crawl_db_datum.

  Returns:
    url_str: Deleted urls.
  """
  delete_keys = []
  clean_all = memcache.get(CLEAN_ALL_KEY)
  delete_fetched_datum =  FetchedDbDatum.get_by_id(crawl_db_datum.url)
  if delete_fetched_datum is not None:
    delete_keys.append(delete_fetched_datum.key)

  data = ndb.Model.to_dict(crawl_db_datum)
  fetch_status = data.get("last_status", 2)
  url=""
  clean_all = memcache.get(CLEAN_ALL_KEY)
  if clean_all:
    delete_keys.append(crawl_db_datum.key)
  else:
    if fetch_status in [FETCHED, SKIPPED, FAILED]:
      delete_keys.append(crawl_db_datum.key)

  ndb.delete_multi(delete_keys)

  yield(url+"\n")
示例#4
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = "successful"
        if self.was_aborted:
            status = "aborted"
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(
            autoescape=True,
            extensions=["jinja2.ext.i18n"],
            loader=FileSystemLoader(os.path.join(base_dir, "templates")),
        )
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(
            sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments
        )
示例#5
0
def _clean_map(crawl_db_datum):
    """Delete entities map function.

  Delete unnecessary entities, also FetchedDbDatum. 

  Args:
    crawl_db_datum: The entity of crawl_db_datum.

  Returns:
    url_str: Deleted urls.
  """
    delete_keys = []
    clean_all = memcache.get(CLEAN_ALL_KEY)
    delete_fetched_datum = FetchedDbDatum.get_by_id(crawl_db_datum.url)
    if delete_fetched_datum is not None:
        delete_keys.append(delete_fetched_datum.key)

    data = ndb.Model.to_dict(crawl_db_datum)
    fetch_status = data.get("last_status", 2)
    url = ""
    clean_all = memcache.get(CLEAN_ALL_KEY)
    if clean_all:
        delete_keys.append(crawl_db_datum.key)
    else:
        if fetch_status in [FETCHED, SKIPPED, FAILED]:
            delete_keys.append(crawl_db_datum.key)

    ndb.delete_multi(delete_keys)

    yield (url + "\n")
示例#6
0
    def testSuccessfulRun(self):
        createMockCrawlDbDatum(2, 2, True)
        file_name1 = self.createMockData(("http://hoge_0.com/content_0", True))
        file_name2 = self.createMockData(
            ("http://hoge_1.com/content_0", False))
        static_content = "<html><body>TestContent</body></html>"
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "text/html"
                            })
        p = pipelines._FetchPagePipeline("FetchPipeline",
                                         [file_name1, file_name2], 2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch()
        entity = entities[0]
        fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
        self.assertTrue(fetched_datum is not None)
示例#7
0
def _extract_content_urls_map(data):
    """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
    k, url = data
    query = CrawlDbDatum.query(CrawlDbDatum.url == url)
    crawl_db_datum = query.fetch()
    key = crawl_db_datum[0].key
    fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
    fetched_datum = fetched_datums[0]
    content = None
    if fetched_datum is not None:
        content = fetched_datum.fetched_content
        mime_type = fetched_datum.content_type
        if content is not None:
            parsed_obj = None
            try:
                params = _get_parser_param(_PARSER_PARAM_KEY)
                parsed_obj = util.handler_for_name(params[mime_type])(key,
                                                                      content)
            except Exception as e:
                logging.warning("Can not handle for %s[params:%s]:%s" %
                                (mime_type, params, e.message))
            if parsed_obj is not None:
                for content_urls in parsed_obj:
                    yield (url, content_urls)
    def testFetchEndToEnd(self):
        """Test for through of fetcher job"""
        createMockCrawlDbDatum("http://foo.com/bar.html")
        static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        self.setReturnValue(url="http://foo.com/robots.txt",
                            content=static_robots,
                            headers={"Content-Length": len(static_robots)})
        #static resource is read from resource
        resource = self.getResource("sample_content.html")
        static_content = resource.read()
        static_content_length = len(static_content)
        self.setReturnValue(url="http://foo.com/bar.html",
                            content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "text/html"
                            })
        resource_image = self.getResource("slide1.png")
        static_content_image = resource_image.read()
        static_content_length = len(static_content_image)
        self.setReturnValue(url="http://foo.com/images/slide1.png",
                            content=static_content_image,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "image/png"
                            })
        p = pipelines.FetcherPipeline(
            "FetcherPipeline",
            params={"entity_kind": "lakshmi.datum.CrawlDbDatum"},
            parser_params={"text/html": __name__ + ".htmlParser"},
            shards=2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://foo.com/bar.html").fetch()
        crawl_db_datum = crawl_db_datums[0]
        self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        fetched_db_datum = fetched_db_datums[0]
        self.assertTrue(fetched_db_datum is not None)
        self.assertTrue("http://foo.com/bar.html",
                        fetched_db_datum.fetched_url)
        link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(link_db_datums) > 0)
        contents_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        self.assertTrue(len(contents_db_datums) > 0)
示例#9
0
def _extract_content_urls_map(data):
  """Map function of extract outlinks from content.

  Function to be extracted and parsed to extract contents url with UDF.
  For example, You specified parser UDF for HTML, would like to
  fetch content from target page, and storing outlinks.
  implement default like this::

    def htmlParser(key, content):
      outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content)
      link_datums = []
      for link in outlinks:
        link_datum = LinkDbDatum(parent=key, link_url=link)
        link_datums.append(link_datum)
      ndb.put_multi_async(link_datums) 
      content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) 
      return content_links

  Note:Note:The above function to return the URL of the target of 
    url that will fetch in the next job(FetchContentPipeline)

  Args:
    data: key value data, that key is position, value is url.

  Returns:
    url: The page url.
  """
  k, url = data
  query = CrawlDbDatum.query(CrawlDbDatum.url==url)
  crawl_db_datum = query.fetch()
  key = crawl_db_datum[0].key
  fetched_datums = FetchedDbDatum.query(ancestor=key).fetch()
  fetched_datum = fetched_datums[0]
  content = None
  if fetched_datum is not None:
    content = fetched_datum.fetched_content
    mime_type = fetched_datum.content_type
    if content is not None:
      parsed_obj = None
      try:
        params = _get_parser_param(_PARSER_PARAM_KEY)
        parsed_obj = util.handler_for_name(params[mime_type])(key, content)
      except Exception as e:
        logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message))
      if parsed_obj is not None:
        for content_urls in parsed_obj:
          yield (url, content_urls)
 def testFetchEndToEnd(self):
   """Test for through of fetcher job"""
   createMockCrawlDbDatum("http://foo.com/bar.html")
   static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
   self.setReturnValue(url="http://foo.com/robots.txt",
       content=static_robots,
       headers={"Content-Length": len(static_robots)})
   #static resource is read from resource
   resource = self.getResource("sample_content.html")
   static_content = resource.read()
   static_content_length = len(static_content)
   self.setReturnValue(url="http://foo.com/bar.html",
       content=static_content,
       headers={"Content-Length": static_content_length,
           "Content-Type": "text/html"})
   resource_image = self.getResource("slide1.png")
   static_content_image = resource_image.read()
   static_content_length = len(static_content_image)
   self.setReturnValue(url="http://foo.com/images/slide1.png",
       content=static_content_image,
       headers={"Content-Length": static_content_length,
           "Content-Type": "image/png"})
   p = pipelines.FetcherPipeline("FetcherPipeline",
       params={
         "entity_kind": "lakshmi.datum.CrawlDbDatum"
       },
       parser_params={
         "text/html": __name__ + ".htmlParser"
       },
       shards=2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   
   crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch()
   crawl_db_datum = crawl_db_datums[0]
   self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status)
   fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   fetched_db_datum = fetched_db_datums[0]
   self.assertTrue(fetched_db_datum is not None)
   self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url)
   link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(link_db_datums)>0)
   contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch()
   self.assertTrue(len(contents_db_datums)>0)
示例#11
0
    def finalized(self):
        """Sends an email to admins indicating this Pipeline has completed.

    For developer convenience. Automatically called from finalized for root
    Pipelines that do not override the default action.
    """
        status = 'successful'
        if self.was_aborted:
            status = 'aborted'
        url = memcache.get("url")
        email = memcache.get("email")
        base_dir = os.path.realpath(os.path.dirname(__file__))
        # Configure jinja for internal templates
        env = Environment(autoescape=True,
                          extensions=['jinja2.ext.i18n'],
                          loader=FileSystemLoader(
                              os.path.join(base_dir, 'templates')))
        subject = "Your Fetcher Job is " + status
        crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(
            CrawlDbDatum.url == url).fetch()
        crawl_db_datum = crawl_db_datums[0]
        content_db_datums = ContentDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        fetched_db_datums = FetchedDbDatum.query(
            ancestor=crawl_db_datum.key).fetch()
        attachments = []
        if len(fetched_db_datums) > 0:
            fetched_db_datum = fetched_db_datums[0]
            attachments.append(
                ("fetched_content.html", fetched_db_datum.fetched_content))
        link_db_datums = LinkDbDatum.query(
            ancestor=crawl_db_datum.key).fetch_async()
        html = env.get_template("mail_template.html").render(
            url=url, contents=content_db_datums, links=link_db_datums)
        attachments.append(("sendmail.html", html))
        sender = "*****@*****.**"
        mail.send_mail(sender=sender,
                       to=email,
                       subject=subject,
                       body="FetchResults",
                       html=html,
                       attachments=attachments)
示例#12
0
  def testSuccessfulRun(self):
    """Test extract outlinks by UDF."""
    resource_neg = self.getResource("cloudysunny14.html")
    static_content = resource_neg.read()
    createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED)
    file_name = self.createMockDataLine("http://cloudysunny14.html\n")
    p = pipelines._ExtractOutlinksPipeline("ExtractOutlinksPipeline",
        file_names=[file_name],
        parser_params={
          "text/html": __name__+"._htmlOutlinkParser"
        }) 
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://cloudysunny14.html").fetch()
    entity = entities[0]
    fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
    self.assertTrue(fetched_datum!=None)
    qry = CrawlDbDatum.query(CrawlDbDatum.last_status == pipelines.UNFETCHED)
    crawl_db_datums = qry.fetch()
    self.assertTrue(len(crawl_db_datums)==0)
示例#13
0
    def testSuccessfulRun(self):
        """Test extract outlinks by UDF."""
        resource_neg = self.getResource("cloudysunny14.html")
        static_content = resource_neg.read()
        createMockFetchedDatum("http://cloudysunny14.html", static_content,
                               pipelines.FETCHED)
        file_name = self.createMockDataLine("http://cloudysunny14.html\n")
        p = pipelines._ExtractOutlinksPipeline(
            "ExtractOutlinksPipeline",
            file_names=[file_name],
            parser_params={"text/html": __name__ + "._htmlOutlinkParser"})
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://cloudysunny14.html").fetch()
        entity = entities[0]
        fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
        self.assertTrue(fetched_datum != None)
        qry = CrawlDbDatum.query(
            CrawlDbDatum.last_status == pipelines.UNFETCHED)
        crawl_db_datums = qry.fetch()
        self.assertTrue(len(crawl_db_datums) == 0)
示例#14
0
 def testSuccessfulRun(self):
   createMockCrawlDbDatum(2, 2, True)
   file_name1 = self.createMockData(("http://hoge_0.com/content_0", True))
   file_name2 = self.createMockData(("http://hoge_1.com/content_0", False))
   static_content = "<html><body>TestContent</body></html>"
   self.setReturnValue(content=static_content,
                       headers={"Content-Length": len(static_content),
                                "Content-Type": "text/html"})
   p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
   finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id)
   
   # Can open files
   file_paths = finished_map.outputs.default.value
   self.assertTrue(len(file_paths) > 0)
   self.assertTrue(file_paths[0].startswith("/blobstore/"))
   
   
   entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://hoge_0.com/content_0").fetch()
   entity = entities[0]
   fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
   self.assertTrue(fetched_datum is not None)
示例#15
0
def _fetchMap(binary_record):
  """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  url = proto.key()
  could_fetch = _str2bool(proto.value())
  result = UNFETCHED
  fetched_url = ""
  fetch_date = None
  #Fetch to CrawlDbDatum
  try:
    query = CrawlDbDatum.query(CrawlDbDatum.url==url)
    crawl_db_datum_future = query.fetch_async() 
  except Exception as e:
    logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message)
    could_fetch = False
  
  if could_fetch:
    #start fetch    
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    try:
      fetch_result = fetcher.get(url)
      if fetch_result:
        #Storing to datastore
        crawl_db_datums = crawl_db_datum_future.get_result()
        fetche_datum = FetchedDbDatum(parent=crawl_db_datums[0].key,
            url=url, fetched_url = fetch_result.get("fetched_url"),
            fetch_time = fetch_result.get("time"), fetched_content = fetch_result.get("content"),
            content_type =  fetch_result.get("mime_type"),
            content_size = fetch_result.get("read_rate"),
            response_rate = fetch_result.get("read_rate"),
            http_headers = str(fetch_result.get("headers")))
        fetche_datum.put()
        #update time of last fetched 
        result = FETCHED
        fetch_date = datetime.datetime.now()
        fetched_url = ("%s\n"%url)
    except Exception as e:
      logging.warning("Fetch Page Error Occurs:" + e.message)
      result = FAILED
  else:
    result = FAILED
  
  #Update status to all datums.
  crawl_db_datums = crawl_db_datum_future.get_result()
  for datum in crawl_db_datums:
    datum.last_status = result
    datum.last_fetched = fetch_date
  ndb.put_multi(crawl_db_datums)

  yield fetched_url
示例#16
0
def _fetchMap(binary_record):
    """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    url = proto.key()
    could_fetch = _str2bool(proto.value())
    result = UNFETCHED
    fetched_url = ""
    fetch_date = None
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" + url +
                        ":" + e.message)
        could_fetch = False

    if could_fetch:
        #start fetch
        fetcher = fetchers.SimpleHttpFetcher(
            1, fetcher_policy_yaml.fetcher_policy)
        try:
            fetch_result = fetcher.get(url)
            if fetch_result:
                #Storing to datastore
                crawl_db_datums = crawl_db_datum_future.get_result()
                fetche_datum = FetchedDbDatum(
                    parent=crawl_db_datums[0].key,
                    url=url,
                    fetched_url=fetch_result.get("fetched_url"),
                    fetch_time=fetch_result.get("time"),
                    fetched_content=fetch_result.get("content"),
                    content_type=fetch_result.get("mime_type"),
                    content_size=fetch_result.get("read_rate"),
                    response_rate=fetch_result.get("read_rate"),
                    http_headers=str(fetch_result.get("headers")))
                fetche_datum.put()
                #update time of last fetched
                result = FETCHED
                fetch_date = datetime.datetime.now()
                fetched_url = ("%s\n" % url)
        except Exception as e:
            logging.warning("Fetch Page Error Occurs:" + e.message)
            result = FAILED
    else:
        result = FAILED

    #Update status to all datums.
    crawl_db_datums = crawl_db_datum_future.get_result()
    for datum in crawl_db_datums:
        datum.last_status = result
        datum.last_fetched = fetch_date
    ndb.put_multi(crawl_db_datums)

    yield fetched_url