示例#1
0
 def add_url(self, url, rank=0, method="url"):
     url = Url(url)
     try:
         self.db.seeds.insert_one({
                             "date": [self.date],
                             "url": url.url,
                             "url_id" : url.url_id,
                             "url_info": url.export(),
                             "title": None,
                             "description": None,
                             "rank": rank,
                             "source_url": None,
                             "depth": 0,
                             "method": method
                             })
                                 
     except pymongo.errors.DuplicateKeyError:
         pass
     return self
示例#2
0
 def extract(self, response, depth=0, filters=True):
     article = {}
     html = response.text
     url = response.url
     url = Url(url)
     article["url"] = url.url
     article["url_info"] = url.export()
     article["url_id"] = url.url_id
     article["depth"] = depth
     article["type"] = response.headers['content-type']
     article["date"] = self.date
     article["encoding"] = response.encoding.lower()
     
     article["status"] = True
     if url.status:
         article_txt = lxml_extractor(html, url)
         article["title"] = self.extract_title(html)
         article["meta"] = self.extract_meta(html)
         article["keywords"] = self.extract_keywords(article["meta"])
         if filters:
             if self.check_lang(article_txt):
                 if self.check_query(article_txt):
                     article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
                     article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
                     outlinks = self.extract_outlinks(html, url, depth)
                     article["citeds_url"] = [n["url"] for n in outlinks]
                     article["cited_url_ids"] = [n["url_id"] for n in outlinks]
                     article["outlinks"] =  outlinks
                     article["lang"] = self.page_lang
                     return article
                     
                 else:
                     if self.check_query(article["title"]):                            
                         article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
                         article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
                         outlinks = self.extract_outlinks(html, url, depth)
                         article["cited_urls"] = [n["url"] for n in outlinks]
                         article["cited_url_ids"] = [n["url_id"] for n in outlinks]
                         article["outlinks"] =  outlinks
                         article["lang"] = self.page_lang
                         article = self.extract_page(article, article_txt, html)
                         article["lang"] = self.page_lang
                         return article
                     else:
                         article["status"] = False
                         article["status_code"] = 900
                         article["msg"] = "Search expression not found"
                         return article
             else:
                 if self.check_lang(article["title"]):                        
                     article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
                     article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
                     outlinks = self.extract_outlinks(html, url, depth)
                     article["cited_urls"] = [n["url"] for n in outlinks]
                     article["cited_url_ids"] = [n["url_id"] for n in outlinks]
                     article["outlinks"] =  outlinks
                     article["lang"] = self.page_lang
                     return article
                 else:
                     article["status"] = False
                     article["status_code"] = 1000
                     article["msg"] = "Lang is invalid"
                     article["lang"] = self.page_lang
                     return article
         else:
             self.check_lang(article_txt)
             article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
             article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
             outlinks = self.extract_outlinks(html, url, depth)
             article["cited_urls"] = [n["url"] for n in outlinks]
             article["cited_url_ids"] = [n["url_id"] for n in outlinks]
             article["outlinks"] =  outlinks
             article["lang"] = self.page_lang
             return article
     else:
         article["status"] = False
         article["error"] = "Invalid url"
         article["status_code"] = 800
         return article