def add_url(self, url, rank=0, method="url"): url = Url(url) try: self.db.seeds.insert_one({ "date": [self.date], "url": url.url, "url_id" : url.url_id, "url_info": url.export(), "title": None, "description": None, "rank": rank, "source_url": None, "depth": 0, "method": method }) except pymongo.errors.DuplicateKeyError: pass return self
def extract(self, response, depth=0, filters=True): article = {} html = response.text url = response.url url = Url(url) article["url"] = url.url article["url_info"] = url.export() article["url_id"] = url.url_id article["depth"] = depth article["type"] = response.headers['content-type'] article["date"] = self.date article["encoding"] = response.encoding.lower() article["status"] = True if url.status: article_txt = lxml_extractor(html, url) article["title"] = self.extract_title(html) article["meta"] = self.extract_meta(html) article["keywords"] = self.extract_keywords(article["meta"]) if filters: if self.check_lang(article_txt): if self.check_query(article_txt): article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["citeds_url"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang return article else: if self.check_query(article["title"]): article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["cited_urls"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang article = self.extract_page(article, article_txt, html) article["lang"] = self.page_lang return article else: article["status"] = False article["status_code"] = 900 article["msg"] = "Search expression not found" return article else: if self.check_lang(article["title"]): article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["cited_urls"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang return article else: article["status"] = False article["status_code"] = 1000 article["msg"] = "Lang is invalid" article["lang"] = self.page_lang return article else: self.check_lang(article_txt) article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["cited_urls"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang return article else: article["status"] = False article["error"] = "Invalid url" article["status_code"] = 800 return article