def mine(self, text, time_created, link_url): try: terms_dict = extract.extract_terms(text) now = datetime.now().strftime('%Y%m%d%H%M') t = time_created.strftime('%Y%m%d%H%M') post = RssMiner.dict_of_post(link_url, terms_dict, t, now) batch = RssMiner.package_batch_to_json(self.category.id, [post]) self.send_to_parent(self.category.parent_id, batch) self.mined_posts_hashes.append(hash) except Exception as e: print e.message, e.args
def run(self): self.log("Starting mining.") urls = self.category.urls.split(',') for url in urls: try: visible_text, last_modified = self.download_page(url) text_hash = hashlib.sha1(visible_text.encode('utf-8')) if text_hash not in self.mined_posts_hashes: terms_dict = extract.extract_terms(visible_text) now = datetime.now().strftime('%Y%m%d%H%M') time = last_modified.strftime('%Y%m%d%H%M') post = WebsiteMiner.dict_of_post(url, terms_dict, time, now) batch = WebsiteMiner.package_batch_to_json(self.category.id, [post]) self.send_to_parent(self.category.parent_id, batch) self.mined_posts_hashes.append(hash) else: print("Post already mined.") except Exception as e: print e.message, e.args
def run(self): self.log("Starting mining.") urls = self.category.urls.split(',') for url in urls: try: visible_text, last_modified = self.download_page(url) text_hash = hashlib.sha1(visible_text.encode('utf-8')) if text_hash not in self.mined_posts_hashes: terms_dict = extract.extract_terms(visible_text) now = datetime.now().strftime('%Y%m%d%H%M') time = last_modified.strftime('%Y%m%d%H%M') post = WebsiteMiner.dict_of_post(url, terms_dict, time, now) batch = WebsiteMiner.package_batch_to_json( self.category.id, [post]) self.send_to_parent(self.category.parent_id, batch) self.mined_posts_hashes.append(hash) else: print("Post already mined.") except Exception as e: print e.message, e.args