def searchPostHtml(request: dict): # print(request) site_type_re = { "nha.chotot.com": { "land": r"^.*/mua-ban-dat/.*$", "house": r"^.*/mua-ban-nha-dat/.*$", "apartment": r"^.*/mua-ban-can-ho-chung-cu/.*$" }, "nhadat247.com.vn": { "land": r"^.*nhadat247.com.vn/ban-dat.*$", "apartment": r"^.*nhadat247.com.vn/ban-can-ho-chung-cu.*$", "house": r"^.*nhadat247.com.vn/ban-nha.*$" }, "batdongsan.com.vn": { "land": r"^.*batdongsan.com.vn/ban-dat.*$", "apartment": r"^.*batdongsan.com.vn/ban-can-ho-chung-cu.*$", "house": r"^.*batdongsan.com.vn/ban-nha.*$" } } try: db = DBObject() _site = request["site"] if "site" in request else None _crawl_date = request["crawl_date"] if "crawl_date" in request else None _post_date = request["post_date"] if "post_date" in request else None _type = request["type"] if "type" in request else "all" _limit = int(request["limit"]) if ( "limit" in request) and len(request["limit"]) > 0 else 0 list_filter = [] if _site in site_type_re: list_filter.append( {"url": { "$regex": "^https://%s/.*$" % (_site) }}) if _type in site_type_re[_site]: list_filter.append({"url": {"$regex": site_type_re[_site][_type]}}) else: list_filter.append({ "$or": [{ "url": { "$regex": site_type_re[_site][_t] } } for _t in site_type_re[_site]] }) _d_range = d_range(_crawl_date) if len(_d_range) > 0: list_filter.append({ "$or": [{ "date": { "$regex": "^[0-9]{2}/%s/%s$" % (m, y) } } for m, y in _d_range] }) _d_range = d_range(_post_date) if len(_d_range) > 0: list_filter.append({ "$or": [{ "post_date": { "$regex": "^[0-9]{2}/%s/%s$" % (m, y) } } for m, y in _d_range] }) query_return = [] for post in db.query_html_db(query_dict={"$and": list_filter}, limit=_limit): post.pop("html") post.pop("_id") post["html"] = "content is eliminated" query_return.append(post) # print(query_return[0]) return {"code": 200, "message": "successfull", "content": query_return} except: # traceback.print_exc() return {"code": 404, "message": "failed", "content": []}
import re import pandas as pd from itertools import chain from database import DBObject import traceback import traceback from database import DBObject from time import time import pandas as pd from datetime import date db = DBObject() def strip_text(text): return text.replace("\t", "").replace("\n", "").strip() def stringify_children(node): # print(str(node.tag)) parts = ([node.text] + list( chain(*((stringify_children(c) + ("\n" if str(c.tag) == "div" else "")) for c in node.getchildren()))) + [node.tail]) return ''.join(filter(None, parts)) def clean_trash(html):
import pandas as pd from datetime import datetime, date import time import hashlib from ParserObject import ParserObject from ParserModelSelector import ParserModelSelector from LibFunc import clean_trash from database import DBObject from Settings import Settings #============================================================================================= #============================================================================================= database = DBObject() def parse(posts_data, site=None, type=None, num=None, many: bool = False, model_name=None, resume=False): print("Go to Parsing Data") the_status = "parsing" __failed_urls = [] __saved_post = [] task_id = (int)(time.time())
def __init__(self, date_from=None, date_to=None, post_type=None, all_date: bool = False, resume=False, limit=-1): self.limit = int(limit) self.db_object = DBObject() the_status = "crawling" worker_info = self.db_object.query_wokers_info(Settings.worker_id) self.resume = resume if self.resume: try: info_ = worker_info status_ = info_["status"] task_id = info_["task_id"] info_str_ = info_["str_info"] if not ("(pause)" in status_ and "crawling" in status_): print(">>", status_) return info_dict_ = { _i_.split(": ")[0]: _i_.split(": ")[1] for _i_ in info_str_.lower().split(", ") } if info_dict_["site"] != "nhadat247.com.vn": return date_from = info_dict_["date"].split("-")[0] date_to = info_dict_["date"].split("-")[1] try: self.limit = int(info_dict_["limit"]) except: self.limit = -1 post_type = info_dict_["type"] the_status = status_.replace("(pause)", "") print("Internal loading data to resume") except: traceback.print_exc() return self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % ( post_type, date_from, date_to, str(self.limit) if isinstance(self.limit, int) and self.limit > 0 else "No") self.__str_info += "Numpost: %d, Error: %d" self.post_type = post_type self.buffer = [] self.seed_url = NhaDat247.get_seed_url(post_type) self.__current_url = "" self.__failed_urls = [] self.__saved_post = [] self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % ( self.post_type) self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % ( self.post_type) self.regex_sub_url = re.compile( "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?") self.regex_post = re.compile( "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html") self.key_type = NhaDat247.get_key_from_type(self.post_type) try: last_day_to = calendar.monthrange(int(date_to.split("/")[1]), int(date_to.split("/")[0]))[1] self.post_date_range = { "from": datetime.strptime("1/" + date_from, '%d/%m/%Y').date(), "to": datetime.strptime( str(last_day_to) + "/" + date_to, '%d/%m/%Y').date() } print("-" * 200, "\n", self.post_date_range) except: traceback.print_exc() self.post_date_range = None self.browser = Browser(headless=False) if not self.resume: task_id = (int)(time.time()) self.__crawling_info = { "task_id": task_id, "status": the_status, "str_info": "" } self.__crawling_log = { "worker_id": Settings.worker_id, "task_id": task_id, "task_info": self.__str_info % (0, 0), "saved_posts": [], "error_posts": [] } if not self.resume: print("Create log") self.db_object.create_wokers_log(self.__crawling_log) self.update_crawling_status_info(0, 0) else: log = self.db_object.query_wokers_logs(Settings.worker_id, task_id) print("Get log: ", log if log else "null") if log is not None: self.__saved_post = log["saved_posts"] self.__failed_urls = log["error_posts"] print("Init crawler")
def callback(ch, method, properties, body): command = "nothing" try: body = body.decode('ascii') message = message_loads(body) command = message["command"] if command == "crawl": pid = int(open("data.lock", "r").read()) if not psutil.pid_exists(pid): Popen(['python', 'worker.py', body]) else: command = "is runing" elif command == "parse": pid = int(open("data.lock", "r").read()) if not psutil.pid_exists(pid): file = open("parse_posts.data", "w") file.write(message["posts"]) file.close() model = message["model"] if "model" in message else "auto" type = message["type"] if "type" in message else "all" site = message["site"] if "site" in message else "all" Popen([ 'python', 'worker.py', "command:parse site:%s type:%s model:%s" % (site, type, model) ]) else: command = "is runing" elif command == "stop": db = DBObject() db.cancel_task(Settings.worker_id) try: pid = int(open("data.lock", "r").read()) os.kill(pid, signal.SIGTERM) except: "" subprocess.call("TASKKILL /f /IM CHROMEDRIVER.EXE") subprocess.call("TASKKILL /f /IM CHROME.EXE") elif command == "pause": db = DBObject() pid = int(open("data.lock", "r").read()) _working, _as = db.workAs(Settings.worker_id) if _working: db.pause_task(Settings.worker_id) try: os.kill(pid, signal.SIGTERM) except: "" subprocess.call("TASKKILL /f /IM CHROME.EXE") subprocess.call("TASKKILL /f /IM CHROMEDRIVER.EXE") else: if not psutil.pid_exists(pid): Popen([ 'python', 'worker.py', "command:%s resume:1" % (_as) ]) else: command = "is runing" elif command == "shield": shield_on = True if ( ("shield" in message and int(message["shield"]) == 1) or (not Settings.isShieldEnable())) else False Settings.enableShield(shield_on) else: command = "nothing" "" except: traceback.print_exc() print(" [x] Received \n -> Do %s" % (command))