def __init__(self): DBObject.__init__(self) self.server = None self.localization = hlib.i18n.Localization(languages = ['cz']) self.trumpet = hlib.database.SimpleMapping() self.users = hlib.database.StringMapping()
def __init__(self, name, password, email): DBObject.__init__(self) self.name = unicode(name) self.password = unicode(password) self.admin = False self.date_format = '%d/%m/%Y %H:%M:%S' self.email = unicode(email) self.maintenance_access = False self.cookies = hlib.database.SimpleMapping() self.events = hlib.database.IndexedMapping() self.api_tokens = hlib.database.SimpleList()
def __getattr__(self, name): if name == 'is_admin': return self.admin == True if name == 'is_online': return self.name in hruntime.app.sessions.online_users return DBObject.__getattr__(self, name)
def searchPostHtml(request: dict): # print(request) site_type_re = { "nha.chotot.com": { "land": r"^.*/mua-ban-dat/.*$", "house": r"^.*/mua-ban-nha-dat/.*$", "apartment": r"^.*/mua-ban-can-ho-chung-cu/.*$" }, "nhadat247.com.vn": { "land": r"^.*nhadat247.com.vn/ban-dat.*$", "apartment": r"^.*nhadat247.com.vn/ban-can-ho-chung-cu.*$", "house": r"^.*nhadat247.com.vn/ban-nha.*$" }, "batdongsan.com.vn": { "land": r"^.*batdongsan.com.vn/ban-dat.*$", "apartment": r"^.*batdongsan.com.vn/ban-can-ho-chung-cu.*$", "house": r"^.*batdongsan.com.vn/ban-nha.*$" } } try: db = DBObject() _site = request["site"] if "site" in request else None _crawl_date = request["crawl_date"] if "crawl_date" in request else None _post_date = request["post_date"] if "post_date" in request else None _type = request["type"] if "type" in request else "all" _limit = int(request["limit"]) if ( "limit" in request) and len(request["limit"]) > 0 else 0 list_filter = [] if _site in site_type_re: list_filter.append( {"url": { "$regex": "^https://%s/.*$" % (_site) }}) if _type in site_type_re[_site]: list_filter.append({"url": {"$regex": site_type_re[_site][_type]}}) else: list_filter.append({ "$or": [{ "url": { "$regex": site_type_re[_site][_t] } } for _t in site_type_re[_site]] }) _d_range = d_range(_crawl_date) if len(_d_range) > 0: list_filter.append({ "$or": [{ "date": { "$regex": "^[0-9]{2}/%s/%s$" % (m, y) } } for m, y in _d_range] }) _d_range = d_range(_post_date) if len(_d_range) > 0: list_filter.append({ "$or": [{ "post_date": { "$regex": "^[0-9]{2}/%s/%s$" % (m, y) } } for m, y in _d_range] }) query_return = [] for post in db.query_html_db(query_dict={"$and": list_filter}, limit=_limit): post.pop("html") post.pop("_id") post["html"] = "content is eliminated" query_return.append(post) # print(query_return[0]) return {"code": 200, "message": "successfull", "content": query_return} except: # traceback.print_exc() return {"code": 404, "message": "failed", "content": []}
import re import pandas as pd from itertools import chain from database import DBObject import traceback import traceback from database import DBObject from time import time import pandas as pd from datetime import date db = DBObject() def strip_text(text): return text.replace("\t", "").replace("\n", "").strip() def stringify_children(node): # print(str(node.tag)) parts = ([node.text] + list( chain(*((stringify_children(c) + ("\n" if str(c.tag) == "div" else "")) for c in node.getchildren()))) + [node.tail]) return ''.join(filter(None, parts)) def clean_trash(html):
def __init__(self, date_from=None, date_to=None, post_type=None, all_date: bool = False, resume=False, limit=-1): self.limit = int(limit) self.db_object = DBObject() the_status = "crawling" worker_info = self.db_object.query_wokers_info(Settings.worker_id) self.resume = resume if self.resume: try: info_ = worker_info status_ = info_["status"] task_id = info_["task_id"] info_str_ = info_["str_info"] if not ("(pause)" in status_ and "crawling" in status_): print(">>", status_) return info_dict_ = { _i_.split(": ")[0]: _i_.split(": ")[1] for _i_ in info_str_.lower().split(", ") } if info_dict_["site"] != "nhadat247.com.vn": return date_from = info_dict_["date"].split("-")[0] date_to = info_dict_["date"].split("-")[1] try: self.limit = int(info_dict_["limit"]) except: self.limit = -1 post_type = info_dict_["type"] the_status = status_.replace("(pause)", "") print("Internal loading data to resume") except: traceback.print_exc() return self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % ( post_type, date_from, date_to, str(self.limit) if isinstance(self.limit, int) and self.limit > 0 else "No") self.__str_info += "Numpost: %d, Error: %d" self.post_type = post_type self.buffer = [] self.seed_url = NhaDat247.get_seed_url(post_type) self.__current_url = "" self.__failed_urls = [] self.__saved_post = [] self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % ( self.post_type) self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % ( self.post_type) self.regex_sub_url = re.compile( "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?") self.regex_post = re.compile( "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html") self.key_type = NhaDat247.get_key_from_type(self.post_type) try: last_day_to = calendar.monthrange(int(date_to.split("/")[1]), int(date_to.split("/")[0]))[1] self.post_date_range = { "from": datetime.strptime("1/" + date_from, '%d/%m/%Y').date(), "to": datetime.strptime( str(last_day_to) + "/" + date_to, '%d/%m/%Y').date() } print("-" * 200, "\n", self.post_date_range) except: traceback.print_exc() self.post_date_range = None self.browser = Browser(headless=False) if not self.resume: task_id = (int)(time.time()) self.__crawling_info = { "task_id": task_id, "status": the_status, "str_info": "" } self.__crawling_log = { "worker_id": Settings.worker_id, "task_id": task_id, "task_info": self.__str_info % (0, 0), "saved_posts": [], "error_posts": [] } if not self.resume: print("Create log") self.db_object.create_wokers_log(self.__crawling_log) self.update_crawling_status_info(0, 0) else: log = self.db_object.query_wokers_logs(Settings.worker_id, task_id) print("Get log: ", log if log else "null") if log is not None: self.__saved_post = log["saved_posts"] self.__failed_urls = log["error_posts"] print("Init crawler")
class NhaDat247(CrawlerObject): BASE_URL = "https://nhadat247.com.vn/" SAVE_CHECK_POINT = 5 def __init__(self, date_from=None, date_to=None, post_type=None, all_date: bool = False, resume=False, limit=-1): self.limit = int(limit) self.db_object = DBObject() the_status = "crawling" worker_info = self.db_object.query_wokers_info(Settings.worker_id) self.resume = resume if self.resume: try: info_ = worker_info status_ = info_["status"] task_id = info_["task_id"] info_str_ = info_["str_info"] if not ("(pause)" in status_ and "crawling" in status_): print(">>", status_) return info_dict_ = { _i_.split(": ")[0]: _i_.split(": ")[1] for _i_ in info_str_.lower().split(", ") } if info_dict_["site"] != "nhadat247.com.vn": return date_from = info_dict_["date"].split("-")[0] date_to = info_dict_["date"].split("-")[1] try: self.limit = int(info_dict_["limit"]) except: self.limit = -1 post_type = info_dict_["type"] the_status = status_.replace("(pause)", "") print("Internal loading data to resume") except: traceback.print_exc() return self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % ( post_type, date_from, date_to, str(self.limit) if isinstance(self.limit, int) and self.limit > 0 else "No") self.__str_info += "Numpost: %d, Error: %d" self.post_type = post_type self.buffer = [] self.seed_url = NhaDat247.get_seed_url(post_type) self.__current_url = "" self.__failed_urls = [] self.__saved_post = [] self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % ( self.post_type) self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % ( self.post_type) self.regex_sub_url = re.compile( "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?") self.regex_post = re.compile( "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html") self.key_type = NhaDat247.get_key_from_type(self.post_type) try: last_day_to = calendar.monthrange(int(date_to.split("/")[1]), int(date_to.split("/")[0]))[1] self.post_date_range = { "from": datetime.strptime("1/" + date_from, '%d/%m/%Y').date(), "to": datetime.strptime( str(last_day_to) + "/" + date_to, '%d/%m/%Y').date() } print("-" * 200, "\n", self.post_date_range) except: traceback.print_exc() self.post_date_range = None self.browser = Browser(headless=False) if not self.resume: task_id = (int)(time.time()) self.__crawling_info = { "task_id": task_id, "status": the_status, "str_info": "" } self.__crawling_log = { "worker_id": Settings.worker_id, "task_id": task_id, "task_info": self.__str_info % (0, 0), "saved_posts": [], "error_posts": [] } if not self.resume: print("Create log") self.db_object.create_wokers_log(self.__crawling_log) self.update_crawling_status_info(0, 0) else: log = self.db_object.query_wokers_logs(Settings.worker_id, task_id) print("Get log: ", log if log else "null") if log is not None: self.__saved_post = log["saved_posts"] self.__failed_urls = log["error_posts"] print("Init crawler") def update_crawling_status_info(self, num_post, num_error): self.__crawling_info["str_info"] = self.__str_info % (num_post, num_error) self.db_object.update_wokers_info(Settings.worker_id, self.__crawling_info) def update_crawling_log(self): self.db_object.update_wokers_log(Settings.worker_id, self.__crawling_log["task_id"], self.__saved_post, self.__failed_urls) def get_html_and_soup_from_url(self, url): """ Return Beautifulsoup object """ _soup = None _html = None for i in range(5): try: element_present = EC.presence_of_element_located( (By.CSS_SELECTOR, "body > div.footer")) _html = self.browser.get_html(url, until_ec=element_present) _soup = BeautifulSoup(_html, 'html.parser') if _soup is not None: return _html, _soup except Exception as e: traceback.print_exc() continue self.__failed_urls.append(self.__current_url) return None, None @staticmethod def get_key_from_type(key) -> list: if key == "land": return ["ban-dat"] elif key == "apartment": return ["ban-can-ho-chung-cu"] elif key == "house": return ["ban-nha-mat-pho", "ban-nha-biet-thu", "ban-nha-rieng"] return [ "ban-dat", "ban-can-ho-chung-cu", "ban-nha-rieng", "ban-nha-mat-pho", "ban-nha-biet-thu" ] def check_type(self, url) -> bool: for key in self.key_type: if key in url: # print("ok") return True return False def append_data(self, _url, _type, _status, _crawl_date, _post_date, _html): post = {} url_hash = hashlib.md5(_url.encode()).hexdigest() post["url_hash"] = url_hash post["url"] = _url post["type"] = _type post["status"] = _status post["html"] = _html post["date"] = _crawl_date post["post_date"] = _post_date self.__saved_post.append(url_hash) self.buffer.append(post) # post["html"] = "<html>" # print("-"*10,"\n",post) def load_init_url(self) -> tuple: local_urls = self.seed_url visited_post = [] if self.resume: try: local_urls = list(open(self.file_log_new_url, "r").readlines()) except: "" try: visited_post = list( open(self.file_log_visited_url, "r").readlines()) except: "" return local_urls, visited_post def get_date(self, page_soup: BeautifulSoup) -> date: post_date = None try: str_date = page_soup.select_one( "#ContentPlaceHolder1_ProductDetail1_divprice > div").get_text( ).split("|")[1] str_date = slugify(str_date.strip().lower()) if "hom-kia" in str_date: post_date = date.today() - timedelta(days=2) elif "hom-qua" in str_date: post_date = date.today() - timedelta(days=1) elif "hom-nay" in str_date: post_date = date.today() else: post_date = datetime.strptime(str_date, '%d-%m-%Y').date() except Exception as e: self.__failed_urls.append(self.__current_url) traceback.print_exc() return post_date def visit(self, current_url) -> tuple: local_urls = [] post_date = None page_source, page_soup = self.get_html_and_soup_from_url(current_url) if page_soup: is_post = re.search(self.regex_post, current_url) if is_post: print("Is a post") post_date = self.get_date(page_soup) if not self.post_date_range or \ (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])): post_date = post_date.strftime('%d/%m/%Y') else: page_source = None else: page_source = None list_href = page_soup.find_all('a') for link in list_href: anchor = str(link.get('href')) if not bool(urlparse(anchor).netloc): anchor = urljoin(self.BASE_URL, anchor) if validators.url(anchor) and self.check_type(anchor) and ( self.regex_post.search(anchor) or self.regex_sub_url.search(anchor)): local_urls.append(anchor) print("<html>" if page_source else "None") return page_source, post_date, local_urls def obtain_data(self): print("START...") num_visited = 0 local_urls, visited_post = self.load_init_url() post_count = len(self.__saved_post) while local_urls: self.__current_url = local_urls.pop(0) if len(self.__current_url) < 10 and ( self.__current_url in visited_post or not self.check_type(self.__current_url)): continue print(" > ", self.__current_url) page_source, post_date, new_urls_to_visit = self.visit( self.__current_url) visited_post.append(self.__current_url) local_urls += new_urls_to_visit if page_source: post_count += 1 self.append_data(_url=self.__current_url, _type="post", _status="0", _html=page_source, _crawl_date=str( date.today().strftime("%d/%m/%Y")), _post_date=post_date) # check-point to save buffer data if num_visited % self.SAVE_CHECK_POINT == 0: self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() NhaDat247.save_list(local_urls, self.file_log_new_url) NhaDat247.save_list(visited_post, self.file_log_visited_url) num_visited += 1 print(" >> num: ", post_count) if self.limit > 0 and post_count >= self.limit: break # finishing self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() self.browser.close() print('CRAWLING DONE') def rotate_ip(self, enable=False): self.browser.set_rotate_ip(enable) return def save_data(self): self.db_object.insert_html_data(self.buffer, many=True) # clear buffer self.buffer = [] def get_seed_url(post_type): data = { "apartment": ["https://nhadat247.com.vn/ban-can-ho-chung-cu.html"], "house": [ "https://nhadat247.com.vn/ban-nha-rieng.html", "https://nhadat247.com.vn/ban-nha-biet-thu-lien-ke.html", "https://nhadat247.com.vn/ban-nha-mat-pho.html" ], "land": [ "https://nhadat247.com.vn/ban-dat-nen-du-an.html", "https://nhadat247.com.vn/ban-dat.html" ] } return data[post_type] if post_type in data else [ url for e in data for url in data[e] ] def save_list(data: list, file_name): print("Checkpoint: ", file_name) with open(file_name, 'w') as file: file.write("\n".join(set(data))) file.close()
class ChoTotCrawler(CrawlerObject): BASE_URL = "https://nha.chotot.com/" SAVE_CHECK_POINT = 5 def __init__(self, date_from=None, date_to=None, post_type=None, all_date: bool = False, resume=False, limit=-1): self.limit = int(limit) self.db_object = DBObject() the_status = "crawling" worker_info = self.db_object.query_wokers_info(Settings.worker_id) self.resume = resume if self.resume: try: info_ = worker_info status_ = info_["status"] task_id = info_["task_id"] info_str_ = info_["str_info"] if not ("(pause)" in status_ and "crawling" in status_): print(">>", status_) return info_dict_ = { _i_.split(": ")[0]: _i_.split(": ")[1] for _i_ in info_str_.lower().split(", ") } if info_dict_["site"] != "nha.chotot.com": return date_from = info_dict_["date"].split("-")[0] date_to = info_dict_["date"].split("-")[1] try: self.limit = int(info_dict_["limit"]) except: self.limit = -1 post_type = info_dict_["type"] the_status = status_.replace("(pause)", "") print("Internal loading data to resume") except: traceback.print_exc() return self.__str_info = "Site: nha.chotot.com, Type: %s, Date: %s-%s, Limit: %s, " % ( post_type, date_from, date_to, str(self.limit) if isinstance(self.limit, int) and self.limit > 0 else "No") self.__str_info += "Numpost: %d, Error: %d" self.post_type = post_type self.buffer = [] self.seed_url = ChoTotCrawler.get_seed_url(post_type) self.__current_url = "" self.__failed_urls = [] self.__saved_post = [] self.file_log_visited_url = "visited_post_log_chotot_%s.txt" % ( self.post_type) self.file_log_new_url = "local_urls_log_chotot_%s.txt" % ( self.post_type) self.regex_sub_url = re.compile( "([a-z][-a-z]*)?ban-[-a-z]+((.htm)|(/[0-9]+))?") self.regex_post = re.compile( "([a-z][-a-z]+)?[/][a-z][-a-z0-9]+/[-a-z0-9]+.htm") self.key_type = ChoTotCrawler.get_key_from_type(self.post_type) try: last_day_to = calendar.monthrange(int(date_to.split("/")[1]), int(date_to.split("/")[0]))[1] self.post_date_range = { "from": datetime.strptime("1/" + date_from, '%d/%m/%Y').date(), "to": datetime.strptime( str(last_day_to) + "/" + date_to, '%d/%m/%Y').date() } print("-" * 200, "\n", self.post_date_range) except: traceback.print_exc() self.post_date_range = None self.browser = Browser(headless=False) if not self.resume: task_id = (int)(time.time()) self.__crawling_info = { "task_id": task_id, "status": the_status, "str_info": "" } self.__crawling_log = { "worker_id": Settings.worker_id, "task_id": task_id, "task_info": self.__str_info % (0, 0), "saved_posts": [], "error_posts": [] } if not self.resume: print("Create log") self.db_object.create_wokers_log(self.__crawling_log) self.update_crawling_status_info(0, 0) else: log = self.db_object.query_wokers_logs(Settings.worker_id, task_id) print("Get log: ", log if log else "null") if log is not None: self.__saved_post = log["saved_posts"] self.__failed_urls = log["error_posts"] print("Init crawler") def update_crawling_status_info(self, num_post, num_error): self.__crawling_info["str_info"] = self.__str_info % (num_post, num_error) self.db_object.update_wokers_info(Settings.worker_id, self.__crawling_info) def update_crawling_log(self): self.db_object.update_wokers_log(Settings.worker_id, self.__crawling_log["task_id"], self.__saved_post, self.__failed_urls) def get_html_and_soup_from_url(self, url): """ Return Beautifulsoup object """ _soup = None _html = None click_phone_script = """ function getElementByXpath(path) { return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; } var phone = getElementByXpath("//*[@id='__next']/div[3]/div[1]/div/div[4]/div[3]/div/linkcontact"); if (phone != null) { phone.click(); } """ for i in range(5): try: is_post = re.search(self.regex_post, url) element_present = EC.presence_of_element_located( (By.XPATH, """//html/body/div[1]/footer""")) _html = self.browser.get_html( url=url, until_ec=element_present, run_script=click_phone_script if is_post else None) _soup = BeautifulSoup(_html, 'html.parser') if _soup is not None: return _html, _soup except Exception as e: traceback.print_exc() continue self.__failed_urls.append(self.__current_url) return None, None @staticmethod def get_key_from_type(key) -> list: if key == "land": return ["mua-ban-dat"] elif key == "apartment": return ["mua-ban-can-ho-chung-cu"] elif key == "house": return ["mua-ban-nha-dat"] return ["mua-ban-dat", "mua-ban-nha-dat", "mua-ban-can-ho-chung-cu"] def check_type(self, url) -> bool: for key in self.key_type: if key in url: # print("ok") return True return False def append_data(self, _url, _type, _status, _crawl_date, _post_date, _html): post = {} url_hash = hashlib.md5(_url.encode()).hexdigest() post["url_hash"] = url_hash post["url"] = _url post["type"] = _type post["status"] = _status post["html"] = _html post["date"] = _crawl_date post["post_date"] = _post_date self.__saved_post.append(url_hash) self.buffer.append(post) # post["html"] = "<html>" # print("-"*10,"\n",post) def load_init_url(self) -> tuple: local_urls = self.seed_url visited_post = [] if self.resume: try: local_urls = list(open(self.file_log_new_url, "r").readlines()) except: "" try: visited_post = list( open(self.file_log_visited_url, "r").readlines()) except: "" return local_urls, visited_post def convert_str2date(date_str): _date = None date_str = slugify(date_str.lower()) _l = date_str.split("-") if "hom-qua" in date_str: _date = date.today() - timedelta(days=1) elif "thang" in _l: _n = int(_l[_l.index("thang") - 1][0]) _date = date.today() - timedelta(days=30 * _n) elif "tuan" in _l: _n = int(_l[_l.index("tuan") - 1][0]) _date = date.today() - timedelta(days=7 * _n) elif "ngay" in _l: _n = int(_l[_l.index("ngay") - 1][0]) _date = date.today() - timedelta(days=1) elif "hom-nay" in date_str or "gio" in _l or "phut" in _l: _date = date.today() else: _date = datetime.strptime(date_str, '%d/%m/%Y').date() return _date def get_date(self, page_soup: BeautifulSoup) -> date: post_date = None try: str_date = page_soup.select_one( "#__next > div > div.ct-detail.adview > div > div.col-md-8 > div.adImageWrapper___KTd-h > div.imageCaption___cMU2J > span" ).get_text() str_date = str_date.strip() post_date = ChoTotCrawler.convert_str2date(str_date) except Exception as e: self.__failed_urls.append(self.__current_url) traceback.print_exc() return post_date def visit(self, current_url) -> tuple: local_urls = [] post_date = None page_source, page_soup = self.get_html_and_soup_from_url(current_url) if page_soup: is_post = re.search(self.regex_post, current_url) if is_post: print("Is a post") post_date = self.get_date(page_soup) if not self.post_date_range or \ (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])): post_date = post_date.strftime('%d/%m/%Y') else: page_source = None else: page_source = None list_href = page_soup.find_all('a') for link in list_href: anchor = str(link.get('href')) if not bool(urlparse(anchor).netloc): anchor = urljoin(self.BASE_URL, anchor) if validators.url(anchor) and self.check_type(anchor) and ( self.regex_post.search(anchor) or self.regex_sub_url.search(anchor)): local_urls.append(anchor) print("<html>" if page_source else "None") return page_source, post_date, local_urls def obtain_data(self): print("START...") num_visited = 0 local_urls, visited_post = self.load_init_url() post_count = len(self.__saved_post) while local_urls: self.__current_url = local_urls.pop(0) if len(self.__current_url) < 10 and ( self.__current_url in visited_post or not self.check_type(self.__current_url)): continue print(" > ", self.__current_url) page_source, post_date, new_urls_to_visit = self.visit( self.__current_url) visited_post.append(self.__current_url) local_urls += new_urls_to_visit if page_source: post_count += 1 self.append_data(_url=self.__current_url, _type="post", _status="0", _html=page_source, _crawl_date=str( date.today().strftime("%d/%m/%Y")), _post_date=post_date) # check-point to save buffer data if num_visited % self.SAVE_CHECK_POINT == 0: self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() ChoTotCrawler.save_list(local_urls, self.file_log_new_url) ChoTotCrawler.save_list(visited_post, self.file_log_visited_url) num_visited += 1 print(" >> num: ", post_count) if self.limit > 0 and post_count >= self.limit: break # finishing self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() self.browser.close() print('CRAWLING DONE') def rotate_ip(self, enable=False): self.browser.set_rotate_ip(enable) return def save_data(self): self.db_object.insert_html_data(self.buffer, many=True) # clear buffer self.buffer = [] def get_seed_url(post_type): data = { "apartment": ["https://nha.chotot.com/toan-quoc/mua-ban-can-ho-chung-cu"], "house": ["https://nha.chotot.com/toan-quoc/mua-ban-nha-dat"], "land": ["https://nha.chotot.com/toan-quoc/mua-ban-dat"] } return data[post_type] if post_type in data else [ url for e in data for url in data[e] ] def save_list(data: list, file_name): print("Checkpoint: ", file_name) with open(file_name, 'w') as file: file.write("\n".join(set(data))) file.close()
import pandas as pd from datetime import datetime, date import time import hashlib from ParserObject import ParserObject from ParserModelSelector import ParserModelSelector from LibFunc import clean_trash from database import DBObject from Settings import Settings #============================================================================================= #============================================================================================= database = DBObject() def parse(posts_data, site=None, type=None, num=None, many: bool = False, model_name=None, resume=False): print("Go to Parsing Data") the_status = "parsing" __failed_urls = [] __saved_post = [] task_id = (int)(time.time())
try: row = file.readline() if row == None or len(row) < 10: continue row = json.loads(row) row["url"] = row["url"].strip() row["url_hash"] = hashlib.md5(row["url"].encode()).hexdigest() soup = BeautifulSoup(row["html"], 'html.parser') _date = soup.select_one( "#product-detail-web > div.detail-product > div.product-config.pad-16 > ul > li:nth-child(1) > span.sp3" ).get_text() _date = _date.strip() _date = datetime.strptime(_date, '%d/%m/%Y').date() row["post_date"] = _date.strftime("%d/%m/%Y") row.pop('parser', None) data.append(row) print(i, ". ", _date) except: print("-" * 20) print("ERROR", i, ":") traceback.print_exc() print("-" * 20) db = DBObject() db.insert_html_data(json_row=data, many=True)
def __init__(self, stamp, hidden): DBObject.__init__(self) self.id = None self.stamp = stamp self.hidden = hidden
def __getattr__(self, name): if name == 'online_users': return hruntime.app.sessions.online_users return DBObject.__getattr__(self, name)
def __init__(self): DBObject.__init__(self) self.events = hlib.database.IndexedMapping() self.maintenance_mode = False
def callback(ch, method, properties, body): command = "nothing" try: body = body.decode('ascii') message = message_loads(body) command = message["command"] if command == "crawl": pid = int(open("data.lock", "r").read()) if not psutil.pid_exists(pid): Popen(['python', 'worker.py', body]) else: command = "is runing" elif command == "parse": pid = int(open("data.lock", "r").read()) if not psutil.pid_exists(pid): file = open("parse_posts.data", "w") file.write(message["posts"]) file.close() model = message["model"] if "model" in message else "auto" type = message["type"] if "type" in message else "all" site = message["site"] if "site" in message else "all" Popen([ 'python', 'worker.py', "command:parse site:%s type:%s model:%s" % (site, type, model) ]) else: command = "is runing" elif command == "stop": db = DBObject() db.cancel_task(Settings.worker_id) try: pid = int(open("data.lock", "r").read()) os.kill(pid, signal.SIGTERM) except: "" subprocess.call("TASKKILL /f /IM CHROMEDRIVER.EXE") subprocess.call("TASKKILL /f /IM CHROME.EXE") elif command == "pause": db = DBObject() pid = int(open("data.lock", "r").read()) _working, _as = db.workAs(Settings.worker_id) if _working: db.pause_task(Settings.worker_id) try: os.kill(pid, signal.SIGTERM) except: "" subprocess.call("TASKKILL /f /IM CHROME.EXE") subprocess.call("TASKKILL /f /IM CHROMEDRIVER.EXE") else: if not psutil.pid_exists(pid): Popen([ 'python', 'worker.py', "command:%s resume:1" % (_as) ]) else: command = "is runing" elif command == "shield": shield_on = True if ( ("shield" in message and int(message["shield"]) == 1) or (not Settings.isShieldEnable())) else False Settings.enableShield(shield_on) else: command = "nothing" "" except: traceback.print_exc() print(" [x] Received \n -> Do %s" % (command))