def __init__(self, logger, file_path): self.index_file_path = file_path self.logger = logger self.office_name_bigrams = None self.office_name_unigrams = None self.office_squeezes = None self.web_domains = None self.office_id_2_ml_office_id = None self.ml_office_id_2_office_id = None self.web_sites = TDeclarationWebSiteList(self.logger) self.regions = TRussianRegions()
def __init__(self): self.args = parse_args() self.logger = setup_logging(log_file_name=self.args.logfile) if self.args.input_offices is not None: offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.input_offices) self.web_sites = TDeclarationWebSiteList(self.logger, offices=offices) else: self.web_sites = TDeclarationWebSiteList(self.logger) self.temp_dlrobot_project: TRobotProject self.temp_dlrobot_project = None THttpRequester.initialize(self.logger)
def main(): args = parse_args() logger = setup_logging("join_office_and_websites") offices = TOfficeTableInMemory(use_office_types=False) offices.read_from_local_file() web_sites_db = TDeclarationWebSiteList( logger, TDeclarationWebSiteList.default_input_task_list_path).load_from_disk() url_info: TDeclarationWebSiteObsolete for url, url_info in web_sites_db.web_sites.items(): office_id = url_info.calculated_office_id office: TOfficeInMemory office = offices.offices.get(int(office_id)) if office is None: logger.debug( "cannot find office_id={}, url={} no valid urls, deleted office?" .format(office_id, url)) continue p = url_info.http_protocol if url_info.http_protocol is not None else "http" i = TDeclarationWebSite() i.url = p + "://" + url i.reach_status = url_info.reach_status i.comments = url_info.comments i.redirect_to = url_info.redirect_to i.title = url_info.title office.office_web_sites.append(i) for o in offices.offices.values(): o.office_web_sites.sort(key=lambda x: 1 if x.reach_status == TWebSiteReachStatus.normal else 0) logger.info("write to {}".format(args.output_file)) offices.write_to_local_file(args.output_file)
def __init__(self, logger, web_sites=None): self.logger = logger if web_sites is not None: self.web_sites = web_sites else: self.web_sites = TDeclarationWebSiteList(logger, RUSSIA.offices_in_memory)
def print_predicted_as_external(self): web_sites = TDeclarationWebSiteList(logger=self.logger, offices=RUSSIA.offices_in_memory) for key, src_doc in self.dlrobot_human.get_all_documents(): if src_doc.calculated_office_id is None: continue urls = set(r.get_site_url() for r in src_doc.web_references) if len(urls) != 1: continue src_doc_url = list(urls)[0] if src_doc_url == "service.nalog.ru": continue office = RUSSIA.offices_in_memory.get_office_by_id( src_doc.calculated_office_id) u: TDeclarationWebSite found = False origin_hostname = urlsplit_pro(src_doc_url).hostname if web_sites.is_a_special_domain(origin_hostname): continue for u in office.office_web_sites: if urlsplit_pro(u.url).hostname == origin_hostname: found = True break if found: continue ww = web_sites.search_url(src_doc_url) if ww is None: self.logger.error( "cannot find url {} by web domain in offices.txt".format( src_doc_url)) continue r = { "sha256": key, "predicted_office": { "id": office.office_id, "name": office.name }, "url_host_office": { "id": ww.parent_office.office_id, "name": ww.parent_office.name }, "url": src_doc_url, "title": src_doc.get_doc_title() } print(json.dumps(r, indent=4, ensure_ascii=False))
def __init__(self, args): self.logger = setup_logging(log_file_name="predict_office.log") self.dlrobot_human_path = args.dlrobot_human_path self.dlrobot_human = TDlrobotHumanFileDBM(self.dlrobot_human_path) self.dlrobot_human.open_write_mode() self.enable_ml = args.enable_ml sp_args = TSmartParserCacheClient.parse_args([]) self.smart_parser_server_client = TSmartParserCacheClient(sp_args, self.logger) model_path = args.office_model_path self.max_failures_count = args.max_failures_count assert (os.path.exists(model_path)) bigrams_path = os.path.join(model_path, "office_ngrams.txt") ml_model_path = os.path.join(model_path, "model") self.office_ml_model = TTensorFlowOfficeModel(self.logger, bigrams_path, ml_model_path, create_model=False) self.regional_tax_offices = self.build_regional_tax_offices() self.web_sites = TDeclarationWebSiteList(self.logger, RUSSIA.offices_in_memory) self.title_parser = TOfficeFromTitle(self.logger, web_sites=self.web_sites) self.src_doc_to_rule_results = dict()
def __init__(self, args): self.args = args self.logger = setup_logging(log_file_name="join_human_and_dlrobot.log", append_mode=True) self.output_dlrobot_human = TDlrobotHumanFileDBM(args.output_json) self.output_dlrobot_human.create_db() self.old_files_with_office_count = 0 self.web_sites_db = TDeclarationWebSiteList(self.logger) self.offices = self.web_sites_db.offices self.dlrobot_config = TRobotConfig.read_by_config_type("prod")
def main(): args = parse_args() logger = setup_logging("calc_region_from_wd") regions = TRussianRegions() offices = TOfficeTableInMemory(use_office_types=False) offices.read_from_local_file() wd = TWikidataRecords(regions) wd.read_from_file(args.wikidata_info) web_sites_db = TDeclarationWebSiteList(logger, TDeclarationWebSiteList.default_input_task_list_path).load_from_disk() office_to_urls = web_sites_db.build_office_to_main_website(take_abandoned=True) with open(args.input_file) as inp: for l in inp: office_id, name = l.strip().split("\t") office = offices.offices.get(int(office_id)) if office is None: logger.debug("cannot find office_id={}, name={} no valid urls, deleted office?") continue wikidata_id, region = wd.get_region_by_name(name) if wikidata_id is not None: cause = "name" else: urls = office_to_urls.get(int(office_id), []) if len(urls) == 0: logger.debug("office_id={}, name={} no valid urls, delete office?") continue for url in urls: wikidata_id, region = wd.get_region_by_url(name, url) if wikidata_id is not None: cause = "url" break if region is None: logger.error( "office_id={}, name={} cannot recognize region".format(office_id, name)) else: logger.debug("set region {} to {} {} by {} ".format(region.name, office_id, name, cause)) office.region_id = region.id office.wikidata_id = wikidata_id logger.info("write to {}".format(args.output_file)) offices.write_to_local_file(args.output_file)
def __init__(self, args): self.register_task_result_error_count = 0 self.logger = setup_logging(log_file_name=args.log_file_name, append_mode=True) self.conversion_client = TDocConversionClient( TDocConversionClient.parse_args([]), self.logger) self.args = args rounds = TDeclarationRounds(args.round_file) self.dlrobot_remote_calls = TRemoteDlrobotCallList( logger=self.logger, file_name=args.remote_calls_file, min_start_time_stamp=rounds.start_time_stamp) self.worker_2_running_tasks = defaultdict(list) self.worker_2_continuous_failures_count = defaultdict(int) offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.offices_file) self.web_sites_db = TDeclarationWebSiteList(self.logger, offices=offices) if not os.path.exists(self.args.result_folder): os.makedirs(self.args.result_folder) self.web_sites_to_process = self.find_projects_to_process() self.cloud_id_to_worker_ip = dict() self.config = TRobotConfig.read_by_config_type( self.args.dlrobot_config_type) self.last_remote_call = None # for testing host, port = self.args.server_address.split(":") self.logger.debug("start server on {}:{}".format(host, port)) super().__init__((host, int(port)), TDlrobotRequestHandler) self.last_service_action_time_stamp = time.time() self.service_action_count = 0 self.decl_sender = TDeclarationSender( self.logger, self.args.enable_smart_parser, self.args.enable_source_doc_server) self.stop_process = False if self.args.enable_ip_checking: self.permitted_hosts = set( str(x) for x in ipaddress.ip_network('192.168.100.0/24').hosts()) self.permitted_hosts.add('127.0.0.1') self.permitted_hosts.add('95.165.96.61') # disclosures.ru self.logger.debug("init complete") self.send_to_telegram("start dlrobot central with {} tasks".format( len(self.web_sites_to_process)))
def get_weak_office_uniq_website(self): strong_offices = set() for _, _, office_id in self.get_predict_train_entries(): strong_offices.add(office_id) web_sites = TDeclarationWebSiteList(logger=self.logger, offices=RUSSIA.offices_in_memory) processed_websites = set() for sha256, src_doc in self.dlrobot_human.get_all_documents(): web_site = src_doc.get_web_site() if web_site in processed_websites or web_site is None or web_site == "": continue processed_websites.add(web_site) site_info = web_sites.search_url(web_site) if site_info is None: self.logger.error( "cannot find {} in offices.txt".format(web_site)) continue office_id = site_info.parent_office.office_id if office_id not in strong_offices: yield sha256, src_doc, office_id
class TOfficePredictIndex: def __init__(self, logger, file_path): self.index_file_path = file_path self.logger = logger self.office_name_bigrams = None self.office_name_unigrams = None self.office_squeezes = None self.web_domains = None self.office_id_2_ml_office_id = None self.ml_office_id_2_office_id = None self.web_sites = TDeclarationWebSiteList(self.logger) self.regions = TRussianRegions() def get_bigrams_count(self): return len(self.office_name_bigrams) def get_unigrams_count(self): return len(self.office_name_unigrams) def get_max_region_id(self): return self.regions.max_region_id def get_web_domain_index(self, web_domain): s = self.web_domains.get(web_domain) if s is None: return 0 return s.web_domain_id def is_office_child(self, child_id, parent_id): return child_id is not None and self.office_squeezes[child_id][ 'parent_id'] == parent_id def is_office_child_or_grandchild(self, child_id, parent_id): if self.is_office_child(child_id, parent_id): return True p = self.office_squeezes[child_id]['parent_id'] return self.is_office_child(p, parent_id) def get_web_domains_count(self): return len(self.web_domains) def get_web_domain_by_url(self, document_url, site_url): # first take web domain from which the document was dowloaded web_domain = urlsplit_pro(document_url).hostname if self.web_sites.get_first_site_by_web_domain(web_domain) is not None: return web_domain # if this web domain is unknown, take web domain from site_url web_domain = urlsplit_pro(site_url).hostname if self.web_sites.get_first_site_by_web_domain(web_domain) is None: if not self.web_sites.is_a_special_domain(web_domain): self.logger.error( "web domain {} is missing in office.txt".format(site_url)) return web_domain def get_ml_office_id(self, office_id: int): return self.office_id_2_ml_office_id.get(office_id) def get_office_id_by_ml_office_id(self, ml_office_id: int): return self.ml_office_id_2_office_id.get(ml_office_id) def get_bigram_id(self, bigram): b = self.office_name_bigrams.get(bigram) if b is None: return None return b.ngram_id def get_unigram_id(self, gram): b = self.office_name_unigrams.get(gram) if b is None: return None return b.ngram_id def get_offices_by_bigram(self, bigram): b = self.office_name_bigrams.get(bigram) if b is None: return list() return b.office_squeezes @staticmethod def get_word_stems(text, stem_size=4, add_starter_and_enders=True): if add_starter_and_enders: yield "^" text = text.lower().replace('ё', 'е') for word in re.split("[\s,\.;:_\"* ()«»]", text): if len(word) == 0: continue #ignore year if word.startswith("20") and len(word) == 4: continue hyphen_index = word.find('-') if hyphen_index > 0: if word[hyphen_index - 1] == 'о': #"ямало-ненецкий" не надо разбивать yield word[:stem_size * 2] else: w1, w2 = word.split('-', 1) yield w1[:stem_size] # split каменск-уральский yield w2[:stem_size] else: yield word[:stem_size] if add_starter_and_enders: yield "$" @staticmethod def get_bigrams(text): words = list(TOfficePredictIndex.get_word_stems(text)) for w1, w2 in zip(words[:-1], words[1:]): yield "_".join((w1, w2)) @staticmethod def get_trigrams(text): words = list(TOfficePredictIndex.get_word_stems(text)) for w1, w2, w3 in zip(words[:-2], words[1:-1], words[2:]): yield "_".join((w1, w2, w3)) @staticmethod def split_web_domain(web_domain): for x in web_domain.split('.'): yield x def read(self): with open(self.index_file_path) as inp: js = json.load(inp) self.office_name_bigrams = dict((k, TOfficeNgram.from_json(v)) for k, v in js['bigrams'].items()) self.office_name_unigrams = dict( (k, TOfficeNgram.from_json(v)) for k, v in js['unigrams'].items()) self.office_squeezes = dict( (int(k), v) for k, v in js['offices'].items()) self.web_domains = dict((k, TOfficeWebDomain.from_json(v)) for k, v in js['web_domains'].items()) self.office_id_2_ml_office_id = dict( (int(k), v) for k, v in js['office_id_2_ml_office_id'].items()) self.ml_office_id_2_office_id = dict( (int(k), v) for k, v in js['ml_office_id_2_office_id'].items()) self.logger.info("bigrams count = {}".format(self.get_bigrams_count())) def write(self): self.logger.info("write to {}".format(self.index_file_path)) with open(self.index_file_path, "w") as outp: assert self.office_squeezes is not None assert len(self.office_squeezes) > 0 rec = { 'bigrams': dict((k, v.to_json()) for k, v in self.office_name_bigrams.items()), 'unigrams': dict((k, v.to_json()) for k, v in self.office_name_unigrams.items()), 'offices': self.office_squeezes, 'web_domains': dict((k, v.to_json()) for k, v in self.web_domains.items()), 'office_id_2_ml_office_id': self.office_id_2_ml_office_id, 'ml_office_id_2_office_id': self.ml_office_id_2_office_id, } json.dump(rec, outp, ensure_ascii=False, indent=4) def get_office_name(self, office_id: int): return self.office_squeezes[office_id]['name'] def has_office_squeeze(self, office_id: int): return office_id in self.office_squeezes def get_office_region(self, office_id: int): return self.office_squeezes[office_id]['region'] def get_region_from_web_site_title(self, site_url: str): site_info = self.web_sites.get_web_site(site_url) if site_info is not None and site_info.title is not None: return self.regions.get_region_all_forms(site_info.title, 0) else: return 0 def get_parent_office_from_web_site(self, site_url: str): site_info = self.web_sites.get_web_site(site_url) if site_info is None: self.logger.error( " site_url = {} cannot be found in offices.txt".format( site_url)) return None return self.get_ml_office_id(site_info.parent_office.office_id)
if r.result_files_count > 0: good.add(url) else: bad.add(url) cnt = 0 for url in bad: if url in good: continue cnt += 1 if web_sites.has_web_site(url) and TWebSiteReachStatus.can_communicate( web_sites.get_web_site(url).reach_status): logger.info("browse {} ...".format(url)) title = get_html_title_from_url(url) output_file.write("{}\t{}\t{}\n".format( url, ",".join(statuses.get(url, ["unk"])), title)) #if cnt > 10: # break if __name__ == "__main__": args = parse_args() logger = setup_logging("analyze_remote_calls") web_sites = TDeclarationWebSiteList(logger) remote_calls = TRemoteDlrobotCall.read_remote_calls_from_file( args.input_file) with open(args.output_file, "w") as outp: if args.action == "print_sites_wo_results": print_sites_wo_results(logger, remote_calls, web_sites, outp) else: raise Exception('unknown acton')
class TWebSitesManager: def __init__(self): self.args = parse_args() self.logger = setup_logging(log_file_name=self.args.logfile) if self.args.input_offices is not None: offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.input_offices) self.web_sites = TDeclarationWebSiteList(self.logger, offices=offices) else: self.web_sites = TDeclarationWebSiteList(self.logger) self.temp_dlrobot_project: TRobotProject self.temp_dlrobot_project = None THttpRequester.initialize(self.logger) def check_web_site_filters(self, site_url): if site_url.strip() == "": return False if self.args.filter_regex is not None: if re.search(self.args.filter_regex, site_url) is None: return False site_info = self.web_sites.get_web_site(site_url) if site_info is None: self.logger.error( "skip {}, cannot find this site".format(site_url)) return False else: if self.args.take_without_titles: return TWebSiteReachStatus.can_communicate( site_info.reach_status) and site_info.title is None elif self.args.take_all_web_sites or TWebSiteReachStatus.can_communicate( site_info.reach_status): return True else: self.logger.debug("skip abandoned {}".format(site_url)) return False def read_web_domains_from_file(self): self.logger.info("read url list from {}".format(self.args.url_list)) web_domains = list() with open(self.args.url_list) as inp: for url in inp: url = url.strip(" \r\n") if url.startswith('http'): web_domains.append(strip_scheme_and_query(url)) else: web_domains.append(url) return web_domains def get_url_list(self, start_selenium=False): web_domains = list() if self.args.filter_by_source is not None: web_domains = list() for k in self.web_sites.web_sites.values(): if k.parent_office.source_id == self.args.filter_by_source: web_domains.append(get_site_url(k.url)) elif self.args.url_list is not None: web_domains = self.read_web_domains_from_file() else: #take all web domains web_domains = list(self.web_sites.web_sites.keys()) domains_filtered = list(w for w in web_domains if self.check_web_site_filters(w)) self.logger.info("we are going to process {} web sites".format( len(domains_filtered))) if start_selenium: TDownloadEnv.FILE_CACHE_FOLDER = TDownloadEnv.FILE_CACHE_FOLDER + "_{}_{}".format( time.time(), os.getpid()) self.logger.info("rm {}".format(TDownloadEnv.FILE_CACHE_FOLDER)) TDownloadEnv.clear_cache_folder() project_path = "project.txt" TRobotProject.create_project("dummy.ru", project_path) with TRobotProject( self.logger, project_path, export_folder="result") as self.temp_dlrobot_project: for w in domains_filtered: yield w os.unlink(project_path) else: for w in domains_filtered: yield w def ban_sites(self): cnt = 0 for url in self.get_url_list(start_selenium=True): self.logger.debug("ban {}".format(url)) self.web_sites.get_web_site(url).ban() cnt += 1 self.logger.info("ban {} web sites".format(cnt)) def to_utf8(self): cnt = 0 for site_url in self.get_url_list(): site_info = self.web_sites.get_web_site(site_url) if site_info.redirect_to is not None and TUrlUtf8Encode.is_idna_string( site_info.redirect_to): site_info.redirect_to = TUrlUtf8Encode.convert_url_from_idna( site_info.redirect_to) if site_info.redirect_to == site_url and site_info.reach_status == TWebSiteReachStatus.abandoned: site_info.redirect_to = None site_info.reach_status = TWebSiteReachStatus.normal cnt += 1 if TUrlUtf8Encode.is_idna_string(site_url): site_info.url = TUrlUtf8Encode.convert_url_from_idna(site_url) cnt += 1 self.logger.info("{} conversions made".format(cnt)) def browse_one_url(self, url): self.logger.info("check {}".format(url)) web_site = TWebSiteCrawlSnapshot(self.temp_dlrobot_project, morda_url=url, enable_step_init=False) web_site.fetch_the_main_page(enable_search_engine=False) if TWebSiteReachStatus.can_communicate(web_site.reach_status): return web_site else: self.logger.info("restart selenium, and try again") self.temp_dlrobot_project.selenium_driver.restart() web_site = TWebSiteCrawlSnapshot(self.temp_dlrobot_project, morda_url=url, enable_step_init=False) web_site.fetch_the_main_page(enable_search_engine=False) if TWebSiteReachStatus.can_communicate(web_site.reach_status): return web_site else: return None def get_external_file_name_by_site_url(self, site_url): return site_url.strip('/').replace('/', '_') + ".page_source.html" def check_alive_one_url(self, site_url, complete_bans, site_info=None): site_info: TDeclarationWebSite if site_info is None: site_info = self.web_sites.get_web_site(site_url) web_site = self.browse_one_url(site_url) #office = self.web_sites.get_office(site_url) office = site_info.parent_office if web_site is None: self.logger.info(" {} is dead".format(site_url)) site_info.ban() complete_bans.append(site_url) else: new_site_url = web_site.get_main_url_protocol( ) + "://" + strip_scheme_and_query(web_site.main_page_url) title = web_site.get_title(web_site.main_page_url) if strip_scheme_and_query( web_site.main_page_url).strip('/') != site_url.strip('/'): self.logger.info( ' {} is alive, but is redirected to {}'.format( site_url, new_site_url)) new_site_info = None for u in office.office_web_sites: if u.url == site_url: u.set_redirect(new_site_url) if u.url == new_site_url: new_site_info = u if new_site_info is None: new_site_info = TDeclarationWebSite(url=new_site_url) office.office_web_sites.append(new_site_info) new_site_info.set_title(title) else: self.logger.info(" {} is alive, main_page_url = {}".format( site_url, web_site.main_page_url)) site_info.set_title(title) if web_site.main_page_source.lower().find('коррупц') != -1: self.logger.info( "site contains corruption keyword {}".format(site_url)) site_info.corruption_keyword_in_html = True if self.args.main_page_path: try: with open( self.get_external_file_name_by_site_url(site_url), "w") as outp: outp.write(web_site.main_page_source) except Exception as exp: self.logger.error( "cannot save page html to file: {} ".format(site_url)) def check_alive(self): complete_bans = list() checked_count = 0 for site_url in self.get_url_list(start_selenium=True): self.check_alive_one_url(site_url, complete_bans) checked_count += 1 self.logger.info("ban {} web sites out of {} sites".format( len(complete_bans), checked_count)) def print_keys(self): for web_domain in self.get_url_list(): print(web_domain) def split(self): parts_count = self.args.split_parts chunk_size = int(len(self.web_sites.offices.offices) / parts_count) offices = list(self.web_sites.offices.offices.values()) chunk_id = 0 cnt = 0 for l in range(0, len(offices), chunk_size): chunk_id += 1 o = TOfficeTableInMemory() for i in offices[l:l + chunk_size]: o.add_office(i) file_path = "chunk_offices_{}.txt".format(chunk_id) o.write_to_local_file(file_path) cnt += len(o.offices) assert cnt == len(offices) def check(self): self.web_sites.check_valid(self.logger, fail_fast=False) def redirect_subdomain(self): for web_domain in self.get_url_list(start_selenium=True): site_info = self.web_sites.get_web_site(web_domain) if site_info.redirect_to is None or not web_domain.endswith( site_info.redirect_to): continue self.browse_one_url(web_domain) def create_departments(self): o: TOfficeInMemory TDownloadEnv.clear_cache_folder() project_path = "project.txt" TRobotProject.create_project("dummy.ru", project_path, web_sites_db=self.web_sites) with TRobotProject( self.logger, project_path, export_folder="result") as self.temp_dlrobot_project: for o in self.web_sites.offices.values(): if o.parent_id == self.args.parent_office_id: self.logger.info("ofiice id = {}, {}".format( o.office_id, o.name)) query = self.args.query_template.format(o.name) engine = random.choice( [SearchEngineEnum.GOOGLE, SearchEngineEnum.YANDEX]) results = SearchEngine.send_request( engine, query, self.temp_dlrobot_project.selenium_driver) if len(results) == 0: msg = "cannot find results fo query {}".format(query) self.logger.error(msg) else: new_web_site = TDeclarationWebSite(url=results[0]) found = False for u in o.office_web_sites: if u.url == new_web_site: found = True self.logger.error( "{} already exists".format(new_web_site)) if not found: o.office_web_sites.append(new_web_site) self.check_alive_one_url(new_web_site.url) time.sleep(20) def select(self): out = TOfficeTableInMemory() for web_domain in self.get_url_list(): site_info: TDeclarationWebSite site_info = self.web_sites.get_web_site(web_domain) out.add_office(site_info.parent_office) self.web_sites.offices = out def select_adhoc(self): good_web_domains = set(self.read_web_domains_from_file()) office: TOfficeInMemory ban_cnt = 0 sp_left = 0 for office in self.web_sites.offices.offices.values(): if office.is_from_spravochnik(): w: TDeclarationWebSite for w in office.office_web_sites: if not w.can_communicate(): continue u = strip_scheme_and_query(w.url) if u in good_web_domains or "{}/".format( u) in good_web_domains: sp_left += 1 continue ban_cnt += 1 self.logger.debug("ban office_id={}".format( office.office_id)) w.ban(TWebSiteReachStatus.unpromising) self.logger.info("ban {} sites, left in spravochnik {}".format( ban_cnt, sp_left)) def make_redirects(self): with open(self.args.redirect_mapping_path) as inp: for l in inp: old, new_site_url = l.strip().split() if not new_site_url.startswith('http'): raise Exception( "unknown http prefix in {}".format(new_site_url)) web_site = self.web_sites.search_url(old) if web_site is None: raise Exception("cannot find website {}".format(old)) web_site.set_redirect(new_site_url) new_site_info = TDeclarationWebSite(url=new_site_url) web_site.parent_office.office_web_sites.append(new_site_info) def get_title_from_local_files(self): for site_url in self.get_url_list(start_selenium=False): site_info = self.web_sites.get_web_site(site_url) file_path = os.path.join( "page_source", self.get_external_file_name_by_site_url(site_url)) if os.path.exists(file_path): self.logger.info("read {}".format(file_path)) with open(file_path, "rb") as inp: title = get_html_title(inp.read()) site_info.set_title(title) def print_web_sites(self): site_infos = list() for site_url in self.get_url_list(start_selenium=False): site_info = self.web_sites.get_web_site(site_url) site_info.title = TDeclarationWebSite.clean_title(site_info.title) d = site_info.write_to_json() d['office_id'] = site_info.parent_office.office_id site_infos.append(d) print(json.dumps(site_infos, ensure_ascii=False, indent=4)) def check_mirrors(self): offices = set() complete_bans = list() for site_url in self.get_url_list(start_selenium=True): office_info: TOfficeInMemory office_info = self.web_sites.get_web_site(site_url).parent_office not_abandoned_cnt = 0 for u in office_info.office_web_sites: if u.can_communicate(): not_abandoned_cnt += 1 if not_abandoned_cnt > 1 and office_info.office_web_sites[ -1].can_communicate() and office_info not in offices: offices.add(office_info) for i in range(len(office_info.office_web_sites) - 1): site_info = office_info.office_web_sites[i] if site_info.can_communicate(): self.check_alive_one_url(site_info.url, complete_bans, site_info=site_info) def main(self): if self.args.action == "ban": self.ban_sites() elif self.args.action == "to_utf8": self.to_utf8() elif self.args.action == "check_alive": self.check_alive() elif self.args.action == "print_keys": self.print_keys() elif self.args.action == "check": self.check() elif self.args.action == "redirect_subdomain": self.redirect_subdomain() elif self.args.action == "create_departments": self.create_departments() elif self.args.action == "select": self.select() elif self.args.action == "split": self.split() return elif self.args.action == "make_redirects": self.make_redirects() elif self.args.action == "get_title_from_local_files": self.get_title_from_local_files() elif self.args.action == "check_mirrors": self.check_mirrors() elif self.args.action == "select_adhoc": self.select_adhoc() elif self.args.action == "print_web_sites": self.print_web_sites() return else: raise Exception("unknown action") self.logger.info("write to {}".format(self.args.output_file)) self.web_sites.offices.write_to_local_file(self.args.output_file)
def test_office_website_valid(self): logger = setup_logging("test_office_website_valid") web_sites = TDeclarationWebSiteList(logger) self.assertEqual(True, web_sites.check_valid(logger, fail_fast=True))
class TOfficePredictor: default_ml_model_path = os.path.join(os.path.dirname(__file__), "../model") @staticmethod def parse_args(args): parser = argparse.ArgumentParser() parser.add_argument("--dlrobot-human-path", dest='dlrobot_human_path', required=True) parser.add_argument("--office-model-path", dest='office_model_path', required=False, default=TOfficePredictor.default_ml_model_path) parser.add_argument("--disable-ml", dest='enable_ml', required=False, default=True, action="store_false") parser.add_argument("--max-failures-count", dest='max_failures_count', required=False, default=100, type=int) return parser.parse_args(args=args) def __init__(self, args): self.logger = setup_logging(log_file_name="predict_office.log") self.dlrobot_human_path = args.dlrobot_human_path self.dlrobot_human = TDlrobotHumanFileDBM(self.dlrobot_human_path) self.dlrobot_human.open_write_mode() self.enable_ml = args.enable_ml sp_args = TSmartParserCacheClient.parse_args([]) self.smart_parser_server_client = TSmartParserCacheClient(sp_args, self.logger) model_path = args.office_model_path self.max_failures_count = args.max_failures_count assert (os.path.exists(model_path)) bigrams_path = os.path.join(model_path, "office_ngrams.txt") ml_model_path = os.path.join(model_path, "model") self.office_ml_model = TTensorFlowOfficeModel(self.logger, bigrams_path, ml_model_path, create_model=False) self.regional_tax_offices = self.build_regional_tax_offices() self.web_sites = TDeclarationWebSiteList(self.logger, RUSSIA.offices_in_memory) self.title_parser = TOfficeFromTitle(self.logger, web_sites=self.web_sites) self.src_doc_to_rule_results = dict() def build_regional_tax_offices(self): o: TOfficeInMemory tax_offices = dict() for o in RUSSIA.iterate_offices(): if o.rubric_id == TOfficeRubrics.Tax: tax_offices[o.region_id] = o.office_id assert len(tax_offices) > 0 return tax_offices def set_office_id(self, sha256, src_doc: TSourceDocument, office_id, method_name: str): old_office_id = src_doc.calculated_office_id if old_office_id is None or office_id == old_office_id: self.logger.debug("set file {} office_id={} ({} )".format( sha256, office_id, method_name)) else: self.logger.info("change office_id from {} to {} for file {} , ({})".format( \ old_office_id, office_id, sha256, method_name)) src_doc.calculated_office_id = office_id self.dlrobot_human.update_source_document(sha256, src_doc) def predict_tax_office(self, sha256, src_doc: TSourceDocument): web_ref: TWebReference for web_ref in src_doc.web_references: if web_ref._site_url.endswith("service.nalog.ru"): if src_doc.region_id is None: smart_parser_json = self.smart_parser_server_client.retrieve_json_by_sha256(sha256) if smart_parser_json is None: return False props = smart_parser_json.get('document_sheet_props') if props is None or len(props) == 0 or 'url' not in props[0]: return False url = props[0]['url'] region_str = url[:url.find('.')] if not region_str.isdigit(): return False src_doc.region_id = int(region_str) office_id = self.regional_tax_offices.get(src_doc.region_id) if office_id is not None: self.set_office_id(sha256, src_doc, office_id, "regional tax office") return True return False # all sites are ascribed to the same office def single_web_site(self, src_doc): r: TWebReference offices = set() for r in src_doc.web_references: if r.get_site_url(): site_info = self.web_sites.search_url(r.get_site_url()) if site_info is not None: offices.add(site_info.parent_office.office_id) if len(offices) == 1: return list(offices)[0] return None #Take the first office, that is a very bad solution. This is done to make the whole thing work. # In future we hope to get rid of this solution by adding anchor texts analysis or moro sophisticated title parsing def predict_by_first_web_site(self, case: TPredictionCase, src_doc): r: TWebReference min_crawl_epoch = time.time() office_id = None for r in src_doc.web_references: if 0 < r.crawl_epoch < min_crawl_epoch: site_info = self.web_sites.search_url(r.get_site_url()) if site_info is not None: min_crawl_epoch = r.crawl_epoch office_id = site_info.parent_office.office_id return office_id
class TDlrobotHTTPServer(http.server.HTTPServer): max_continuous_failures_count = 7 PITSTOP_FILE = ".dlrobot_pit_stop" @staticmethod def parse_args(arg_list): parser = argparse.ArgumentParser() parser.add_argument( "--server-address", dest='server_address', default=None, help= "by default read it from environment variable DLROBOT_CENTRAL_SERVER_ADDRESS" ) parser.add_argument("--dlrobot-config-type", dest='dlrobot_config_type', required=False, default="prod", help="can be prod, preliminary or test") parser.add_argument("--custom-offices-file", dest='offices_file', required=False) parser.add_argument("--log-file-name", dest='log_file_name', required=False, default="dlrobot_central.log") parser.add_argument("--remote-calls-file", dest='remote_calls_file', default=None) parser.add_argument("--result-folder", dest='result_folder', required=True) parser.add_argument("--tries-count", dest='tries_count', required=False, default=2, type=int) parser.add_argument("--central-heart-rate", dest='central_heart_rate', required=False, default='60s') parser.add_argument( "--check-yandex-cloud", dest='check_yandex_cloud', default=False, action='store_true', required=False, help="check yandex cloud health and restart workstations") parser.add_argument( "--skip-worker-check", dest='skip_worker_check', default=False, action='store_true', required=False, help="skip checking that this task was given to this worker") parser.add_argument("--enable-ip-checking", dest='enable_ip_checking', default=False, action='store_true', required=False) parser.add_argument("--disable-smart-parser-server", dest="enable_smart_parser", default=True, action="store_false", required=False) parser.add_argument("--disable-source-doc-server", dest="enable_source_doc_server", default=True, action="store_false", required=False) parser.add_argument("--disable-search-engines", dest="enable_search_engines", default=True, action="store_false", required=False) parser.add_argument("--disable-telegram", dest="enable_telegram", default=True, required=False, action="store_false") parser.add_argument("--disable-pdf-conversion-server-checking", dest="pdf_conversion_server_checking", default=True, required=False, action="store_false") parser.add_argument("--web-site-regexp", dest="web_site_regexp", required=False) parser.add_argument("--office-source-id", dest="office_source_id", required=False) parser.add_argument( "--round-file", dest="round_file", default=TDeclarationRounds.default_dlrobot_round_path) args = parser.parse_args(arg_list) args.central_heart_rate = convert_timeout_to_seconds( args.central_heart_rate) if args.server_address is None: args.server_address = os.environ['DLROBOT_CENTRAL_SERVER_ADDRESS'] if args.check_yandex_cloud: assert TYandexCloud.get_yc() is not None return args def __init__(self, args): self.register_task_result_error_count = 0 self.logger = setup_logging(log_file_name=args.log_file_name, append_mode=True) self.conversion_client = TDocConversionClient( TDocConversionClient.parse_args([]), self.logger) self.args = args rounds = TDeclarationRounds(args.round_file) self.dlrobot_remote_calls = TRemoteDlrobotCallList( logger=self.logger, file_name=args.remote_calls_file, min_start_time_stamp=rounds.start_time_stamp) self.worker_2_running_tasks = defaultdict(list) self.worker_2_continuous_failures_count = defaultdict(int) offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.offices_file) self.web_sites_db = TDeclarationWebSiteList(self.logger, offices=offices) if not os.path.exists(self.args.result_folder): os.makedirs(self.args.result_folder) self.web_sites_to_process = self.find_projects_to_process() self.cloud_id_to_worker_ip = dict() self.config = TRobotConfig.read_by_config_type( self.args.dlrobot_config_type) self.last_remote_call = None # for testing host, port = self.args.server_address.split(":") self.logger.debug("start server on {}:{}".format(host, port)) super().__init__((host, int(port)), TDlrobotRequestHandler) self.last_service_action_time_stamp = time.time() self.service_action_count = 0 self.decl_sender = TDeclarationSender( self.logger, self.args.enable_smart_parser, self.args.enable_source_doc_server) self.stop_process = False if self.args.enable_ip_checking: self.permitted_hosts = set( str(x) for x in ipaddress.ip_network('192.168.100.0/24').hosts()) self.permitted_hosts.add('127.0.0.1') self.permitted_hosts.add('95.165.96.61') # disclosures.ru self.logger.debug("init complete") self.send_to_telegram("start dlrobot central with {} tasks".format( len(self.web_sites_to_process))) def send_to_telegram(self, message): if self.args.enable_telegram: self.logger.debug("send to telegram: {}".format(message)) telegram_send.send(messages=[message]) def stop_server(self): self.server_close() self.shutdown() def verify_request(self, request, client_address): if self.args.enable_ip_checking: (ip, dummy) = client_address if ip not in self.permitted_hosts: return False return True def log_process_result(self, process_result): s = process_result.stdout.strip("\n\r ") if len(s) > 0: for line in s.split("\n"): self.logger.error("task stderr: {}".format(line)) s = process_result.stderr.strip("\n\r ") if len(s) > 0: for line in s.split("\n"): self.logger.error("task stderr: {}".format(line)) def have_tasks(self): return len(self.web_sites_to_process) > 0 and not self.stop_process def project_is_to_process(self, project_file): interactions = self.dlrobot_remote_calls.get_interactions(project_file) if sum(1 for i in interactions if i.task_was_successful()) > 0: return False tries_count = self.args.tries_count if sum(1 for i in interactions if not i.task_ended()) > 0: # if the last result was not obtained, may be, # worker is down, so the problem is not in the task but in the worker # so give this task one more chance tries_count += 1 self.logger.debug("increase max_tries_count for {} to {}".format( project_file, tries_count)) return len(interactions) < tries_count def save_dlrobot_remote_call(self, remote_call: TRemoteDlrobotCall): self.dlrobot_remote_calls.add_dlrobot_remote_call(remote_call) if not remote_call.task_was_successful(): if self.project_is_to_process(remote_call.project_file): self.web_sites_to_process.append(remote_call.web_site) self.logger.debug("register retry for {}".format( remote_call.web_site)) def find_projects_to_process(self): web_sites_to_process = list() self.logger.info("filter web sites") web_site_info: TDeclarationWebSite for web_site, web_site_info in self.web_sites_db.web_sites.items(): if self.args.web_site_regexp is not None: if re.match(self.args.web_site_regexp, web_site) is None: continue if self.args.office_source_id is not None: if web_site_info.get_parent_source_id( ) != self.args.office_source_id: continue if TWebSiteReachStatus.can_communicate(web_site_info.reach_status): project_file = TRemoteDlrobotCall.web_site_to_project_file( web_site) if self.project_is_to_process(project_file): web_sites_to_process.append(web_site) self.logger.info("there are {} sites in the input queue".format( len(web_sites_to_process))) web_sites_to_process.sort( key=(lambda x: self.dlrobot_remote_calls.last_interaction[x])) with open("web_sites_to_process_debug.txt", "w") as out: for w in web_sites_to_process: out.write(w + "\n") return web_sites_to_process def get_running_jobs_count(self): return sum(len(w) for w in self.worker_2_running_tasks.values()) def get_processed_jobs_count(self): return len(list(self.dlrobot_remote_calls.get_all_calls())) def get_new_project_to_process(self, worker_host_name, worker_ip): site_url = self.web_sites_to_process.pop(0) project_file = TRemoteDlrobotCall.web_site_to_project_file(site_url) self.logger.info( "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}" .format(project_file, worker_ip, worker_host_name, len(self.web_sites_to_process), self.get_running_jobs_count())) remote_call = TRemoteDlrobotCall(worker_ip=worker_ip, project_file=project_file, web_site=site_url) remote_call.worker_host_name = worker_host_name web_site_passport = self.web_sites_db.get_web_site(site_url) regional_main_pages = list() if web_site_passport is None: self.logger.error( "{} is not registered in the web site db, no office information is available for the site" ) project_content_str = TRobotProject.create_project_str( site_url, regional_main_pages, disable_search_engine=not self.args.enable_search_engines) self.worker_2_running_tasks[worker_ip].append(remote_call) return remote_call, project_content_str.encode("utf8") def untar_file(self, project_file, result_archive): base_folder, _ = os.path.splitext(project_file) output_folder = os.path.join(self.args.result_folder, base_folder) + ".{}".format( int(time.time())) compressed_file = io.BytesIO(result_archive) decompressed_file = gzip.GzipFile(fileobj=compressed_file) tar = tarfile.open(fileobj=decompressed_file) tar.extractall(output_folder) return output_folder def pop_project_from_running_tasks(self, worker_ip, project_file): if worker_ip not in self.worker_2_running_tasks: raise Exception( "{} is missing in the worker table".format(worker_ip)) worker_running_tasks = self.worker_2_running_tasks[worker_ip] for i in range(len(worker_running_tasks)): if worker_running_tasks[i].project_file == project_file: return worker_running_tasks.pop(i) raise Exception("{} is missing in the worker {} task table".format( project_file, worker_ip)) def worker_is_banned(self, worker_ip, host_name): return self.worker_2_continuous_failures_count[(worker_ip, host_name)] > \ TDlrobotHTTPServer.max_continuous_failures_count def update_worker_info(self, worker_host_name, worker_ip, exit_code): key = (worker_ip, worker_host_name) if exit_code == 0: self.worker_2_continuous_failures_count[key] = 0 else: self.worker_2_continuous_failures_count[key] += 1 if self.worker_is_banned(worker_ip, worker_host_name): self.send_to_telegram( "too many dlrobot errors from ip {}, hostname={}, the host is banned, " "you have to restart dlrobot_central to unban it".format( worker_ip, worker_host_name)) def register_task_result(self, worker_host_name, worker_ip, project_file, exit_code, result_archive): if self.args.skip_worker_check: remote_call = TRemoteDlrobotCall(worker_ip, project_file) else: try: remote_call = self.pop_project_from_running_tasks( worker_ip, project_file) except: if ipaddress.ip_address(worker_ip).is_private: self.logger.debug( "try to get a result {} from a local ip {}, though this task was not dispatched" .format(project_file, worker_ip)) remote_call = TRemoteDlrobotCall(worker_ip, project_file) else: raise self.update_worker_info(worker_host_name, worker_ip, exit_code) remote_call.worker_host_name = worker_host_name remote_call.exit_code = exit_code remote_call.end_time = int(time.time()) project_folder = self.untar_file(project_file, result_archive) remote_call.calc_project_stats(self.logger, self.web_sites_db, project_folder, self.config) if not TWebSiteReachStatus.can_communicate(remote_call.reach_status): remote_call.exit_code = -1 self.decl_sender.send_declaraion_files_to_other_servers(project_folder) self.save_dlrobot_remote_call(remote_call) self.last_remote_call = remote_call self.logger.debug( "got exitcode {} for task result {} from worker {} (host_name = {})" .format(exit_code, project_file, worker_ip, worker_host_name)) def forget_old_remote_processes(self, current_time): for running_procs in self.worker_2_running_tasks.values(): for i in range(len(running_procs) - 1, -1, -1): remote_call = running_procs[i] elapsed_seconds = current_time - remote_call.start_time if elapsed_seconds > self.config.get_kill_timeout_in_central(): self.logger.debug( "task {} on worker {}(host={}) takes {} seconds, probably it failed, stop waiting for a result" .format(remote_call.web_site, remote_call.worker_ip, remote_call.worker_host_name, elapsed_seconds)) running_procs.pop(i) remote_call.exit_code = 126 self.save_dlrobot_remote_call(remote_call) def forget_remote_processes_for_yandex_worker(self, cloud_id): worker_ip = self.cloud_id_to_worker_ip.get(cloud_id) if worker_ip is None and len(self.cloud_id_to_worker_ip) > 0: self.logger.info( "I do not remember ip for cloud_id {}, cannot delete processes" .format(cloud_id)) return running_procs = self.worker_2_running_tasks.get(worker_ip, list()) for i in range(len(running_procs) - 1, -1, -1): rc = running_procs[i] self.logger.debug( "forget task {} on worker {} since the workstation was stopped" .format(rc.project_file, rc.worker_ip)) running_procs.pop(i) rc.exit_code = 125 self.save_dlrobot_remote_call(rc) if cloud_id in self.cloud_id_to_worker_ip: del self.cloud_id_to_worker_ip[cloud_id] def check_yandex_cloud(self): if not self.args.check_yandex_cloud: return None try: if not check_internet(): self.logger.error( "cannot connect to google dns, probably internet is down") return None for m in TYandexCloud.list_instances(): cloud_id = m['id'] if m['status'] == 'STOPPED': self.forget_remote_processes_for_yandex_worker(cloud_id) self.logger.info( "start yandex cloud worker {}".format(cloud_id)) TYandexCloud.start_yandex_cloud_worker(cloud_id) elif m['status'] == "RUNNING": worker_ip = TYandexCloud.get_worker_ip(m) if self.args.enable_ip_checking: self.permitted_hosts.add(worker_ip) self.cloud_id_to_worker_ip[cloud_id] = worker_ip except Exception as exp: self.logger.error(exp) def check_pdf_conversion_server(self): if not self.args.pdf_conversion_server_checking: return True return not self.conversion_client.server_is_too_busy() def service_actions(self): current_time = time.time() if current_time - self.last_service_action_time_stamp >= self.args.central_heart_rate: self.service_action_count += 1 if self.service_action_count % 10 == 0: self.logger.debug('alive') self.last_service_action_time_stamp = current_time if os.path.exists(self.PITSTOP_FILE): self.stop_process = True self.logger.debug( "stop sending tasks, exit for a pit stop after all tasks complete" ) os.unlink(self.PITSTOP_FILE) if self.stop_process and self.get_running_jobs_count() == 0: self.logger.debug("exit via exception") raise Exception("exit for pit stop") try: self.forget_old_remote_processes(current_time) except Exception as exp: self.logger.error(exp) self.check_yandex_cloud() if not self.check_pdf_conversion_server(): self.logger.debug( "stop sending tasks, because conversion pdf queue length is {}" .format(self.conversion_client. last_pdf_conversion_queue_length)) def get_stats(self): workers = dict((k, list(r.write_to_json() for r in v)) for (k, v) in self.worker_2_running_tasks.items()) stats = { 'running_count': self.get_running_jobs_count(), 'input_tasks': len(self.web_sites_to_process), 'processed_tasks': self.get_processed_jobs_count(), 'worker_2_running_tasks': workers, 'last_service_action_time_stamp': self.last_service_action_time_stamp, 'central_heart_rate': self.args.central_heart_rate, 'register_task_result_error_count': self.register_task_result_error_count } if self.stop_process: stats['stop_process'] = True return stats