def __init__(self, original_domain: str, link: str, external_stop_event: multiprocessing.Event, max_thread: int=1, download_content=True, download_base_dir=None, max_level=2, max_page=200): self._original_domain = original_domain self._archive_link = link self._external_stop_event = external_stop_event self._internal_pages = [] # an array of PageAttrs for page comparison self._external_ref_page = [] # an array of PageAttrs for page comparison self._internal_list = [] # an array of LinkAttrs for checking download list self._broken_res_list = [] inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link) file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment) self._internal_list.append(LinkAttrs(link=link, path=file_path, ref_link=ref_path, shadow_ref_link=ref_path, source=file_path, res_type=LinkUtility.EXT_WEBPAGE, level=0)) self._max_thread = max_thread self._max_level = max_level self._current_level = 0 self._max_page = max_page if max_thread < 1: self._max_thread = 1 self._download_content = download_content if self._download_content and download_base_dir is None: raise ValueError("ArchiveExplorer.__init__: download_base_dir cannot be None.") self._file_manager = SiteFileManager(base_dir_path=FilePath.get_default_archive_dir(), file_name=original_domain) self._file_manager.write_to_error_log(LinkAttrs.get_titles()) self._max_redirect = 10 self._max_retries = 2 self._pool = None self._sync_lock = threading.RLock() self._broken_webpage_count = 0 self._broken_image_count = 0 self._broken_css_count = 0 self._broken_js_count = 0 self._broken_others_count = 0 self._total_webpage_count = 0 self._total_image_count = 0 self._total_css_count = 0 self._total_js_count = 0 self._total_others_count = 0 self._total_res_done = 0 self._timeout = 10
from DomainFinderSrc.Utilities import FilePath from DomainFinderSrc.MiniServer.DatabaseServer.SiteDB import CategoryDomainSiteDB, CatagoryDomainSiteDataStruct from DomainFinderSrc.MiniServer.DatabaseServer.CategoryDB import * from DomainFinderSrc.MiniServer.DatabaseServer.DBManager import DBManagerInterface from threading import Event, RLock from DomainFinderSrc.Utilities.Logging import ErrorLogger from DomainFinderSrc.MiniServer.Common.SocketCommands import ServerState, MiningList, ServerCommand, CommandStruct from DomainFinderSrc.Utilities.Serializable import Serializable, NamedMutableSequence market_place_db_addr = FilePath.get_marketplace_db_path("MarketplaceSites.db") market_place_skeleton_db_addr = FilePath.get_marketplace_db_path( "MarketplaceSkeletonSites.db") def db_update_process(skeleton_db_addr: str = "", market_db_addr: str = ""): if len(skeleton_db_addr) == 0: skeleton_db_addr = market_place_skeleton_db_addr if len(market_db_addr) == 0: market_db_addr = market_place_db_addr skeleton_db_manager = CategoryDBManager(skeleton_db_addr) skeleton_db_manager.reset_category_count() len_per_patch = 20000 db = CategoryDomainSiteDB(market_db_addr) site_count = db.site_count(False) current_count = 0 while current_count < site_count: sites = db.get_next_patch_no_rollover(current_count, len_per_patch) for site in sites: if isinstance(site, CatagoryDomainSiteDataStruct): for topic in site.get_categories(): sub_category = skeleton_db_manager.get_sub_category(
def process_data(self, data: FilteredDomainData, **kwargs): account = kwargs.get("Account") # is_domain_good = False is_spammed = False try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): majestic = MajesticCom(account) if self._en_spam_check: self._filter_domain_name(domain=data.domain) # self._filter_anchor_text(majestic, data.domain) # self._filter_ref_domains(majestic, data.domain) if self._en_tf_check: data = self._filter_tf_cf_backlink_ratio(majestic, data) if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains): raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains)) # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio: # raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,)) if self._en_spam_check: self._filter_anchor_text(majestic, data.domain) self._filter_ref_domains(majestic, data.domain) # is_domain_good = True else: raise ValueError("account is none in process_data") except MajesticSpamException as mjx_ex: is_spammed = True data.exception = str(mjx_ex) except Exception as ex: data.exception = str(ex) # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data)) finally: PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True # if data.cf >= self._min_cf and data.tf >= self._min_tf: if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains: # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains: #print("Majatic output:", data) # PrintLogger.print("domain: " + data.domain + " is good.") if not self._is_throughput_debug: if is_spammed: CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) else: CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file self._output_queue.put(data) return data # elif is_spammed: # if not self._is_throughput_debug: # CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None # print("domain: " + data.domain + " has exception:" + data.exception) else: pass
def __init__(self, *args, TF=15, CF=15, CF_TF_Deviation=0.80, Ref_Domains=10, manager: AccountManager, accounts=[], en_tf_check=True, en_spam_check=True, **kwargs): self._min_tf = TF self._min_cf = CF self._min_ref_domains = Ref_Domains acc_manager = manager self._cf_tf_deviation = CF_TF_Deviation self._majestic_result_anchor_limit = 50 self._majestic_result_ref_domain_limit = 50 self._max_backlink_to_ref_domain_ratio = 6.0 self._max_percentage_for_anchor_text_ratio = 0.1 self._en_spam_check = en_spam_check self._en_tf_check = en_tf_check self._log_file = "Majestic_filtering_good.csv" self._bad_log_file = "Majestic_filtering_bad.csv" self._spam_keyword = [x.lower() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_keywords_file_path())] self._spam_anchor = [x.lower() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_anchors_file_path())] self._white_keyword_list = [x.lower() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_white_list_file_path())] self._bad_country = [x.upper() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_bad_country_path())] if len(accounts) == 0: self._account_list = acc_manager.get_accounts(AccountType.Majestic) else: self._account_list = [x for x in accounts if isinstance(x, SiteAccount)] worker_number = kwargs["worker_number"] if worker_number <= 0: worker_number = len(self._account_list) kwargs.update({"worker_number": worker_number}) FilterInterface.__init__(self, *args, **kwargs)
from DomainFinderSrc.Utilities import FilePath from DomainFinderSrc.MiniServer.DatabaseServer.SiteDB import CategoryDomainSiteDB, CatagoryDomainSiteDataStruct from DomainFinderSrc.MiniServer.DatabaseServer.CategoryDB import * from DomainFinderSrc.MiniServer.DatabaseServer.DBManager import DBManagerInterface from threading import Event, RLock from DomainFinderSrc.Utilities.Logging import ErrorLogger from DomainFinderSrc.MiniServer.Common.SocketCommands import ServerState, MiningList, ServerCommand, CommandStruct from DomainFinderSrc.Utilities.Serializable import Serializable, NamedMutableSequence market_place_db_addr = FilePath.get_marketplace_db_path("MarketplaceSites.db") market_place_skeleton_db_addr = FilePath.get_marketplace_db_path("MarketplaceSkeletonSites.db") def db_update_process(skeleton_db_addr: str="", market_db_addr: str=""): if len(skeleton_db_addr) == 0: skeleton_db_addr = market_place_skeleton_db_addr if len(market_db_addr) == 0: market_db_addr = market_place_db_addr skeleton_db_manager = CategoryDBManager(skeleton_db_addr) skeleton_db_manager.reset_category_count() len_per_patch = 20000 db = CategoryDomainSiteDB(market_db_addr) site_count = db.site_count(False) current_count = 0 while current_count < site_count: sites = db.get_next_patch_no_rollover(current_count, len_per_patch) for site in sites: if isinstance(site, CatagoryDomainSiteDataStruct): for topic in site.get_categories(): sub_category = skeleton_db_manager.get_sub_category(CategoryManager.decode_sub_category(topic, False)) sub_category.count += 1 current_count += 1
def __init__(self): self._file_path = FilePath.get_proxy_file_path()
def __init__(self, original_domain: str, link: str, external_stop_event: multiprocessing.Event, max_thread: int = 1, download_content=True, download_base_dir=None, max_level=2, max_page=200): self._original_domain = original_domain self._archive_link = link self._external_stop_event = external_stop_event self._internal_pages = [] # an array of PageAttrs for page comparison self._external_ref_page = [ ] # an array of PageAttrs for page comparison self._internal_list = [ ] # an array of LinkAttrs for checking download list self._broken_res_list = [] inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) file_path, ref_path = LinkUtility.make_valid_web_res_path( path, fragment) self._internal_list.append( LinkAttrs(link=link, path=file_path, ref_link=ref_path, shadow_ref_link=ref_path, source=file_path, res_type=LinkUtility.EXT_WEBPAGE, level=0)) self._max_thread = max_thread self._max_level = max_level self._current_level = 0 self._max_page = max_page if max_thread < 1: self._max_thread = 1 self._download_content = download_content if self._download_content and download_base_dir is None: raise ValueError( "ArchiveExplorer.__init__: download_base_dir cannot be None.") self._file_manager = SiteFileManager( base_dir_path=FilePath.get_default_archive_dir(), file_name=original_domain) self._file_manager.write_to_error_log(LinkAttrs.get_titles()) self._max_redirect = 10 self._max_retries = 2 self._pool = None self._sync_lock = threading.RLock() self._broken_webpage_count = 0 self._broken_image_count = 0 self._broken_css_count = 0 self._broken_js_count = 0 self._broken_others_count = 0 self._total_webpage_count = 0 self._total_image_count = 0 self._total_css_count = 0 self._total_js_count = 0 self._total_others_count = 0 self._total_res_done = 0 self._timeout = 10