Exemplo n.º 1
0
def extract_domain_of_url(url=None):
    """
    url中提取出domain
    :param url:网址
    :return
    正常:返回domain
    异常:返回None
    """
    if url is None:
        return
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
    url = no_fetch_extract(url)
    if url.domain == "" or url.suffix == "":
        return
    else:
        return url.domain + '.' + url.suffix
Exemplo n.º 2
0
    def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ScraperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ScraperHandler")
            # plugin is essential to functionality
            sys.exit(1)
def read_domains():
    """
    从数据库中读取要探测的域名,并解析出其主域名和顶级域名(一级)
    注意:若是不符合规范的域名,则丢弃
    """

    domains = []
    main_domains = []
    tlds = []
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)

    try:
        db = MySQL(SOURCE_CONFIG)
    except Exception, e:
        logger.logger.error(e)
        return False
Exemplo n.º 4
0
def get_site_type_from_url(url):
    """
    Gets a site type (as defined in profiles.models) from the given URL

    Args:
        url (str): A URL

    Returns:
        str: A string indicating the site type
    """
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=False)
    extract_result = no_fetch_extract(url)
    domain = extract_result.domain.lower()
    if domain in SITE_TYPE_OPTIONS:
        return domain
    return PERSONAL_SITE_TYPE
Exemplo n.º 5
0
    def __init__(self, config=None):
        if not config:
            # If there is not config specified, we load a non-interactive configuration.
            self.config = helper_config.non_interactive_config_resolver()
        elif not isinstance(config, helper_config.ConfigResolver):
            # If config is not a ConfigResolver, we are in a legacy situation.
            # We protect this part of the Client API.
            self.config = helper_config.legacy_config_resolver(config)
        else:
            self.config = config

        # Validate configuration
        self._validate_config()

        runtime_config = {}

        # Process domain, strip subdomain
        domain_extractor = tldextract.TLDExtract(
            cache_dir=_get_tldextract_cache_path(),
            include_psl_private_domains=True)
        domain_parts = domain_extractor(self.config.resolve("lexicon:domain"))
        runtime_config[
            "domain"] = f"{domain_parts.domain}.{domain_parts.suffix}"

        if self.config.resolve("lexicon:delegated"):
            # handle delegated domain
            delegated = self.config.resolve("lexicon:delegated").rstrip(".")
            if delegated != runtime_config.get("domain"):
                # convert to relative name
                if delegated.endswith(runtime_config.get("domain")):
                    delegated = delegated[:-len(runtime_config.get("domain"))]
                    delegated = delegated.rstrip(".")
                # update domain
                runtime_config[
                    "domain"] = f"{delegated}.{runtime_config.get('domain')}"

        self.action = self.config.resolve("lexicon:action")
        self.provider_name = self.config.resolve(
            "lexicon:provider_name") or self.config.resolve("lexicon:provider")

        self.config.add_config_source(
            helper_config.DictConfigSource(runtime_config), 0)

        provider_module = importlib.import_module("lexicon.providers." +
                                                  self.provider_name)
        provider_class = getattr(provider_module, "Provider")
        self.provider = provider_class(self.config)
Exemplo n.º 6
0
def split_domain_into_subdomains(domain, split_tld=False):
    """
    Walks up a domain by subdomain.

    >>> split_domain_into_subdomains('this.is.a.test.skywww.net')
    ['this.is.a.test.skywww.net', 'is.a.test.skywww.net', 'a.test.skywww.net', 'test.skywww.net', 'skywww.net']

    """
    import tldextract

    # Requires unicode
    domain = ensure_decoded_text(domain)

    # Do not request latest TLS list on init == suffix_list_urls=False
    global _tldex
    if _tldex is None:
        _tldex = tldextract.TLDExtract(suffix_list_urls=False)

    tx = _tldex(domain)

    domains = []
    if tx.subdomain:
        domains.extend(tx.subdomain.split('.'))

    # tx.registered_domain returns only if domain AND suffix are not none
    # There are cases where we have domain and not suffix; ie short hostnames
    registered_domain = [tx.domain]
    if tx.suffix:
        registered_domain.append(tx.suffix)

    if split_tld:
        domains.extend(registered_domain)
    else:
        domains.append('.'.join(registered_domain))

    # Musical chairs. Change places!
    domains.reverse()

    def join_dom(a, b):
        return '.'.join([b, a])

    # Take each part and add it to the previous part, returning all results
    domains = list(accumulate(domains, func=join_dom))
    # Change places!
    domains.reverse()

    return domains
Exemplo n.º 7
0
    def __init__(self, cookiefile):
        with open(cookiefile, 'rb') as db, \
            tempfile.NamedTemporaryFile(delete=False, prefix='fxcookie-') as tmp:
            shutil.copyfileobj(db, tmp)
            tmp.flush()

        self._file_to_delete = tmp.name
        self._db = sqlite3.connect(tmp.name)
        # self._db = sqlite3.connect(cookiefile)
        self._extract = tldextract.TLDExtract(include_psl_private_domains=True)
        try:
            self._db.execute(
                "select 1 from moz_cookies where originAttributes = '' limit 1"
            ).fetchall()
            self._origin = True
        except sqlite3.OperationalError:
            self._origin = False
Exemplo n.º 8
0
    def __init__(self, config=None):
        if not config:
            # If there is not config specified, we load a non-interactive configuration.
            self.config = non_interactive_config_resolver()
        elif not isinstance(config, ConfigResolver):
            # If config is not a ConfigResolver, we are in a legacy situation.
            # We protect this part of the Client API.
            self.config = legacy_config_resolver(config)
        else:
            self.config = config

        # Validate configuration
        self._validate_config()

        runtime_config = {}

        # Process domain, strip subdomain
        domain_extractor = tldextract.TLDExtract(cache_file=TLDEXTRACT_CACHE_FILE,
                                                 include_psl_private_domains=True)
        domain_parts = domain_extractor(
            self.config.resolve('lexicon:domain'))
        runtime_config['domain'] = '{0}.{1}'.format(
            domain_parts.domain, domain_parts.suffix)

        if self.config.resolve('lexicon:delegated'):
            # handle delegated domain
            delegated = self.config.resolve('lexicon:delegated').rstrip('.')
            if delegated != runtime_config.get('domain'):
                # convert to relative name
                if delegated.endswith(runtime_config.get('domain')):
                    delegated = delegated[:-len(runtime_config.get('domain'))]
                    delegated = delegated.rstrip('.')
                # update domain
                runtime_config['domain'] = '{0}.{1}'.format(
                    delegated, runtime_config.get('domain'))

        self.action = self.config.resolve('lexicon:action')
        self.provider_name = (self.config.resolve('lexicon:provider_name')
                              or self.config.resolve('lexicon:provider'))

        self.config.add_config_source(DictConfigSource(runtime_config), 0)

        provider_module = importlib.import_module(
            'lexicon.providers.' + self.provider_name)
        provider_class = getattr(provider_module, 'Provider')
        self.provider = provider_class(self.config)
Exemplo n.º 9
0
def is_valid_redirect(next_param):
    # add local domain suffix because it is non-standard
    extract_with_extra_suffix = tldextract.TLDExtract(extra_suffixes=["great"
                                                                      ], )
    extracted_domain = extract_with_extra_suffix(next_param)
    # Allow internal redirects
    is_domain = bool(extracted_domain.domain) and bool(extracted_domain.suffix)
    # NOTE: The extra is_domain check is necessary because otherwise
    # for example ?next=//satan.com would redirect even if
    # satan.com is not an allowed redirect domain
    if next_param.startswith('/') and not is_domain:
        return True

    # Otherwise check we allow that domain/suffix
    domain = '.'.join([extracted_domain.domain, extracted_domain.suffix])
    return (domain in settings.ALLOWED_REDIRECT_DOMAINS) or (
        extracted_domain.suffix in settings.ALLOWED_REDIRECT_DOMAINS)
Exemplo n.º 10
0
    def initial_seeds(self):
        """初始化调度器"""

        while True:
            initial_len = self.server.llen('seeds')
            if initial_len:
                break
            time.sleep(180)
            continue

        self.logger.debug('获取初始种子列表.........')
        while True:
            tasks = self.server.lrange('seeds', 0, -1)
            self.server.ltrim('seeds', -1, 0)
            self.tasks.extend(tasks)
            if self.tasks:
                break

        self.logger.debug('获取初始爬虫进程个数.........')
        self.spiders = self.server.keys('stats:spider:*:*')  # spiders列表
        self.spider_count = len(self.spiders)

        if self.spider_count:
            self.logger.debug('调用一致性哈希算法布局爬虫节点位置.......')
            job_ids = []
            for spider in self.spiders:
                job_ids.append(spider.split(':')[3])
            self.chose = ketama.Continuum(job_ids)

            self.logger.debug('分配初始种子URLs队列........')
            for task_json in self.tasks:
                task = pickle.loads(task_json)
                if 'url' in task and 'spider_type' in task:
                    extract = tldextract.TLDExtract()
                    url = task['url']
                    spider_type = task['spider_type']
                    domain = extract(url).domain
                    job_id = self.chose[url.encode('utf-8')]
                    queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type,
                                                                               job_id=job_id,
                                                                               domain=domain)
                    priority = task['priority']
                    self.server.zadd(queue_key, pickle.dumps(task), priority)
                else:
                    self.logger.error("please input url and spider_type that you want to crawl!")
Exemplo n.º 11
0
    def extract(self):
        """
        extract domain

        >>> d = Domain('www.example.com')
        <domain.Domain object>
        >>> d.extract()
        ExtractResult(subdomain='www', domain='example', suffix='com')

        :return: extracted domain results
        """
        data_storage_dir = settings.data_storage_dir
        extract_cache_file = data_storage_dir.joinpath('public_suffix_list.dat')
        ext = tldextract.TLDExtract(extract_cache_file, None)
        result = self.match()
        if result:
            return ext(result)
        return None
Exemplo n.º 12
0
def makeData(black="./data/dga.txt", white="./data/top-1m.csv"):
    X = []
    Y = []
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
    with open(black, 'r') as f:
        data = f.readlines()
        for i in data:
            X.append(domain2list(no_fetch_extract(i.strip()).domain))
            Y.append([0])
    with open("./data/top-1m.csv", 'r') as f:
        data = f.readlines()
        for i in data:
            X.append(
                domain2list(no_fetch_extract(i.strip().split(',')[1]).domain))
            Y.append([1])
    X = np.mat(X)
    Y = np.mat(Y)
    return X, Y
Exemplo n.º 13
0
    def __init__(self,):
        self.ad_objects = []
        self.tld_extract = tldextract.TLDExtract(suffix_list_urls=None)
        # Stores the set of unique domain.hash  to reduce load on the clustering algo
        self.domain_hashes = set()

        # domain_ids... maps a domain to an integer ID
        self.domain_ids = {}

        # ad_objects indices of samples being considered for clustering 
        self.input_indices = []
        self.feature_matrix_hash = []
        self.feature_matrix_domain = []

        # label --> [index in ad_objects] 
        self.clusters = {}
        self.fetch_data()
        self.cluster_images()
Exemplo n.º 14
0
def test_list_source():
    extractor = tldextract.TLDExtract(cache_dir=None)
    extractor._extractor = _SuffixListTLDExtractor({  # pylint: disable=locally-disabled, protected-access
        SOURCE_PUBLICSUFFIX_ICANN: set(['com', 'net']),
        SOURCE_PUBLICSUFFIX_PRIVATE: set(['blogspot.com']),
    })

    result = extractor('hi.blogspot.com', include_psl_private_domains=False)
    assert result.subdomain == 'hi'
    assert result.domain == 'blogspot'
    assert result.suffix == 'com'
    assert result.source == SOURCE_PUBLICSUFFIX_ICANN

    result = extractor('hi.blogspot.com', include_psl_private_domains=True)
    assert result.subdomain == ''
    assert result.domain == 'hi'
    assert result.suffix == 'blogspot.com'
    assert result.source == SOURCE_PUBLICSUFFIX_PRIVATE
Exemplo n.º 15
0
def parse_url(url, isupdate=False):
    """

    :param url:
    :return:
    """
    o = urlparse(url)
    scheme = o.scheme
    netloc = o.netloc
    path = o.path
    extract = tldextract.TLDExtract()
    if isupdate:
        extract.update()
    ext = extract(url)
    return [
        scheme, ext.subdomain, ext.suffix, ext.domain + "." + ext.suffix,
        netloc, path, url
    ]
def detect_type(indicator):
    """Infer the type of the indicator.
    Args:
        indicator(str): The indicator whose type we want to check.
    Returns:
        str. The type of the indicator.
    """
    if re.match(sha256Regex, indicator) or re.match(
            md5Regex, indicator) or re.match(sha1Regex, indicator):
        return FeedIndicatorType.File

    if re.match(ipv4cidrRegex, indicator):
        return FeedIndicatorType.CIDR

    if re.match(ipv6cidrRegex, indicator):
        return FeedIndicatorType.IPv6CIDR

    if re.match(ipv4Regex, indicator):
        return FeedIndicatorType.IP

    if re.match(ipv6Regex, indicator):
        return FeedIndicatorType.IPv6

    if re.match(urlRegex, indicator):
        return FeedIndicatorType.URL

    if re.match(emailRegex, indicator):
        return FeedIndicatorType.Email

    try:
        # we use TLDExtract class to fetch all existing domain suffixes from the bellow mentioned file:
        # https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat
        # the suffix_list_urls=None is used to not make http calls using the extraction - avoiding SSL errors
        if tldextract.TLDExtract(
                cache_file='https://raw.githubusercontent.com/publicsuffix'
                '/list/master/public_suffix_list.dat',
                suffix_list_urls=None).__call__(indicator).suffix:
            if '*' in indicator:
                return FeedIndicatorType.DomainGlob
            return FeedIndicatorType.Domain
    except Exception:
        pass

    return None
Exemplo n.º 17
0
    def __init__(self, log_id, start_url=None, agent_name="edge_win"):
        self.start_time = time.time()
        self.agent_name = agent_name
        self.log_id = log_id
        self.logger = logging.getLogger(log_id)

        self.screenshots_dir_path = os.path.join(config.MAIN_LOG_PATH,
                                                 config.SCREENSHOTS_DIR,
                                                 self.log_id)
        os.mkdir(self.screenshots_dir_path)

        self.html_logs_dir_path = os.path.join(config.MAIN_LOG_PATH,
                                               config.HTML_LOGS_DIR,
                                               self.log_id)
        os.mkdir(self.html_logs_dir_path)

        # Every time the browser is restarted the browser_counter increases
        # this counter is used to name the JSGraph log file uniquely.
        self.browser_counter = 1

        tab = self.open_browser(start_url)
        self.start_url = start_url

        self._save_html(tab)

        # Useful when milking
        sshot_path = self._take_screenshot(tab)
        self.logger.info("The screenshot of loaded home page: %s", sshot_path)
        # Sometimes, websites might redirect to a different site. We would like to use the
        # name of this redirected site instead of the original one
        self.url = self.devtools_client.get_tab_url(tab)

        self.tabs_opened = 0
        self.overall_tabs_opened = 0

        self.logger.info("Home URL: %s", self.url)
        # current state.
        self.state = []
        self.restart = False
        # VERY STRANGELY, some ad networks fail to display ads when there is a live TLD lookup from Python code!
        # I have no idea why this request is interfering with that at all.
        # However, we are disabling the live lookup of suffixes. Only the stored list is used.
        self.tld_extract = tldextract.TLDExtract(suffix_list_urls=None)
        self.home_domain = self.tld_extract(self.url).registered_domain
Exemplo n.º 18
0
    def extract(self):
        """
        域名导出

        >>> d = Domain('www.example.com')
        <domain.Domain object>
        >>> d.extract()
        ExtractResult(subdomain='www', domain='example', suffix='com')

        :return: 导出结果
        """
        extract_cache_file = config.data_storage_dir.joinpath(
            'public_suffix_list.dat')
        tldext = tldextract.TLDExtract(extract_cache_file)
        result = self.match()
        if result:
            return tldext(result)
        else:
            return None
    def handle_noargs(self, **options):
        self.setup_logging(verbosity=options.get('verbosity', 1))

        filename = getattr(
            settings, 'MULTISITE_PUBLIC_SUFFIX_LIST_CACHE',
            os.path.join(tempfile.gettempdir(), 'multisite_tld.dat'))
        self.log("Updating {filename}".format(filename=filename))

        with tempfile.NamedTemporaryFile(dir=os.path.dirname(filename)) as f:
            tmpname = f.name

            extract = tldextract.TLDExtract(fetch=True, cache_file=tmpname)
            extract._get_tld_extractor()
            self.log(
                "Downloaded new data to {filename}".format(filename=tmpname))
            os.rename(tmpname, filename)
            f.delete = False  # No need to delete f any more.

        self.log("Done.")
Exemplo n.º 20
0
def read_domains(file_name):
    """
    读取域名存储文件,获取要探测的域名,以及提取出主域名
    注意:若是不符合规范的域名,则丢弃
    """
    domains = []
    main_domains = []
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
    file_path = './unverified_domain_data/'
    with open(file_path+file_name,'r') as fp:
        for d in fp.readlines():
            domain_tld = no_fetch_extract(d.strip())
            tld, reg_domain = domain_tld.suffix, domain_tld.domain  # 提取出顶级域名和主域名部分
            if tld and reg_domain:
                main_domains.append(reg_domain+'.'+tld)
                domains.append(d.strip())
            else:
                logger.logger.warning('域名%s不符合规范,不进行探测' % d.strip())

    return domains, main_domains
Exemplo n.º 21
0
    def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, add_type, backlog_blacklist,
                 queue_timeout):
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.add_type = add_type
        self.item_retires = retries
        self.logger = logger
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout
        self.extract = tldextract.TLDExtract()

        self.job_id = None  # 标识爬虫进程
        self.task = None
        self.paused = False  # 标识爬虫是否暂停
Exemplo n.º 22
0
def extract(url, *, include_psl_private_domains=False):
  cache_dir = save_cache_path('tldextract')
  last_updated = os.path.join(cache_dir, 'last_updated')
  extractor = tldextract.TLDExtract(
    cache_dir = cache_dir,
    include_psl_private_domains = include_psl_private_domains,
  )

  update = False
  try:
    t = os.path.getmtime(last_updated)
    if time.time() - t > 86400 * 7:
      update = True
  except FileNotFoundError:
    update = True

  if update:
    extractor.update()
    with open(last_updated, 'w'): pass

  return extractor(url)
Exemplo n.º 23
0
    def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(
            host=settings['REDIS_HOST'],
            port=settings['REDIS_PORT'],
            db=settings.get('REDIS_DB'),
            password=settings['REDIS_PASSWORD'],
            decode_responses=True,
            socket_timeout=settings.get('REDIS_SOCKET_TIMEOUT'),
            socket_connect_timeout=settings.get('REDIS_SOCKET_TIMEOUT'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ScraperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ScraperHandler")
            # plugin is essential to functionality
            sys.exit(1)
Exemplo n.º 24
0
def split_domain_into_subdomains(domain, split_tld=False):
    if not hasattr(split_domain_into_subdomains, '_tldex'):
        import tldextract
        # Do not request latest TLS list on init == suffix_list_url=False
        split_domain_into_subdomains._tldex = tldextract.TLDExtract(
            suffix_list_url=False)
    _tldex = split_domain_into_subdomains._tldex

    # Requires unicode
    domain = ensure_decoded_text(domain)

    tx = _tldex(domain)

    domains = []
    if tx.subdomain:
        domains.extend(tx.subdomain.split('.'))

    # tx.registered_domain returns only if domain AND suffix are not none
    # There are cases where we have domain and not suffix; ie short hostnames
    registered_domain = [tx.domain]
    if tx.suffix:
        registered_domain.append(tx.suffix)

    if split_tld:
        domains.extend(registered_domain)
    else:
        domains.append('.'.join(registered_domain))

    # Musical chairs. Change places!
    domains.reverse()

    def join_dom(a, b):
        return '.'.join([b, a])

    # Take each part and add it to the previous part, returning all results
    domains = list(accumulate(domains, func=join_dom))
    # Change places!
    domains.reverse()

    return domains
Exemplo n.º 25
0
def auto(page_url, **kwargs):
    default_options = {"index": 0, "album": False, "username": False}

    for k, v in default_options.items():
        if k not in kwargs:
            kwargs[k] = v

    domains = {
        "artstation": artstation,
        "pixiv": pixiv,
        "hentai-foundry": hentai_foundry,
        "deviantart": deviantart,
        "furaffinity": furaffinity,
    }

    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)

    domain = no_fetch_extract(page_url).domain

    if domain in domains:
        return domains[domain](page_url, kwargs)

    raise exc.UnsupportedSite(page_url)
Exemplo n.º 26
0
def test_cache_permission(mocker, monkeypatch, tmpdir):
    """Emit a warning once that this can't cache the latest PSL."""

    warning = mocker.patch.object(logging.getLogger("tldextract.cache"),
                                  "warning")

    def no_permission_makedirs(*args, **kwargs):
        raise PermissionError("""[Errno 13] Permission denied:
            '/usr/local/lib/python3.7/site-packages/tldextract/.suffix_cache"""
                              )

    monkeypatch.setattr(os, "makedirs", no_permission_makedirs)

    for _ in range(0, 2):
        my_extract = tldextract.TLDExtract(cache_dir=tmpdir)
        assert_extract(
            "http://www.google.com",
            ("www.google.com", "www", "google", "com"),
            funs=(my_extract, ),
        )

    assert warning.call_count == 1
    assert warning.call_args[0][0].startswith("unable to cache")
Exemplo n.º 27
0
    def __init__(self, ):
        self.ad_objects = []
        self.tld_extract = tldextract.TLDExtract(suffix_list_urls=None)
        # Stores the set of unique domain.hash  to reduce load on the clustering algo
        self.domain_hashes = set()

        # domain_ids... maps a domain to an integer ID
        self.domain_ids = {}

        # ad_objects indices of samples being considered for clustering
        self.input_indices = []
        self.feature_matrix_hash = []
        self.feature_matrix_domain = []

        # label --> [index in ad_objects]
        self.clusters = {}
        # self.fetch_data()
        self.fetch_clustered_data()
        print len(self.clusters)
        self.filter_clusters()
        print "Number of clusters after filtering:", len(self.clusters)
        print "# of SE clusters:", sum(
            [len(x) for x in se_categories.values()])
        se_clusters_1 = set([int(x) for x in self.clusters.keys()])
        se_clusters_2 = set()
        for cluster_list in se_categories.values():
            se_clusters_2 = se_clusters_2.union(set(cluster_list))
        print se_clusters_1
        print "***" * 10
        print se_clusters_2
        print "***" * 10
        print "Missing clusters:", se_clusters_1.difference(se_clusters_2)

        print "Different clusters:"
        self.fetch_mal_ad_hashes()
        self.fetch_ad_network_counts()
Exemplo n.º 28
0
def main():
    tld_extractor = tldextract.TLDExtract(suffix_list_urls=None)
    ad_objs = ad_object.parse_ad_objects(CLUSTERED_ADOBJECTS_PATH)
    ad_domains = set()
    image_hashes = set()
    milking_url_domains = set()
    upstream_domains = set()
    milking_urls = []
    for ad in ad_objs:
        ad_domain, milking_url, milking_domain, curr_upstream_domains = get_milking_url(
            ad, tld_extractor)
        if milking_domain and milking_domain not in milking_url_domains:
            # print "Milking url: ", milking_url
            # import ipdb; ipdb.set_trace()
            milking_url_domains.add(milking_domain)
            milking_urls.append(milking_url)
        image_hashes.add(ad.screenshot_hash)
        ad_domains.add(ad_domain)
        upstream_domains = upstream_domains.union(curr_upstream_domains)
        # home_domain = self.tld_extract(ad).registered_domain
    print len(ad_objs)
    print "# Ad domains: ", len(ad_domains)
    print "# Image hashes: ", len(image_hashes)
    print "# Milking URLS: ", len(milking_urls)
    print "# Upstream domains: ", len(upstream_domains)
    print "# All domains:", len(
        ad_domains.union(milking_url_domains).union(upstream_domains))
    dump_object = {
        "ad_domains": list(ad_domains),
        "image_hashes": list(image_hashes),
        "milking_urls": milking_urls,
        "upstream_domains": list(upstream_domains)
    }
    with open(MILKING_URLS_PATH, "wb") as f:
        dump_str = json.dumps(dump_object)
        f.write(dump_str)
Exemplo n.º 29
0
    def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
                 backlog_blacklist, queue_timeout, chose):
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retires = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout
        self.chose = chose
        self.extract = tldextract.TLDExtract()

        self.job_id = None  # 标识爬虫进程
        self.paused = False  # 标识爬虫是否暂停
Exemplo n.º 30
0
def link_is_blocked(blocked_links, url):

    # TLDExtract extracts the top-level domain from the
    # registered domain and subdomains of a URL. For example,
    # to get just the 'google' part of 'http://www.google.com'
    # or 'http://google.com.sg'.
    #
    # By default, when the module is first run, it updates
    # its TLD list with a live HTTP request. This updated TLD
    # list is cached indefinitely in /path/to/tldextract/.tld_set
    #
    # This is to set the call to not use cache.
    no_cache_extract = tldextract.TLDExtract(cache_file=False)

    # How to use TLDExtract
    # Usage: no_cache_extract('http://forums.news.cnn.com/')
    # Result: ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
    extracted = no_cache_extract(url)
    url_domain = extracted.domain + "." + extracted.suffix

    for site in blocked_links:
        # if url matches with blocked domain
        if (re.match(site, url_domain)):
            return True