def urlscan_screenshot(url):
    #First, let's see if it's been indexed already.
    domain = tldextract.extract(url).fqdn
    print("\t\t[-] Looking up domain screenshot {}".format(domain))
    if lookup_screenshot(url):
        print ("\t\t[-] Already screenshotted...")
        return
    else:
        print ("\t\t[-] No screenshot yet, sending to urlscan...")
        urlscanio = "https://urlscan.io/api/v1/scan/"
        key = urlscanapikey
        headers  = {
            "Content-Type" : "application/json",
            "API-Key" : key
        }
        data = {
            "url" : url,
            "public" : "on"
        }
        try:
            r = requests.post(urlscanio, headers=headers, json=data)
            save_screenshot(url, r.json()['result'])
        except Exception as e:
            print (e)
            pass
        time.sleep(2)
示例#2
0
 def get_title_for_classifier(self):
     title = self.soup.find("title")
     domain = tldextract.extract(self.current_link)
     return_string = domain.domain + ", " + re.sub(
         r'[^\w]', " ",
         title.get_text().strip().lower().replace(domain.domain, ""))
     return return_string
示例#3
0
    def __init__(self, url, config=None, **kwargs):
        """The config object for this source will be passed into all of this
        source's children articles unless specified otherwise or re-set.
        """
        if (url is None) or ('://' not in url) or (url[:4] != 'http'):
            raise Exception('Input url is bad!')

        self.config = config or Configuration()
        self.config = utils.extend_config(self.config, kwargs)

        self.extractor = self.config.extractor(self.config)

        self.url = url
        self.url = urls.prepare_url(url)

        self.domain = urls.get_domain(self.url)
        self.scheme = urls.get_scheme(self.url)

        self.categories = []
        self.feeds = []
        self.articles = []

        self.html = ''
        self.doc = None

        self.logo_url = ''
        self.favicon = ''
        self.brand = tldextract.extract(self.url).domain
        self.description = ''

        self.is_parsed = False
        self.is_downloaded = False
def enviar_noticias(resultado, id_chat, Nombre_Grupo, provincias, tema):

    try:
        url_api = "bot1477154971:AAHz2Ok9QD8bwzkAxIqqZc64GNPeqGjuRTI/sendMessage"
        temas = ''.join(tema)

        for m in resultado:
            titulo = m[1]
            linkNoticia = m[0]
            descripcion = m[2]
            contenido = m[3]
            extracted = tldextract.extract(linkNoticia)
            medio = "{}.{}".format(extracted.domain, extracted.suffix)
            medio = medio.replace(".com", "").replace(".ar", "")

            mensaje = "Medio: " + medio + "\n\n" + "    Última Noticia: " + titulo + "\n\n" + "    Ver más en ->" + linkNoticia
            requests.post('https://api.telegram.org/' + url_api,
                          data={
                              'chat_id': id_chat,
                              'text': mensaje
                          })
            print(requests.status_codes)
            try:
                mycursor = mydb.cursor()
                sql = "INSERT INTO noticias_enviadas (link,tema,id_grupo) " \
                      "VALUES (%s, %s, %s) "
                val = (linkNoticia, temas, id_chat)
                mycursor.execute(sql, val)
                mydb.commit()
                print("insertó correctamente el link: " + linkNoticia + "")
            except Exception as e:
                print("El Link ya fue guardado: " + linkNoticia + "")

    except Exception as e:
        print(" 279 - enviar ", e)
示例#5
0
    def compare_to(self, compared_url):
        """ Get other website url and a word
            Finds the number of occurrences in the compared website
            Prints data in a table
        """
        # find number of letters & most common word in current website
        num_of_letters, most_common_word = self.get_text_info()

        # find number of occurrences in the compared website
        text = self.parse(compared_url)
        compared_url_occur = text.count(most_common_word[0])

        # create a table and print data
        table = PrettyTable(['Description', 'Value'])
        table.add_row(['Number of Letters in ' + self.name, num_of_letters])
        table.add_row([
            'Most Common Word in ' + self.name,
            "The word '" + str(most_common_word[0]) + "'" + " occurs " +
            str(most_common_word[1]) + " times"
        ])
        table.add_row([
            'Number of Occurrences in ' +
            tldextract.extract(compared_url).domain,
            str(compared_url_occur)
        ])
        print(table)
示例#6
0
    async def select_proxy(self, url: str) -> str:
        if not self._redis_client:
            await self.setup()
        await self.cleanup()

        domain = tldextract.extract(url).domain

        while True:
            free_slots = await asyncio.gather(*[
                proxy.get_number_of_free_slots(domain, self._redis_client)
                for proxy in self._proxies
            ])

            slots_and_proxies = list(zip(free_slots, self._proxies))
            shuffle(slots_and_proxies)

            proxies = sorted(slots_and_proxies,
                             key=lambda d: d[0],
                             reverse=True)

            if proxies[0][0] > 0:
                return await proxies[0][1].get_url(domain, self._redis_client)
            else:
                # No proxy available right now. Wait.
                await asyncio.sleep(5)
示例#7
0
 async def resolve(self, domain):
     if not self.bootstraped:
         await self.bootstrap()
     tld = tldextract.extract(domain).suffix
     apex_domain = tldextract.extract(domain).registered_domain
     try:
         authorative_server = self.server_list[tld]
         async with aiohttp.ClientSession() as session:
             res = await self._fetch_json(session, "%sdomain/%s" %(authorative_server, apex_domain ))
             if res:
                 entry = RdapDomainEntry.from_rdap_response(res)
                 return entry
             else:
                 raise LookupError('no valid rdap data received')
     except KeyError:
         raise LookupError("No rdap server found for domain: %s" % domain)
示例#8
0
    def update_json_information(self, site):
        time.sleep(3)
        print("Updating json with recording information")

        with open(site["json_path"], "r") as f:
            json_data = json.loads(f.read())

        # Generate the eTLD+1 list
        # Bug 1585598 - Validate list of sites used for testing Fission
        etdl = []
        for item in json_data["http_protocol"].keys():
            base_url = ".".join(item.split(".")[-2:])
            if base_url not in etdl:
                etdl.append(base_url)
        self.information["etdl"] = etdl

        self.information["proxy"] = self.proxy

        self.information["url"] = site["url"]
        self.information["domain"] = tldextract.extract(site["url"]).domain

        self.information["label"] = site.get("label")

        json_data["info"] = self.information
        with open(site["json_path"], "w") as f:
            f.write(json.dumps(json_data, sort_keys=True, indent=2))
示例#9
0
    def getTargetValidityProblems(self):
        """Verify that each target has a valid TLD in order to prevent problematic rewrite
                 as stated in EFForg/https-everywhere/issues/10877. In particular, 
                 right-wildcard target are ignored from this test.

                 Returns an array of strings reporting any coverage problems if they exist,
                 or empty list if coverage is sufficient.
                 """
        problems = self._determineTestApplication()

        # Next, make sure each target has a valid TLD and doesn't overlap with others
        for target in self.targets:
            # If it's a wildcard, check which other targets it covers
            if '*' in target:
                target_re = regex.escape(target)

                if target_re.startswith(r'\*'):
                    target_re = target_re[2:]
                else:
                    target_re = r'\A' + target_re

                target_re = regex.compile(
                    target_re.replace(r'\*', r'[^.]*') + r'\Z')

                others = [other for other in self.targets if other !=
                          target and target_re.search(other)]

                if others:
                    problems.append("%s: Target '%s' also covers %s" %
                                    (self.filename, target, others))

            # Ignore right-wildcard targets for TLD checks
            if target.endswith(".*"):
                continue

            # Ignore if target is an ipv4 address
            try:
                socket.inet_aton(target)
                continue
            except:
                pass

            # Ignore if target is an ipv6 address
            try:
                socket.inet_pton(socket.AF_INET6, target)
                continue
            except:
                pass

            # Extract TLD from target if possible
            res = tldextract.extract(target)
            if res.suffix == "":
                problems.append("%s: Target '%s' missing eTLD" %
                                (self.filename, target))
            elif res.domain == "":
                problems.append("%s: Target '%s' containing entire eTLD" % (
                    self.filename, target))

        return problems
示例#10
0
 def is_polite(self, url):
     ext = tldextract.extract(url)
     URL = ext.subdomain + "." + ext.domain + "." + ext.suffix
     rp = urllib.robotparser.RobotFileParser()
     URL = "https://" + URL + "/robots.txt"
     rp.set_url(URL)
     rp.read()
     flag = rp.can_fetch("*", url)
     return flag
示例#11
0
    async def register_status_code(self, url: str, status_code: int,
                                   proxy_url: str):
        domain = tldextract.extract(url).domain

        for proxy in self._proxies:
            if proxy._url == proxy_url:
                await proxy.register_status_code(status_code, domain,
                                                 self._redis_client)
                break
示例#12
0
def get_hostname(hostname):
    h_data = tldextract.extract(hostname)

    if h_data.subdomain:
        hostname = h_data.subdomain
    else:
        if h_data.registered_domain:
            hostname = h_data.registered_domain

    return hostname
def lookup_screenshot(url):
    domain = tldextract.extract(url).fqdn
    if os.path.exists("screenshots/mapping.json"):
        f = open("screenshots/mapping.json", 'r')
        current = f.read()
        f.close()
        current = json.loads(current)
        return current.get(domain, False)
    else:
        return False
示例#14
0
def get_domain(uri):
    """
    Given an uri returns it's domain ("without http://")
    :param uri:
    :return:
    """
    parsed_uri = urlparse(uri)
    netloc = '{uri.netloc}'.format(uri=parsed_uri)
    extract_result = tldextract.extract(netloc)
    domain = extract_result.domain + DOT + extract_result.suffix
    return domain.strip().replace("\"\"", "")
示例#15
0
def get_hosts_set():
    for num in range(len(URLS)):
        with open(f'ads_{num}.txt') as f:
            for line in f:
                if line.startswith('#') or ('0.0.0.0' not in line
                                            and '127.0.0.1' not in line):
                    continue
                domain = line.strip().split(' ')[1]
                extracted = tldextract.extract(domain)
                if extracted.suffix:
                    yield extracted
示例#16
0
    def classify_links(self):
        self.same_domain_links = []
        self.sub_directory_links = []
        self.same_directory_links = []
        self.super_directory_links = []
        self.sub_domain_links = []
        self.super_domain_links = []
        self.sister_domain_links = []
        self.external_links = []

        dir_url = '/'.join(self.url.split('://')[1].split('/')[:-1])
        url_tld = extract(self.url)
        for link in self.all_links:
            link_tld = extract(link['href'])
            if link_tld.tld == url_tld.tld and link_tld.domain == url_tld.domain:
                # internal link within the domain name e.g. bbc.co.uk
                if link_tld.subdomain == url_tld.subdomain:
                    # we are at the same subdomain
                    self.same_domain_links.append(link)
                    if link['href'].startswith(dir_url):
                        remainder = link['href'].replace(dir_url, '')
                        r_segs = remainder.split('/')
                        if len(r_segs) == 2:
                            self.same_directory_links.append(link)
                        elif len(r_segs) > 2:
                            self.sub_directory_links.append(link)
                    else:
                        self.super_directory_links.append(link)

                elif link_tld.subdomain.endswith(url_tld.subdomain):
                    # this is a subdomain of the current_domain
                    self.sub_domain_links.append(link)
                elif url_tld.subdomain.endswith(link_tld.subdomain):
                    # this is a superdomain of the current domain
                    self.super_domain_links.append(link)
                else:
                    # this must be a sister domain of some kind
                    self.sister_domain_links.append(link)
            else:
                # external link
                self.external_links.append(link)
    def identify_short_url_by_suffix(self):
        """
        检查URL的后缀是否为常见的短链后缀
        :param url_to_tld_dict: 从文本内容中提取的URL:TLD字典
        :return: 经过后缀检查后的URL:TLD字典
        """
        # 可能属于短链的URL:TLD的字典
        short_url_to_tld_dict = {}
        # 项目根目录的路径
        rootPath = os.path.split(curPath)[0]
        # 短链服务提供商域名列表的绝对路径,由此来获取常用的短链后缀
        short_url_service_domain_list_path_absolute = rootPath + "/data/short_url_services_list.txt"
        short_url_service_domain_list = loads_file_from_txt_to_list(
            fpath=short_url_service_domain_list_path_absolute)
        # 常见短链服务提供商的域名后缀集合
        suffix_set = set()
        for url in short_url_service_domain_list:
            suffix = tldextract.extract(url)[2]
            suffix_set.add(suffix)
        # 通用域名,不列为可疑的短链域名后缀
        common_suffix_list = ['com', 'net', 'info', 'org', '']
        for common_suffix in common_suffix_list:
            suffix_set.remove(common_suffix)

        # print("短链服务提供商的常见域名后缀的集合:{0}".format(suffix_set))

        for url in self.url_to_tld_dict.keys():
            flag = False
            input_url_suffix = tldextract.extract(self.url_to_tld_dict[url])[2]
            # 传入URL的TLD
            tld = self.url_to_tld_dict[url]
            for suffix in suffix_set:
                if input_url_suffix == suffix:
                    flag = True
                    short_url_to_tld_dict[url] = tld
            if flag:
                self.judge_dict[url].append(True)
            else:
                self.judge_dict[url].append(False)

        return short_url_to_tld_dict
示例#18
0
def get_base_url(url):
    """
    Takes as input a url, returns the protocol,domain and suffix concatenated
    to form the base url of the website. Uses the tldextract library.
    """
    tld = tldextract.extract(url)
    print(tld.subdomain, ' - ', tld.domain, ' - ', tld.suffix)
    if tld.subdomain != "":
        base_url = '.'.join([tld.subdomain, tld.domain, tld.suffix])
    else:
        base_url = '.'.join([tld.domain, tld.suffix])
    return base_url
示例#19
0
def get_domain(url):
    """
    Return the domain without any subdomain

    >>> get_domain("http://blog.example.com")
    'example.com'
    >>> get_domain("http://www.example.com")
    'example.com'
    >>> get_domain("http://deeper.blog.example.co.uk")
    'example.co.uk'
    """
    return ".".join(tldextract.extract(url)[-2:])
示例#20
0
 def get_open_page_rank(self, url):
     try:
         o = tldextract.extract(url)
         domain = ('%s.%s' % (o.domain, o.suffix))
         try:
             pginfo = self.page_rank.pg[domain]
         except KeyError:
             config.logger.warn('page rank information for domain [' +
                                domain + '] not found')
             return MISSING_FEATURE * 2, True
         return [pginfo['page_rank_decimal'], pginfo['rank']], False
     except Exception as e:
         config.logger.error(repr(e))
         return MISSING_FEATURE * 2, True
示例#21
0
def post_reformat(post):
    return [
        post.id,                                                                      # 0  post id
        Markup((post.title[:120] + '...') if len(post.title) > 120 else post.title),  # 1  post title
        Markup(post.url),                                                             # 2  full url
        tldextract.extract(post.url).registered_domain,                               # 3  domain only url
        Markup(post.username),                                                        # 4  user name
        datetime.datetime.fromtimestamp(post.datetime).strftime('%c'),                # 5  tooltip time
        human_datetime(post.datetime),                                                # 6  time since post
        post.tags,                                                                    # 7  tags
        len(post.votes.split(';')),                                                   # 8  votes
        len(post.reports.split(';')),                                                 # 9  reports
        int(time.time() - post.datetime)                                              # 10 epoch seconds
    ]
def save_screenshot(url, urlscanurl):
    domain = tldextract.extract(url).fqdn
    if os.path.exists("screenshots/mapping.json"):
        f = open("screenshots/mapping.json", 'r')
        current = f.read()
        f.close()
        current = json.loads(current)
        current[domain] = urlscanurl
        f = open("screenshots/mapping.json", 'w')
        current = f.write(json.dumps(current))
        f.close()
    else:
        current = {domain: urlscanurl}
        f = open("screenshots/mapping.json", 'w')
        current = f.write(json.dumps(current))
        f.close()
示例#23
0
def get_url_info(url, guess_and_check, max_depth):
    # Create a new Link instance
    link = Link()
    # Save the extracted URL
    link.url = url
    # Remove markdown and other artifacts from the URL
    link.url_clean = remove_markdown(url)
    # Check if the clean URL is valid, if so continue with the next steps
    link.url_clean_is_valid = check_if_valid_url(link.url_clean)
    if link.url_clean_is_valid:
        link.is_amp = check_if_amp(link.url_clean)
        if link.is_amp:
            link.is_cached = check_if_cached(link.url_clean)
            link.domain = tldextract.extract(link.url_clean).domain
            link = get_canonical(link, guess_and_check, max_depth)

    return link
示例#24
0
def is_valid_url(url):
    extract = tldextract.extract(url)
    if not extract.suffix:
        return False

    # Validators catches 'most' invalid urls, but there are some issues and exceptions that are not really likely
    # to cause any major issues in our software. The other alternative is another library with other quircks.
    # see: https://github.com/kvesteri/validators/
    # Note that this library does not account for 'idna' / punycode encoded domains, so you have to convert
    # them yourself. luckily:
    # 'аренда.орг' -> 'xn--80aald4bq.xn--c1avg'
    # 'google.com' -> 'google.com'
    valid_domain = domain(url.encode('idna').decode())
    if valid_domain is not True:
        return False

    return True
示例#25
0
    def handle(self, *args, **options):

        imported_domains = 0
        already_existing_domains = 0

        file_path = options.get('file', None)

        with open(file_path, newline='') as file:
            line = file.readline()

            while line:
                url = line
                line = file.readline()

                if not urlparse(url).scheme:
                    url = 'http://' + line

                # This uses a list of public suffixes
                tld_extract_result = tldextract.extract(url)

                if tld_extract_result.subdomain:
                    url_full_domain = '.'.join([
                        tld_extract_result.subdomain,
                        tld_extract_result.domain, tld_extract_result.suffix
                    ])
                elif tld_extract_result.suffix:
                    url_full_domain = '.'.join(
                        [tld_extract_result.domain, tld_extract_result.suffix])
                else:
                    url_full_domain = tld_extract_result.domain

                if ProxyBlacklistedDomain.objects.filter(
                        domain=url_full_domain).exists():
                    logger.info('Domain %s already exists, not importing.' %
                                url_full_domain)
                    already_existing_domains += 1
                    continue

                ProxyBlacklistedDomain.objects.create(domain=url_full_domain)
                logger.info('Imported domain %s successfully.' %
                            url_full_domain)
                imported_domains += 1

            logger.info(
                'Finished. Imported %d domains and skipped %d as they already existed'
                % (imported_domains, already_existing_domains))
示例#26
0
def get_url_info(url, use_gac, max_depth) -> Link:
    link = Link(canonicals=[])

    origin = UrlMeta(url=remove_markdown(url))
    origin.is_valid = check_if_valid_url(origin.url)
    origin.is_amp = check_if_amp(origin.url) and not any(
        map(origin.url.__contains__, static.DENYLISTED_DOMAINS))

    if origin.is_valid:
        if origin.is_amp:
            origin.is_cached = check_if_cached(origin.url)
            origin.domain = tldextract.extract(origin.url).domain
            link.origin = origin
            link = get_canonicals(link=link,
                                  max_depth=max_depth,
                                  use_gac=use_gac)

    return link
示例#27
0
def valid_image_url(url):

    url = clean_url(url)
    if not url.startswith(('http://', 'https://')):
        return False
    path = urlparse(url).path

    # input url is not in valid form (scheme, netloc, tld)
    if not path.startswith('/'):
        return False

    # the '/' which may exist at the end of the url provides us no information
    if path.endswith('/'):
        path = path[:-1]

    # '/story/cnn/blahblah/index.html' --> ['story', 'cnn', 'blahblah', 'index.html']
    path_chunks = [x for x in path.split('/') if len(x) > 0]

    # siphon out the file type. eg: jpeg, png
    if len(path_chunks) > 0:
        file_type = urls.url_to_filetype(url)

        # if the file type is a media type, reject instantly
        if file_type and file_type not in ALLOWED_TYPES:
            return False

        last_chunk = path_chunks[-1].split('.')
        # the file type is not of use to use anymore, remove from url
        if len(last_chunk) > 1:
            path_chunks[-1] = last_chunk[-2]

    # extract the tld (top level domain)
    tld_dat = tldextract.extract(url)
    tld = tld_dat.domain.lower()

    if tld in urls.BAD_DOMAINS:
        return False

    for d in BAD_URLS:
        if d in url:
            return False

    return True
示例#28
0
    def create_tenant(domain='', **extra_fields):
        extra_fields.setdefault('name', '')
        extra_fields.setdefault('country', '')
        extra_fields.setdefault('currency', '')

        if domain and not freemail.is_free(domain):
            # We can guess some field values based on the domain.
            tld = tldextract.extract(domain)
            geo_ip = GeoIP2()

            if not extra_fields['name']:
                # Use the domain of the email address as tenant name.
                extra_fields['name'] = tld.domain.title()

            if not extra_fields['country']:
                try:
                    country_code = geo_ip.country(
                        tld.registered_domain).get('country_code')
                except (gaierror, AddressNotFoundError):
                    pass
                else:
                    if country_code in [c[0] for c in COUNTRIES]:
                        extra_fields['country'] = country_code

            if extra_fields['country'] and not extra_fields['currency']:
                currency = get_territory_currencies(
                    extra_fields['country'])[-1]
                if currency in [c[0] for c in CURRENCIES]:
                    extra_fields['currency'] = currency

        if settings.BILLING_ENABLED:
            # Chargebee needs extra info on who to bill, so for now only create the plans without activating the trial.
            plan, created = Plan.objects.get_or_create(
                name=settings.CHARGEBEE_PRO_TRIAL_PLAN_NAME)
            billing = Billing.objects.create(plan=plan)
        else:
            billing = Billing.objects.create()

        tenant = Tenant.objects.create(billing=billing, **extra_fields)

        create_defaults_for_tenant(tenant)

        return tenant
def enviar_noticias(arr, id_chat, Nombre_Grupo, provincias, tema):

    try:
        url_api = "bot1477154971:AAHz2Ok9QD8bwzkAxIqqZc64GNPeqGjuRTI/sendMessage"
        temas = ''.join(tema)

        for m in arr:
            linkPortal = m[1]
            linkNoticia = m[0]
            mycursor = mydb.cursor()
            #sql = "SELECT * FROM noticias_enviadas WHERE link = '"+str(linkNoticia)+"'" +""+""
            #mycursor.execute(sql)
            #records = cursor.fetchall()
            #if  records == []:
            extracted = tldextract.extract(linkPortal)
            medio = "{}.{}".format(extracted.domain, extracted.suffix)
            medio = medio.replace(".com", "").replace(".ar", "")
            linktitulo = requests.get(m[0]).text
            titulo = funcion_BuscaTitulo(linktitulo)
            if isinstance(titulo, str):

                mensaje = "Medio: " + medio + "\n\n" + "    Última Noticia: " + titulo + "\n\n" + "    Ver más en ->" + linkNoticia
                requests.post('https://api.telegram.org/' + url_api,
                              data={
                                  'chat_id': id_chat,
                                  'text': mensaje
                              })
                print(requests.status_codes)
                try:
                    mycursor = mydb.cursor()
                    sql = "INSERT INTO noticias_enviadas (link,tema,id_grupo) " \
                          "VALUES (%s, %s, %s) "
                    val = (linkNoticia, temas, id_chat)
                    mycursor.execute(sql, val)
                    mydb.commit()
                    print("insertó correctamente el link: " + linkNoticia + "")
                except Exception as e:
                    print("El Link ya fue guardado: " + linkNoticia + "")

    except Exception as e:
        print(" 279 - enviar ", e)
示例#30
0
    def set_url(self, url):
        url = url.strip()
        res = extract(url)
        segs = []
        for seg in [res.subdomain, res.domain, res.tld]:
            if seg:
                segs.append(seg)
        domain_str = '.'.join(segs)
        domain = self.get_domain_entry(domain_str)
        if not domain:
            domain = Domain(
                url=domain_str)  # create the domain guessing the values
            domain.save()

        setattr(self, 'domain_name', domain_str)
        setattr(self, 'domain_data', domain)
        setattr(self, 'link', url.split(domain_str, 1)[1])

        self.get_soup()
        self.set_title()
        self.set_date()
        self.set_content()