Python get_fld示例，tld.get_fld Python示例

示例#1

0

显示文件

文件： sublert.py 项目： mmg1/sublert

def posting_to_slack(result, dns_resolve, dns_output): #sending result to slack workplace
    global domain_to_monitor
    global new_subdomains
    if dns_resolve:
        dns_result = dns_output
        if dns_result:
            rev_url = []
            print(colored("\n[!] Exporting result to Slack. Please don't interrupt!", "red"))
            for url in dns_result:
                url = url.replace('*.', '')
                url = "https://" + url.replace('+ ', '')
                rev_url.append(get_fld(url))
            for subdomain in new_subdomains:
                subdomain = subdomain.replace('*.','')
                subdomain = subdomain.replace('+ ','')
                data = "<!channel> :new: {}".format(subdomain)
                slack(data)
                try:
                    if dns_result[subdomain]["A"]:
                        for i in dns_result[subdomain]["A"]:
                            data = "```A : {}```".format(i)
                            slack(data)
                except: pass
                try:
                    if dns_result[subdomain]['CNAME']:
                        for i in dns_result[subdomain]['CNAME']:
                            data = "```CNAME : {}```".format(i)
                            slack(data)
                except: pass
            print(colored("\n[!] Done. ", "green"))
            rev_url = list(set(rev_url))
            for url in rev_url:
                os.system("rm -f ./output/" + url.lower() + ".txt")
                os.system("mv -f ./output/" + url.lower() + "_tmp.txt " + "./output/" + url.lower() + ".txt") #save the temporary one
            os.system("rm -f ./output/*_tmp.txt") #remove the remaining tmp files

    elif result:
        rev_url = []
        print(colored("\n[!] Exporting the result to Slack. Please don't interrupt!", "red"))
        for url in result:
            url = "https://" + url.replace('+ ', '')
            rev_url.append(get_fld(url))
            data = "<!channel> :new: {}".format(url)
            slack(data)
        print(colored("\n[!] Done. ", "green"))
        rev_url = list(set(rev_url))

        for url in rev_url:
            os.system("rm -f ./output/" + url.lower() + ".txt")
            os.system("mv -f ./output/" + url.lower() + "_tmp.txt " + "./output/" + url.lower() + ".txt") #save the temporary one
        os.system("rm -f ./output/*_tmp.txt") #remove the remaining tmp files

    else:
        if not domain_to_monitor:
            data = "<!channel> :-1: We couldn't find any new subdomains."
            slack(data)
            print(colored("\n[!] Done. ", "green"))
            os.system("rm -f ./output/*_tmp.txt")
        else: pass

示例#2

0

显示文件

def get_http_requests(conn, domain, id):
    query="select url from http_requests \
                where visit_id="+str(id)

    try:
        cur = conn.cursor()
        cur.execute(query)

        rows = cur.fetchall()
        
        # only those requests are first party whose get_fld() is same as get_fld() of original site analysed; rest are 3rd-party
        first_requests=[ele for ele in rows if domain==tld.get_fld(ele[0], fail_silently=True)]
        third_requests=[ele for ele in rows if domain!=tld.get_fld(ele[0], fail_silently=True)]
    except Error as e:
        return 0

    return [len(first_requests), len(third_requests)]

示例#3

0

显示文件

 def get_website(self, soup):
     try:
         url = soup.find('span', attrs={'itemprop': 'url'}).get_text().strip()
         if not url.startswith('http'):
             url = 'http://' + url
         return get_fld(url)
     except Exception:
         return None

示例#4

0

显示文件

文件： trustradius.py 项目： pantuts/scrapers

 def get_website(self, soup):
     try:
         s = soup.find('a', class_='vendor-backlink').get('href')
         if not s.startswith('http'):
             s = 'http://' + s
         return get_fld(s)
     except Exception:
         return None

示例#5

0

显示文件

文件： sublert.py 项目： mmg1/sublert

def domain_sanity_check(domain): #Verify the domain name sanity
    if domain:
        if ("http://" or "https://") not in domain:
            try:
                domain = get_fld("https://" + domain)
                return domain
            except:
                print(colored("[!] Incorrect domain format. Please follow this format: example.com, https://example.com, www.example.com", "red"))
                sys.exit(1)
        else:
            try:
                domain = get_fld(domain)
            except:
                print(colored("[!] Incorrect domain name. Please follow this format: example.com, https://example.com, www.example.com", "red"))
                sys.exit(1)
    else:
        pass

示例#6

0

显示文件

文件： g2crowd.py 项目： pantuts/scrapers

 def get_website(self, soup):
     try:
         site = soup.find('a', {'itemprop': 'url'}).get('href')
         if not site.startswith('http'):
             site = 'http://' + site
         return get_fld(site)
     except Exception:
         return None

示例#7

0

显示文件

 def get_website(self, soup):
     try:
         return get_fld(
             soup.find(
                 'a',
                 text=re.compile('^Visit.+Website$')).get('href').strip())
     except Exception:
         return None

示例#8

0

显示文件

文件： app.py 项目： nonomal/social-analyzer

def get_website(site):
    '''
    extract domain from website
    '''

    temp_value = get_fld(site, fix_protocol=True)
    temp_value = temp_value.replace(".{username}", "").replace("{username}.", "")
    return temp_value

示例#9

0

显示文件

def get_format_url(url, a_doc, host):
    a_href = a_doc.get('href')
    try:
        if a_href is not None and a_href.__len__() > 0:
            a_href = str(a_href).strip()
            a_href = a_href[:a_href.index('#')] if a_href.__contains__(
                '#') else a_href
            # a_href = a_href.encode('utf8')
            # a_href = urllib.quote(a_href,safe='.:/?&=')
            if a_href.startswith('//'):
                url = 'https:' + a_href if url.startswith(
                    'https:') else 'http:' + a_href
                url = mx.URL.URL(str(url))
                a_href = url.url
            elif a_href.startswith('/'):
                url = 'https://' + host + a_href if url.startswith(
                    'https:') else 'http://' + host + a_href
                url = mx.URL.URL(str(url))
                a_href = url.url
            elif a_href.startswith('./') or a_href.startswith('../'):
                url = mx.URL.URL(str(url) + '/' + a_href)
                a_href = url.url
            elif not a_href.startswith('javascript') and not a_href.startswith(
                    'mailto') and not a_href.startswith(
                        'http') and a_href != '':
                url = 'https://' + host + '/' + a_href if url.startswith(
                    'https:') else 'http://' + host + '/' + a_href
                url = mx.URL.URL(str(url))
                a_href = url.url
            a_href = a_href[:-1] if a_href.endswith('/') else a_href
            #a_href = a_href.lower()
        get_fld(a_href)
    except:
        return ''

    if not a_href.startswith('http'):
        return ''

    if a_href.__contains__('?'):
        a_params_str = a_href[a_href.index('?') + 1:]
        a_params = a_params_str.split('&')
        a_params.sort()
        a_params_str = '&'.join(a_params)
        a_href = a_href[:a_href.index('?') + 1] + a_params_str

    return a_href

示例#10

0

显示文件

文件： arrange.py 项目： qcts33/switchylist

def wirte_sorl(head, body):
    with open("arrange.sorl", "w") as fp:
        for line in head:
            print(line, file=fp)
        url = (x.lstrip("*.") for x in body)
        fld = (get_fld(x, fix_protocol=True) for x in url)
        for line in sorted(set(fld)):
            print(f"*.{line}", file=fp)

示例#11

0

显示文件

def use_cdn(cname):
    try:
        fld = get_fld(cname.rstrip('.'), fix_protocol=True)
        answers = dns.resolver.query(fld, 'NS')
    except:
        ### debug
        print('[DNS] {} can not find NS record'.format(fld))
    else:
        cdn_nss = json.load(open('cdn-ns.json'))
        for answer in answers:
            for cdn_vendor, cdn_ns_list in cdn_nss.items():
                if get_fld(answer.to_text().rstrip('.'),
                           fix_protocol=True) in cdn_ns_list:
                    ### debug
                    print('[CDN] Vendor: {}, NS: {}'.format(
                        cdn_vendor, answer.to_text()))
                    return True
    return False

示例#12

0

显示文件

 def get_website(self, soup):
     try:
         s = soup.find('li', text=re.compile(r'www')).get_text().strip()
         if not s.startswith('http'):
             s = 'http://' + s
         return get_fld(s)
     except Exception:
         try:
             for h2 in soup.select('h2'):
                 txt = h2.get_text().strip()
                 if txt == 'Vendor Details':
                     s = h2.find_next('ul', \
                         class_='check-list').select('li')[1].get_text().strip()
                     if not s.startswith('http'):
                         s = 'http://' + s
                     return get_fld(s)
         except Exception:
             return None

示例#13

0

显示文件

文件： getapp.py 项目： pantuts/scrapers

 def get_website(self, soup):
     try:
         s = soup.find('span', {'itemprop': 'author'}).find_next().find_next().get_text().strip()
         if not s.startswith('http'):
             s = 'http://' + s
         return get_fld(s)
     except Exception:
         try:
             s = MAIN_URL + soup.find('a', text=re.compile(r'Visit Website')).get('href')
             resp = request(s)
             external_url = re.findall(r'location\.replace.+?"(.+?)"', resp.text)[0]
             if 'external_click_ga' in external_url:
                 r = requests.head(external_url)
                 url = r.headers['Location']
                 return get_fld(url)
             return get_fld(external_url)
         except Exception:
             return None

示例#14

0

显示文件

文件： getdomain.py 项目： ranggaggngntt/HAX-BOT

 def get(self, web):
     try:
         res = get_fld(web, fix_protocol=True)
         save = open('domain.txt', 'a')
         save.write('http://'+res+'\n')
         save.close()
         print(Fore.LIGHTBLUE_EX, '[+] http://{}'.format(res))
     except:
         pass

示例#15

0

显示文件

文件： glassdoor.py 项目： pantuts/scrapers

 def get_website(self, soup):
     try:
         s = soup.find('span',
                       class_='website').find('a').get('href').strip()
         if not s.startswith('http'):
             s = 'http://' + s
         return get_fld(s)
     except Exception:
         return None

示例#16

0

显示文件

    def get_root_domain(value, zone=None):
        """
        Get the root domain (FLD) for the provided value
        """
        res = get_fld(value, fix_protocol=True, fail_silently=True)
        if res is None:
            return zone

        return res

示例#17

0

显示文件

文件： whois_lookups.py 项目： utkarshiam/Marinus

def get_fld_from_value(value, zone):
    """
    Get the First Level Domain (FLD) for the provided value
    """
    res = get_fld(value, fix_protocol=True, fail_silently=True)
    if res is None:
        return zone

    return res

示例#18

0

显示文件

文件： census_util.py 项目： elisamerida/TFG_FINAL

def extract_domain(url):
    if type(url) == type(None):
        return None
    url = url if url.startswith("http") else "http://" + url
    try:
        #devuelve el top level domain dada una url
        return get_fld(url)
    except:
        return urlparse.urlparse(url).netloc

示例#19

0

显示文件

    def get_data(self, url):
        resp = request(url)
        if resp:
            soup = BeautifulSoup(resp.text, 'lxml')

            name = soup.find('div', id='main').find('h1').find('span', class_='title').get_text().strip()
            website = None
            try:
                website = get_fld(soup.find('div', id='node-sidebar').find('div', \
                                class_='node-links').find('li', class_=re.compile('link-related-www')).find('a').get('href').strip())
            except Exception:
                pass
            desc = None
            try:
                desc = soup.find('strong', \
                            text=re.compile(name.strip(' County').upper())).previous('p').previous_element.get_text().replace(';', ',').strip()
            except Exception:
                pass
            addr = None
            try:
                addr = soup.find('div', class_='field-address').get_text().strip()
            except Exception:
                pass
            population = None
            try:
                population = soup.find('strong', text=re.compile(r'OPULATION')).next.next.strip()
            except Exception:
                pass
            inc_date = None
            try:
                inc_date = soup.find('strong', text=re.compile(r'INCORPORATION DATE')).next.next.strip()
            except Exception:
                pass
            boards = None
            try:
                boards = soup.find('strong', text=re.compile(r'BOARD')).find_parent().find_parent().find_next('ul').get_text().strip()
            except Exception:
                pass
            form_of_gov = None
            try:
                form_of_gov = soup.find('strong', text=re.compile(r'FORM OF GOVERNMENT')).next.next.strip()
            except Exception:
                pass

            data = [[
                name,
                website,
                desc,
                addr,
                population,
                inc_date,
                boards,
                form_of_gov,
                url
            ]]
            write_data(data, name)

示例#20

0

显示文件

 def list_all_websites(self):
     '''
     list all the available websites' entries
     '''
     if len(self.websites_entries) > 0:
         for site in self.websites_entries:
             temp_value = get_fld(site["url"], fix_protocol=True)
             temp_value = temp_value.replace(".{username}", "").replace("{username}.", "")
             if not self.silent:
                 self.log.info(temp_value)

示例#21

0

显示文件

文件： helpers.py 项目： Lagicrus/spoopy-python

async def blacklist_check(url: str,
                          transaction: Hub.current.scope.transaction):
    with transaction.start_child(op="task", description="Blacklist check"):
        blacklist = await open_blacklist()
        if not validate_ip(url):
            url = tld.get_fld(url, fix_protocol=True)
        if url in blacklist["blacklist"]:
            return blacklist["blacklist"][url]
        else:
            return False

示例#22

0

显示文件

 def get_domain(url):
     domain = None
     try:
         domain = get_fld(url)
     except TldDomainNotFound:
         # Not yet known TLD or IP address or local hostname
         domain = urlparse(url).netloc
     except TldBadUrl:
         domain = None
     return domain

示例#23

0

显示文件

 def __init__(self,domain,port,page,filename):
     intclass=GetAsset(domain,port,page,filename)
     if not intclass.JudgeIP(domain):
         domain="http://"+domain
         self.domain=get_fld(domain)
     else:
         self.domain=domain
     self.port=port
     self.page=page
     self.filename=filename

示例#24

0

显示文件

文件： html_util.py 项目： MrZhaii/PythonCrawler

 def get_url_domain(self, url):
     """
     获取url的domain
     """
     # 加锁
     self.lock.acquire()
     domain = get_fld(url)
     #释放锁
     self.lock.release()
     return domain

示例#25

0

显示文件

 def get_website(self, soup):
     try:
         s = soup.find('li', attrs={
             'style': re.compile('link_grey.png')
         }).get_text().strip()
         if not s.startswith('http'):
             s = 'http://' + s
         return get_fld(s)
     except Exception:
         return None

示例#26

0

显示文件

文件： app.py 项目： nonomal/social-analyzer

def list_all_websites():
    '''
    list all the available websites' entries
    '''

    if len(WEBSITES_ENTRIES) > 0:
        for site in WEBSITES_ENTRIES:
            temp_value = get_fld(site["url"], fix_protocol=True)
            temp_value = temp_value.replace(".{username}", "").replace("{username}.", "")
            LOG.info(temp_value)

示例#27

0

显示文件

文件： html.py 项目： openzim/sotoki

 def rewrite_user_link(self, link):
     try:
         if self.conf.without_users_links and (
                 link["href"].startswith("mailto:")
                 or get_fld(link["href"]) in SOCIAL_DOMAINS):
             self.redact_link(link)
             return 1
     except Exception as exc:
         logger.warning(f"Failed to get fld for {link.get('href')}: {exc}")
         return 0

示例#28

0

显示文件

文件： nb_utils.py 项目： isabella232/ott-tracking

def get_ps1_or_ipaddress(url):
    try:
        return get_fld(url, fail_silently=False)
    except Exception:
        hostname = urlparse(url).hostname
        try:
            ipaddress.ip_address(hostname)
            return hostname
        except Exception:
            return None

示例#29

0

显示文件

文件： sublert.py 项目： yassineaboukir/sublert

def domain_sanity_check(domain): #Verify the domain name sanity
    if domain:
        try:
            domain = get_fld(domain, fix_protocol = True)
            return domain
        except:
            print(colored("[!] Incorrect domain format. Please follow this format: example.com, http(s)://example.com, www.example.com", "red"))
            sys.exit(1)
    else:
        pass

示例#30

0

显示文件

def main(market_name,ts):
    ''' Main function where Bing Search is implemented and search term is defined. The results are returned and stored in new dataframe
    and the df is then exported to BING_SEARCH_NIELSEN collection.
    @:param ts: Timestamp
    @:type: int64'''
    connection = connect_mongo()
    marketmap = pd.DataFrame((connection[Settings.MARKET_MAP].find({"country":market_name.capitalize()},{"code":1,"country":1,"_id":0})))
    print(marketmap)
    reader = connection[Settings.CLEAN_NIELSEN].find({"UPPER_COUNTRY":market_name.upper(), "ts":ts},{"_id": 0})
    print(reader)
    for records in reader:
        try:
            count = connection[Settings.BRAND_SOURCE].count({"brand_low": str(records['UPPER_BRAND']).lower(), "country": str(records['UPPER_COUNTRY']).lower()})
            if count == 0:
                term = str(records['UPPER_BRAND']).lower()+" "+str(records['MANUFACTURER']).lower()
                print("fetching Records : ", term)
                brand=str(records['UPPER_BRAND']).lower()
                company=str(records['MANUFACTURER']).lower()
                market=str(records['UPPER_COUNTRY']).lower()
                if len(Settings.bing_subscription_key) == 32 and company != "PRIVATE LABEL":
                    print('Searching the Web for: ', term)
                    offset = 0
                    totalEstimatedMatches = 100
                    count = 0
                    while (offset < totalEstimatedMatches):
                        count = count + 1
                        headers, result = BingWebSearch(term, offset,marketmap, market)
                        data = json.dumps(json.loads(result), indent=4)
                        d_data = json.loads(data)
                        print("count : ", str(count))
                        print("offset : ", str(offset))
                        if 'webPages' in d_data:
                            totalEstimatedMatches = d_data['webPages']['totalEstimatedMatches']
                            news_dict = d_data['webPages']['value']
                            for news in news_dict:
                                news['hitCount'] = count
                                news['totalEstimateMatches'] = totalEstimatedMatches
                                news['fetchdate'] = datetime.datetime.utcnow()
                                news['offset'] = offset
                                news['query'] = term
                                news['count'] = 1
                                news['brand'] = brand
                                news['company'] = company
                                news['country'] = market
                                news['ts'] = ts
                                news['domain'] = get_fld(news['url'])
                                exists = connection[Settings.BING_SEARCH_NIELSEN].count({'brand': news['brand'], 'domain': news['domain'],'country':market_name})
                                if exists == 0:
                                    connection[Settings.BING_SEARCH_NIELSEN].insert(news)
                                else:
                                    connection[Settings.BING_SEARCH_NIELSEN].update_one({'brand': news['brand'], 'domain': news['domain'],'country':market_name})
                        offset = offset + len(news_dict)
                        totalEstimatedMatches = 0
        except Exception as e:
            print(str(e))

示例#31

0

显示文件

文件： website_news.py 项目： TimeAshore/dash_app

def website_url_not_in_db(url):
    domain = db.session.query(DomainArchived).filter_by(
        name=get_fld(url)).first()
    if not domain:
        raise InvalidAPIRequest('主域名未收录，请先添加主域名')
    for model in [
            WebsiteArchived, WebsiteNews, WebsiteRecycler, WebsiteBanned,
            WebsiteDuplicated
    ]:
        if db.session.query(model).filter_by(url=url.strip('/')).first():
            raise RecordAlreadyExists('已有此网站')

示例#32

0

显示文件

文件： hook.py 项目： jacobgarder/letsencrypt-cloudflare-hook

def _get_zone_id(domain):
    tld = get_fld('http://' + domain)
    url = "https://api.cloudflare.com/client/v4/zones?name={0}".format(tld)
    for auth in CF_HEADERS:
        r = requests.get(url, headers=auth)
        r.raise_for_status()
        r = r.json().get('result',())
        if r:
            return auth, r[0]['id']
    logger.error(" + Domain {0} not found in any Cloudflare account".format(tld))
    sys.exit(1)

示例#33

0

显示文件

文件： scraping.py 项目： moodule/paranoid-moodule

def extract_tld_from_url(
        url: str) -> str:
    """
    Identify the top level domain of the url.
    
    Parameters
    ----------
    url: str.
        The URL of the taget website.

    Returns
    -------
    out: str.
        The TLD according to Mozilla's tables.
    """
    return get_fld(url)

示例#34

0

显示文件

文件： sublert.py 项目： mmg1/sublert

    def lookup(self, domain, wildcard = True):
        base_url = "https://crt.sh/?q={}&output=json"
        if wildcard:
            domain = "%25.{}".format(domain)
            url = base_url.format(domain)
        subdomains = []
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0'

        try:
            req = requests.get(url, headers={'User-Agent': user_agent}, timeout=20, verify=False) #times out after 8 seconds waiting
            if req.status_code == 200:
                try:
                    content = req.content.decode('utf-8')
                    data = json.loads(content)
                    for subdomain in data:
                        subdomains.append(subdomain["name_value"])
                    return subdomains
                except:
                    error = "Error retrieving information for {}.".format(domain.replace('%25.', ''))
                    errorlog(error, enable_logging)
        except:
            try: #connecting to crt.sh postgres database to retrieve subdomains in case API fails
                unique_domains = []
                domain = domain.replace('%25.', '')
                conn = psycopg2.connect("dbname={0} user={1} host={2}".format(DB_NAME, DB_USER, DB_HOST))
                conn.autocommit = True
                cursor = conn.cursor()
                cursor.execute("SELECT ci.NAME_VALUE NAME_VALUE FROM certificate_identity ci WHERE ci.NAME_TYPE = 'dNSName' AND reverse(lower(ci.NAME_VALUE)) LIKE reverse(lower('%{}'));".format(domain))
                for result in cursor.fetchall():
                    matches = re.findall(r"\'(.+?)\'", str(result))
                    for subdomain in matches:
                        try:
                            if get_fld("https://" + subdomain) == domain:
                                unique_domains.append(subdomain)
                        except: pass
                return unique_domains
            except:
                print(colored("[!] Unable to connect to the database.".format(domain), "red"))
                error = "Unable to connect to the database."
                errorlog(error, enable_logging)