def posting_to_slack(result, dns_resolve, dns_output): #sending result to slack workplace global domain_to_monitor global new_subdomains if dns_resolve: dns_result = dns_output if dns_result: rev_url = [] print(colored("\n[!] Exporting result to Slack. Please don't interrupt!", "red")) for url in dns_result: url = url.replace('*.', '') url = "https://" + url.replace('+ ', '') rev_url.append(get_fld(url)) for subdomain in new_subdomains: subdomain = subdomain.replace('*.','') subdomain = subdomain.replace('+ ','') data = "<!channel> :new: {}".format(subdomain) slack(data) try: if dns_result[subdomain]["A"]: for i in dns_result[subdomain]["A"]: data = "```A : {}```".format(i) slack(data) except: pass try: if dns_result[subdomain]['CNAME']: for i in dns_result[subdomain]['CNAME']: data = "```CNAME : {}```".format(i) slack(data) except: pass print(colored("\n[!] Done. ", "green")) rev_url = list(set(rev_url)) for url in rev_url: os.system("rm -f ./output/" + url.lower() + ".txt") os.system("mv -f ./output/" + url.lower() + "_tmp.txt " + "./output/" + url.lower() + ".txt") #save the temporary one os.system("rm -f ./output/*_tmp.txt") #remove the remaining tmp files elif result: rev_url = [] print(colored("\n[!] Exporting the result to Slack. Please don't interrupt!", "red")) for url in result: url = "https://" + url.replace('+ ', '') rev_url.append(get_fld(url)) data = "<!channel> :new: {}".format(url) slack(data) print(colored("\n[!] Done. ", "green")) rev_url = list(set(rev_url)) for url in rev_url: os.system("rm -f ./output/" + url.lower() + ".txt") os.system("mv -f ./output/" + url.lower() + "_tmp.txt " + "./output/" + url.lower() + ".txt") #save the temporary one os.system("rm -f ./output/*_tmp.txt") #remove the remaining tmp files else: if not domain_to_monitor: data = "<!channel> :-1: We couldn't find any new subdomains." slack(data) print(colored("\n[!] Done. ", "green")) os.system("rm -f ./output/*_tmp.txt") else: pass
def get_http_requests(conn, domain, id): query="select url from http_requests \ where visit_id="+str(id) try: cur = conn.cursor() cur.execute(query) rows = cur.fetchall() # only those requests are first party whose get_fld() is same as get_fld() of original site analysed; rest are 3rd-party first_requests=[ele for ele in rows if domain==tld.get_fld(ele[0], fail_silently=True)] third_requests=[ele for ele in rows if domain!=tld.get_fld(ele[0], fail_silently=True)] except Error as e: return 0 return [len(first_requests), len(third_requests)]
def get_website(self, soup): try: url = soup.find('span', attrs={'itemprop': 'url'}).get_text().strip() if not url.startswith('http'): url = 'http://' + url return get_fld(url) except Exception: return None
def get_website(self, soup): try: s = soup.find('a', class_='vendor-backlink').get('href') if not s.startswith('http'): s = 'http://' + s return get_fld(s) except Exception: return None
def domain_sanity_check(domain): #Verify the domain name sanity if domain: if ("http://" or "https://") not in domain: try: domain = get_fld("https://" + domain) return domain except: print(colored("[!] Incorrect domain format. Please follow this format: example.com, https://example.com, www.example.com", "red")) sys.exit(1) else: try: domain = get_fld(domain) except: print(colored("[!] Incorrect domain name. Please follow this format: example.com, https://example.com, www.example.com", "red")) sys.exit(1) else: pass
def get_website(self, soup): try: site = soup.find('a', {'itemprop': 'url'}).get('href') if not site.startswith('http'): site = 'http://' + site return get_fld(site) except Exception: return None
def get_website(self, soup): try: return get_fld( soup.find( 'a', text=re.compile('^Visit.+Website$')).get('href').strip()) except Exception: return None
def get_website(site): ''' extract domain from website ''' temp_value = get_fld(site, fix_protocol=True) temp_value = temp_value.replace(".{username}", "").replace("{username}.", "") return temp_value
def get_format_url(url, a_doc, host): a_href = a_doc.get('href') try: if a_href is not None and a_href.__len__() > 0: a_href = str(a_href).strip() a_href = a_href[:a_href.index('#')] if a_href.__contains__( '#') else a_href # a_href = a_href.encode('utf8') # a_href = urllib.quote(a_href,safe='.:/?&=') if a_href.startswith('//'): url = 'https:' + a_href if url.startswith( 'https:') else 'http:' + a_href url = mx.URL.URL(str(url)) a_href = url.url elif a_href.startswith('/'): url = 'https://' + host + a_href if url.startswith( 'https:') else 'http://' + host + a_href url = mx.URL.URL(str(url)) a_href = url.url elif a_href.startswith('./') or a_href.startswith('../'): url = mx.URL.URL(str(url) + '/' + a_href) a_href = url.url elif not a_href.startswith('javascript') and not a_href.startswith( 'mailto') and not a_href.startswith( 'http') and a_href != '': url = 'https://' + host + '/' + a_href if url.startswith( 'https:') else 'http://' + host + '/' + a_href url = mx.URL.URL(str(url)) a_href = url.url a_href = a_href[:-1] if a_href.endswith('/') else a_href #a_href = a_href.lower() get_fld(a_href) except: return '' if not a_href.startswith('http'): return '' if a_href.__contains__('?'): a_params_str = a_href[a_href.index('?') + 1:] a_params = a_params_str.split('&') a_params.sort() a_params_str = '&'.join(a_params) a_href = a_href[:a_href.index('?') + 1] + a_params_str return a_href
def wirte_sorl(head, body): with open("arrange.sorl", "w") as fp: for line in head: print(line, file=fp) url = (x.lstrip("*.") for x in body) fld = (get_fld(x, fix_protocol=True) for x in url) for line in sorted(set(fld)): print(f"*.{line}", file=fp)
def use_cdn(cname): try: fld = get_fld(cname.rstrip('.'), fix_protocol=True) answers = dns.resolver.query(fld, 'NS') except: ### debug print('[DNS] {} can not find NS record'.format(fld)) else: cdn_nss = json.load(open('cdn-ns.json')) for answer in answers: for cdn_vendor, cdn_ns_list in cdn_nss.items(): if get_fld(answer.to_text().rstrip('.'), fix_protocol=True) in cdn_ns_list: ### debug print('[CDN] Vendor: {}, NS: {}'.format( cdn_vendor, answer.to_text())) return True return False
def get_website(self, soup): try: s = soup.find('li', text=re.compile(r'www')).get_text().strip() if not s.startswith('http'): s = 'http://' + s return get_fld(s) except Exception: try: for h2 in soup.select('h2'): txt = h2.get_text().strip() if txt == 'Vendor Details': s = h2.find_next('ul', \ class_='check-list').select('li')[1].get_text().strip() if not s.startswith('http'): s = 'http://' + s return get_fld(s) except Exception: return None
def get_website(self, soup): try: s = soup.find('span', {'itemprop': 'author'}).find_next().find_next().get_text().strip() if not s.startswith('http'): s = 'http://' + s return get_fld(s) except Exception: try: s = MAIN_URL + soup.find('a', text=re.compile(r'Visit Website')).get('href') resp = request(s) external_url = re.findall(r'location\.replace.+?"(.+?)"', resp.text)[0] if 'external_click_ga' in external_url: r = requests.head(external_url) url = r.headers['Location'] return get_fld(url) return get_fld(external_url) except Exception: return None
def get(self, web): try: res = get_fld(web, fix_protocol=True) save = open('domain.txt', 'a') save.write('http://'+res+'\n') save.close() print(Fore.LIGHTBLUE_EX, '[+] http://{}'.format(res)) except: pass
def get_website(self, soup): try: s = soup.find('span', class_='website').find('a').get('href').strip() if not s.startswith('http'): s = 'http://' + s return get_fld(s) except Exception: return None
def get_root_domain(value, zone=None): """ Get the root domain (FLD) for the provided value """ res = get_fld(value, fix_protocol=True, fail_silently=True) if res is None: return zone return res
def get_fld_from_value(value, zone): """ Get the First Level Domain (FLD) for the provided value """ res = get_fld(value, fix_protocol=True, fail_silently=True) if res is None: return zone return res
def extract_domain(url): if type(url) == type(None): return None url = url if url.startswith("http") else "http://" + url try: #devuelve el top level domain dada una url return get_fld(url) except: return urlparse.urlparse(url).netloc
def get_data(self, url): resp = request(url) if resp: soup = BeautifulSoup(resp.text, 'lxml') name = soup.find('div', id='main').find('h1').find('span', class_='title').get_text().strip() website = None try: website = get_fld(soup.find('div', id='node-sidebar').find('div', \ class_='node-links').find('li', class_=re.compile('link-related-www')).find('a').get('href').strip()) except Exception: pass desc = None try: desc = soup.find('strong', \ text=re.compile(name.strip(' County').upper())).previous('p').previous_element.get_text().replace(';', ',').strip() except Exception: pass addr = None try: addr = soup.find('div', class_='field-address').get_text().strip() except Exception: pass population = None try: population = soup.find('strong', text=re.compile(r'OPULATION')).next.next.strip() except Exception: pass inc_date = None try: inc_date = soup.find('strong', text=re.compile(r'INCORPORATION DATE')).next.next.strip() except Exception: pass boards = None try: boards = soup.find('strong', text=re.compile(r'BOARD')).find_parent().find_parent().find_next('ul').get_text().strip() except Exception: pass form_of_gov = None try: form_of_gov = soup.find('strong', text=re.compile(r'FORM OF GOVERNMENT')).next.next.strip() except Exception: pass data = [[ name, website, desc, addr, population, inc_date, boards, form_of_gov, url ]] write_data(data, name)
def list_all_websites(self): ''' list all the available websites' entries ''' if len(self.websites_entries) > 0: for site in self.websites_entries: temp_value = get_fld(site["url"], fix_protocol=True) temp_value = temp_value.replace(".{username}", "").replace("{username}.", "") if not self.silent: self.log.info(temp_value)
async def blacklist_check(url: str, transaction: Hub.current.scope.transaction): with transaction.start_child(op="task", description="Blacklist check"): blacklist = await open_blacklist() if not validate_ip(url): url = tld.get_fld(url, fix_protocol=True) if url in blacklist["blacklist"]: return blacklist["blacklist"][url] else: return False
def get_domain(url): domain = None try: domain = get_fld(url) except TldDomainNotFound: # Not yet known TLD or IP address or local hostname domain = urlparse(url).netloc except TldBadUrl: domain = None return domain
def __init__(self,domain,port,page,filename): intclass=GetAsset(domain,port,page,filename) if not intclass.JudgeIP(domain): domain="http://"+domain self.domain=get_fld(domain) else: self.domain=domain self.port=port self.page=page self.filename=filename
def get_url_domain(self, url): """ 获取url的domain """ # 加锁 self.lock.acquire() domain = get_fld(url) #释放锁 self.lock.release() return domain
def get_website(self, soup): try: s = soup.find('li', attrs={ 'style': re.compile('link_grey.png') }).get_text().strip() if not s.startswith('http'): s = 'http://' + s return get_fld(s) except Exception: return None
def list_all_websites(): ''' list all the available websites' entries ''' if len(WEBSITES_ENTRIES) > 0: for site in WEBSITES_ENTRIES: temp_value = get_fld(site["url"], fix_protocol=True) temp_value = temp_value.replace(".{username}", "").replace("{username}.", "") LOG.info(temp_value)
def rewrite_user_link(self, link): try: if self.conf.without_users_links and ( link["href"].startswith("mailto:") or get_fld(link["href"]) in SOCIAL_DOMAINS): self.redact_link(link) return 1 except Exception as exc: logger.warning(f"Failed to get fld for {link.get('href')}: {exc}") return 0
def get_ps1_or_ipaddress(url): try: return get_fld(url, fail_silently=False) except Exception: hostname = urlparse(url).hostname try: ipaddress.ip_address(hostname) return hostname except Exception: return None
def domain_sanity_check(domain): #Verify the domain name sanity if domain: try: domain = get_fld(domain, fix_protocol = True) return domain except: print(colored("[!] Incorrect domain format. Please follow this format: example.com, http(s)://example.com, www.example.com", "red")) sys.exit(1) else: pass
def main(market_name,ts): ''' Main function where Bing Search is implemented and search term is defined. The results are returned and stored in new dataframe and the df is then exported to BING_SEARCH_NIELSEN collection. @:param ts: Timestamp @:type: int64''' connection = connect_mongo() marketmap = pd.DataFrame((connection[Settings.MARKET_MAP].find({"country":market_name.capitalize()},{"code":1,"country":1,"_id":0}))) print(marketmap) reader = connection[Settings.CLEAN_NIELSEN].find({"UPPER_COUNTRY":market_name.upper(), "ts":ts},{"_id": 0}) print(reader) for records in reader: try: count = connection[Settings.BRAND_SOURCE].count({"brand_low": str(records['UPPER_BRAND']).lower(), "country": str(records['UPPER_COUNTRY']).lower()}) if count == 0: term = str(records['UPPER_BRAND']).lower()+" "+str(records['MANUFACTURER']).lower() print("fetching Records : ", term) brand=str(records['UPPER_BRAND']).lower() company=str(records['MANUFACTURER']).lower() market=str(records['UPPER_COUNTRY']).lower() if len(Settings.bing_subscription_key) == 32 and company != "PRIVATE LABEL": print('Searching the Web for: ', term) offset = 0 totalEstimatedMatches = 100 count = 0 while (offset < totalEstimatedMatches): count = count + 1 headers, result = BingWebSearch(term, offset,marketmap, market) data = json.dumps(json.loads(result), indent=4) d_data = json.loads(data) print("count : ", str(count)) print("offset : ", str(offset)) if 'webPages' in d_data: totalEstimatedMatches = d_data['webPages']['totalEstimatedMatches'] news_dict = d_data['webPages']['value'] for news in news_dict: news['hitCount'] = count news['totalEstimateMatches'] = totalEstimatedMatches news['fetchdate'] = datetime.datetime.utcnow() news['offset'] = offset news['query'] = term news['count'] = 1 news['brand'] = brand news['company'] = company news['country'] = market news['ts'] = ts news['domain'] = get_fld(news['url']) exists = connection[Settings.BING_SEARCH_NIELSEN].count({'brand': news['brand'], 'domain': news['domain'],'country':market_name}) if exists == 0: connection[Settings.BING_SEARCH_NIELSEN].insert(news) else: connection[Settings.BING_SEARCH_NIELSEN].update_one({'brand': news['brand'], 'domain': news['domain'],'country':market_name}) offset = offset + len(news_dict) totalEstimatedMatches = 0 except Exception as e: print(str(e))
def website_url_not_in_db(url): domain = db.session.query(DomainArchived).filter_by( name=get_fld(url)).first() if not domain: raise InvalidAPIRequest('主域名未收录,请先添加主域名') for model in [ WebsiteArchived, WebsiteNews, WebsiteRecycler, WebsiteBanned, WebsiteDuplicated ]: if db.session.query(model).filter_by(url=url.strip('/')).first(): raise RecordAlreadyExists('已有此网站')
def _get_zone_id(domain): tld = get_fld('http://' + domain) url = "https://api.cloudflare.com/client/v4/zones?name={0}".format(tld) for auth in CF_HEADERS: r = requests.get(url, headers=auth) r.raise_for_status() r = r.json().get('result',()) if r: return auth, r[0]['id'] logger.error(" + Domain {0} not found in any Cloudflare account".format(tld)) sys.exit(1)
def extract_tld_from_url( url: str) -> str: """ Identify the top level domain of the url. Parameters ---------- url: str. The URL of the taget website. Returns ------- out: str. The TLD according to Mozilla's tables. """ return get_fld(url)
def lookup(self, domain, wildcard = True): base_url = "https://crt.sh/?q={}&output=json" if wildcard: domain = "%25.{}".format(domain) url = base_url.format(domain) subdomains = [] user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0' try: req = requests.get(url, headers={'User-Agent': user_agent}, timeout=20, verify=False) #times out after 8 seconds waiting if req.status_code == 200: try: content = req.content.decode('utf-8') data = json.loads(content) for subdomain in data: subdomains.append(subdomain["name_value"]) return subdomains except: error = "Error retrieving information for {}.".format(domain.replace('%25.', '')) errorlog(error, enable_logging) except: try: #connecting to crt.sh postgres database to retrieve subdomains in case API fails unique_domains = [] domain = domain.replace('%25.', '') conn = psycopg2.connect("dbname={0} user={1} host={2}".format(DB_NAME, DB_USER, DB_HOST)) conn.autocommit = True cursor = conn.cursor() cursor.execute("SELECT ci.NAME_VALUE NAME_VALUE FROM certificate_identity ci WHERE ci.NAME_TYPE = 'dNSName' AND reverse(lower(ci.NAME_VALUE)) LIKE reverse(lower('%{}'));".format(domain)) for result in cursor.fetchall(): matches = re.findall(r"\'(.+?)\'", str(result)) for subdomain in matches: try: if get_fld("https://" + subdomain) == domain: unique_domains.append(subdomain) except: pass return unique_domains except: print(colored("[!] Unable to connect to the database.".format(domain), "red")) error = "Unable to connect to the database." errorlog(error, enable_logging)