def extract(self, url): headers = { "Cookie": "VT_PREFERRED_LANGUAGE=en; __utma=194538546.1669411933.1482685628.1482685628.1482685628.1;" " __utmb=194538546.1.10.1482685628; __utmc=194538546;" " __utmz=194538546.1482685628.1.1." "utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided);" " __utmt=1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0", "Host": "www.virustotal.com" } content = self.requester.get(url, headers).text if not self.has_error(content): try: soup = BeautifulSoup(content, "html5lib") search_tags = soup.find_all("a", attrs={"target": "_blank", "class": None}, href=True) for tag in search_tags: domain = tag.string if domain is not None: domain = domain.strip() self.add(domain) else: break except: pass else: logger.error("Captcha detected during running VirusTotal Plugin")
def query(host_name, query_type='ANY', name_server='8.8.8.8', tcp=True): """ This method override the query method of dnslib :return: a list of info that we got """ # logger.info("Querying %s with type %s and name server %s" % (host_name, query_type, name_server)) results = [] try: _query = dnslib.DNSRecord.question(host_name, query_type.upper().strip()) response_raw = _query.send(name_server, tcp=tcp, timeout=2) response_parsed = dnslib.DNSRecord.parse(response_raw) for r in response_parsed.rr: try: _type = str(dnslib.QTYPE[r.rtype]) # Server sent an unknown type: except dnslib.dns.DNSError: _type = str(r.rtype) _host = str(r.rname).rstrip(".") _data = str(r.rdata) result = {'host': _host, 'type': _type, 'data': _data} results.append(result) except socket.error: logger.error("The query meet timeout, so i broke it") except struct.error: logger.error("Could not decode the response of dns query") except: raise DnsQueryException return results
def create(self): """ Create an instance of a plugin """ try: module = "extensions.plugins.%s.%s" % (self.plugin_type, self._get_plugin_file_name()) plugin_class = getattr(importlib.import_module(module), self.plugin_name) instance = plugin_class() instance.set_requester(self.requester) return instance except ImportError, e: logger.exception(str(e)) logger.error("Could not load plugin %s.%s" % (self.plugin_type, self._get_plugin_file_name())) return None
def post(self, url, data=None, headers=None): if headers is None: headers = self._headers for i in range(3): while True: try: return requests.post(url, headers=headers, proxies=self._proxies, data=data, timeout=60) except requests.exceptions.Timeout: logger.error("It takes a request so long so I must kill it.") logger.info("Trying to reconnect...") continue except: logger.error("I don't know why this error occurred") logger.info("Trying to reconnect...") continue return None
def extract(self, url): try: content = self.requester.get(url).text if self.has_error(content): logger.error("Can not find any result for: %s" % self.base_domain.domain_name) return None _soup = BeautifulSoup(content, "html5lib") if not _soup.find_all("em"): logger.error("This site seem to blocked your requests") return _last = "" _from = "" total = int(_soup.find_all("em")[0].string.split()[1]) logger.info("Total of results: %d for: %s " % (total, self.base_domain.domain_name)) if total > 20: count = total / 20 for tem in range(count + 1): url_temp = "" r = self.requester.get(url + _last + _from) soup = BeautifulSoup(r.text, "html5lib") search_region = BeautifulSoup( str(soup.find_all("table", attrs={"class": "TBtable"})), "lxml") for item in search_region.find_all('a', attrs={"rel": True}): url_temp = self.parse_domain_name(item['href']) self.add(url_temp) _last = "&last=" + url_temp _from = "&from=" + str((tem + 1) * 20 + 1) else: search_region = BeautifulSoup( str(_soup.find_all("table", attrs={"class": "TBtable"})), "lxml") for item in search_region.find_all('a', attrs={"rel": True}): url_temp = self.parse_domain_name(item['href']) self.add(url_temp) except: raise
def get_total_page(self): url = self.base_url.format(query=self.get_query(), page=self.max_page) content = self.requester.get(url).text try: if not self.has_error(content): soup = BeautifulSoup(content, "html5lib") tag_a = soup.findAll('a', attrs={'class': 'fl'}) try: num_page = tag_a[-1]['aria-label'] return int(num_page.split()[1]) except: logger.error("Can not get total_page so return 0") return 0 else: logger.error("Google seems blocked my request") return 0 except: return 0
def extract(self, url): try: header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" } content = self.requester.get(url, header).text soup = BeautifulSoup(content, "html5lib") search = soup.find_all("a", attrs={"class": " ac-algo ac-21th lh-24"}) for line in search: try: url = line['href'].split("/")[7].split("=")[1] self.add(self.parse_domain_name(url)) except: logger.error("Yahoo- Plugin: can not extract domain") except: pass
def get_total_page(self): max_page_temp = self.max_page while max_page_temp >= 0: url = self.base_url.format(query=self.get_query(), page=max_page_temp) header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" } content = self.requester.get(url, header).text if self.has_error(content): logger.error("To much requests and Yahoo knew") return 0 if (("We did not find results for" not in content) or ("Check spelling or type a new query" not in content)): list_seed = [] soup = BeautifulSoup(content, "html5lib") search_page = soup.find_all("a", href=True, title=True, attrs={'class': None}) for i in search_page: list_seed.append(int(i.string)) current = soup.find("strong") try: list_seed.append(int(current.string)) except: logger.error( "Yahoo Plug-in: Failed to get current seed but that means there are " "no more sub-domain for this base_domain but Yahoo could block your requests also" ) if not list_seed: return 0 else: return max(list_seed) else: logger.info( "Yahoo Plug-in: max_page down to %d since bot can not get any info about total_page" % max_page_temp) max_page_temp -= 10 return 0
def get_total_page(self): max_page_temp = self.max_page while max_page_temp >= 0: url = self.base_url.format(query=self.get_query(), page=max_page_temp) r = self.requester.get(url) if r is None: return 0 content = r.text if self.was_blocked(content): logger.error("Ask blocked the request") return 0 elif self.was_not_found(content): logger.info( "Ask Plug-in: max_page down to %d for domain: %s" % (max_page_temp, self.base_domain.domain_name)) max_page_temp -= 5 else: soup = BeautifulSoup(content, "html5lib") search_pages = soup.find_all("a", attrs={"ul-attr-accn": "pagination"}) list_no_page = [] for tag in search_pages: try: no_page = int(tag.string) list_no_page.append(no_page) except: continue if not list_no_page: logger.debug(soup) return 1 else: return max(list_no_page) return 0