def search(self): dl = Download.Download(verbose=self.verbose) while self.Counter <= self.Depth and self.Counter <= 100: helpers.modsleep(5) if self.verbose: p = ' [*] GitHubUser Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = 'https://github.com/search?p=' + str(self.Counter) + '&q=' + \ str(self.domain) + 'ref=searchresults&type=Users&utf8=' except Exception as e: error = " [!] Major issue with GitHubUser Search:" + str(e) print helpers.color(error, warning=True) try: r = dl.requesturl(url, useragent=self.UserAgent, raw=True, timeout=10) except Exception as e: error = " [!] Fail during Request to GitHubUser (Check Connection):" + \ str(e) print helpers.color(error, warning=True) results = r.content self.Html += results self.Counter += 1
def search(self): dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 1000: time.sleep(1) if self.verbose: p = " [*] Google Search on page: " + str(self.Counter) print helpers.color(p, firewall=True) try: url = ( "http://www.google.com/search?num=" + str(self.Quanity) + "&start=" + str(self.Counter) + '&hl=en&meta=&q=%40"' + self.Domain + '"' ) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: results = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + str(e) print helpers.color(error, warning=True) try: # Url = r.url dl.GoogleCaptchaDetection(results) except Exception as e: print e self.Html += results self.Counter += 100 helpers.modsleep(self.Sleep, jitter=self.Jitter)
def search(self): dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 1000: time.sleep(1) if self.verbose: p = ' [*] Google Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = "http://www.google.com/search?num=" + str(self.Quanity) + "&start=" + \ str(self.Counter) + "&hl=en&meta=&q=%40\"" + \ self.Domain + "\"" except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: results = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) try: # Url = r.url dl.GoogleCaptchaDetection(results) except Exception as e: print e self.Html += results self.Counter += 100 helpers.modsleep(self.Sleep, jitter=self.Jitter)
def process(self): dl = Download.Download(self.verbose) while self.Counter <= self.PageLimit: if self.verbose: p = ' [*] AskSearch on page: ' + str(self.Counter) print helpers.color(p, firewall=True) self.logger.info('AskSearch on page: ' + str(self.Counter)) try: url = 'http://www.ask.com/web?q=@' + str(self.Domain) + \ '&pu=10&page=' + str(self.Counter) except Exception as e: error = " [!] Major issue with Ask Search:" + str(e) self.logger.error('Major issue with Ask Search: ' + str(e)) print helpers.color(error, warning=True) try: rawhtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Ask (Check Connection):" + \ str(e) self.logger.error( 'Fail during Request to Ask (Check Connection): ' + str(e)) print helpers.color(error, warning=True) self.Html += rawhtml self.Counter += 1 helpers.modsleep(self.Sleep, jitter=self.Jitter)
def search(self): dl = Download.Download(verbose=self.verbose) while self.Counter <= self.Depth and self.Counter <= 100: helpers.modsleep(5) if self.verbose: p = ' [*] GitHubUser Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = 'https://github.com/search?p=' + str(self.Counter) + '&q=' + \ str(self.domain) + 'ref=searchresults&type=Users&utf8=' except Exception as e: error = " [!] Major issue with GitHubUser Search:" + str(e) print helpers.color(error, warning=True) try: r = dl.requesturl( url, useragent=self.UserAgent, raw=True, timeout=10) except Exception as e: error = " [!] Fail during Request to GitHubUser (Check Connection):" + \ str(e) print helpers.color(error, warning=True) results = r.content self.Html += results self.Counter += 1
def search(self): dl = Download.Download(self.verbose) convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google DOC Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: urly = "https://www.google.com/search?q=site:" + \ self.Domain + "+filetype:doc&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: r = requests.get(urly) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) RawHtml = r.content # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse( a['href']).query)['q'][0] if l.startswith('http') or l.startswith('www'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google DOC search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".doc" FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Google DOC file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) self.Text += convert.convert_doc_to_txt(FileName) # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening Doc Files\n", firewall=True) try: dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No DOC's to download from Google!\n", firewall=True)
def search(self): dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google CSV Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = "https://www.google.com/search?q=site:" + \ self.Domain + "+filetype:csv&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs( urlparse.urlparse(a['href']).query)['q'][0] if l.startswith('http') or l.startswith('www'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google CSV search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".csv" FileName, FileDownload = dl.download_file2(url, filetype) if FileDownload: if self.verbose: p = '[*] Google CSV file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) with open(FileName) as f: self.Text += f.read() # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening CSV Files\n", firewall=True) try: dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)
def search(self): dl = Download.Download(self.verbose) convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 10: helpers.modsleep(1) if self.verbose: p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter) self.logger.info('ExaleadPPTXSearch on page: ' + str(self.Counter)) print helpers.color(p, firewall=True) try: url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \ '"+filetype:pptx&elements_per_page=' + \ str(self.Quanity) + '&start_index=' + str(self.Counter) except Exception as e: self.logger.error('ExaleadPPTXSearch could not build URL') error = " [!] Major issue with Exalead PPTX Search: " + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) # sometimes url is broken but exalead search results contain # e-mail self.Text += RawHtml soup = BeautifulSoup(RawHtml, "lxml") self.urlList = [h2.a["href"] for h2 in soup.findAll('h4', class_='media-heading')] except Exception as e: self.logger.error('ExaleadPPTXSearch could not request / parse HTML') error = " [!] Fail during parsing result: " + str(e) print helpers.color(error, warning=True) self.Counter += 30 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Exalead PPTX search downloading: ' + str(url) self.logger.info('ExaleadPPTXSearch downloading: ' + str(url)) print helpers.color(p, firewall=True) try: filetype = ".pptx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Exalead PPTX file was downloaded: ' + \ str(url) self.logger.info('ExaleadDOCSearch downloaded: ' + str(p)) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning('Downloaded file is not a PPTX: ' + ft) except Exception as e: error = " [!] Issue with opening PPTX Files:%s" % (str(e)) print helpers.color(error, warning=True) try: dl.delete_file(FileName) except Exception as e: print e except Exception as e: self.logger.error("ExaleadPPTXSearch no doc's to download") print helpers.color(" [*] No PPTX's to download from Exalead!\n", firewall=True) if self.verbose: p = ' [*] Searching PPTX from Exalead Complete' print helpers.color(p, status=True)
def search(self): dl = Download.Download(self.verbose) convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 10: helpers.modsleep(1) if self.verbose: p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter) self.logger.info('ExaleadPPTXSearch on page: ' + str(self.Counter)) print helpers.color(p, firewall=True) try: url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \ '"+filetype:pptx&elements_per_page=' + \ str(self.Quanity) + '&start_index=' + str(self.Counter) except Exception as e: self.logger.error('ExaleadPPTXSearch could not build URL') error = " [!] Major issue with Exalead PPTX Search: " + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) # sometimes url is broken but exalead search results contain # e-mail self.Text += RawHtml soup = BeautifulSoup(RawHtml, "lxml") self.urlList = [ h2.a["href"] for h2 in soup.findAll('h4', class_='media-heading') ] except Exception as e: self.logger.error( 'ExaleadPPTXSearch could not request / parse HTML') error = " [!] Fail during parsing result: " + str(e) print helpers.color(error, warning=True) self.Counter += 30 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Exalead PPTX search downloading: ' + str(url) self.logger.info('ExaleadPPTXSearch downloading: ' + str(url)) print helpers.color(p, firewall=True) try: filetype = ".pptx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Exalead PPTX file was downloaded: ' + \ str(url) self.logger.info('ExaleadDOCSearch downloaded: ' + str(p)) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning( 'Downloaded file is not a PPTX: ' + ft) except Exception as e: error = " [!] Issue with opening PPTX Files:%s" % (str(e)) print helpers.color(error, warning=True) try: dl.delete_file(FileName) except Exception as e: print e except Exception as e: self.logger.error("ExaleadPPTXSearch no doc's to download") print helpers.color(" [*] No PPTX's to download from Exalead!\n", firewall=True) if self.verbose: p = ' [*] Searching PPTX from Exalead Complete' print helpers.color(p, status=True)
def search(self): dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = " [*] Google CSV Search on page: " + str(self.Counter) print helpers.color(p, firewall=True) try: url = ("https://www.google.com/search?q=site:" + self.Domain + "+filetype:csv&start=" + str(self.Counter)) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + str( e) print helpers.color(error, warning=True) # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll("a"): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse( a["href"]).query)["q"][0] if l.startswith("http") or l.startswith("www"): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files try: for url in self.urlList: if self.verbose: p = " [*] Google CSV search downloading: " + str(url) print helpers.color(p, firewall=True) try: filetype = ".csv" FileName, FileDownload = dl.download_file2(url, filetype) if FileDownload: if self.verbose: p = "[*] Google CSV file was downloaded: " + str( url) print helpers.color(p, firewall=True) with open(FileName) as f: self.Text += f.read() # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening CSV Files\n", firewall=True) try: dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)
def search(self): # setup for helpers in the download class convert = Converter.Converter(verbose=self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = " [*] Google PDF Search on page: " + str(self.Counter) print helpers.color(p, firewall=True) try: urly = ("https://www.google.com/search?q=" + self.Domain + "+filetype:pdf&start=" + str(self.Counter)) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: r = requests.get(urly) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + str( e) print helpers.color(error, warning=True) RawHtml = r.content # get redirect URL # Url = r.url dl.GoogleCaptchaDetection(RawHtml) soup = BeautifulSoup(RawHtml) for a in soup.findAll("a"): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse( a["href"]).query)["q"][0] if l.startswith("http") or l.startswith("www"): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files try: for url in self.urlList: if self.verbose: p = " [*] Google PDF search downloading: " + str(url) print helpers.color(p, firewall=True) try: filetype = ".pdf" # use new helper class to download file FileName, FileDownload = dl.download_file(url, filetype) # check if the file was downloaded if FileDownload: if self.verbose: p = " [*] Google PDF file was downloaded: " + str( url) print helpers.color(p, firewall=True) self.Text += convert.convert_pdf_to_txt(FileName) except Exception as e: print e try: # now remove any files left behind dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No PDF's to download from Google!\n", firewall=True)
def search(self): convert = Converter.Converter(self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = " [*] Google PPTX Search on page: " + str(self.Counter) print helpers.color(p, firewall=True) try: url = ( "https://www.google.com/search?q=" + self.Domain + "+filetype:pptx&start=" + str(self.Counter) ) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + str( e ) print helpers.color(error, warning=True) # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll("a"): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse(a["href"]).query)["q"][0] if ( l.startswith("http") or l.startswith("www") or l.startswith("https") ): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) # for some reason PPTX seems to be cached data: l = urlparse.parse_qs(urlparse.urlparse(a["href"]).query)["q"][0] l = l.split(":", 2) if "webcache.googleusercontent.com" not in l[2]: self.urlList.append(l[2]) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files try: for url in self.urlList: if self.verbose: p = " [*] Google PPTX search downloading: " + str(url) print helpers.color(p, firewall=True) try: filetype = ".pptx" FileName, FileDownload = dl.download_file2(url, filetype) if FileDownload: if self.verbose: p = " [*] Google PPTX file was downloaded: " + str(url) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if "powerpoint" in ft: # self.Text += convert.convert_zip_to_text(FileName) self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning("Downloaded file is not a PPTX: " + ft) # print self.Text except Exception as e: print helpers.color( " [!] Issue with opening PPTX Files\n", firewall=True ) try: if FileDownload: dl.delete_file(FileName) except Exception as e: self.logger.warning("Issue deleting file: " + str(e)) except: print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)
def search(self): convert = Converter.Converter(self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google PPTX Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = "https://www.google.com/search?q=" + \ self.Domain + "+filetype:pptx&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0] if l.startswith('http') or l.startswith('www') or l.startswith('https'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) # for some reason PPTX seems to be cached data: l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0] l = l.split(':', 2) if "webcache.googleusercontent.com" not in l[2]: self.urlList.append(l[2]) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google PPTX search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".pptx" FileName, FileDownload = dl.download_file2(url, filetype) if FileDownload: if self.verbose: p = ' [*] Google PPTX file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: # self.Text += convert.convert_zip_to_text(FileName) self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning('Downloaded file is not a PPTX: ' + ft) # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening PPTX Files\n", firewall=True) try: if FileDownload: dl.delete_file(FileName) except Exception as e: self.logger.warning('Issue deleting file: ' + str(e)) except: print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)
def search(self): convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = " [*] Google XLSX Search on page: " + str(self.Counter) self.logger.info("Google XLSX Search on page: " + str(self.Counter)) print helpers.color(p, firewall=True) try: urly = ("https://www.google.com/search?q=site:" + self.Domain + "+filetype:xlsx&start=" + str(self.Counter)) except Exception as e: error = " [!] Major issue with Google XLSX Search:" + str(e) self.logger.error("GoogleXlsxSearch failed to build url: " + str(e)) print helpers.color(error, warning=True) try: r = requests.get(urly) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + str( e) self.logger.error( "GoogleXlsxSearch failed to request url (Check Connection): " + str(e)) print helpers.color(error, warning=True) RawHtml = r.content soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll("a"): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse( a["href"]).query)["q"][0] if l.startswith("http") or l.startswith("www"): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files self.logger.debug( "GoogleXlsxSearch completed HTML result query, starting downloads") try: for url in self.urlList: if self.verbose: p = " [*] Google XLSX search downloading: " + str(url) self.logger.info("Google XLSX search downloading: " + str(url)) print helpers.color(p, firewall=True) try: filetype = ".xlsx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = " [*] Google XLSX file was downloaded: " + str( url) self.logger.info( "Google XLSX file was downloaded: " + str(url)) print helpers.color(p, firewall=True) self.Text += convert.convert_Xlsx_to_Csv(FileName) # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening Xlsx Files\n", firewall=True) self.logger.error("Google XLSX had issue opening file") try: dl.delete_file(FileName) except Exception as e: self.logger.error("Google XLSX failed to delete file: " + str(e)) except Exception as e: print helpers.color(" [*] No XLSX's to download from google!\n", firewall=True) self.logger.error("No XLSX's to download from google! " + str(e))
def search(self): # setup for helpers in the download class convert = Converter.Converter(verbose=self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google PDF Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: urly = "https://www.google.com/search?q=" + \ self.Domain + "+filetype:pdf&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: r = requests.get(urly) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) RawHtml = r.content # get redirect URL # Url = r.url dl.GoogleCaptchaDetection(RawHtml) soup = BeautifulSoup(RawHtml) for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs( urlparse.urlparse(a['href']).query)['q'][0] if l.startswith('http') or l.startswith('www'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google PDF search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".pdf" # use new helper class to download file FileName, FileDownload = dl.download_file(url, filetype) # check if the file was downloaded if FileDownload: if self.verbose: p = ' [*] Google PDF file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) self.Text += convert.convert_pdf_to_txt(FileName) except Exception as e: print e try: # now remove any files left behind dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No PDF's to download from Google!\n", firewall=True)