def search(self): dl = Download.Download(self.verbose) convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google DOCX Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: urly = "https://www.google.com/search?q=site:" + \ self.Domain + "+filetype:docx&start=" + str(self.Counter) except Exception as e: error = "[!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: r = requests.get(urly) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) RawHtml = r.content soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs( urlparse.urlparse(a['href']).query)['q'][0] if l.startswith('http') or l.startswith('www'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google DOCX search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".docx" FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Google DOCX file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) self.Text += convert.convert_docx_to_txt(FileName) # print self.Text except Exception as e: print helpers.color(" [!] Issue with Converting Docx Files\n", firewall=True) try: dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No DOCX's to download from Google!\n", firewall=True)
def test_converter(): # test the convert for all formats p = os.path.dirname(os.path.realpath('.')) + '/SimplyEmail/tests/' c = Converter.Converter(verbose=True) print p text = c.convert_docx_to_txt(p + 'Test-DOCX.docx') assert text assert 'How to Design and Test' in text text = c.convert_doc_to_txt(p + 'Test-DOC.doc') assert text assert 'How to Design and Test' in text text = c.convert_pdf_to_txt(p + 'Test-PDF.pdf') assert text assert 'How to Design and Test' in text text = c.convert_zip_to_text(p + 'Test-PPTX.pptx') assert text assert 'Test SLIDE' in text assert 'Test SLIDE 2' in text assert 'Test SLIDE 3' in text
def search(self): convert = Converter.Converter(self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google PPTX Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = "https://www.google.com/search?q=" + \ self.Domain + "+filetype:pptx&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) # check for captcha try: # Url = r.url dl.GoogleCaptchaDetection(RawHtml) except Exception as e: print e soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse( a['href']).query)['q'][0] if l.startswith('http') or l.startswith( 'www') or l.startswith('https'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) # for some reason PPTX seems to be cached data: l = urlparse.parse_qs(urlparse.urlparse( a['href']).query)['q'][0] l = l.split(':', 2) if "webcache.googleusercontent.com" not in l[2]: self.urlList.append(l[2]) except: pass self.Counter += 10 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google PPTX search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".pptx" FileName, FileDownload = dl.download_file2(url, filetype) if FileDownload: if self.verbose: p = ' [*] Google PPTX file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: self.Text += convert.convert_pptx_to_txt(FileName) else: self.logger.warning( 'Downloaded file is not a PPTX: ' + ft) # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening PPTX Files\n", firewall=True) try: dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No PPTX to download from Google!\n", firewall=True)
def search(self): dl = Download.Download(self.verbose) convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 10: helpers.modsleep(1) if self.verbose: p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter) self.logger.info('ExaleadPPTXSearch on page: ' + str(self.Counter)) print helpers.color(p, firewall=True) try: url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \ '"+filetype:pptx&elements_per_page=' + \ str(self.Quanity) + '&start_index=' + str(self.Counter) except Exception as e: self.logger.error('ExaleadPPTXSearch could not build URL') error = " [!] Major issue with Exalead PPTX Search: " + str(e) print helpers.color(error, warning=True) try: RawHtml = dl.requesturl(url, useragent=self.UserAgent) # sometimes url is broken but exalead search results contain # e-mail self.Text += RawHtml soup = BeautifulSoup(RawHtml, "lxml") self.urlList = [ h2.a["href"] for h2 in soup.findAll('h4', class_='media-heading') ] except Exception as e: self.logger.error( 'ExaleadPPTXSearch could not request / parse HTML') error = " [!] Fail during parsing result: " + str(e) print helpers.color(error, warning=True) self.Counter += 30 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Exalead PPTX search downloading: ' + str(url) self.logger.info('ExaleadPPTXSearch downloading: ' + str(url)) print helpers.color(p, firewall=True) try: filetype = ".pptx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Exalead PPTX file was downloaded: ' + \ str(url) self.logger.info('ExaleadDOCSearch downloaded: ' + str(p)) print helpers.color(p, firewall=True) ft = helpers.filetype(FileName).lower() if 'powerpoint' in ft: self.Text += convert.convert_zip_to_text(FileName) else: self.logger.warning( 'Downloaded file is not a PPTX: ' + ft) except Exception as e: error = " [!] Issue with opening PPTX Files:%s" % (str(e)) print helpers.color(error, warning=True) try: dl.delete_file(FileName) except Exception as e: print e except Exception as e: self.logger.error("ExaleadPPTXSearch no doc's to download") print helpers.color(" [*] No PPTX's to download from Exalead!\n", firewall=True) if self.verbose: p = ' [*] Searching PPTX from Exalead Complete' print helpers.color(p, status=True)
def search(self): convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 10: time.sleep(1) if self.verbose: p = ' [*] Exalead Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \ '"+filetype:pdf&elements_per_page=' + \ str(self.Quanity) + '&start_index=' + str(self.Counter) except Exception as e: error = " [!] Major issue with Exalead PDF Search: " + str(e) print helpers.color(error, warning=True) try: r = requests.get(url, headers=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Exalead (Check Connection):" + str( e) print helpers.color(error, warning=True) try: RawHtml = r.content # sometimes url is broken but exalead search results contain # e-mail self.Text += RawHtml soup = BeautifulSoup(RawHtml, "lxml") self.urlList = [h2.a["href"] for h2 in soup.findAll('h4', class_='media-heading')] except Exception as e: error = " [!] Fail during parsing result: " + str(e) print helpers.color(error, warning=True) self.Counter += 30 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Exalead PDF search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".pdf" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = ' [*] Exalead PDF file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) self.Text += convert.convert_pdf_to_txt(FileName) except Exception as e: pass try: dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No PDF's to download from Exalead!\n", firewall=True) if self.verbose: p = ' [*] Searching PDF from Exalead Complete' print helpers.color(p, status=True)
def search(self): convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit: time.sleep(1) if self.verbose: p = ' [*] Exalead Search on page: ' + str(self.Counter) self.logger.info("ExaleadDOCXSearch on page: " + str(self.Counter)) print helpers.color(p, firewall=True) try: url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \ '"+filetype:docx&elements_per_page=' + \ str(self.Quanity) + '&start_index=' + str(self.Counter) except Exception as e: self.logger.error("Issue building URL to search") error = " [!] Major issue with Exalead DOCX Search: " + str(e) print helpers.color(error, warning=True) try: r = requests.get(url, headers=self.UserAgent) except Exception as e: error = " [!] Fail during Request to Exalead (Check Connection):" + str( e) print helpers.color(error, warning=True) try: RawHtml = r.content # sometimes url is broken but exalead search results contain # e-mail self.Text += RawHtml soup = BeautifulSoup(RawHtml, "lxml") self.urlList = [ h2.a["href"] for h2 in soup.findAll('h4', class_='media-heading') ] except Exception as e: self.logger.error("Fail during parsing result: " + str(e)) error = " [!] Fail during parsing result: " + str(e) print helpers.color(error, warning=True) self.Counter += 30 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Exalead DOCX search downloading: ' + str(url) self.logger.info("Starting download of DOCX: " + str(url)) print helpers.color(p, firewall=True) try: filetype = ".docx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: self.logger.info("File was downloaded: " + str(url)) p = ' [*] Exalead DOCX file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) self.Text += convert.convert_docx_to_txt(FileName) except Exception as e: self.logger.error("Issue with opening DOCX Files: " + str(e)) error = " [!] Issue with opening DOCX Files:%s\n" % ( str(e)) print helpers.color(error, warning=True) try: dl.delete_file(FileName) except Exception as e: print e except Exception as e: p = " [*] No DOCX's to download from Exalead: " + e self.logger.info("No DOCX's to download from Exalead: " + str(e)) print helpers.color(p, firewall=True) if self.verbose: p = ' [*] Searching DOCX from Exalead Complete' self.logger.info("Searching DOCX from Exalead Complete") print helpers.color(p, status=True)
def search(self): # setup for helpers in the download class convert = Converter.Converter(verbose=self.verbose) dl = Download.Download(self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = ' [*] Google PDF Search on page: ' + str(self.Counter) print helpers.color(p, firewall=True) try: urly = "https://www.google.com/search?q=site:" + \ self.Domain + "+filetype:pdf&start=" + str(self.Counter) except Exception as e: error = " [!] Major issue with Google Search:" + str(e) print helpers.color(error, warning=True) try: r = requests.get(urly) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + \ str(e) print helpers.color(error, warning=True) RawHtml = r.content # get redirect URL # Url = r.url dl.GoogleCaptchaDetection(RawHtml) soup = BeautifulSoup(RawHtml) for a in soup.findAll('a'): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs( urlparse.urlparse(a['href']).query)['q'][0] if l.startswith('http') or l.startswith('www'): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 # now download the required files try: for url in self.urlList: if self.verbose: p = ' [*] Google PDF search downloading: ' + str(url) print helpers.color(p, firewall=True) try: filetype = ".pdf" # use new helper class to download file FileName, FileDownload = dl.download_file(url, filetype) # check if the file was downloaded if FileDownload: if self.verbose: p = ' [*] Google PDF file was downloaded: ' + \ str(url) print helpers.color(p, firewall=True) self.Text += convert.convert_pdf_to_txt(FileName) except Exception as e: print e try: # now remove any files left behind dl.delete_file(FileName) except Exception as e: print e except: print helpers.color(" [*] No PDF's to download from Google!\n", firewall=True)
def search(self): convert = Converter.Converter(verbose=self.verbose) while self.Counter <= self.Limit and self.Counter <= 100: time.sleep(1) if self.verbose: p = " [*] Google XLSX Search on page: " + str(self.Counter) self.logger.info("Google XLSX Search on page: " + str(self.Counter)) print helpers.color(p, firewall=True) try: urly = ("https://www.google.com/search?q=site:" + self.Domain + "+filetype:xlsx&start=" + str(self.Counter)) except Exception as e: error = " [!] Major issue with Google XLSX Search:" + str(e) self.logger.error("GoogleXlsxSearch failed to build url: " + str(e)) print helpers.color(error, warning=True) try: r = requests.get(urly) except Exception as e: error = " [!] Fail during Request to Google (Check Connection):" + str( e) self.logger.error( "GoogleXlsxSearch failed to request url (Check Connection): " + str(e)) print helpers.color(error, warning=True) RawHtml = r.content soup = BeautifulSoup(RawHtml) # I use this to parse my results, for URLS to follow for a in soup.findAll("a"): try: # https://stackoverflow.com/questions/21934004/not-getting-proper-links- # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412? # newreg=01f0ed80771f4dfaa269b15268b3f9a9 l = urlparse.parse_qs(urlparse.urlparse( a["href"]).query)["q"][0] if l.startswith("http") or l.startswith("www"): if "webcache.googleusercontent.com" not in l: self.urlList.append(l) except: pass self.Counter += 10 helpers.modsleep(self.Sleep, jitter=self.Jitter) # now download the required files self.logger.debug( "GoogleXlsxSearch completed HTML result query, starting downloads") try: for url in self.urlList: if self.verbose: p = " [*] Google XLSX search downloading: " + str(url) self.logger.info("Google XLSX search downloading: " + str(url)) print helpers.color(p, firewall=True) try: filetype = ".xlsx" dl = Download.Download(self.verbose) FileName, FileDownload = dl.download_file(url, filetype) if FileDownload: if self.verbose: p = " [*] Google XLSX file was downloaded: " + str( url) self.logger.info( "Google XLSX file was downloaded: " + str(url)) print helpers.color(p, firewall=True) self.Text += convert.convert_Xlsx_to_Csv(FileName) # print self.Text except Exception as e: print helpers.color(" [!] Issue with opening Xlsx Files\n", firewall=True) self.logger.error("Google XLSX had issue opening file") try: dl.delete_file(FileName) except Exception as e: self.logger.error("Google XLSX failed to delete file: " + str(e)) except Exception as e: print helpers.color(" [*] No XLSX's to download from google!\n", firewall=True) self.logger.error("No XLSX's to download from google! " + str(e))