Пример #1
0
 def search(self):
     dl = Download.Download(verbose=self.verbose)
     while self.Counter <= self.Depth and self.Counter <= 100:
         helpers.modsleep(5)
         if self.verbose:
             p = ' [*] GitHubUser Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = 'https://github.com/search?p=' + str(self.Counter) + '&q=' + \
                 str(self.domain) + 'ref=searchresults&type=Users&utf8='
         except Exception as e:
             error = " [!] Major issue with GitHubUser Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             r = dl.requesturl(url,
                               useragent=self.UserAgent,
                               raw=True,
                               timeout=10)
         except Exception as e:
             error = " [!] Fail during Request to GitHubUser (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         results = r.content
         self.Html += results
         self.Counter += 1
Пример #2
0
 def search(self):
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 1000:
         time.sleep(1)
         if self.verbose:
             p = " [*] Google Search on page: " + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = (
                 "http://www.google.com/search?num="
                 + str(self.Quanity)
                 + "&start="
                 + str(self.Counter)
                 + '&hl=en&meta=&q=%40"'
                 + self.Domain
                 + '"'
             )
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             results = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + str(e)
             print helpers.color(error, warning=True)
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(results)
         except Exception as e:
             print e
         self.Html += results
         self.Counter += 100
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
Пример #3
0
 def search(self):
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 1000:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = "http://www.google.com/search?num=" + str(self.Quanity) + "&start=" + \
                 str(self.Counter) + "&hl=en&meta=&q=%40\"" + \
                 self.Domain + "\""
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             results = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(results)
         except Exception as e:
             print e
         self.Html += results
         self.Counter += 100
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
Пример #4
0
 def process(self):
     dl = Download.Download(self.verbose)
     while self.Counter <= self.PageLimit:
         if self.verbose:
             p = ' [*] AskSearch on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
             self.logger.info('AskSearch on page: ' + str(self.Counter))
         try:
             url = 'http://www.ask.com/web?q=@' + str(self.Domain) + \
                 '&pu=10&page=' + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Ask Search:" + str(e)
             self.logger.error('Major issue with Ask Search: ' + str(e))
             print helpers.color(error, warning=True)
         try:
             rawhtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Ask (Check Connection):" + \
                 str(e)
             self.logger.error(
                 'Fail during Request to Ask (Check Connection): ' + str(e))
             print helpers.color(error, warning=True)
         self.Html += rawhtml
         self.Counter += 1
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
Пример #5
0
 def search(self):
     dl = Download.Download(verbose=self.verbose)
     while self.Counter <= self.Depth and self.Counter <= 100:
         helpers.modsleep(5)
         if self.verbose:
             p = ' [*] GitHubUser Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = 'https://github.com/search?p=' + str(self.Counter) + '&q=' + \
                 str(self.domain) + 'ref=searchresults&type=Users&utf8='
         except Exception as e:
             error = " [!] Major issue with GitHubUser Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             r = dl.requesturl(
                 url, useragent=self.UserAgent, raw=True, timeout=10)
         except Exception as e:
             error = " [!] Fail during Request to GitHubUser (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         results = r.content
         self.Html += results
         self.Counter += 1
Пример #6
0
 def search(self):
     dl = Download.Download(self.verbose)
     convert = Converter.Converter(verbose=self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google DOC Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             urly = "https://www.google.com/search?q=site:" + \
                 self.Domain + "+filetype:doc&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             r = requests.get(urly)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         RawHtml = r.content
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith('www'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google DOC search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".doc"
                 FileName, FileDownload = dl.download_file(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google DOC file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     self.Text += convert.convert_doc_to_txt(FileName)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening Doc Files\n",
                                     firewall=True)
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No DOC's to download from Google!\n",
                             firewall=True)
Пример #7
0
 def search(self):
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google CSV Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = "https://www.google.com/search?q=site:" + \
                 self.Domain + "+filetype:csv&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             RawHtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(
                     urlparse.urlparse(a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith('www'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google CSV search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".csv"
                 FileName, FileDownload = dl.download_file2(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = '[*] Google CSV file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     with open(FileName) as f:
                         self.Text += f.read()
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening CSV Files\n", firewall=True)
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)
    def search(self):
        dl = Download.Download(self.verbose)
        convert = Converter.Converter(verbose=self.verbose)
        while self.Counter <= self.Limit and self.Counter <= 10:
            helpers.modsleep(1)
            if self.verbose:
                p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter)
                self.logger.info('ExaleadPPTXSearch on page: ' + str(self.Counter))
                print helpers.color(p, firewall=True)
            try:
                url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \
                      '"+filetype:pptx&elements_per_page=' + \
                    str(self.Quanity) + '&start_index=' + str(self.Counter)
            except Exception as e:
                self.logger.error('ExaleadPPTXSearch could not build URL')
                error = " [!] Major issue with Exalead PPTX Search: " + str(e)
                print helpers.color(error, warning=True)
            try:
                RawHtml = dl.requesturl(url, useragent=self.UserAgent)
                # sometimes url is broken but exalead search results contain
                # e-mail
                self.Text += RawHtml
                soup = BeautifulSoup(RawHtml, "lxml")
                self.urlList = [h2.a["href"]
                                for h2 in soup.findAll('h4', class_='media-heading')]
            except Exception as e:
                self.logger.error('ExaleadPPTXSearch could not request / parse HTML')
                error = " [!] Fail during parsing result: " + str(e)
                print helpers.color(error, warning=True)
            self.Counter += 30

        # now download the required files
        try:
            for url in self.urlList:
                if self.verbose:
                    p = ' [*] Exalead PPTX search downloading: ' + str(url)
                    self.logger.info('ExaleadPPTXSearch downloading: ' + str(url))
                    print helpers.color(p, firewall=True)
                try:
                    filetype = ".pptx"
                    dl = Download.Download(self.verbose)
                    FileName, FileDownload = dl.download_file(url, filetype)
                    if FileDownload:
                        if self.verbose:
                            p = ' [*] Exalead PPTX file was downloaded: ' + \
                                str(url)
                            self.logger.info('ExaleadDOCSearch downloaded: ' + str(p))
                            print helpers.color(p, firewall=True)
                        ft = helpers.filetype(FileName).lower()
                        if 'powerpoint' in ft:
                            self.Text += convert.convert_zip_to_text(FileName)
                        else:
                            self.logger.warning('Downloaded file is not a PPTX: ' + ft)
                except Exception as e:
                    error = " [!] Issue with opening PPTX Files:%s" % (str(e))
                    print helpers.color(error, warning=True)
                try:
                    dl.delete_file(FileName)
                except Exception as e:
                    print e
        except Exception as e:
            self.logger.error("ExaleadPPTXSearch no doc's to download")
            print helpers.color(" [*] No PPTX's to download from Exalead!\n", firewall=True)

        if self.verbose:
            p = ' [*] Searching PPTX from Exalead Complete'
            print helpers.color(p, status=True)
Пример #9
0
    def search(self):
        dl = Download.Download(self.verbose)
        convert = Converter.Converter(verbose=self.verbose)
        while self.Counter <= self.Limit and self.Counter <= 10:
            helpers.modsleep(1)
            if self.verbose:
                p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter)
                self.logger.info('ExaleadPPTXSearch on page: ' +
                                 str(self.Counter))
                print helpers.color(p, firewall=True)
            try:
                url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \
                      '"+filetype:pptx&elements_per_page=' + \
                    str(self.Quanity) + '&start_index=' + str(self.Counter)
            except Exception as e:
                self.logger.error('ExaleadPPTXSearch could not build URL')
                error = " [!] Major issue with Exalead PPTX Search: " + str(e)
                print helpers.color(error, warning=True)
            try:
                RawHtml = dl.requesturl(url, useragent=self.UserAgent)
                # sometimes url is broken but exalead search results contain
                # e-mail
                self.Text += RawHtml
                soup = BeautifulSoup(RawHtml, "lxml")
                self.urlList = [
                    h2.a["href"]
                    for h2 in soup.findAll('h4', class_='media-heading')
                ]
            except Exception as e:
                self.logger.error(
                    'ExaleadPPTXSearch could not request / parse HTML')
                error = " [!] Fail during parsing result: " + str(e)
                print helpers.color(error, warning=True)
            self.Counter += 30

        # now download the required files
        try:
            for url in self.urlList:
                if self.verbose:
                    p = ' [*] Exalead PPTX search downloading: ' + str(url)
                    self.logger.info('ExaleadPPTXSearch downloading: ' +
                                     str(url))
                    print helpers.color(p, firewall=True)
                try:
                    filetype = ".pptx"
                    dl = Download.Download(self.verbose)
                    FileName, FileDownload = dl.download_file(url, filetype)
                    if FileDownload:
                        if self.verbose:
                            p = ' [*] Exalead PPTX file was downloaded: ' + \
                                str(url)
                            self.logger.info('ExaleadDOCSearch downloaded: ' +
                                             str(p))
                            print helpers.color(p, firewall=True)
                        ft = helpers.filetype(FileName).lower()
                        if 'powerpoint' in ft:
                            self.Text += convert.convert_zip_to_text(FileName)
                        else:
                            self.logger.warning(
                                'Downloaded file is not a PPTX: ' + ft)
                except Exception as e:
                    error = " [!] Issue with opening PPTX Files:%s" % (str(e))
                    print helpers.color(error, warning=True)
                try:
                    dl.delete_file(FileName)
                except Exception as e:
                    print e
        except Exception as e:
            self.logger.error("ExaleadPPTXSearch no doc's to download")
            print helpers.color(" [*] No PPTX's to download from Exalead!\n",
                                firewall=True)

        if self.verbose:
            p = ' [*] Searching PPTX from Exalead Complete'
            print helpers.color(p, status=True)
Пример #10
0
 def search(self):
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = " [*] Google CSV Search on page: " + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = ("https://www.google.com/search?q=site:" + self.Domain +
                    "+filetype:csv&start=" + str(self.Counter))
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             RawHtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + str(
                 e)
             print helpers.color(error, warning=True)
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll("a"):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a["href"]).query)["q"][0]
                 if l.startswith("http") or l.startswith("www"):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = " [*] Google CSV search downloading: " + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".csv"
                 FileName, FileDownload = dl.download_file2(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = "[*] Google CSV file was downloaded: " + str(
                             url)
                         print helpers.color(p, firewall=True)
                     with open(FileName) as f:
                         self.Text += f.read()
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening CSV Files\n",
                                     firewall=True)
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No CSV to download from Google!\n",
                             firewall=True)
Пример #11
0
 def search(self):
     # setup for helpers in the download class
     convert = Converter.Converter(verbose=self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = " [*] Google PDF Search on page: " + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             urly = ("https://www.google.com/search?q=" + self.Domain +
                     "+filetype:pdf&start=" + str(self.Counter))
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             r = requests.get(urly)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + str(
                 e)
             print helpers.color(error, warning=True)
         RawHtml = r.content
         # get redirect URL
         # Url = r.url
         dl.GoogleCaptchaDetection(RawHtml)
         soup = BeautifulSoup(RawHtml)
         for a in soup.findAll("a"):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a["href"]).query)["q"][0]
                 if l.startswith("http") or l.startswith("www"):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = " [*] Google PDF search downloading: " + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pdf"
                 # use new helper class to download file
                 FileName, FileDownload = dl.download_file(url, filetype)
                 # check if the file was downloaded
                 if FileDownload:
                     if self.verbose:
                         p = " [*] Google PDF file was downloaded: " + str(
                             url)
                         print helpers.color(p, firewall=True)
                     self.Text += convert.convert_pdf_to_txt(FileName)
             except Exception as e:
                 print e
             try:
                 # now remove any files left behind
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No PDF's to download from Google!\n",
                             firewall=True)
Пример #12
0
 def search(self):
     convert = Converter.Converter(self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = " [*] Google PPTX Search on page: " + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = (
                 "https://www.google.com/search?q="
                 + self.Domain
                 + "+filetype:pptx&start="
                 + str(self.Counter)
             )
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             RawHtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + str(
                 e
             )
             print helpers.color(error, warning=True)
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll("a"):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(a["href"]).query)["q"][0]
                 if (
                     l.startswith("http")
                     or l.startswith("www")
                     or l.startswith("https")
                 ):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
                 # for some reason PPTX seems to be cached data:
                 l = urlparse.parse_qs(urlparse.urlparse(a["href"]).query)["q"][0]
                 l = l.split(":", 2)
                 if "webcache.googleusercontent.com" not in l[2]:
                     self.urlList.append(l[2])
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = " [*] Google PPTX search downloading: " + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pptx"
                 FileName, FileDownload = dl.download_file2(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = " [*] Google PPTX file was downloaded: " + str(url)
                         print helpers.color(p, firewall=True)
                     ft = helpers.filetype(FileName).lower()
                     if "powerpoint" in ft:
                         # self.Text += convert.convert_zip_to_text(FileName)
                         self.Text += convert.convert_zip_to_text(FileName)
                     else:
                         self.logger.warning("Downloaded file is not a PPTX: " + ft)
                 # print self.Text
             except Exception as e:
                 print helpers.color(
                     " [!] Issue with opening PPTX Files\n", firewall=True
                 )
             try:
                 if FileDownload:
                     dl.delete_file(FileName)
             except Exception as e:
                 self.logger.warning("Issue deleting file: " + str(e))
     except:
         print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)
Пример #13
0
 def search(self):
     convert = Converter.Converter(self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google PPTX Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = "https://www.google.com/search?q=" + \
                 self.Domain + "+filetype:pptx&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             RawHtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith('www') or l.startswith('https'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
                 # for some reason PPTX seems to be cached data:
                 l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0]
                 l = l.split(':', 2)
                 if "webcache.googleusercontent.com" not in l[2]:
                     self.urlList.append(l[2])
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google PPTX search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pptx"
                 FileName, FileDownload = dl.download_file2(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google PPTX file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     ft = helpers.filetype(FileName).lower()
                     if 'powerpoint' in ft:
                         # self.Text += convert.convert_zip_to_text(FileName)
                         self.Text += convert.convert_zip_to_text(FileName)
                     else:
                         self.logger.warning('Downloaded file is not a PPTX: ' + ft)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening PPTX Files\n", firewall=True)
             try:
                 if FileDownload:
                     dl.delete_file(FileName)
             except Exception as e:
                 self.logger.warning('Issue deleting file: ' + str(e))
     except:
         print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)
Пример #14
0
 def search(self):
     convert = Converter.Converter(verbose=self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = " [*] Google XLSX Search on page: " + str(self.Counter)
             self.logger.info("Google XLSX Search on page: " +
                              str(self.Counter))
             print helpers.color(p, firewall=True)
         try:
             urly = ("https://www.google.com/search?q=site:" + self.Domain +
                     "+filetype:xlsx&start=" + str(self.Counter))
         except Exception as e:
             error = " [!] Major issue with Google XLSX Search:" + str(e)
             self.logger.error("GoogleXlsxSearch failed to build url: " +
                               str(e))
             print helpers.color(error, warning=True)
         try:
             r = requests.get(urly)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + str(
                 e)
             self.logger.error(
                 "GoogleXlsxSearch failed to request url (Check Connection): "
                 + str(e))
             print helpers.color(error, warning=True)
         RawHtml = r.content
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll("a"):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a["href"]).query)["q"][0]
                 if l.startswith("http") or l.startswith("www"):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     self.logger.debug(
         "GoogleXlsxSearch completed HTML result query, starting downloads")
     try:
         for url in self.urlList:
             if self.verbose:
                 p = " [*] Google XLSX search downloading: " + str(url)
                 self.logger.info("Google XLSX search downloading: " +
                                  str(url))
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".xlsx"
                 dl = Download.Download(self.verbose)
                 FileName, FileDownload = dl.download_file(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = " [*] Google XLSX file was downloaded: " + str(
                             url)
                         self.logger.info(
                             "Google XLSX file was downloaded: " + str(url))
                         print helpers.color(p, firewall=True)
                     self.Text += convert.convert_Xlsx_to_Csv(FileName)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening Xlsx Files\n",
                                     firewall=True)
                 self.logger.error("Google XLSX had issue opening file")
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 self.logger.error("Google XLSX failed to delete file: " +
                                   str(e))
     except Exception as e:
         print helpers.color(" [*] No XLSX's to download from google!\n",
                             firewall=True)
         self.logger.error("No XLSX's to download from google! " + str(e))
Пример #15
0
 def search(self):
     # setup for helpers in the download class
     convert = Converter.Converter(verbose=self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google PDF Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             urly = "https://www.google.com/search?q=" + \
                 self.Domain + "+filetype:pdf&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             r = requests.get(urly)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         RawHtml = r.content
         # get redirect URL
         # Url = r.url
         dl.GoogleCaptchaDetection(RawHtml)
         soup = BeautifulSoup(RawHtml)
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(
                     urlparse.urlparse(a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith('www'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
             except:
                 pass
         self.Counter += 10
         helpers.modsleep(self.Sleep, jitter=self.Jitter)
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google PDF search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pdf"
                 # use new helper class to download file
                 FileName, FileDownload = dl.download_file(url, filetype)
                 # check if the file was downloaded
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google PDF file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     self.Text += convert.convert_pdf_to_txt(FileName)
             except Exception as e:
                 print e
             try:
                 # now remove any files left behind
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No PDF's to download from Google!\n", firewall=True)