Exemplo n.º 1
0
 def search(self):
     convert = Converter.Converter(self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google PPTX Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = "https://www.google.com/search?q=" + \
                 self.Domain + "+filetype:pptx&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             RawHtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith(
                         'www') or l.startswith('https'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
                 # for some reason PPTX seems to be cached data:
                 l = urlparse.parse_qs(urlparse.urlparse(
                     a['href']).query)['q'][0]
                 l = l.split(':', 2)
                 if "webcache.googleusercontent.com" not in l[2]:
                     self.urlList.append(l[2])
             except:
                 pass
         self.Counter += 10
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google PPTX search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pptx"
                 FileName, FileDownload = dl.download_file2(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google PPTX file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     ft = helpers.filetype(FileName).lower()
                     if 'powerpoint' in ft:
                         self.Text += convert.convert_pptx_to_txt(FileName)
                     else:
                         self.logger.warning(
                             'Downloaded file is not a PPTX: ' + ft)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening PPTX Files\n",
                                     firewall=True)
             try:
                 dl.delete_file(FileName)
             except Exception as e:
                 print e
     except:
         print helpers.color(" [*] No PPTX to download from Google!\n",
                             firewall=True)
Exemplo n.º 2
0
    def search(self):
        dl = Download.Download(self.verbose)
        convert = Converter.Converter(verbose=self.verbose)
        while self.Counter <= self.Limit and self.Counter <= 10:
            helpers.modsleep(1)
            if self.verbose:
                p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter)
                self.logger.info('ExaleadPPTXSearch on page: ' +
                                 str(self.Counter))
                print helpers.color(p, firewall=True)
            try:
                url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \
                      '"+filetype:pptx&elements_per_page=' + \
                    str(self.Quanity) + '&start_index=' + str(self.Counter)
            except Exception as e:
                self.logger.error('ExaleadPPTXSearch could not build URL')
                error = " [!] Major issue with Exalead PPTX Search: " + str(e)
                print helpers.color(error, warning=True)
            try:
                RawHtml = dl.requesturl(url, useragent=self.UserAgent)
                # sometimes url is broken but exalead search results contain
                # e-mail
                self.Text += RawHtml
                soup = BeautifulSoup(RawHtml, "lxml")
                self.urlList = [
                    h2.a["href"]
                    for h2 in soup.findAll('h4', class_='media-heading')
                ]
            except Exception as e:
                self.logger.error(
                    'ExaleadPPTXSearch could not request / parse HTML')
                error = " [!] Fail during parsing result: " + str(e)
                print helpers.color(error, warning=True)
            self.Counter += 30

        # now download the required files
        try:
            for url in self.urlList:
                if self.verbose:
                    p = ' [*] Exalead PPTX search downloading: ' + str(url)
                    self.logger.info('ExaleadPPTXSearch downloading: ' +
                                     str(url))
                    print helpers.color(p, firewall=True)
                try:
                    filetype = ".pptx"
                    dl = Download.Download(self.verbose)
                    FileName, FileDownload = dl.download_file(url, filetype)
                    if FileDownload:
                        if self.verbose:
                            p = ' [*] Exalead PPTX file was downloaded: ' + \
                                str(url)
                            self.logger.info('ExaleadDOCSearch downloaded: ' +
                                             str(p))
                            print helpers.color(p, firewall=True)
                        ft = helpers.filetype(FileName).lower()
                        if 'powerpoint' in ft:
                            self.Text += convert.convert_zip_to_text(FileName)
                        else:
                            self.logger.warning(
                                'Downloaded file is not a PPTX: ' + ft)
                except Exception as e:
                    error = " [!] Issue with opening PPTX Files:%s" % (str(e))
                    print helpers.color(error, warning=True)
                try:
                    dl.delete_file(FileName)
                except Exception as e:
                    print e
        except Exception as e:
            self.logger.error("ExaleadPPTXSearch no doc's to download")
            print helpers.color(" [*] No PPTX's to download from Exalead!\n",
                                firewall=True)

        if self.verbose:
            p = ' [*] Searching PPTX from Exalead Complete'
            print helpers.color(p, status=True)
    def search(self):
        dl = Download.Download(self.verbose)
        convert = Converter.Converter(verbose=self.verbose)
        while self.Counter <= self.Limit and self.Counter <= 10:
            helpers.modsleep(1)
            if self.verbose:
                p = ' [*] Exalead PPTX Search on page: ' + str(self.Counter)
                self.logger.info('ExaleadPPTXSearch on page: ' + str(self.Counter))
                print helpers.color(p, firewall=True)
            try:
                url = 'http://www.exalead.com/search/web/results/?q="%40' + self.Domain + \
                      '"+filetype:pptx&elements_per_page=' + \
                    str(self.Quanity) + '&start_index=' + str(self.Counter)
            except Exception as e:
                self.logger.error('ExaleadPPTXSearch could not build URL')
                error = " [!] Major issue with Exalead PPTX Search: " + str(e)
                print helpers.color(error, warning=True)
            try:
                RawHtml = dl.requesturl(url, useragent=self.UserAgent)
                # sometimes url is broken but exalead search results contain
                # e-mail
                self.Text += RawHtml
                soup = BeautifulSoup(RawHtml, "lxml")
                self.urlList = [h2.a["href"]
                                for h2 in soup.findAll('h4', class_='media-heading')]
            except Exception as e:
                self.logger.error('ExaleadPPTXSearch could not request / parse HTML')
                error = " [!] Fail during parsing result: " + str(e)
                print helpers.color(error, warning=True)
            self.Counter += 30

        # now download the required files
        try:
            for url in self.urlList:
                if self.verbose:
                    p = ' [*] Exalead PPTX search downloading: ' + str(url)
                    self.logger.info('ExaleadPPTXSearch downloading: ' + str(url))
                    print helpers.color(p, firewall=True)
                try:
                    filetype = ".pptx"
                    dl = Download.Download(self.verbose)
                    FileName, FileDownload = dl.download_file(url, filetype)
                    if FileDownload:
                        if self.verbose:
                            p = ' [*] Exalead PPTX file was downloaded: ' + \
                                str(url)
                            self.logger.info('ExaleadDOCSearch downloaded: ' + str(p))
                            print helpers.color(p, firewall=True)
                        ft = helpers.filetype(FileName).lower()
                        if 'powerpoint' in ft:
                            self.Text += convert.convert_zip_to_text(FileName)
                        else:
                            self.logger.warning('Downloaded file is not a PPTX: ' + ft)
                except Exception as e:
                    error = " [!] Issue with opening PPTX Files:%s" % (str(e))
                    print helpers.color(error, warning=True)
                try:
                    dl.delete_file(FileName)
                except Exception as e:
                    print e
        except Exception as e:
            self.logger.error("ExaleadPPTXSearch no doc's to download")
            print helpers.color(" [*] No PPTX's to download from Exalead!\n", firewall=True)

        if self.verbose:
            p = ' [*] Searching PPTX from Exalead Complete'
            print helpers.color(p, status=True)
Exemplo n.º 4
0
 def search(self):
     convert = Converter.Converter(self.verbose)
     dl = Download.Download(self.verbose)
     while self.Counter <= self.Limit and self.Counter <= 100:
         time.sleep(1)
         if self.verbose:
             p = ' [*] Google PPTX Search on page: ' + str(self.Counter)
             print helpers.color(p, firewall=True)
         try:
             url = "https://www.google.com/search?q=" + \
                 self.Domain + "+filetype:pptx&start=" + str(self.Counter)
         except Exception as e:
             error = " [!] Major issue with Google Search:" + str(e)
             print helpers.color(error, warning=True)
         try:
             RawHtml = dl.requesturl(url, useragent=self.UserAgent)
         except Exception as e:
             error = " [!] Fail during Request to Google (Check Connection):" + \
                 str(e)
             print helpers.color(error, warning=True)
         # check for captcha
         try:
             # Url = r.url
             dl.GoogleCaptchaDetection(RawHtml)
         except Exception as e:
             print e
         soup = BeautifulSoup(RawHtml)
         # I use this to parse my results, for URLS to follow
         for a in soup.findAll('a'):
             try:
                 # https://stackoverflow.com/questions/21934004/not-getting-proper-links-
                 # from-google-search-results-using-mechanize-and-beautifu/22155412#22155412?
                 # newreg=01f0ed80771f4dfaa269b15268b3f9a9
                 l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0]
                 if l.startswith('http') or l.startswith('www') or l.startswith('https'):
                     if "webcache.googleusercontent.com" not in l:
                         self.urlList.append(l)
                 # for some reason PPTX seems to be cached data:
                 l = urlparse.parse_qs(urlparse.urlparse(a['href']).query)['q'][0]
                 l = l.split(':', 2)
                 if "webcache.googleusercontent.com" not in l[2]:
                     self.urlList.append(l[2])
             except:
                 pass
         self.Counter += 10
     # now download the required files
     try:
         for url in self.urlList:
             if self.verbose:
                 p = ' [*] Google PPTX search downloading: ' + str(url)
                 print helpers.color(p, firewall=True)
             try:
                 filetype = ".pptx"
                 FileName, FileDownload = dl.download_file2(url, filetype)
                 if FileDownload:
                     if self.verbose:
                         p = ' [*] Google PPTX file was downloaded: ' + \
                             str(url)
                         print helpers.color(p, firewall=True)
                     ft = helpers.filetype(FileName).lower()
                     if 'powerpoint' in ft:
                         # self.Text += convert.convert_zip_to_text(FileName)
                         self.Text += convert.convert_zip_to_text(FileName)
                     else:
                         self.logger.warning('Downloaded file is not a PPTX: ' + ft)
                 # print self.Text
             except Exception as e:
                 print helpers.color(" [!] Issue with opening PPTX Files\n", firewall=True)
             try:
                 if FileDownload:
                     dl.delete_file(FileName)
             except Exception as e:
                 self.logger.warning('Issue deleting file: ' + str(e))
     except:
         print helpers.color(" [*] No CSV to download from Google!\n", firewall=True)