Exemplo n.º 1
0
    def get(self, url, pause=2):
        """
        requests get
        url: 
        :return: result
        """
        time.sleep(pause)
        #domain = self.get_random_domain()
        # Add headers
        headers = {'user-agent': self.get_random_user_agent()}
        headers['X-ProxyMesh-Timeout'] = '100'
        # headers['X-ProxyMesh-Country'] = domain[domain.rfind('.') + 1:].upper()
        headers['X-ProxyMesh-Country'] = 'US'

        try:
            requests.packages.urllib3.disable_warnings(
                requests.packages.urllib3.exceptions.InsecureRequestWarning)
            r = requests.get(url=url,
                             proxies=self.proxies,
                             headers=headers,
                             allow_redirects=False,
                             verify=False,
                             timeout=30)
            LOGGER.info(url)
            charset = chardet.detect(r.content)
            content = r.content.decode(charset['encoding'])
            return content
        except Exception as e:
            LOGGER.exception(e)
            return None
Exemplo n.º 2
0
 def search_page(self, query, language='en', num=None, start=0, pause=2):
     """
     Google search
     :param query: Keyword
     :param language: Language
     :return: result
     """
     time.sleep(pause)
     domain = self.get_random_domain()
     if num is None:
         url = URL_SEARCH
         url = url.format(
             domain=domain, language=language, query=quote_plus(query))
     else:
         url = URL_NUM
         url = url.format(
             domain=domain, language=language, query=quote_plus(query), num=num)
     # Add headers
     headers = {'user-agent': self.get_random_user_agent()}
     try:
         requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
         r = requests.get(url=url,
                          proxies=self.proxies,
                          headers=headers,
                          allow_redirects=False,
                          verify=False,
                          timeout=30)
         LOGGER.info(url)
         charset = chardet.detect(r.content)
         content = r.content.decode(charset['encoding'])
         return content
     except Exception as e:
         LOGGER.exception(e)
         return None
Exemplo n.º 3
0
    def change_ip_for_vps(self):
        try:
            subprocess.Popen('pppoe-stop', shell=True, stdout=subprocess.PIPE)
            time.sleep(2)
            subprocess.Popen('pppoe-start', shell=True, stdout=subprocess.PIPE)
            time.sleep(5)
            pppoe_restart = subprocess.Popen('pppoe-status',
                                             shell=True,
                                             stdout=subprocess.PIPE)
            pppoe_restart.wait()
            pppoe_log = pppoe_restart.communicate()[0]
            adsl_ip = re.findall(r'inet (.+?) peer ', pppoe_log)[0]
            LOGGER.info('[*] New ip address : ' + adsl_ip)

            return True
        except Exception, e:
            LOGGER.info(e)
            return False
Exemplo n.º 4
0
 def filter_link(self, link):
     """
     Returns None if the link doesn't yield a valid result.
     Token from https://github.com/MarioVilas/google
     :return: a valid result
     """
     try:
         o = urlparse(link, 'http')
         if o.netloc:
             return link
         if link.startswith('/url?'):
             link = parse_qs(o.query)['q'][0]
             o = urlparse(link, 'http')
             if o.netloc:
                 return link
     except Exception as e:
         LOGGER.exception(e)
         return None
Exemplo n.º 5
0
 def filter_link(self, link):
     """
     Returns None if the link doesn't yield a valid result.
     Token from https://github.com/MarioVilas/google
     :return: a valid result
     """
     try:
         # Valid results are absolute URLs not pointing to a Google domain
         # like images.google.com or googleusercontent.com
         o = urlparse(link, 'http')
         if o.netloc:
             return link
         # Decode hidden URLs.
         if link.startswith('/url?'):
             link = parse_qs(o.query)['q'][0]
             # Valid results are absolute URLs not pointing to a Google domain
             # like images.google.com or googleusercontent.com
             o = urlparse(link, 'http')
             if o.netloc:
                 return link
     # Otherwise, or on error, return None.
     except Exception as e:
         LOGGER.exception(e)
         return None
Exemplo n.º 6
0
    def search(self,
               query,
               language=None,
               num=None,
               start=0,
               pause=2,
               keyword=None,
               keytype=None):
        """
        Get the results you want,such as title,description,url

        :param keyword: 
        :param keytype: 
        :param pause: 
        :param query:
        :param language:
        :param num:
        :param start:
        :return: Generator
        """
        content = self.search_page(query, language, num, start, pause, keyword)
        try:
            pq_content = self.pq_html(content)
        except Exception as e:
            LOGGER.info(keyword + str(start) +
                        "-----------pq_html----error----------{}".format(e))
            return [], -1
        else:

            if pq_content and '302 Moved' == pq_content('h1').eq(0).text():
                try:
                    LOGGER.info(keyword + str(start) +
                                "-------------- change proxy--------------")
                    self.change_ip_for_vps()
                    content = self.search_page(query, language, num, start,
                                               pause, keyword)
                    pq_content = self.pq_html(content)
                except Exception as e:
                    LOGGER.info(
                        keyword + str(start) +
                        "------after change proxy error-------{}".format(e))
                    return [], -1

            try:
                result_num = pq_content('#resultStats')
                if result_num:
                    result_num = result_num.text()
                    result_num = re.search(
                        'bout\s(\d*,?\.?\d*,?\.?\d*,?\.?\d*) results',
                        result_num)
                    if result_num:
                        result_num = result_num.group()
                        result_num = result_num.lstrip('bout ').rstrip(
                            ' results').replace(',', '').replace('.', '')
                        result_num = int(result_num)
                    else:
                        return [], 0
                else:
                    return [], 0
            except Exception as e:
                result_num = -1
                LOGGER.info(keyword + str(start) +
                            "-------result_num errors------------{}".format(e))

            result_dict_one = []
            for item in pq_content('div.g').items():
                asin = ""
                url = ""
                title = item('h3.r>a').eq(0).text()
                # ------------------------------------------Amazon--begin--------------------
                rating = item('div.f.slp').eq(0).text()
                if rating:
                    rating = rating.encode('utf-8')
                    rating = rating.replace("\xc2\xa0", "")
                    rating_out = rating.lstrip('Rating: ').rstrip(' reviews')
                    rating_out = rating_out.split('-')
                    star = rating_out[0].replace(',', '.')
                    review = rating_out[1].replace(',', '')
                else:
                    star = 0.0
                    review = 0
                # ------------------------------------------Amazon--end--------------------
                href = item('h3.r>a').eq(0).attr('href')
                if href:
                    url = self.filter_link(href)
                    if url:
                        asin = re.search('dp/(\w{10})', url)
                        if asin:
                            asin = asin.groups()[0]
                insert_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                insert_datetime = str(insert_datetime)

                result_dict = {
                    "asin": asin,
                    "insert_datetime": insert_datetime,
                    "url": url,
                    "title": title,
                    "review_num": review,
                    "review_value": star,
                    "keytype": keytype,
                    "keyword": keyword
                }
                result_dict_one.append(result_dict)
            return result_dict_one, result_num
Exemplo n.º 7
0
 def search_page(self,
                 query,
                 language=None,
                 num=None,
                 start=0,
                 pause=2,
                 keyword=None):
     """
     Google search
     :param num: 
     :param start: 
     :param pause: 
     :param query: Keyword
     :param language: Language
     :return: result
     """
     domain = self.get_random_domain()
     if start > 0:
         url = URL_NEXT
         url = url.format(domain=domain,
                          language=language,
                          query=quote_plus(query),
                          num=num,
                          start=start)
     else:
         if num is None:
             url = URL_SEARCH
             url = url.format(domain=domain,
                              language=language,
                              query=quote_plus(query))
         else:
             url = URL_NUM
             url = url.format(domain=domain,
                              language=language,
                              query=quote_plus(query),
                              num=num)
     if language is None:
         url = url.replace('hl=None&', '')
     # Add headers
     headers = {'user-agent': self.get_random_user_agent()}
     try:
         requests.packages.urllib3.disable_warnings(
             requests.packages.urllib3.exceptions.InsecureRequestWarning)
         r = requests.get(
             url=url,
             # proxies=self.proxies,
             headers=headers,
             allow_redirects=False,
             verify=False,
             timeout=30)
         content = r.content
         charset = cchardet.detect(content)
         text = content.decode(charset['encoding'])
         return text
     except SSLError as e:
         LOGGER.exception(e)
         LOGGER.info(url)
         return {}
     except Exception as e:
         LOGGER.exception(e)
         LOGGER.info(url)
         LOGGER.info(keyword + str(start) +
                     "------- change proxy for bad proxy ---------")
         self.change_ip_for_vps()
         return {}