예제 #1
0
    def download(url):
        try:
            r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT)
            # print("应答码:{}".format(r))
            # 将获取内容的编码更改;通过chardet.detect检查内容,并拿到编码方式
            r.encoding = chardet.detect(r.content)['encoding']
            # 检测是否获取成功
            if (not r.ok) or len(r.content) < 500:
                raise ConnectionError
            else:
                return r.text
        except Exception as e:
            print(e)
            count = 0  # 重试次数
            # 获取代理IP再进行下载
            proxylist = sql.select(10)
            if not proxylist:
                return None

            while count < config.RETRY_TIME:
                try:
                    proxy = random.choice(proxylist)
                    ip = proxy[0]
                    port = proxy[1]
                    proxies = {'http': 'http://%s%s' % (ip, port), 'https': 'http://%s:%s' % (ip, port)}

                    r = requests.get(url=url, headres=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
                    r.encoding = chardet.detect(r.content)['encoding']
                    if (not r.ok) or len(r.content) < 500:
                        raise ConnectionError
                    else:
                        return r.text
                except Exception:
                    count += 1
        return None
예제 #2
0
    def download(url):
        try:
            r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT)
            r.encoding = chardet.detect(r.content)['encoding']
            if (not r.ok) or len(r.content) < 500:
                raise ConnectionError
            else:
                return r.text

        except Exception:
            count = 0  # 重试次数
            proxylist = sqlhelper.select(10)
            if not proxylist:
                return None

            while count < config.RETRY_TIME:
                try:
                    proxy = random.choice(proxylist)
                    ip = proxy[0]
                    port = proxy[1]
                    proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}

                    r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
                    r.encoding = chardet.detect(r.content)['encoding']
                    if (not r.ok) or len(r.content) < 500:
                        raise ConnectionError
                    else:
                        return r.text
                except Exception:
                    count += 1

        return None
예제 #3
0
파일: Main.py 프로젝트: liusepiaoxu/scrapy-
def download_image(image_title, image_detail_websites):
    num = 1
    amount = len(image_detail_websites)
    path = 'F:/Images/temp/' + image_title
    for i in image_detail_websites:
        proxies = config.get_ips()
        for ip in proxies:
            proxy = {'http': ip.strip()}
            print(proxy)
            try:
                r = requests.get(url=i,
                                 headers=config.get_header(),
                                 proxies=proxy,
                                 timeout=3)
                if r.status_code == 200:
                    response = requests.get(url=i,
                                            headers=config.get_header(),
                                            proxies=proxy,
                                            timeout=3)
                    if response.status_code == 200:
                        if not os.path.exists(path):
                            os.makedirs(path)
                        os.chdir('F:/Images/temp/' + image_title)
                        filename = '%s%s.jpg' % (image_title, num)
                        print('正在下载图片:%s第%s/%s,' % (image_title, num, amount))
                        with open(filename, 'wb') as f:
                            f.write(response.content)
                        num += 1
                        break
                else:
                    continue
            except:
                print('该代理{}失效!'.format(proxy))
예제 #4
0
def download(url):
    try:
        r = requests.get(url=url,
                         headers=config.get_header(),
                         timeout=config.TIMEOUT)
        r.encoding = chardet.detect(r.content)['encoding']
        if (not r.ok) or len(r.content) < 500:
            raise
        else:
            return r.text

    except Exception:
        count = 0  # 重试次数

        while count < config.RETRY_TIME:
            try:
                r = requests.get(url=url,
                                 headers=config.get_header(),
                                 timeout=config.TIMEOUT)
                r.encoding = chardet.detect(r.content)['encoding']
                if (not r.ok) or len(r.content) < 500:
                    raise
                else:
                    return r.text
            except Exception:
                count += 1
    return None
예제 #5
0
    def download(url):
        """
        获取网页
        :param url: 请求的网页地址
        :return: 返回网页内容
        """
        try:
            # 网页请求成功
            r = requests.get(url=url,
                             headers=config.get_header(),
                             timeout=config.TIMEOUT,
                             proxies=spyder.HtmlHandler.proxy_list())
            # r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT)

            # 获取网页编码格式,并修改为request.text的解码类型
            r.encoding = chardet.detect(r.content)['encoding']
            if r.encoding == "GB2312":
                r.encoding = "GBK"

            # 网页请求OK或者请求得到的内容过少,判断为连接失败
            if (not r.ok) or len(r.content) < 500:
                raise ConnectionError
            else:
                return r.text

        except Exception:
            count = 0  # 重试次数
            # proxylist = sqlhelper.select(10)
            proxylist = json.loads(requests.get(config.PROXYURL).text)
            if not proxylist:
                return None

            while count < config.RETRY_TIME:
                try:
                    proxy = random.choice(proxylist)
                    ip = proxy[0]
                    port = proxy[1]
                    proxies = {
                        "http": "http://%s:%s" % (ip, port),
                        "https": "http://%s:%s" % (ip, port)
                    }

                    # r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=P)
                    r = requests.get(url=url,
                                     headers=config.get_header(),
                                     timeout=config.TIMEOUT,
                                     proxies=proxies)
                    r.encoding = chardet.detect(r.content)['encoding']
                    if (not r.ok) or len(r.content) < 500:
                        raise ConnectionError
                    else:
                        return r.text
                except Exception:
                    count += 1

        return None
예제 #6
0
def check_server(proxy):
    ip=proxy['ip']
    port=proxy['port']
    proxies = {"http": "http://%s:%s" % (ip, port), "https": "https://%s:%s" % (ip, port)}
    url1 = config.GOAL_HTTPS_LIST[0]
    url2=config.GOAL_HTTP_LIST
    c_https=requests.get(url=url1,proxies=proxies,headers=config.get_header(),timeout=config.TIMEOUT)
    c_http=requests.get(url=url2,proxies=proxies,headers=config.get_header(),timeout=config.TIMEOUT)
    if c_http and c_https:
        return 2
    elif c_https:
        return 1
    elif c_http:
        return 0
예제 #7
0
def NovelCrawl_Main(count):
    url = r'http://www.yousuu.com/booklist'
    proxies = fetchproxies()
    for i in range(count):
        try:
            header = get_header()
            proxy = random.choice(proxies)
            url = CrawlNovelList(url, header, proxy)
        except Exception as e:
            print 'error'
            print e.message
            pass

    sql = 'select distinct novellisturl from pagenovel '
    a = MySQLHelper()
    NovelListUrl = a.SqlFecthAll(sql)
    a.CloseCon()
    print len(NovelListUrl)

    #proxies = fetchproxies()
    for i in NovelListUrl:
        try:
            header = get_header()
            proxy = random.choice(proxies)
            print i[0]
            CrawlNovel(i[0], header, proxy)
        except Exception as e:
            print 'error'
            print e.message
            pass

    a = MySQLHelper()
    sql = 'select distinct novelurl from novelurl'
    NovelUrl = a.SqlFecthAll(sql)
    a.CloseCon()
    print len(NovelUrl)

    #proxies = fetchproxies()
    for i in NovelUrl:
        try:
            header = get_header()
            proxy = random.choice(proxies)
            print i[0]
            CrawlNovelData(i[0], header, proxy)
        except Exception as e:
            print 'error'
            print e.message
            pass
예제 #8
0
def getMyIP():
    try:
        r = requests.get(url=config.TEST_IP, headers=config.get_header(), timeout=config.TIMEOUT)
        ip = json.loads(r.text)
        return ip['origin']
    except Exception as e:
        raise Test_URL_Fail
예제 #9
0
 def getHtml(self):
     for i in range(0, 1):
         print i
         self.redis_pipline.smembers(i)
         resSet = self.redis_pipline.execute()[0]
         with open("data/" + str(i) + ".csv", "wb+") as f:
             csvHeader = ["id", "html"]
             f_csv = csv.writer(f)
             f_csv.writerow(csvHeader)
             for j, productID in enumerate(resSet):
                 if j == 500:
                     break
                 url = self.baseUrl + productID
                 flag = True
                 while flag:
                     try:
                         proxy = self.getProxy()
                         print "current ip" + proxy
                         res = requests.get(url,
                                            headers=get_header(),
                                            proxies={"http": proxy})
                         print res.status_code
                         if res.status_code != 200 or self.checkRobot(
                                 res.content):
                             deleteIP(proxy)
                             print proxy + "has been deleted"
                             continue
                         row = []
                         row.append(productID)
                         row.append(res.content)
                         f_csv.writerow(row)
                         flag = False
                     except Exception as e:
                         print "here" + str(e.message)
                         break
예제 #10
0
def get_kdlspider():
    pattern_ip = '//*[@id="list"]/table/tbody/tr/td[1]/text()'
    pattern_port = '//*[@id="list"]/table/tbody/tr/td[2]/text()'
    start_url = []
    path = setting.save_ip + 'a2.txt'
    ip_port_list = []
    for i in range(1, 42):
        time.sleep(2)
        url = 'http://www.kuaidaili.com/free/inha/' + str(i) + '/'
        start_url.append(url)
    for i in start_url:
        print(i)
        time.sleep(2)
        response = requests.get(url=i, headers=config.get_header())
        content = response.content
        selector = html.fromstring(content)
        ip = selector.xpath(pattern_ip)
        port = selector.xpath(pattern_port)
        for i in zip(ip, port):
            ip_port = i[0] + ':' + i[1]
            ip_port_list.append(ip_port)
    with open(path, 'a') as f:
        f.write('\n')
        for i in ip_port_list:
            f.write(i + '\n')
예제 #11
0
def get_xicidailispinder():
    url = 'http://www.xicidaili.com/'
    ip_port_list = []
    path = setting.save_ip + 'a3.txt'
    for i in range(4):
        k = 0
        for j in range(19):
            time.sleep(2)
            num = j + 3 + k
            pattern_ip = '//*[@id="ip_list"]/tbody/tr[num]/td[2]/text()'
            pattern_port = '//*[@id="ip_list"]/tbody/tr[num]/td[3]/text()'
            response = requests.get(url=url, headers=config.get_header())
            content = response.content
            selector = html.fromstring(content)
            ip = selector.xpath(pattern_ip)
            port = selector.xpath(pattern_port)
            for i in zip(ip, port):
                ip_port = i[0] + ':' + i[1]
                ip_port_list.append(ip_port)

        k = i + 22
    with open(path, 'a') as f:
        f.write('\n')
        for i in ip_port_list:
            f.write(i + '\n')
예제 #12
0
    def download(url):
        try:
            r = requests.get(url=url,
                             headers=config.get_header(),
                             timeout=config.TIMEOUT)
            r.encoding = chardet.detect(r.content)['encoding']
            if (not r.ok) or len(r.content) < 500:
                raise ConnectionError
            else:
                return r.text

        except Exception:
            count = 0  # 重试次数
            # while count < config.RETRY_TIME:
            #     try:
            #         #proxy = random.choice(proxy_list)
            #         ip = proxy[0]
            #         port = proxy[1]
            #         proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
            #
            #         r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
            #         r.encoding = chardet.detect(r.content)['encoding']
            #         if (not r.ok) or len(r.content) < 500:
            #             raise ConnectionError
            #         else:
            #             return r.text
            #     except Exception:
            #         count += 1

        return None
예제 #13
0
def get_basics(stock_info):
    base_url = 'https://xueqiu.com/stock/f10/finmainindex.json?symbol={0}&page=1&size=199'
    code_mk = stock_info['market']+stock_info['code']
    req = requests.get(base_url.format(code_mk),headers=get_header())
    json_data = json.loads(req.text)['list']
    for i in json_data:
        stock_code = stock_info['code']
        report_date = i['reportdate']
        mainbusiincome = i['mainbusiincome']
        mainbusiprofit = i['mainbusiprofit']
        totprofit = i['totprofit']
        netprofit = i['netprofit']
        totalassets = i['totalassets']
        totalliab = i['totalliab']
        totsharequi = i['totsharequi']
        basiceps = i['basiceps']
        naps = i['naps']
        opercashpershare = i['opercashpershare']
        peropecashpershare = i['peropecashpershare']
        operrevenue = i['operrevenue']
        invnetcashflow = i['invnetcashflow']
        finnetcflow = i['finnetcflow']
        chgexchgchgs = i['chgexchgchgs']
        cashnetr = i['cashnetr']
        cashequfinbal = i['cashequfinbal']
        createtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(stock_code, report_date)
        insert_stock_basics(stock_code, report_date, mainbusiincome, mainbusiprofit, totprofit, netprofit, totalassets,
                            totalliab, totsharequi, basiceps, naps, opercashpershare, peropecashpershare, operrevenue,
                            invnetcashflow, finnetcflow, chgexchgchgs, cashnetr, cashequfinbal,createtime)
예제 #14
0
 def weather(self, addr):
     result = ''
     headers = config.get_header()
     format_data = {'addr': addr}
     res = requests.get(url=self.weather_url,
                        params=format_data,
                        headers=headers)
     #print(res.text)
     soup = etree.HTML(res.content)
     selector = soup.xpath('//ul[@class="query-hd"]')
     #print(selector)
     if len(selector) == 0:
         result = u'没找到这个城市的天气预报!'
     else:
         ul = soup.xpath('//ul[@class="query-hd"]/li')
         date = ul[0].xpath('./div[@class="date"]/text()')[0]
         phrase = ul[0].xpath('./div[@class="phrase"]/text()')[0]
         temperature = ul[0].xpath('./div[@class="temperature"]/text()')[0]
         result = result + date + u'今天' + " " + phrase + ' ' + temperature + '\n'
         for li in ul[1:]:
             date = li.xpath('./div[@class="date"]/text()')[0]
             phrase = li.xpath('./div[@class="phrase"]/text()')[0]
             temperature = li.xpath('./div[@class="temperature"]/text()')[0]
             result = result + date + " " + phrase + ' ' + temperature + '\n'
     return result
예제 #15
0
    def kuaidi(self, kuaidi_no):
        result = ''
        headers = config.get_header()
        headers['Referer'] = 'http://m.ip138.com/kuaidi/search.asp'

        format_data = {'no': kuaidi_no}
        try:
            res = requests.post(url=self.kuaidi_url,
                                data=format_data,
                                headers=headers,
                                timeout=config.TIMEOUT)
            res.encoding = 'utf-8'
            #print(res.text)
        except requests.exceptions.Timeout:
            print('Request timed out. (timeout=%s)' % config.TIMEOUT)
            return 'Request timed out,please try again!'
        soup = etree.HTML(res.content)
        selector = soup.xpath('//ul[@class="query-hd"]/li')
        title = soup.xpath('//ul[@class="query-hd"]/li[@class="title"]')
        comany = title[0].xpath('./span[@class="comany"]/text()')
        #print(comany)
        if len(comany) == 0:
            result = selector[-1].xpath('./text()')[0]
            #return result
        else:
            comany = title[0].xpath('./span/text()')[0]
            status = title[0].xpath('./span/text()')[1]
            #print(comany,status)
            result = result + comany + "," + status + '\n'
            for li in selector[1:-1]:
                time = li.xpath('./div[@class="time"]/text()')[0]
                detail = li.xpath('./div[@class="detail"]/text()')[0]
                result = result + time + " - " + detail + '\n'
            #print(result)
        return result
예제 #16
0
def getMyIP():
    try:
        r = requests.get(url=config.TEST_IP, headers=config.get_header(), timeout=config.TIMEOUT)
        ip = json.loads(r.text)
        return ip['origin']
    except Exception as e:
        raise Test_URL_Fail
예제 #17
0
def getMyIP():
    try:
        r = requests.get(url=TEST_IP, headers=get_header(), timeout=TIMEOUT)
        ip = json.loads(r.text)
        return ip['origin']
    except Exception as e:
        print e.message
예제 #18
0
 def __sub_reply_req(self, url, reply_sub_main_id):
     try:
         header = config.get_header(lv=2)
         res_pones = requests.get(url, headers=header)
         if 200 == res_pones.status_code:
             try:
                 data = json.loads(str(res_pones.text))
                 if "code" in data and "100000" == data['code']:
                     self.__sub_reply_parse(
                         data['data']['html'],
                         reply_sub_main_id=reply_sub_main_id)
                 else:
                     print("sub interface error", data, self.count,
                           self.sub_count)
             except Exception as err:
                 print("json失败", err)
         elif 404 == res_pones.status_code:
             self.__sub_reply_req(url=url,
                                  reply_sub_main_id=reply_sub_main_id)
         else:
             print("sub request error", res_pones.status_code, self.count,
                   self.sub_count)
     except Exception as err:
         print("err3---->", err, self.count, self.sub_count)
         self.can_continue = False
예제 #19
0
def checkSped(selfip, proxy):
    try:
        speeds = []
        test_url = config.GOAL_HTTPS_LIST
        proxies = {
            "http": "http://%s:%s" % (proxy['ip'], proxy['port']),
            "https": "https://%s:%s" % (proxy['ip'], proxy['port'])
        }
        for i in test_url:
            try:
                start = time.time()
                r = requests.get(url=i,
                                 headers=config.get_header(),
                                 timeout=config.TIMEOUT,
                                 proxies=proxies)
                if r.ok:
                    speed = round(int(time.time() - start), 2)
                    speeds.append(speed)
                else:
                    speeds.append(1000000)
            except Exception as e:
                speeds.append(1000000)
        return speeds
    except Exception as e:
        return None
예제 #20
0
def _checkHttpProxy(selfip, proxies, isHttp=False):
    types = -1
    if isHttp:
        test_url = config.GOAL_HTTP_LIST
    else:
        test_url = config.GOAL_HTTPS_LIST
    try:
        r = requests.get(url=test_url[0],
                         headers=config.get_header(),
                         timeout=config.TIMEOUT,
                         proxies=proxies)
        if r.ok:
            content = json.loads(r.text)
            headers = content['headers']
            ip = content['origin']
            proxy_connection = headers.get('Proxy-Connection', None)
            if ',' in ip:
                types = 2
            elif proxy_connection:
                types = 1
            else:
                types = 0
            return True, types
        else:
            return False, types
    except Exception as e:
        return False, types
예제 #21
0
def get_free():
    r_http = requests.get(free_url,
                          params={'proxy': free_params[0]},
                          headers=get_header())
    # r_socks1 = requests.get(free_url, params={'proxy': free_params[1]}, headers=get_header())
    # r_socks2 = requests.get(free_url, params={'proxy': free_params[2]}, headers=get_header())
    parse_free(r_http.text)
예제 #22
0
    def download(self, url):
        count = 0  #重试次数
        r = ''
        logger.info("downloading url: %s", url)
        ls_p = sqlHelper.select(count=10, conditions={'protocol': 1})
        while count < config.RETRY_TIME:
            if r == '' or (not r.ok) or len(r.content) < 500:
                if len(ls_p) > 5:
                    choose = random.choice(ls_p)
                    proxies = {
                        "https": "http://%s:%s" % (choose.ip, choose.port)
                    }
                else:
                    proxies = {}
                try:
                    r = requests.get(url=url,
                                     headers=config.get_header(),
                                     timeout=config.TIMEOUT,
                                     proxies=proxies)
                    r.encoding = 'gbk'
                    count += 1
                except Exception, e:
                    count += 1

            else:
                return r.text
예제 #23
0
def _checkHttpProxy(selfip, proxies, isHttp=True):
    types = -1
    speed = -1
    if isHttp:
        test_url = config.TEST_HTTP_HEADER
    else:
        test_url = config.TEST_HTTPS_HEADER
    try:
        start = time.time()
        r = requests.get(url=test_url,
                         headers=config.get_header(),
                         timeout=config.TIMEOUT,
                         proxies=proxies)
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        sys.stdout.write("check result:" + str(r.text) + " \n")
        if r.ok:
            speed = round(time.time() - start, 2)
            content = json.loads(r.text)
            headers = content['headers']
            ip = content['origin']
            proxy_connection = headers.get('Proxy-Connection', None)
            if ',' in ip:
                types = 2
            elif proxy_connection:
                types = 1
            else:
                types = 0

            return True, types, speed
        else:
            return False, types, speed
    except Exception as e:
        return False, types, speed
예제 #24
0
def html_Download(proxy, page):
    u"""
    爬数据 写入mysql.
    爬一次,sleep 1s。
    """
    url = "https://www.zhipin.com/c101210100/h_101210100/?query=%%E6%%95%%B0%%E6%%8D%%AE%%E5%%88%%86%%E6%%9E%%90&page=%s&ka=page-%s"\
            %(page, page )  # %s  %% 百分号
    p = re.compile(r'\n+| +', re.S)
    Findjob_detail_url = re.compile(r'href="/job_detail/(.*?)" ka', re.S)
    try:
        c = TABLEzhipinHelper()
        c.CreateTablezhipin()
        headers = get_header()

        source_code = requests.get(url, headers=headers, proxies=proxy)
        soup = BeautifulSoup(source_code.text)
        job_list = soup.findAll('div', 'job-primary')
        for job in job_list:
            detail = re.split(',', re.sub(p, ',', job.text.strip()))
            detail_url = 'https://www.zhipin.com/job_detail/' + re.findall(
                Findjob_detail_url, str(job))[0]
            detail.append(detail_url)
            c.Insertzhipin(detail)
        print 'page %s is done at %s' % (page, time.ctime())
        c.CloseCon()
        #time.sleep(1)
    except Exception as e:
        #traceback.print_exc(file = open(r'./html_Download_Error.log','a+'))
        print e
        pass
예제 #25
0
def _checkHttpProxy(selfip, proxies, isHttp=True):
    types = -1
    speed = -1
    if isHttp:
        test_url = config.TEST_HTTP_HEADER
    else:
        test_url = config.TEST_HTTPS_HEADER
    try:
        start = time.time()
        r = requests.get(url=test_url,
                         headers=config.get_header(),
                         timeout=config.TIMEOUT,
                         proxies=proxies)
        if r.ok:
            speed = round(time.time() - start, 2)
            content = json.loads(r.text)
            headers = content['headers']
            ip = content['origin']
            x_forwarded_for = headers.get('X-Forwarded-For', None)
            x_real_ip = headers.get('X-Real-Ip', None)
            if selfip in ip or ',' in ip:
                return False, types, speed
            elif x_forwarded_for is None and x_real_ip is None:
                types = 0
            elif selfip not in x_forwarded_for and selfip not in x_real_ip:
                types = 1
            else:
                types = 2
            return True, types, speed
        else:
            return False, types, speed
    except Exception as e:
        return False, types, speed
예제 #26
0
def p_get(url,data=None,num_retries=3):
	try:
		request_data=b''
		headers_default = config.get_header()
		if data:
			request_data =urllib.parse.urlencode(data).encode('utf-8')

		print("Dowbloading ...",url)
		req = urllib.request.Request(url, headers=headers_default,method="GET")
		response = urllib.request.urlopen(req,data=request_data)
		html=response.read()
		try:
			html=gzip.decompress(html)
		except:
			pass
		chardit1 = chardet.detect(html)
		html=html.decode(chardit1['encoding'])
	except urllib.error.URLError as e:
		print(e)
		html=None
		if num_retries>0:
			if hasattr(e,'code') and 500<=e.code<600:
				return p_get(url,data=data,num_retries=num_retries-1)
	except Exception as e:
		html=None
		print("错误----->",e)
	return html
예제 #27
0
    def mobile(self, mobile_num):

        result = ''
        headers = config.get_header()
        headers['Referer'] = 'http://m.ip138.com/mobile.html'

        format_data = {'mobile': mobile_num}
        try:
            res = requests.get(url=self.mobile_url,
                               params=format_data,
                               headers=headers,
                               timeout=config.TIMEOUT)
            #print(res.text)
        except requests.exceptions.Timeout:
            print('Request timed out. (timeout=%s)' % config.TIMEOUT)
            return 'Request timed out,please try again!'
        if res.status_code == 200:
            soup = etree.HTML(res.content)
            selector = soup.xpath('//table[@class="table"]/tr')
            er = soup.xpath(
                '//table[@class="table"]/tr/td[@colspan="2"]/text()')
            if len(er):
                result = er[0]
                #return result
            else:
                for table in selector:
                    td_key = table.xpath('./td/text()')[0]
                    td_var = table.xpath('./td/span/text()')[0]
                    #result[td_key] = td_var
                    result = result + td_key + ":" + td_var + '\n'
                #print(result)

        else:
            return '查询失败,请重试!'
        return result
예제 #28
0
 def crawlData(self, url):
     #设置phantomjs
     desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
     desired_capabilities["phantomjs.page.settings.userAgent"] = (
         config.get_header())
     # 不载入图片,爬页面速度会快很多
     desired_capabilities["phantomjs.page.settings.loadImages"] = False
     # 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
     proxy = webdriver.Proxy()
     proxy.proxy_type = ProxyType.MANUAL
     # proxy.http_proxy = random.choice(ips)
     # proxy.add_to_capabilities(desired_capabilities)
     # 打开带配置信息的phantomJS浏览器
     # driver = webdriver.PhantomJS(executable_path=phantomjs_driver,desired_capabilities=desired_capabilities)
     driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
     driver.start_session(desired_capabilities)
     # 隐式等待5秒,可以自己调节
     driver.implicitly_wait(5)
     # 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项
     # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
     driver.set_page_load_timeout(20)
     # 设置10秒脚本超时时间
     driver.set_script_timeout(20)
     #browser = webdriver.Chrome('/home/caidong/developProgram/selenium/chromedriver')
     driver.get(url)
     driver.implicitly_wait(1)
     driver.find_element_by_xpath(
         '//div[@class="house-chat-phone"]').click()
     html = driver.page_source
     return html
예제 #29
0
 def getType(self, proxies, url):
     types = -1
     speed = -1
     try:
         start = time.time()
         r = requests.get(url=url,
                          headers=config.get_header(),
                          timeout=config.TIMEOUT,
                          proxies=proxies)
         if r.ok:
             speed = round(time.time() - start, 2)
             content = json.loads(r.text)
             headers = content['headers']
             ip = content['origin']
             proxy_connection = headers.get('Connection', None)
             # print 'proxy_connection',proxy_connection
             if ',' in ip:
                 anonymous = 2
             elif proxy_connection:
                 anonymous = 1
             else:
                 anonymous = 0
             return True, types, speed
         else:
             return False, types, speed
     except Exception as e:
         print 'error'
         return False, types, speed
예제 #30
0
def baidu_check(selfip, proxies):
    '''
    :param
    :return:
    '''
    protocol = -1
    types = -1
    speed = -1

    try:
        start = time.time()
        r = requests.get(url='https://www.baidu.com',
                         headers=config.get_header(),
                         timeout=config.TIMEOUT,
                         proxies=proxies)
        r.encoding = chardet.detect(r.content)['encoding']
        if r.ok:
            speed = round(time.time() - start, 2)
            protocol = 0
            types = 0

        else:
            speed = -1
            protocol = -1
            types = -1
    except Exception as e:
        speed = -1
        protocol = -1
        types = -1
    return protocol, types, speed
예제 #31
0
def _checkHttpProxy(selfip, proxies, isHttp=True):
    types = -1
    speed = -1
    if isHttp:
        test_url = config.TEST_HTTP_HEADER
    else:
        test_url = config.TEST_HTTPS_HEADER
    try:
        start = time.time()
        r = requests.get(url=test_url,
                         headers=config.get_header(),
                         timeout=config.TIMEOUT,
                         proxies=proxies)
        if r.ok:
            speed = round(time.time() - start, 2)
            content = json.loads(r.text)
            headers = content['headers']
            ip = content['origin']
            proxy_connection = headers.get('Proxy-Connection', None)
            if ',' in ip:
                types = 2
            elif proxy_connection:
                types = 1
            else:
                types = 0

            return True, types, speed
        else:
            return False, types, speed
    except Exception as e:
        return False, types, speed
예제 #32
0
def _checkHttpProxy(selfip, proxies, isHttp=True):
    types = -1
    speed = -1
    if isHttp:
        test_url = config.TEST_HTTP_HEADER
    else:
        test_url = config.TEST_HTTPS_HEADER
    try:
        start = time.time()
        r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
        if r.ok:
            speed = round(time.time() - start, 2)
            content = json.loads(r.text)
            headers = content['headers']
            ip = content['origin']
            proxy_connection = headers.get('Proxy-Connection', None)
            if ',' in ip:
                types = 2
            elif proxy_connection:
                types = 1
            else:
                types = 0

            return True, types, speed
        else:
            return False, types, speed
    except Exception as e:
        return False, types, speed
예제 #33
0
def baidu_check(selfip, proxies):
    '''
    用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
    :param
    :return:
    '''
    protocol = -1
    types = -1
    speed = -1
    # try:
    #     #http://ip.chinaz.com/getip.aspx挺稳定,可以用来检测ip
    #     r = requests.get(url=config.TEST_URL, headers=config.get_header(), timeout=config.TIMEOUT,
    #                      proxies=proxies)
    #     r.encoding = chardet.detect(r.content)['encoding']
    #     if r.ok:
    #         if r.text.find(selfip)>0:
    #             return protocol, types, speed
    #     else:
    #         return protocol,types,speed
    #
    #
    # except Exception as e:
    #     return protocol, types, speed
    try:
        start = time.time()
        r = requests.get(url='https://www.baidu.com', headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
        r.encoding = chardet.detect(r.content)['encoding']
        if r.ok:
            speed = round(time.time() - start, 2)
            protocol= 0
            types=0

        else:
            speed = -1
            protocol= -1
            types=-1
    except Exception as e:
            speed = -1
            protocol = -1
            types = -1
    return protocol, types, speed