示例#1
0
def get_HtmlText(keyword):
    '''
    获得网页内容
    :param keyword: 产品关键字
    :return: 网页内容
    '''
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400',
        'referer': 'https://www.jd.com/2019',
    }
    url = 'https://search.jd.com/search?'
    for i in range(1, 10):
        proxies = get_proxies()
        param = {
            'keyword': keyword,
            'enc': 'utf-8',
            'page': i,
        }
        try:
            r = requests.get(url,
                             params=param,
                             headers=headers,
                             proxies=proxies)
            r.raise_for_status()
            r.encoding = 'utf-8'
            yield r.text
        except Exception as e:
            print(e)
示例#2
0
def getHtmlText(url, times=5):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400',
        'Referer': "www.baidu.com",
        'Connection': 'close',
    }
    cookies = {
        'cookie':
        'id=220793df29bd00a4||t=1554689493|et=730|cs=002213fd48ec54ab198b47c57e',
    }
    proxies = get_proxies()

    try:
        s = requests.session()
        s.keep_alive = False
        r = requests.get(url,
                         headers=headers,
                         proxies=proxies,
                         cookies=cookies)
        r.raise_for_status()
        r.encoding = 'gb2312'
        return r.text
    except Exception as e:
        if times > 0:
            print('重试....')
            time.sleep(3)
            return getHtmlText(url, times=times - 1)
        else:
            print(e)
示例#3
0
def getHtmlText(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3587.400',
        'Referer': 'https://search.bilibili.com/',
    }
    proxies = get_proxies()
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=10)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(e)
示例#4
0
def getHtmlText(url):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400',
        'referer': 'https://music.163.com/'
    }
    proxies = get_proxies()
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=5)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(e)
def getHtmlText(url):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400',
        'Referer': 'https://maoyan.com/board'
    }
    proxies = get_proxies()
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=3)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(e)
示例#6
0
def getHtmlText(url):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400',
    }
    proxies = get_proxies()
    try:
        r = requests.get(url, headers=headers, proxies=proxies)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.content
    except Exception as e:
        print(e)
示例#7
0
def getHtmlText(url):
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400',
    }
    cookies={
        'cookie':'_ga=GA1.2.1409714415.1554703585; __gads=ID=a5304af261dcd4c2:T=1554703585:S=ALNI_MbEsW4VT55Z3ElVFspuBPVYP8Thwg; UM_distinctid=16a1ec92e0bca-04c3b65518f387-4d764416-100200-16a1ec92e0c138; __utma=226521935.1409714415.1554703585.1556871669.1556871669.1; __utmz=226521935.1556871669.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _gid=GA1.2.839933537.1557463307; .Cnblogs.AspNetCore.Cookies=CfDJ8JcopKY7yQlPr3eegllP76ONU4DW7YjCWy40gI6hf9myPV5ajk7NVExemnfstB1tbKJF2OyOqLT2GisTVjvl4bcF-mn-4aXJ52G2n2gLQBobFwxenWoyAOvcGa13hlH32dXrq_PuOyuhgHXcH2nby0WfC7cPTAQTYOdCy7YW805ZP5e3ZT3ALtIt_0_2Vft90hmllh3jNhNPMlCJPejJlz2EXsNhEyWS1KjFXxJ84fNM_zoF6xhU7Kj4sCn9vHhfg3B7SznY0qaTT5f0uCEeIhrD1gZn4YPIP-QBLG2JVoA6O1gkhLwcl-Vw-WweKjuAS0VbMVgo4J_bF9qAVqWuphAWls-CdFR-dqwcIHA-Sj3PFpiKh1givGqsthFY4Pf6lZyRhCNU_Aopac48afzKxU9Oi6S-zeVx0iy07UmG7CF5vXFH4DT_Dtkmsam_Yd4nkQ; .CNBlogsCookie=70A254D254EC0CB9FD627D2B7FE7A278AF297DD82AD19D643B5D263438C4195FAF708B086BE305CC124EDD70B8A126ABF076B337AE4F8CCEB35326582BAE6F7A0565B1554A33A5982F7E83150ED7CB57F9173534; _gat=1'
    }
    proxies= get_proxies()
    try:
        r = requests.get(url, headers=headers, proxies=proxies, cookies=cookies)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text

    except Exception as e:
        print(e)
示例#8
0
def getHtmlText(url):
    """
    获得网页源代码
    :param url: 网页链接
    :return: 网页内容
    """
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400',
        'Referer': 'https://www.dota2.com.cn/news/index.htm',
    }
    proxies = get_proxies()
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=5)
        r.raise_for_status()
        # print('编码:',r.encoding)
        return r.text
    except Exception as e:
        print(e)
示例#9
0
    def getHtmlText(self, url):
        """
        获得url网页响应数据
        :param url:
        :return: text内容
        """
        headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3587.400',
            'Origin': 'https://lianjia.com',
        }
        proxies = get_proxies()
        try:
            r = requests.get(url, headers=headers, proxies=proxies, timeout=10)
            r.raise_for_status()
            return r.text

        except Exception as e:
            print(e)
示例#10
0
def get_comments(productId):
    '''
    获得每个产品10页评论
    :param productId:
    :return:
    '''
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400',
        'referer': 'https://www.jd.com/2019',
    }
    url = 'https://sclub.jd.com/comment/productPageComments.action?'
    for page in range(0, 10):
        proxies = get_proxies()
        param_data = {
            'callback': 'fetchJSON_comment98vv1216',
            'productId': productId,
            'score': 0,
            'sortType': 5,
            'page': page,
            'pageSize': 10,
            'isShadowSku': 0,
            'fold': 1
        }
        try:
            response = requests.get(url,
                                    params=param_data,
                                    headers=headers,
                                    proxies=proxies)
            response.raise_for_status()
            response = response.text
            #除去网页无用信息fetchJSON_comment98vv1216()
            new_resp = response.replace('fetchJSON_comment98vv1216(',
                                        "").replace(');', "")
            data = json.loads(new_resp)
            comments = data['comments']
            p = Pool()
            p.map(save_into_mongo, comments)

        except Exception as e:
            print(e)
示例#11
0
def getHtmlText(url):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400',
        'Referer': 'https://www.biqudao.com/bqge127585/7403388.html'
    }

    cookies = {
        'cookie':
        "UM_distinctid=16ab10a567b541-08e6327450281f-4a764a16-100200-16ab10a567c4a8; CNZZDATA1275036968=1243908052-1557745098-https%253A%252F%252Fwww.baidu.com%252F%7C1557745098; bookid=127585; bcolor=; font=; size=; fontcolor=; width=; chapterid=7403387; chaptername=%25u7B2C%25u4E00%25u5343%25u4E00%25u767E%25u4E00%25u5341%25u4E8C%25u7AE0%25u771F%25u6B63%25u7684%25u7389%25u5E1D%25uFF08%25u4E0A%25uFF09"
    }
    proxies = get_proxies()
    try:

        r = requests.get(url, headers=headers, proxies=proxies)
        r.raise_for_status()
        r.encoding = 'utf-8'

        return r.text

    except Exception as e:
        print(e)
示例#12
0
def getHtmlText(url, ipcount=3):

    proxies = get_proxies()

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3650.400 QQBrowser/10.4.3341.400',
        'referer': 'https://weixin.sogou.com/',
    }
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=10)
        r.raise_for_status()
        return r.text

    except Exception as e:
        if ('443' in str(e)):
            if ipcount > 0:
                getHtmlText(url, ipcount=ipcount - 1)

            else:
                print('finish')
        else:
            print(e)
示例#13
0
import getIpfromWeb
import getProxy
import time


def main():
    ip_list = getIpfromWeb.getIp()
    # verified_list = getIpfromWeb.verifyIP(ip_list)
    verified_list = getIpfromWeb.verifyip_multithread(ip_list)
    getIpfromWeb.data_persistence(verified_list)
    print('crawl verified ip list is : ', verified_list)


if __name__ == '__main__':
    star = time.time()
    main()
    getIpfromWeb.refresh_db()
    print(getProxy.get_proxies())
    end = time.time()
    print('用时:', end - star)
    #49.479830265045166     单线程
    # 28.486629486083984    多线程
示例#14
0
import getProxy
import cssselect
from bs4 import BeautifulSoup
url = 'https://sx.fang.anjuke.com/loupan/yuechengqu/'

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3650.400 QQBrowser/10.4.3341.400',
    'Referer': 'https://sx.fang.anjuke.com/loupan/yuechengqu/',
}
cookies = {
    'cookie':
    'ctid=66; aQQ_ajkguid=DE7D9768-0C31-4978-1D21-SX0428133507; sessid=1ACFCBE2-691C-D1FA-3461-SX0428133507; isp=true; lps=http%3A%2F%2Fuser.anjuke.com%2Fajax%2FcheckMenu%2F%3Fr%3D0.8617607606270921%26callback%3DjQuery1113003545128319064261_1556429737063%26_%3D1556429737064%7Chttps%3A%2F%2Fsx.fang.anjuke.com%2Floupan%2F%3Fpi%3Dbaidu-cpcaf-sx-tyongsx1%26kwid%3D94538308235; twe=2; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1556429737; 58tj_uuid=bd822b30-9606-4160-b237-b5d4e97b0bee; init_refer=https%253A%252F%252Fwww.baidu.com%252Fbaidu.php%253Fsc.Ks000001qLT2daZnZ-PnF6VfdHyzXE-ljY0UrvaFUKIRtpg9KJcd9o4fAIs3sHFn5I15dnr1cxvmqRXRvA2Fo_GSHwoy0YExi11QSADpD8TcHS8v_A8rvHYqMoR84_ePK0LVZrcC5YafivARH99CJDgO8zUPHBM3mU7GkjXDVYUvWFedqTpq8v7GfSFBLV_Z5Bv_RPU8MjGjGACins.DR_NR2Ar5Od66Wl5Qy1hthm_8jViBaBeKWuB6e3L_g_3_AXZZa1o1kqAX9CIhWl2SEUsmhPOu83erPh1FkLQDkELIrOklXrH-JuIz3qis1f_XPMZBC0.U1Yk0ZDqzI1fzeXOEP_0mywkXHLFLPjQVeStvsK9uZ7Y5Hc0TA-W5HD0IjLFLPjQzVaLEe1U0A-V5HczPfKM5yq-TZnk0ZNG5yF9pywdUAY0TA-b5Hnz0APGujYzP1T0UgfqnH0kPdtknjD4g1DsnHPxn10knNt1PW0k0AVG5H00TMfqP16L0ANGujY0mhbqnW0Y0AdW5H6srHRsPHRkPNtknj0kg1c4rHR1nHczn1IxnH01g100TgKGujY1n6Kkmv-b5HcLPsKzuLw9u1Ys0A7B5HKxn0K-ThTqn6KsTjYs0A4vTjYsQW0snj0snj0s0AdYTjYs0AwbUL0qn0KzpWYs0Aw-IWdsmsKhIjYs0ZKC5H00ULnqn0KBI1Yz0A4Y5H00TLCq0A71gv-bm1dsTzdCu0KYIgnqnHR4nHTzrHR3Pj6LrHbsn104Pj60ThNkIjYkPHRvPjc4PWb4nHcd0ZPGujY4ujFbPWFWuj0snjRLnjb30AP1UHdDwHwKfbuawbuDfbPKPDnY0A7W5HD0TA3qn0KkUgfqn0KkUgnqn0KlIjYs0AdWgvuzUvYqn7tsg1Kxn7ts0Aw9UMNBuNqsUA78pyw15HKxn7tsg1nkn1R3rjuxn0Ksmgwxuhk9u1Ys0AwWpyfqn0K-IA-b5iYk0A71TAPW5H00IgKGUhPW5H00Tydh5HDv0AuWIgfqn0KhXh6qn0Khmgfqn0KlTAkdT1Ys0A7buhk9u1Yk0Akhm1Ys0AqY5H00ULFsIjYsc10Wc10Wn0KWThnqn1nLrjc%2526word%253D%2525E7%2525BB%25258D%2525E5%252585%2525B4%2525E6%252588%2525BF%2525E4%2525BB%2525B7%2526ck%253D3354.1.58.325.187.335.173.595%2526shh%253Dwww.baidu.com%2526sht%253D95697632_hao_pg%2526us%253D1.0.1.0.0.0.0%2526bc%253D110101; new_uv=1; wmda_uuid=796de203d230c5efa426350a8e8970bd; wmda_new_uuid=1; wmda_session_id_8788302075828=1556429737519-93e198a7-972b-71b9; wmda_visited_projects=%3B8788302075828; als=0; new_session=0; __xsptplusUT_8=1; __xsptplus8=8.1.1556429738.1556430628.14%232%7Cwww.baidu.com%7C%7C%7C%25E7%25BB%258D%25E5%2585%25B4%25E6%2588%25BF%25E4%25BB%25B7%7C%23%23Czq2qvFzGKNhXO64wbNMLCm0VCYmDeIi%23; Hm_lpvt_c5899c8768ebee272710c9c5f365a6d8=1556430628'
}

proxies = getProxy.get_proxies()


def getHtmlText(url):
    try:
        r = requests.get(url, headers=headers, proxies=proxies)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print(e)


def getInfo(html, fname):
    wb = openpyxl.load_workbook(fname)
    soup = BeautifulSoup(html, 'html.parser')
示例#15
0
import requests
import time
import random
import openpyxl
import sys
sys.path.append(
    r'C:\Users\Administrator\Desktop\python\Py_webSpider\github\IP_pond\Improve'
)
from getProxy import get_proxies

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400',
    'Referer': "https://movie.douban.com/tv/"
}
proxies = get_proxies()
url = 'https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20'


def getHTML(url):
    try:
        r = requests.get(url, headers=headers, proxies=proxies)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print(e)


def getTVinfo(html, fname):
    try: