Пример #1
0
 def __init__(self):
     user_agent = mfu.UserAgent()
     self.headers = {
         "User-Agent": user_agent.random(),
         "Cookie": "p_phone_400=4007-999-999; p_phone_level=0; p_global_phone=%2B0086-25-8685-9999; _tacau=MCw4NjkwMmRjMi0yMmMxLWFlZmUtNTc5NC1jNzRlYTliNmI0OTUs; _tact=MGZhZmNjMTgtZjZmNC04ODJlLTNiM2YtMTdhOThlMzVmM2Vj; _tacz2=taccsr%3D%28direct%29%7Ctacccn%3D%28none%29%7Ctaccmd%3D%28none%29%7Ctaccct%3D%28none%29%7Ctaccrt%3D%28none%29; _tacc=1; _ga=GA1.2.697330034.1618750045; _gid=GA1.2.1481901232.1618750045; PageSwitch=1%2C213612736; OLBSESSID=gus2q36s4pog0o2v4hsbec22a6; PcHomeVisit=1; smidV2=20210418204730d1a4357137884d6a8cfbb49e7a12f61100f9c9152ac53cc00; tuniu_partner=MTUyOTEsMCwsYzQ1YjA0MjRjZDliMzA4ZDU0NzE5Y2I2ODlhOWE5NzA%3D; _taca=1618750044531.1618750044531.1618758928733.2; _tacb=ZDAyZjNjMjgtMjJkZi01MjJmLWVlM2QtNTQzODI2MjRmMjUw; Hm_lvt_fe3fbe4228e14b1544525f058df92f91=1618750046,1618758937; isHaveShowPriceTips=1; Hm_lvt_51d49a7cda10d5dd86537755f081cc02=1618758988; Hm_lpvt_51d49a7cda10d5dd86537755f081cc02=1618759159; tuniuuser_ip_citycode=MTAwOA==; ssxmod_itna=eqfxBQDQitG=qiIx0dD=n2uDUgjDuADx=ppmW40H2eGzDAxn40iDt=PHOG4pDrmGT3Gs36AyfGoF1ebfPq/m8o33DU4i8DCMqoqTDeW=D5xGoDPxDeDADYo6DAqiOD7k=DEDmb8DaxDoDY362DitD4qDBzxdDKqGgbLhNbY=D2+ig7eUNDCGxbqDMCeGXY7W3rmWTTYXWKqGyzPGuATUnzwrDCO4YfYpzvGvXG0GsLGD1S7qeI7wzkDA4iELKfAdtFAmoRI+eDD3P0G44eD=; ssxmod_itna2=eqfxBQDQitG=qiIx0dD=n2uDUgjDuADx=ppmDnKSIheDskaiDLC=HWgjBtW=qnR5ssYBSyGPnKlgFwNsFWl5mKu/iqR5wHLp7Km=KLjOKxji9t03TYKQ4/LxLq93VqOYMtS1GSlKpTUV/hGR/+Usfffr9Rfknu5sRijk/3hKafhKakG0YlLXnwIwU2Tc4fWHl4c4BxA450=st+u0F4YBi+vrpYcQ0pEoE4YOIwvaV1RPDfuWoMAGqTUwG=jOoTyOPj4H3luV732BCpNqn8WI4gCIHwlO9D=mvEh17RLLChKXGGkX23jvpt6lQVAps=pXGIphiK6jM=0OjXdEG3coHYAWaA4xIUOY/g+fBqYCo97EtUNnBwZ6qtaKnBv5Oetg5nlE/lPUh0kIpyD31r3b1DHZ=UZjPEEj0jPam3B6+Z2wWhEP+kYcKWe8mUHFEvAf36G1mc73vznUppvXGnf/p0AmujLgWwHzXaRw8cploo90bICFUEAXWe=RF7lUipX3RRY/t=PURRAr6u63ttU397fPTDexvMpuMj7E3TX1uNAomrqOmXfeecnnl4Q4TGaZW=cDdX2KG4DQKbCGIUExbDOUq9W023q4U25UHk90XauxBnRv24jpHYi2+DnL05Cu=2UkgTsttx0/=iLTjPn=seF94IAHz0Oy0RiRag9=pNkLNe3Rn5rajxNxkPxUDaxGXuDt4D08DG7Hb47DC5x7d1GfB81+xYDD; __guid=84647874.3517961395735585300.1618759287150.3435; monitor_count=1; tuniuuser_citycode=MTkwMw%3D%3D; _pzfxsvpc=1254673435147030907%7C1618758933444%7C4%7Chttp%3A%2F%2Fwww.tuniu.com%2F%3Fp%3D15291%26utm_source%3D360%26utm_medium%3Dcpc%26utm_campaign%3DPP; _pzfxuvpc=1618750044912%7C1057153772146331711%7C5%7C1618759314866%7C2%7C6257891389600454241%7C1254673435147030907; Hm_lpvt_fe3fbe4228e14b1544525f058df92f91=1618759315; _gat=1; clickCache=%5B%7B%22key%22%3A1618759288172%2C%22url%22%3A%22https%3A%2F%2Fjn.tuniu.com%2F%22%2C%22pageName%22%3A%22%E5%BA%A6%E5%81%87%3A%E6%B5%8E%E5%8D%97%3A%E9%A6%96%E9%A1%B5%3Ajn%22%2C%22referer%22%3A%22%22%2C%22events%22%3A%5B%7B%22text%22%3A%22%E7%82%B9%E5%87%BB_%E5%B7%A6%E4%BE%A7%E5%AF%BC%E8%88%AA_%E4%B8%80%E7%BA%A7%E5%AF%BC%E8%88%AA_6_%E9%97%A8%E7%A5%A8%20%E7%8E%A9%E4%B9%90%22%2C%22x%22%3A400%2C%22y%22%3A333%2C%22lg%22%3A1618759888442%7D%5D%7D%5D"
     }
     self.list = {}
Пример #2
0
def download(
    url, num_retries=3, headers={}, cookie="", params="", reFlag=0, timeout=(30, 300),
):
    # print("Downloading: ", url)
    if "user-agent" not in headers:
        headers["user-agent"] = mfua.UserAgent().random()
    if cookie != "":
        headers["cookie"] = cookie
    try:
        resp = requests.get(url, headers=headers, params=params, timeout=timeout)
        resp.close()
        html = resp.text
        content = resp.content
        if resp.status_code >= 400:
            print("Download error: ", resp.text)
            html = None
            content = None
            if num_retries and 500 <= resp.status_code < 600:
                return download(url, num_retries - 1)
    except requests.exceptions.RequestException as e:
        print("Download error!!!")
        print(e)
        html = None
        content = None
        resp = None
    except requests.exceptions.Timeout:
        print("请求超时!")
        html = None
        content = None
    if reFlag == 0:
        return html
    elif reFlag == 1:
        return content
    else:
        return resp
Пример #3
0
 def __init__(self):
     user_agent = mfu.UserAgent()
     self.headers = {
         "User-Agent":
         mfu.UserAgent().random(),
         "Cookie":
         "QN1=00006700306c31a87bc0c086; QN99=7358; QunarGlobal=10.86.213.148_1147fecb_178ee43b1e9_2f84|1618905465741; _i=ueHd86MuqdAuqc1AU6CwyeJQnI1X; QN601=9d3319d52ed7a25675d2802fd36a5283; QN48=000080802f1031a87bd81b32; QN269=CED74FF0269211EB9D3EFA163E6279D9; fid=121d0b6b-732b-4505-a294-09125f85e731; QN243=10; __guid=11325109.3948886401476819500.1618905602258.9663; QN57=16189056035600.7784914755548995; QN71=\"NTguMTk0LjE2OS4xNTI65rWO5Y2XOjE=\"; QN63=%E6%B5%8E%E5%8D%97%E5%8A%A8%E7%89%A9%E5%9B%AD%7C%E6%96%B9%E7%89%B9%7C%E6%96%B9%E7%89%B9%5B%E6%B5%8E%E5%8D%97%5D%7C%E6%B5%8E%E5%8D%97%E6%96%B9%E7%89%B9%7C%E8%A5%BF%E5%AE%89%E8%B6%B5%E7%AA%81%E6%B3%89%7C%E8%A5%BF%E5%AE%89%E8%B6%B5%E7%AA%81%E6%B3%89%5B%E8%A5%BF%E5%AE%89%5D%7C%E8%B6%B5%E7%AA%81%E6%B3%89%5B%E8%A5%BF%E5%AE%89%5D%7C%E8%B6%B5%E7%AA%81%E6%B3%89%5B%E6%B5%8E%E5%8D%97%5D%7C%E6%B5%8E%E5%8D%97%E6%B5%8E%E5%8D%97%E6%96%B9%E7%89%B9%7C%E6%B5%8E%E5%8D%97%E6%B5%8E%E5%8D%97%E5%8A%A8%E7%89%A9%E5%9B%AD%7C%E5%8A%A8%E7%89%A9%E5%9B%AD%7C%E6%B5%8E%E5%8D%97%E8%B6%B5%E7%AA%81%E6%B3%89; QN67=192250%2C215559%2C215527%2C196094; QN300=auto_5187810b; QN205=auto_5187810b; QN277=auto_5187810b; csrfToken=xpmqALG4a1v9B6SPTPrrPvw1BIlPx9cW; _vi=Eu0g5Wp7dQrg0p1IjSzpGhsekm3dIncKdeqvEM6UEH53RJHZwZYwUg6-9Z7qIeQ-dY4mmbanXaCXddc1Qz5Ae-KTUP9FC9kkgd9aSVCWatKEmkPMVB0WtufBrsomWIvhB1sUu2D9pSNGvNQEi1Bqhq4-BugojwQXWHtoqZFk-fFF; QN163=0; QN6=auto_5187810b; Hm_lvt_15577700f8ecddb1a927813c81166ade=1618905605,1619009041; QN58=1619009039769%7C1619009268902%7C5; Hm_lpvt_15577700f8ecddb1a927813c81166ade=1619009269; QN271=66845d5b-78e4-422a-a910-8e3fa1b0012b; __qt=v1%7CVTJGc2RHVmtYMStaN3RCUnBkVmZ5enF0ZmdOZVdCcmUvU3pQazFZZkZjSkVlbW5qK3h1RWhidzJMNWtPeUFNWXJBOWE3bk9uZVM5QWNzUmN5cUdyTzFsSHZ4VlVJZFBuN1RDazVkMU03TjZpOGhnVVo4WWRpczAxMnkrbEhnOVc0L3Fabk9ZTUliRlFEbzNocHBwY25BdEJockpPdmQyYU5sS01NZmpna2cwPQ%3D%3D%7C1619009279624%7CVTJGc2RHVmtYMTlxME1tOVYvVHp4YjJRSm5LU3NNMjM0QVk3WWxmbE1TcVZ0eG5KRGZKSzQwb0ZkUDlZNjdQcGdBajk3a2d0UnZERnk5akZ5WWZRemc9PQ%3D%3D%7CVTJGc2RHVmtYMS9BSFJJa3RsMTNjUmd4VS85UmhiQjcycUE2ZFAxWlBxVXo2dThObmRJcGkwSjQ2MUU5alNhNENEaEFqUzZKNUdrMjloWTRiYjkvaWdkTzhlYkM5VjJEalhDaGZldThWZjV4WHN4N01BMDhvYlk3ZVdHSjZwQ3Z2Q0drayt6ays0a2xOMTdTdjRnK1NzNEIwSm9INEJOdC8yMGc2ZG1Hak4wK3J5ME5nWHU1N0hBWlpacVdjZDFwK04rNzVoUTl1alE2cWxqNXY4UGdpb0owMFBGOEtlU2hTZklqcWRRcEdwOVRCN280cVA0dFcvc2lmMlZXZGJwc09jbkk1K2Z5bjEyUk9nS0hBTm10RWQ2UmZjWW53cW1TRnFYRFdXUkc1bUhwWlpCc0wzQzlyVFNCTFo3S21oTDBaMm1NdkFhczRSWUJqc242MnhxbFJLTUt5YnM1QkVmc21KVVZxTXdweHU2N29qUzdRSW1WS0pHVElzNk1tQUQxbU9BQU1QYjBmTzBWLzh2anJBNjlacTAvbzQ4V0JTd2U2WWVBb1dSOUpQWk9DUmJvWlhXaGpJS1JXdWJWcE1LKytrblE5L1BoRDBBemNMWmduVXBiWWd2MTdhdTJ3eDRxUS9mV0g0TXEvdXFSV0xKT1ByT3FYdG9xbGF0L2loUmdzOHZ1bnFDay9rNVljUmdvTXhxVkJhSjJ4My91MXU5UGxPT3dKSUJHWTh5NTdmK3REWFAwSFhrbWpaUzlsTEx3WWZZMTNYbGViMTV5WjBGa2E5dDZsU1Y5bTJWVzF3VkFCMi92M2djamtmM3N4aE51SXFlVFNhVTR3NVBYNkhDSFcvTjM1U2xqRVg2WnNyZEY4dGE4VHhqcEtDdjdLZVRoenlQUWVZNno3NEtXbm1HTE9VVWtYSlZ2c0ZwZWs1YkdLSjc3Z1pKd3FUZ3pOcUdsRUlqVFhLWUpYUWhGMVhMbUtDandqS0h6NHJ1Z3RzNEErcGVJR3ZJNE5VdXBYWVFoWkFUaExzT1d5Z3lMOStYRUxNT2dzNjU5c1lIL2cvMVE2OHljZnJ5TlVYSmp3eVVWMFlBdlpFSFQzSWNkTDdDdw%3D%3D; JSESSIONID=7EC69B67C1BFEDFA17C8CE319CC33E4F; monitor_count=6; QN267=012553093731ef52626",
     }
     self.start_url = "https://piao.qunar.com/"
     self.cityList = {}
     self.spots = []
     self.tickets = []
     '''保存景点门票数据'''
     self.spotsInfo = {}
     self.name = '去哪儿网'
     self.done = False
Пример #4
0
 def __init__(self):
     user_agent = mfu.UserAgent()
     self.headers = {
         "User-Agent":
         user_agent.random(),
         "Cookie":
         "__jsluid_h=c19e9d181bb271865e1be7d61d799c67; mfw_uuid=60b8420e-95c4-eda2-bfc9-5e02d4fd4333; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222021-06-03+10%3A44%3A30%22%3B%7D; uva=s%3A91%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1622688271%3Bs%3A10%3A%22last_refer%22%3Bs%3A23%3A%22http%3A%2F%2Fwww.mafengwo.cn%2F%22%3Bs%3A5%3A%22rhost%22%3BN%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1622688271%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=60b8420e-95c4-eda2-bfc9-5e02d4fd4333; UM_distinctid=179cfc20b6734d-00221eee7872ec-2363163-144000-179cfc20b68465; __omc_chl=; _r=csdn; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A49%3A%22blog.csdn.net%2Fu011291072%2Farticle%2Fdetails%2F81266372%22%3Bs%3A1%3A%22t%22%3Bi%3A1622824665%3B%7D; __mfwothchid=referrer%7Cblog.csdn.net; __omc_r=blog.csdn.net; __mfwc=referrer%7Cblog.csdn.net; bottom_ad_status=0; PHPSESSID=tt8tvp1bg1287j9t4fpd66rpn5; __mfwa=1622688269985.12455.3.1622824668000.1622879218167; __mfwlv=1622879218; __mfwvn=3; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1622688271,1622824670,1622879218; CNZZDATA30065558=cnzz_eid%3D1705560339-1622684260-http%253A%252F%252Fwww.mafengwo.cn%252F%26ntime%3D1622878977; __jsl_clearance=1622879888.257|0|MruEnfhwo70geWcDqzscOHYZ%2FGM%3D; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1622879993; __mfwb=0155e7e63b27.4.direct; __mfwlt=1622880000"
     }
     self.list = {}
Пример #5
0
 def getHtml(self, url, host):
     try:
         self.headers['User-Agent'] = mfu.UserAgent().random()
         self.headers['Cookie'] = self.choseCookie()
         #self.headers['Host'] = host
         #self.headers['Referer'] = url
         resp = requests.get(url, headers=self.headers)
         resp.raise_for_status()
         resp.encoding = 'utf-8'
         return resp.text
     except:
         return ""
Пример #6
0
 def getRHtml(self, url):
     try:
         self.headers['User-Agent'] = mfu.UserAgent().random()
         #self.headers['Cookie'] = '__guid=131352329.2117375357185893600.1618750037696.1821; p_phone_400=4007-999-999; p_phone_level=0; p_global_phone=%2B0086-25-8685-9999; _tacau=MCw4NjkwMmRjMi0yMmMxLWFlZmUtNTc5NC1jNzRlYTliNmI0OTUs; _tact=MGZhZmNjMTgtZjZmNC04ODJlLTNiM2YtMTdhOThlMzVmM2Vj; _tacz2=taccsr%3D%28direct%29%7Ctacccn%3D%28none%29%7Ctaccmd%3D%28none%29%7Ctaccct%3D%28none%29%7Ctaccrt%3D%28none%29; _ga=GA1.2.697330034.1618750045; _gid=GA1.2.1481901232.1618750045; PageSwitch=1%2C213612736; _uab_collina=161875004597741912116211; smidV2=20210418204730d1a4357137884d6a8cfbb49e7a12f61100f9c9152ac53cc00; tuniu_partner=MTUyOTEsMCwsYzQ1YjA0MjRjZDliMzA4ZDU0NzE5Y2I2ODlhOWE5NzA%3D; isHaveShowPriceTips=1; Hm_lvt_51d49a7cda10d5dd86537755f081cc02=1618758988; tuniuuser_ip_citycode=MTAwOA==; tuniuuser_citycode=MjQwMg%3D%3D; acw_tc=76b20ff416188267636778763e3a7822abed332150bd271d4eed81997d2dcd; acw_sc__v2=607d560ba2d2f37cbacad0ffc568e1b0c8dfb234; monitor_count=8; _taca=1618750044531.1618758928733.1618826770234.3; _tacb=ZjJlODliNGEtMDZlZS0zZmUyLTYzYzctNzRlMGViM2ZmMWUy; _tacc=1; _pzfxuvpc=1618750044912%7C1057153772146331711%7C7%7C1618826770299%7C4%7C1503591366559265886%7C7335018051316751874; _pzfxsvpc=7335018051316751874%7C1618826770291%7C1%7Chttp%3A%2F%2Fwww.tuniu.com%2F; Hm_lvt_fe3fbe4228e14b1544525f058df92f91=1618750046,1618758937,1618826770; Hm_lpvt_fe3fbe4228e14b1544525f058df92f91=1618826770; OLBSESSID=6eb605kbelogi1uge3rju4gm36; ssxmod_itna=CqAx97qQqDqrDXYG7GTueGTKYT3EQW2rA7x0yei=iDSxGKidDqxBeWQrKqe5qv3KOKC3AEoDR2H3hQbEmuomTWCRoX3DU4i8DCTx5YOGDYA8Dt4DTD34DYDixib1xi5GRD09kDbxYp9DWPDYxDr61KDRxi7DDydkx07DQHkKnikeUP+9xo13iBeTnKD9ooDsE+EQnKwvI3qZ1RODl9jDCKz9c9Ci4GdZi0omDxNCQI24TG3SQG3qmrPqi+xx64NGOioKNDbFijyQDDWT00YiA+DD; ssxmod_itna2=CqAx97qQqDqrDXYG7GTueGTKYT3EQW2rADnIgiKitCQDlE70xjRDueZ3Pw2ktGFWik4hsn7iei=Xb4Qvb7tugfo=GeAPRfi+Tt+pBD0qv0Q7gQXfxWgRcH1NOajLqYSdB3+nONgUO0ZmCKPneoZ5HTWT+/2Y1txYTtpawHOGF3DFt3AO=oh0I+1Wb1bSAqafIEfahHpuPcxeww=njBokOwpaHoUyTn/uW0t9XTAMdyxlA8CP5W22NUZeICe=xWFwEhCpQnirQTLiF3lP5V+==CjSneA68mBCKfke53sUFg=VjoZj0XFqtAIEgKLZ+s=XRofKDv/aXUEUuGmr2ID4w3/Kza=77dQ66drec7YcopLWorSQcP6CmmAWvhQt40egiP7mpZjAHmp36bcrmb4LX0b+AEEYIoZ3+e6vD2wEYp/4IzL+lWf3aKZpL4awthXBejFcbta70WeoF71FXWcjtWYpRCRaXInqlWqF96InqPcGDFUUnr4ylI9UK9b+YN74DQ93FhYFObK0D0E49qw1lNRrmvll9xDFqD+op1j4qAEURiKAmKYbaYD='
         #self.headers['Host'] = 'www.tuniu.com'
         self.headers['Referer'] = 'https://piao.qunar.com/'
         resp = requests.get(url, headers=self.headers)
         resp.raise_for_status()
         resp.encoding = 'utf-8'
         return resp
     except:
         return ""
Пример #7
0
 def getproxies(self):
     # url = 'https://api.xiaoxiangdaili.com/ip/get?appKey=530595293713289216&appSecret=hISgmKhy&cnt=&wt=text&method=http&city=&province='#小象代理
     url = 'http://dps.kdlapi.com/api/getdps/?orderid=908407943957242&num=1&pt=1&sep=1'  # 快代理
     headers = {'User-Agent': my_fake_useragent.UserAgent().random()}
     response = requests.get(url=url, headers=headers)
     proxy = response.content.decode('utf-8')
     print("代理IP为:" + proxy)
     proxies = {
         'http': 'http://' + proxy,
         'https': 'https://' + proxy,
     }
     return proxies
Пример #8
0
def send_request(url):
    try:
        headers = {}
        headers['User-Agent'] = mfu.UserAgent().random()
        headers[
            'Cookie'] = '_qyeruid=CgIBAWC4PHp2qkJVmSkPAg==; new_uv=1; new_session=1; _guid=Rc6e04dd-9db6-cafc-afa0-e9515fac0d3f; ql_guid=QL5c19c9-1f38-4377-82a6-18242efa0235; source_url=https://www.qyer.com/; isnew=1622686857075; __utma=253397513.1025643824.1622686844.1622825628.1622888267.5; __utmc=253397513; __utmz=253397513.1622888267.5.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; PHPSESSID=23bf82872f55096d2a8720eb8dfeb705; city_browse=a:2:{i:0;i:50;i:1;i:51;}; ql_created_session=1; ql_stt=1622892667772; ql_vts=7; __utmt=1; __utmb=253397513.27.10.1622888267; ql_seq=27'
        # self.headers['Host'] = 'www.tuniu.com'
        headers['Referer'] = 'https://place.qyer.com/china/sight/'
        resp = requests.get(url, headers=headers)
        resp.raise_for_status()
        resp.encoding = 'utf-8'
        return resp.text
    except:
        return ""
Пример #9
0
 def getHtml(self, url):
     '''
     虎丘网页text内容
     :param url:
     :return:
     '''
     try:
         self.headers['User-Agent'] = mfu.UserAgent().random()
         self.headers['Cookie'] = '__jsluid_h=a36cc55b4bf978581c27b5a0c5c38ecd; __guid=186442287.2875075106880838700.1621242079265.626; mfw_uuid=60a230e2-caec-fed0-1f63-726051666864; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222021-05-17+17%3A01%3A22%22%3B%7D; __mfwc=direct; uva=s%3A91%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1621242084%3Bs%3A10%3A%22last_refer%22%3Bs%3A23%3A%22http%3A%2F%2Fwww.mafengwo.cn%2F%22%3Bs%3A5%3A%22rhost%22%3BN%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1621242084%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=60a230e2-caec-fed0-1f63-726051666864; UM_distinctid=179798efea715-0839cc2a865b56-376b4502-1fa400-179798efea82b1; bottom_ad_status=0; PHPSESSID=hgla0nidhpna2qfe5othdi0of2; __mfwa=1621242083107.22706.2.1621242083107.1621255595846; __mfwlv=1621255595; __mfwvn=2; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1621242085,1621255596; __omc_chl=; __omc_r=; __jsl_clearance=1621257482.651|0|EmCk5dxrQc0Gxh14q4LThKkHf1A%3D; __jsluid_s=4dd191ae77bb2389976da37e6fecfc4d; CNZZDATA30065558=cnzz_eid%3D1690025938-1621239949-http%253A%252F%252Fwww.mafengwo.cn%252F%26ntime%3D1621258179; __jsl_clearance_s=1621260980.181|0|Fzx2Pddtt2RW5Lq3Tv2JnNYOdIA%3D; monitor_count=53; __mfwb=b283048fa80a.29.direct; __mfwlt=1621261029; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1621261030'
         #self.headers['Host'] = 'www.tuniu.com'
         self.headers['Referer'] = 'http://www.mafengwo.cn/mdd/'
         resp = requests.get(url, headers=self.headers)
         resp.raise_for_status()
         resp.encoding = 'utf-8'
         return resp.text
     except:
         return ""
Пример #10
0
 def getHtml(self, url):
     '''
     获得网页的text内容
     :param url:
     :return:
     '''
     try:
         self.headers['User-Agent'] = mfu.UserAgent().random()
         self.headers['Cookie'] = '_abtest_userid=128990d6-ec49-40cb-b25d-fc8452c3d8a1; _ga=GA1.2.179469688.1614864484; MKT_CKID=1614864484805.yk39i.z4vz; _RSG=r2q6zDxpRN1sq9uB0iKSXA; _RGUID=15dbcfb3-7d1b-40b5-a85c-52c00be09d36; _RDG=287a9b7a6689de2a903820b27712075311; ibulanguage=CN; ibulocale=zh_cn; cookiePricesDisplayed=CNY; _gid=GA1.2.435881939.1618908478; Union=AllianceID=5376&SID=130860&OUID=&createtime=1618908478&Expires=1619513277722; Session=smartlinkcode=U130860&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; MKT_CKID_LMT=1618908477918; MKT_Pagesource=PC; GUID=09031023413294183609; __utma=1.179469688.1614864484.1618917016.1618917016.1; __utmc=1; __utmz=1.1618917016.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _RF1=223.99.16.76; _jzqco=%7C%7C%7C%7C1618908478117%7C1.94961373.1614864484796.1618917699648.1618917762688.1618917699648.1618917762688.undefined.0.0.7.7; __zpspc=9.3.1618917018.1618917762.3%233%7Cwww.so.com%7C%7C%7C%7C%23; _bfa=1.1614864482896.3146qi.1.1618908639843.1618917016691.3.15.10650038368; _bfs=1.5; _bfi=p1%3D10650034475%26p2%3D10650034475%26v1%3D15%26v2%3D14; appFloatCnt=4; U_TICKET_SELECTED_DISTRICT_CITY={%22value%22:{%22districtid%22:1%2C%22districtname%22:%22%E5%8C%97%E4%BA%AC%22%2C%22isoversea%22:false%2C%22stage%22:%22selectedCity%22}%2C%22updateDate%22:1618919559657%2C%22createTime%22:1618919434692}'
         #self.headers['Host'] = 'www.tuniu.com'
         self.headers['Referer'] = 'https://piao.ctrip.com/ticket'
         resp = requests.get(url, headers=self.headers)
         resp.raise_for_status()
         resp.encoding = 'utf-8'
         return resp.text
     except:
         return ""
Пример #11
0
 def __init__(self):
     user_agent = mfu.UserAgent()
     self.headers = {
         "User-Agent":
         user_agent.random(),
         "Cookie":
         "_lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=167ffca928ec8-0e654d87ed4011-4d045769-100200-167ffca928ec8; __mta=210679722.1546184730073.1546184730073.1546184730073.1; _lxsdk=167ffca928ec8-0e654d87ed4011-4d045769-100200-167ffca928ec8; _ga=GA1.2.268215992.1546188830; _gid=GA1.2.2085490335.1546188830; mtcdn=K; lsu=; token2=k5KFzZOmjNtI4RXwSn-MBwHYS_QFAAAAqgcAAM17q21drlYFsEkrWY8nBciWgigr_vFCL5FDakc3B15Z318X6W3X_Dkc15OrK0yCPQ; u=646978641; n=XwR964951585; lt=k5KFzZOmjNtI4RXwSn-MBwHYS_QFAAAAqgcAAM17q21drlYFsEkrWY8nBciWgigr_vFCL5FDakc3B15Z318X6W3X_Dkc15OrK0yCPQ; ci=146; rvct=146%2C224%2C527%2C1114%2C1268%2C758%2C835%2C811%2C729%2C113%2C402; unc=XwR964951585; uuid=d927d5e7a70f4031900e.1546184723.2.0.0; client-id=03aeb51b-56e7-4809-b3a0-1fd44f5b4ea4; lat=40.74812; lng=107.400892; _lxsdk_s=16803187a83-b3c-b35-5ba%7C%7C171",
     }
     self.start_url = "http://t.dianping.com/citylist"
     self.store = ''
     self.discountList = []
     self.storeList = []
     self.cookie = [
         '__guid=169583271.2127607620695126000.1618218440470.8167; _lxsdk_cuid=175bae02240c8-09a81e2cb0072f-376b4502-1fa400-175bae02241c8; _lxsdk=175bae02240c8-09a81e2cb0072f-376b4502-1fa400-175bae02241c8; _hc.v=8cc5f797-6103-3af5-abdb-f8baea888bad.1618218444; s_ViewType=10; ua=dpuser_8342278047; ctu=221cf049b9b2e4851a051dc19e113dc7457d5ccf8b929510e97ec6f95a5b6125; cityid=22; default_ab=shopList%3AC%3A5; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; fspop=test; cy=22; cye=jinan; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1618658692,1618741564,1618745811,1618825724; dper=965e5e629ab69e006a873be31a15f0f5530e016824b2ee5217dfdb51e726c29e64c0e8182775f9d4eec550fdf72032b4157151fbf06393277d8eca821d67b29d9a4e50782a937ce01e24c4e5c87978acd7d79976a6ac115a95a294891bc878b8; ll=7fd06e815b796be3df069dec7836c3df; uamo=15662652209; monitor_count=90; dplet=2ab439395a6417e1c18d480e2c28b2b8; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1618825850; _lxsdk_s=178e9883cb9-dee-7b-1c%7C%7C77',
         'fspop=test; __guid=169583271.2127607620695126000.1618218440470.8167; _lxsdk_cuid=175bae02240c8-09a81e2cb0072f-376b4502-1fa400-175bae02241c8; _lxsdk=175bae02240c8-09a81e2cb0072f-376b4502-1fa400-175bae02241c8; _hc.v=8cc5f797-6103-3af5-abdb-f8baea888bad.1618218444; s_ViewType=10; ua=dpuser_8342278047; ctu=221cf049b9b2e4851a051dc19e113dc7457d5ccf8b929510e97ec6f95a5b6125; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1618218444,1618316416,1618571673,1618658692; dper=965e5e629ab69e006a873be31a15f0f579a0845eb2b3c176d6f00668ca011e840b98b8a129bf16d49dedcc1a42f7785ec1709dbe52244a9360e9868811b109712cb170512f86402e4a51395e5b3d9ab1213fdfb136f210a870d092056f663db6; ll=7fd06e815b796be3df069dec7836c3df; uamo=15662652209; dplet=da5c43b3460109b9c3507790ceb00f4f; cy=22; cye=jinan; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1618667476; _lxsdk_s=178df9386bd-1a-2e0-50%7C%7C672; monitor_count=41',
     ]
     self.index = 0
     self.length = 2
Пример #12
0
 def getRHtml(self, url):
     try:
         s = quote('密室逃脱')
         #y = 'http://www.dianping.com/search/keyword/22/0_'+s
         #y = 'http://www.dianping.com/search/keyword'
         y = 'https://www.dianping.com/search/keyword/22/0_%E9%AB%98%E7%AC%AC%E8%A1%97'
         #' + str(random.randint(1, 30)) + '
         header = {}
         header['User-Agent'] = mfu.UserAgent().random(
         )  #'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
         header['Cookie'] = self.choseCookie()
         header['Host'] = 'www.dianping.com'
         header['Referer'] = y
         resp = requests.get(url, headers=header)
         resp.raise_for_status()
         resp.encoding = 'utf-8'
         return resp.text
     except:
         return ""
Пример #13
0
 def getHtml(self, url):
     '''
     获得网页的text内容
     :param url:
     :return:
     '''
     try:
         self.headers['User-Agent'] = mfu.UserAgent().random()
         self.headers[
             'Cookie'] = 'Hm_lvt_64941895c0a12a3bdeb5b07863a52466=1618908738,1621066788,1621430892; pt__search_from=channel=scenery&page=scenery-index; __guid=28211134.3027891932986968000.1621433416753.135; _tcudid_v2=Heduqn08MGjn_99CGYUUHN_6jp48JY7iaPNJI2jFoKg; qdid=-9999; 17uCNRefId=RefId=0&SEFrom=&SEKeyWords=; TicketSEInfo=RefId=0&SEFrom=&SEKeyWords=; CNSEInfo=RefId=0&tcbdkeyid=&SEFrom=&SEKeyWords=&RefUrl=; Hm_lvt_c6a93e2a75a5b1ef9fb5d4553a2226e5=1621066808,1621430903,1621479164; __tctma=144323752.1618908737536602.1618908737482.1621430890844.1621479162672.4; __tctmu=144323752.0.0; __tctmz=144323752.1621479162672.4.1.utmccn=(direct)|utmcsr=(direct)|utmcmd=(none); longKey=1618908737536602; __tctrack=0; indexTopSearchHistory=%5B%22%E6%96%B9%E7%89%B9%22%2C%22%E6%B5%8E%E5%8D%97%E6%96%B9%E7%89%B9%22%2C%22%E6%B5%8E%E5%8D%97%E6%B5%8E%E5%8D%97%E6%96%B9%E7%89%B9%22%2C%22%E4%B8%8A%E6%B5%B7%E8%BF%AA%E5%A3%AB%E5%B0%BC%22%2C%22%E9%9D%92%E5%B2%9B%E6%96%B9%E7%89%B9%22%2C%22%E6%B7%B1%E5%9C%B3%E6%96%B9%E7%89%B9%22%2C%22%E8%BF%AA%E5%A3%AB%E5%B0%BC%22%5D; wwwscenery=35a2356ff0780dcdca4b45e9a0cd10b0; ASP.NET_SessionId=o2tyazukx3zehx1eeivhuvaj; pagestate=1; Hm_lpvt_c6a93e2a75a5b1ef9fb5d4553a2226e5=1621479269; __tctmc=144323752.210364819; __tctmd=144323752.737325; __tctmb=144323752.1502026337417512.1621479203036.1621479266761.5; monitor_count=13'
         #self.headers['Host'] = 'www.tuniu.com'
         self.headers[
             'Referer'] = 'https://so.ly.com/scenery?q=%E6%96%B9%E7%89%B9'
         resp = requests.get(url, headers=self.headers)
         resp.raise_for_status()
         resp.encoding = 'utf-8'
         return resp.text
     except:
         return ""
Пример #14
0
class JianshuspiderSpider(CrawlSpider):
    name = 'jianshuspider'
    allowed_domains = ['jianshu.com']
    start_urls = ['http://jianshu.com/']
    custom_settings = {"USER_AGENT": ua.UserAgent().random()}

    rules = (Rule(LinkExtractor(allow=r'.*?/p/.*?'),
                  callback='parse_item',
                  follow=True), )

    def parse_item(self, response):
        title = response.xpath('//h1[@title]/text()').get()
        # 文章标题
        author = response.xpath(
            '//div[@class]/a[@href]/span[@class]/text()').get()
        # 文章作者

        # 用于存储文章内容
        x_content = response.xpath('//article//text()').getall()
        content = ' '.join(x_content)
        print(title, author, content)
        item = JianshuItem(title=title, author=author, content=content)
        yield item
Пример #15
0
 def getRHtml(self, url):
     '''
     添加host信息获取网页
     :param url:
     :return:
     '''
     try:
         s = quote('密室逃脱')
         #y = 'http://www.dianping.com/search/keyword/22/0_'+s
         #y = 'http://www.dianping.com/search/keyword'
         y = 'https://www.dianping.com/search/keyword/22/0_%E9%AB%98%E7%AC%AC%E8%A1%97'
         #' + str(random.randint(1, 30)) + '
         header = {}
         header['User-Agent'] = mfu.UserAgent().random()
         header['Cookie'] = self.choseCookie()
         header['Host'] = 'www.dianping.com'
         header['Referer'] = y
         resp = requests.get(url, headers=header)
         resp.raise_for_status()
         resp.encoding = 'utf-8'
         return resp.text
     except:
         return ""
Пример #16
0
# -*- coding: utf-8 -*-
'''
一个随机切换user_agent的第三方python库:my_fake_useragent
参考:https://www.cnblogs.com/zrmw/p/11433200.html
# pip install my_fake_useragent
'''
import my_fake_useragent as ua

if __name__ == '__main__':
    user_agent = ua.UserAgent()

    # 方法1
    # 随机获取一个user-agent,每次获取的都不一样
    for _ in range(10):
        print(user_agent.random())

    # 方法2
    # user_agent.get_useragent_list(): 返回一个user-agent的列表
    for index, each_useragent in enumerate(user_agent.get_useragent_list()):
        print(index, ': ', each_useragent)
'''
# 常见 User-Agent 大全
参考:https://www.cnblogs.com/zrmw/p/9332801.html

window.navigator.userAgent
 
1) Chrome
Win7:
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1
 
2) Firefox
Пример #17
0
# -*- coding:utf-8 -*-
# @Time:     2021/2/12 22:54
# @Author:   Top Programmer - Hacker(Administrator)
# @Software: PyCharm
import requests
from lxml import etree
import my_fake_useragent as mfu

headers = {
    "Referer": "http://bitpush.news/covid19/",
    "User-Agent": mfu.UserAgent().random()
}

gets = requests.get("http://bitpush.news/covid19/", headers=headers).text


def x_paths(ps):
    html = etree.HTML(gets)
    return html.xpath(ps)
Пример #18
0
 def getheaders(self):
     return {'user-agent': my_fake_useragent.UserAgent().random()}
Пример #19
0
 def __init__(self, quality, url, hash):
     self.url = url
     self.hash = hash
     self.quality = quality
     self.user_agent = ua.UserAgent()
Пример #20
0
 def process_request(self, request, spider):
     request.headers["User-Agent"] = my_fake_useragent.UserAgent().random()
     request.meta["proxy"] = proxyServer