Exemplo n.º 1
0
 def make_request(self, seed):
     url = "https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={0}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1".format(
         seed.value)
     request = {
         "url": url,
         "encoding": "gbk",
         "method": "get",
         "proxies": {
             "http": random.choice(HttpProxy.getHttpProxy()),
             "https": random.choice(HttpProxy.getHttpsProxy())
         },
         "headers": {
             'Host': 'club.jd.com',
             'Connection': 'close',
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
             'Referer': 'https://item.jd.com/{0}.html'.format(seed.value)
         }
     }
     return request
Exemplo n.º 2
0
 def __init__(self, **kwargs):
     super(GetProductId, self).__init__(**kwargs)
     self.retries = 3
     self.proxies = HttpProxy.getHttpsProxy()
     self.ua = UserAgent()
     with op.DBManger() as m:
         last_brand_collect = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^brand20\d\d\d\d\d\d$"
             }})
         pipeline = [{
             "$match": {
                 "cate_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "brand_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "_status": 0
             }
         }]
         data_set = collections.DataSet(
             m.read_from(db_collect=("jingdong", last_brand_collect),
                         out_field=("cate_id", "brand_id", "name"),
                         pipeline=pipeline))
         for i, seed in enumerate(data_set.distinct()):
             self.seeds_queue.put(
                 Seed(value=seed, retries=self.retries, type=0))
     self.first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
     self.skuids_pettern = re.compile(r'{.*?"skuId":(\d+).*?}')
     self.totalpage_perttern = re.compile(
         r'<div id="J_topPage"[\s\S]*?<b>\d+</b><em>/</em><i>(\d+)</i>')
Exemplo n.º 3
0
               "Referer": "https://item.m.jd.com/72321801855.html",
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
               #"cookie":"shshshfpa=230b299e-b267-3f39-748a-5274ba04573e-1526388430; shshshfpb=0e4a63e00c3146f1205679ecef0af468fb452b7038a3edfd15afad6d12; __jdu=1595213903005116662704; pin=jd_49e6f74229a5c; unick=jd_188014ctk; _tp=f8skMf7S6k8VPMVjjhCn8S7vk6UqsuFMW8o68xx3ddc%3D; _pst=jd_49e6f74229a5c; ipLocation=%u5317%u4eac; pinId=CL2LG1jQi0fBGlwodztkXrV9-x-f3wj7; unpl=V2_ZzNtbRAFShd8AUZWfk0IB2JTRwgSBxBBfAtGUHseXFFkCxINclRCFnQURldnG1wUZwQZWUNcRhJFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRpdS1BKFnQLRlZLKV8FVwMTbUJTSxF2CERcehtdBGMDElpFUEATdA12ZHwpbDVjCxVUQVdzFEUJdhYvRVsNbwAaWw9XRx1xC0ZWcxheBGYHEl1FUEQWcwlDZHopXw%3d%3d; __jdv=76161171|direct|-|none|-|1609750532540; TrackID=14z86bRECmD_c8hnyUzWqPbiv0pHgxgGJ0tgMH9b8UmBPkuTndrN5VhNCH5t8h3LTmlYuJbzhHXbftdRKDtXKBnPgOEXXzqhXAH9ZY-6s5MAR2ncnCvnEbToPqbFrYgEt; user-key=43a6e8ea-993d-49e9-8763-4e756d81ae6f; cn=0; PCSYCityID=CN_110000_110100_110105; areaId=1; ipLoc-djd=1-72-55653-0; wxa_level=1; jxsid=16109508048628837173; webp=1; visitkey=31014972499970792; __jda=122270672.1595213903005116662704.1595213903.1610948455.1610955353.123; __jdc=122270672; 3AB9D23F7A4B3C9B=HV7XTTHFGASMIJRSRKK34KLHMYELLS47K4NBCIR2PEFYCZUMIX225JHQCMEJUTEKYFDA47E3QEMFC3TYKKQRYXFS2Q; shshshfp=8e6807b1ccf37dd2a527f63ee133d3e6; shshshsID=48908f8b4d08dd6a4ad6ea045c548f30_2_1610955836722; wq_logid=1610955927.1063071573; retina=1; cid=9; wqmnx1=MDEyNjM4NHMubXQxMzQyL25yOzVNQUszTEdoLjFsaTFzZjQyRUgmUg%3D%3D; __jdb=122270672.12.1595213903005116662704|123.1610955353; mba_muid=1595213903005116662704; mba_sid=16109559266537842608963232693.1"
           }}
cate_pattern = re.compile(r'navThird[1-9]: (\[.*\])')
cate_pattern1 = re.compile(r'<li data-sku="(\d+)"[\s\S]*?class="gl-item">[\s\S]*?<em>([^¥][\s\S]*?)</em>[\s\S]*?</li>')
first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
comments_pattern = re.compile(r'"comments":[\s\S]*?(\[[\s\S]*\])')
allcnt_pattern = re.compile(r'"CommentCount": \"(\d+)\",')
import json
import time
from ast import literal_eval
import json
from fake_useragent import UserAgent
from multiprocess.core import HttpProxy
proxies = HttpProxy.getHttpsProxy()
countlist={}
countlisttimeout={}
for proxy in proxies:
    countlist[proxy] = 0
    countlisttimeout[proxy] = 0
ua = UserAgent()
#,proxies={"https": "https://*****:*****@192.168.0.71:3128","http": "http://*****:*****@192.168.0.71:3128"}
for i in range(100000):
    time.sleep(0.5)
    src = requests.get(**request)
    print(src.headers)
    print(allcnt_pattern.findall(src.text))
# first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
# shopid_pettern = re.compile(r'shopId:\'(\d*)\',')
# venderid_pettern = re.compile(r'venderId:(\d*),')
Exemplo n.º 4
0
    current_date = timeUtil.current_time()
    process_manger.kill_old_process(sys.argv[0])
    import logging
    new_config = {
        "job_name": "jdcomment",
        "spider_num": 1,
        "retries": 3,
        "rest_time": 5,
        "complete_timeout": 1 * 60,
        "seeds_file": "resource/month202007",
        "dateindex": current_date,
        "mongo_config": {
            "addr": "mongodb://192.168.0.13:27017",
            "db": "jicheng",
            "collection": "comment" + current_date
        },
        "log_config": {
            "level":
            logging.DEBUG,
            "filename":
            sys.argv[0] + '.logging',
            "filemode":
            'a',
            "format":
            '%(asctime)s - %(filename)s - %(processName)s - [line:%(lineno)d] - %(levelname)s: %(message)s'
        },
        "proxies_pool": HttpProxy.getHttpsProxy()
    }
    p = GetComment(**new_config)
    p.main_loop(show_process=True)