Пример #1
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer = []
     buffer_size = 1024
     with op.DBManger() as m:
         pipeline = [
             {
                 "$match": {
                     "_status": 3
                 }
             },
         ]
         data_set = collections.DataSet(
             m.read_from(db_collect=("jingdong", self.last_retry_collect),
                         out_field=("_seed", "_status"),
                         pipeline=pipeline))
         should_exit = True
         for i, (seed, status) in enumerate(data_set.distinct()):
             should_exit = False
             seed = Seed(value=seed, type=3)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
         if should_exit:
             import sys
             sys.exit(0)
Пример #2
0
 def __init__(self, seeds_file, **kwargs):
     super(GetBrands, self).__init__(**kwargs)
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(data_set.map(lambda line: line.strip('\n').split("\t")[0].replace('-', ','))
                                          .shuffle(1024)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.pattern = re.compile(r'<li id="brand-(\d+)[\s\S]*?品牌::([\s\S]*?)\'\)"')
Пример #3
0
 def __init__(self, seeds_file, dateindex, **kwargs):
     super(GetComment, self).__init__(**kwargs)
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(
                 data_set.map(lambda line: line.strip('\n').split("\t")[0]).
                 shuffle(2048)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.allcnt_pattern = re.compile(r'"commentCount":(\d+),')
     self.dateindex = dateindex
Пример #4
0
 def __init__(self, seeds_file, **kwargs):
     super(GetBrands1, self).__init__(**kwargs)
     self.proxies = list(
         map(lambda x: ("http://u{}:[email protected]:3128".format(x)),
             range(28)))
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(
                 data_set.map(lambda line: line.strip('\n').split("\t")[0].
                              replace('-', ',')).shuffle(1024)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.pattern = re.compile(r'"id":.*?"name":".*?"')
Пример #5
0
 def __init__(self, seeds_file, dateindex, **kwargs):
     super(GetComment1, self).__init__(**kwargs)
     self.proxies = list(
         map(lambda x: ("http://u{}:[email protected]:3128".format(x)),
             range(28)))
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(
                 data_set.map(lambda line: line.strip('\n').split("\t")[0]).
                 shuffle(2048)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.allcnt_pattern = re.compile(r'"CommentCount": "(\d+)"')
     self.dateindex = dateindex
Пример #6
0
 def __init__(self, **kwargs):
     super(GetProductId, self).__init__(**kwargs)
     self.retries = 3
     self.proxies = HttpProxy.getHttpProxy()
     self.ua = UserAgent()
     with op.DBManger() as m:
         last_brand_collect = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^brand20\d\d\d\d\d\d$"
             }})
         pipeline = [{
             "$match": {
                 "cate_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "brand_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "name": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "_status": 0
             }
         }]
         data_set = collections.DataSet(
             m.read_from(db_collect=("jingdong", last_brand_collect),
                         out_field=("cate_id", "brand_id", "name"),
                         pipeline=pipeline))
         for i, seed in enumerate(data_set.distinct()):
             self.seeds_queue.put(
                 Seed(value=seed, retries=self.retries, type=0))
     self.first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
     self.skuids_pettern = re.compile(r'{.*?"skuId":(\d+).*?}')
     self.totalpage_perttern = re.compile(
         r'<div id="J_topPage"[\s\S]*?<b>\d+</b><em>/</em><i>(\d+)</i>')
Пример #7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from multiprocess.tools import timeUtil, collections
from mongo import op

print(timeUtil.getdate(0, format="%Y-%m%d"))
print(timeUtil.current_time())
dt = collections.DataSet([1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6])
for i in dt.shuffle(
        buffer_size=3).map(lambda x: x * 2).map(lambda x: x + 1).distinct():
    print(i)

with op.DBManger() as m:
    last_brand_collect = m.get_lasted_collection(
        "jingdong", filter={"name": {
            "$regex": r"^brand20\d\d\d\d\d\d$"
        }})
    pipeline = [{
        "$match": {
            "cate_id": {
                "$ne": None
            }
        }
    }, {
        "$match": {
            "brand_id": {
                "$ne": None
            }
        }
    }, {
        "$match": {