示例#1
0
    def parse_comment(self,
                      goodid,
                      callback=None,
                      meta=None,
                      keyword=None,
                      goodsname=None,
                      client=None):
        i = 0
        if callback is None:
            callback = self.comment_detail
        while self.comment_switch.get(goodid, True):
            #self.comment_switch.get(goodid)默认值为True
            # print(goodid)
            logging.info('Fetching %s comments, Page %s,begin' %
                         (goodid, i + 1))
            callback(goodid=goodid,
                     page=i,
                     callback=self.comment_detail,
                     meta=meta,
                     keyword=keyword,
                     goodsname=goodsname,
                     client=client)
            logging.info('Fetching %s comments, Page %s,done' %
                         (goodid, i + 1))
            i += 1

            # 更新模式下,最多翻看self.maxpage页
            if self.update_status and i >= self.maxpage:
                break
        pass
示例#2
0
 def __init__(self):
     try:
         # print(mysqlconfig)
         self.conn = pymysql.connect(**mysqlconfig)
     except Exception as e:
         logging.error(
             'Fatal Error :Mysql connect get an Fatal Error : %s' % e)
     else:
         logging.info('Mysql connect success')
示例#3
0
 def __init__(self):
     try:
         self.mconn = pymongo.MongoClient(mongohost, int(mongoport))
         self.db = self.mconn['eshop']
         self.collection = self.db['reviews_JD6']
     except Exception as e:
         logging.error(
             'Fatal Error :Mongodb connect get an Fatal Error : %s' % e)
     else:
         logging.info('Mongodb connect success!')
示例#4
0
 def insert(self, doc):
     if isinstance(doc, (list, set, tuple)):
         try:
             self.collection.insert_many(doc)  # 插入多个
         except Exception as e:
             logging.info('insert Failed,{}'.format(e))
     elif isinstance(doc, dict):
         try:
             self.collection.insert_one(doc)  #测试时候使用
         except Exception as e:
             logging.info('insert Failed,{}'.format(e))
示例#5
0
    def run(self):
        if self.lock == False:
            print('{}爬取结束'.format(self.keyword2))
            return
        i = 0
        while self.switch:
            i += 1
            # print(i)
            logging.info(self.goodsname + ' ' + self.keyword + ': ' +
                         'Page {} start..'.format(i))
            self.__search(page=i, callback=self.parse_comment)
            logging.info(self.goodsname + ' ' + self.keyword + ': ' +
                         'Page {} crawled..'.format(i))
            time.sleep(1)

            print(self.keyword + '第%s页' % i)
        print('%s 爬取完成' % self.keyword)
示例#6
0
    def __search(self, page=1, callback=None, meta=None):
        '''
		page:第多少个半页
		'''
        # 解析第一部分 第一部分和第二部分可以合并
        # 解析第二部分
        # 处理分页
        if page >= 200:
            return
        refer = self.refer
        url2 = 'https://search.jd.com/s_new.php'
        headers = {}
        headers['Referer'] = refer
        headers.update(self.headers)
        data2 = {
            'keyword': self.keyword,
            'enc': 'utf-8',
            'qrst': '1',
            'rt': '1',
            'stop': '1',
            'vt': '2',
            'wq': self.keyword,
            'page': page,
            's': (page - 1) * 30 + 1,
            'scrolling': 'y',
            'log_id': time.time(),
            'tpl': '1_M',
        }
        # print('测试') #测试时候使
        try:
            proxy = get_one_proxy()
            proxies = {'http': proxy, 'https': proxy}
            resp = requests.get(url2,
                                params=data2,
                                headers=headers,
                                proxies=proxies)
        except Exception as e:
            logging.error('Fatal error:' + url2 + 'downloaded fail')
            return
        # code = resp.encoding
        logging.info('status code : {}'.format(resp.status_code))
        # print(resp.status_code)
        result = resp.text
        # print(result)
        html = etree.HTML(result)
        items = html.xpath(r'//li[@class = "gl-item"]')
        length = len(items)
        if length == 0:
            self.switch = False
        for item in items:
            temp_url = item.xpath(r'.//div[@class="p-img"]/a/@href')
            # print(temp_url)
            if len(temp_url) > 0:
                _ = re.findall(self.pattern, temp_url[0])
                if len(_) > 0:
                    url = self.urlmoduel.format(_[0])
                    goodid = _[0]
                    # print(url)
                else:
                    continue
                    pass
            else:
                continue

            # 为了数据完整性,此处需要修改

            res = etree.tostring(item)
            cod = chardet.detect(res).get("encoding")
            res = res.decode(cod)
            # kw = self.keyword.split(' ')
            reT = self.keyword2 + '[a-zA-Z]'
            # print(reT)

            res = re.sub(r'<font.+?>', '', res)
            res = re.sub(r'</font>', '', res)
            tres = etree.HTML(res)
            tres = tres.xpath(r'//a/em/text()')  # 获取标题
            if len(tres):
                res = tres[0]
            else:
                print('空')
                continue

            print(res)
            if re.search(reT, res, re.S):
                logging.info('Invalid Match ')
                # print(goodid,'x')
                continue
            if self.keyword2 not in res:
                continue
            if '显示器' not in res:
                continue
            else:
                logging.info('{}'.format(goodid))
                print(res)
                # print(reT)
                # print(goodid,'okay')
                # continue #测试的时候使用
                if goodid in self.setfilter:  #去掉爬过了的网页
                    continue
                else:
                    self.setfilter.add(goodid)

                print(goodid)  #测试
                callback(goodid=goodid, callback=self.comment_detail)
                '''break # 必须删除,调试的时候使用
示例#7
0
def jd():
    sqllist = []
    sql0 = 'replace into eshop (MD5_id,content,creationTime,score,referenceName,referenceTime,source) VALUES '
    for i in range(len(temp)):
        _ = {}
        _["MD5_id"] = temp[i].get("md5_id", None)
        _["content"] = temp[i].get("content", None)
        _["creationTime"] = temp[i].get("creationTime", None)
        _["score"] = temp[i].get("score", None)
        _["referenceName"] = temp[i].get("referenceName", None)
        _["referenceTime"] = temp[i].get("referenceTime", None)
        _["source"] = temp[i].get("source", None)
        _["content"] = re.sub(r"\'", "\\\'", _["content"])
        _["content"] = re.sub(r'\"', '\\\"', _["content"])
        # _["content"] = re.sub(r"\'","\\\'",_["content"])
        # print(_)
        if _["MD5_id"] is None:
            logging.info("MD5_id is none:")
            continue
        else:
            sqltemp = "({},{},{},{},{},{},{})".format(
                "'{}' ".format(_["MD5_id"]), "'{}' ".format(_["content"]),
                "'{}' ".format(_["creationTime"]), "{}".format(_["score"]),
                "'{}' ".format(_["referenceName"]),
                "'{}' ".format(_["referenceTime"]),
                "'{}' ".format(_['source']))
            sqllist.append(sqltemp)
        if i % 10000 == 0:
            print('#')
            time.sleep(1)

    # 处理插入数量过大问题
    length = len(sqllist)
    first = 0
    last = 0

    while first < length:
        last = first + 20000
        if last > length:
            last = -1
            sqllisttemp = sqllist[first:]
            sql = sql0 + '\n,'.join(sqllisttemp)

            mysqlconn = MysqlPipeline()
            try:
                s = mysqlconn.run(sql)
                print(s)
                print('结束')
            except Exception as e:
                print(e)
            break
        else:
            sqllisttemp = sqllist[first:last]
            sql = sql0 + '\n,'.join(sqllisttemp)

            mysqlconn = MysqlPipeline()
            try:
                s = mysqlconn.run(sql)
                print(s)
            except Exception as e:
                print(e)
            print(first, '~', last)
            first = last