Пример #1
0
 def make_url(asin, url_type='goods', urltitle='', sessionId=''):
     url_tuple = tuple()
     if url_type == 'goods':
         url, referer = GoodsParser.make_goods_url(asin,
                                                   urltitle=urltitle,
                                                   sessionId=sessionId)
         url_tuple = (url, referer)
     if url_type == 'reviews':
         url = ReviewsParser.make_reviews_url(asin, urltitle=urltitle)
         url_tuple = (url, )
     if url_type == 'tosell':
         url = TosellParser.make_tosell_url(asin)
         url_tuple = (url, )
     return url_tuple
Пример #2
0
    def download(self, ip, asin_or_kw, url_dict):
        url_type = self.url_type
        asin = asin_or_kw
        monitor_type = url_dict.get('mtp') or 1
        print('url type: ', url_type)
        url_md5key = url_dict.get('md5') or ''
        if not url_md5key:
            url_md5key = self.get_md5_key(asin + url_type)
        startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S")
        time_now = lambda: time.time()
        time1 = time_now()
        ua = self.get_ua()
        self.debug_log.debug('goods ua: %s' % (ua))
        # value_str = ip + ua
        # self.debug_log.debug('goods value_str: %s' % (value_str))
        # cookMd5key = self.get_md5_key(value_str)
        cookMd5key = ''
        cookie = self.get_cookie(cookMd5key)
        print('\ngoodsCookie: ', cookie)
        # url_title = urlQ.get_urlTitle_from_string(asin)
        url_title = ''
        sessionId = ''
        # if cookie:
        #     sessionId = cookie.get('session-id')
        retry = False
        old_dnum = url_dict.get('dnum') or 0
        if old_dnum > 3:
            retry = True
        url, referer = GoodsParser.make_goods_url(asin,
                                                  urltitle=url_title,
                                                  sessionId=sessionId,
                                                  retry=retry)
        cookies = cookie
        print('goods referer: %s' % (referer))
        print('[ip %s] 工作中... [%s]' % (ip, url))
        if url:
            print('goods_url: ', url)
            html, cookiesObj, is_error = self.get_html(url,
                                                       ua,
                                                       ip,
                                                       cookies,
                                                       referer,
                                                       url_type=url_type,
                                                       asin=asin)
            print('这是HTML')
            # with open('data/devtest/6_1.html', 'w', encoding='utf8') as f:
            #     f.write(html)
            durl = url_dict.get('durl') or []
            durl.append(url)
            url_dict['durl'] = list(set(durl))
            url_dict['dnum'] = old_dnum + 1
            if is_error:
                self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                msgInt = 6
                proxyInfo = 'get Html error'
                self.record_log(asin, time1, msgInt, url_type, startTime, ip,
                                proxyInfo)

            else:
                analyze = self.analyze_html(html,
                                            cookie,
                                            cookiesObj,
                                            ip,
                                            asin_or_kw,
                                            url_dict,
                                            cookMd5key,
                                            time1,
                                            startTime,
                                            html_type=url_type)
                if analyze and analyze != 404:
                    # 获取url_title 并保存
                    # self.save_url_title(asin, html)
                    cook = cookie
                    if not cookie:
                        cook = cookiesObj
                    result, is_error = self.parser(html,
                                                   html_type=url_type,
                                                   asin=asin,
                                                   ip=ip,
                                                   ua=ua,
                                                   debug_log=self.debug_log,
                                                   monitor_type=monitor_type,
                                                   cookie=cook,
                                                   url=url)
                    if is_error:
                        self.the_url_is_discard(asin, url_dict, url_type,
                                                url_md5key)
                        msgInt = 3
                        proxyInfo = 'get data error'
                        self.record_log(asin, time1, msgInt, url_type,
                                        startTime, ip, proxyInfo)
                    else:
                        if not result:
                            self.the_url_is_discard(asin, url_dict, url_type,
                                                    url_md5key)
                            msgInt = 2
                            proxyInfo = 'get data defeated'
                            self.record_log(asin, time1, msgInt, url_type,
                                            startTime, ip, proxyInfo)
                        else:
                            goods_datas = result[0]
                            if goods_datas:
                                print('goods_datas:', goods_datas)
                                qty, qtydt = inv_start(asin, html, cook, ua,
                                                       goods_datas[asin])
                                print('qty, qtydt:', qty, qtydt)
                                goods_datas[asin]['quantity'] = qty
                                goods_datas[asin]['qtydt'] = qtydt
                                print('goods_datas.add(qty, qtydt):',
                                      goods_datas)
                                bsr_data = result[1]
                                data_bytes = pickle.dumps(goods_datas)
                                if bsr_data:
                                    # print('bsr_data1', bsr_data)
                                    bsrData_bytes = pickle.dumps(bsr_data)
                                    # print('bsrData_bytes', bsrData_bytes)
                                    self.dataQ.add_bsrData_to_queue(
                                        bsrData_bytes)
                                    # print('bsr data ok', bsr_data)
                                # from pprint import pprint
                                # pprint(goods_datas)
                                result1 = self.dataQ.add_goods_data_to_queue(
                                    data_bytes)
                                print(result1)
                                if not result1:
                                    sys.exit()

                                # self.dataQ.record_data_ok_times()
                                # self.dataQ.record_goods_ok_times()
                                self.save_success_asin_keyword(
                                    asin, url_type=url_type)
                                msgInt = 1
                                proxyInfo = 'get data success'
                                self.record_log(asin, time1, msgInt, url_type,
                                                startTime, ip, proxyInfo)

                else:
                    self.the_url_is_discard(asin, url_dict, url_type,
                                            url_md5key)
                    time.sleep(1)
        else:
            print(url_type, '没有url')
            self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
            time.sleep(1)
Пример #3
0
    def download(self, asin_or_kw, url_dict):
        print(asin_or_kw, url_dict)
        url_type = self.url_type
        print(url_type)
        asin = asin_or_kw
        monitor_type = url_dict.get('mtp') or 1
        print('url type: ', url_type)
        url_md5key = url_dict.get('md5') or ''
        if not url_md5key:
            url_md5key = self.get_md5_key(asin + url_type)
        startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S")
        time_now = lambda: time.time()
        time1 = time_now()
        retry = False
        old_dnum = url_dict.get('dnum') or 0
        if old_dnum > 3:
            retry = True
        url, referer = GoodsParser.make_goods_url(asin, retry=retry)
        if url:
            print('goods_url: ', url)
            html, cookiesObj, is_error = self.get_html(url,
                                                       referer=referer,
                                                       url_type=url_type,
                                                       asin=asin)
            print('is_error:', is_error)
            durl = url_dict.get('durl') or []
            durl.append(url)
            url_dict['durl'] = list(set(durl))
            url_dict['dnum'] = old_dnum + 1
            if is_error:
                self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                msgInt = 6
                proxyInfo = 'get Html error'
                self.record_log(asin, time1, msgInt, url_type, startTime,
                                proxyInfo)

            else:
                analyze = self.analyze_html(html,
                                            asin_or_kw,
                                            url_dict,
                                            time1,
                                            startTime,
                                            html_type=url_type)
                if analyze and analyze != 404:
                    result, is_error = self.parser(html,
                                                   html_type=url_type,
                                                   asin=asin,
                                                   debug_log=self.debug_log,
                                                   monitor_type=monitor_type,
                                                   url=url)
                    if is_error:
                        self.the_url_is_discard(asin, url_dict, url_type,
                                                url_md5key)
                        msgInt = 3
                        proxyInfo = 'get data error'
                        self.record_log(asin, time1, msgInt, url_type,
                                        startTime, proxyInfo)
                    else:
                        if not result:
                            self.the_url_is_discard(asin, url_dict, url_type,
                                                    url_md5key)
                            msgInt = 2
                            proxyInfo = 'get data defeated'
                            self.record_log(asin, time1, msgInt, url_type,
                                            startTime, proxyInfo)
                        else:
                            goods_datas = result[0]
                            if goods_datas:
                                cookies, headers = cookiesObj
                                user_anget = headers.get('User-Agent')
                                print(user_anget)
                                from pprint import pprint
                                pprint(cookies)
                                pprint(goods_datas)
                                msgInt = 1
                                proxyInfo = 'get data success'
                                log_param = (asin, time1, msgInt, url_type,
                                             startTime, proxyInfo)
                                start(asin=asin,
                                      goods_datas=goods_datas,
                                      user_anget=user_anget,
                                      url_dict=url_dict,
                                      goods_html=html,
                                      cookies=cookies,
                                      log_param=log_param,
                                      crawler_obj=self)
                else:
                    self.the_url_is_discard(asin, url_dict, url_type,
                                            url_md5key)
                    time.sleep(1)
        else:
            print(url_type, '没有url')
            self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
            time.sleep(1)