Exemplos de Util.traceback_info em Python, exemplos de packages.Util.traceback_info em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: update.py Projeto: Gzigithub/workspace

def fetch_data(url, proxy=None, headers=None, **kwargs):
    """获取页面数据

    @param proxy    代理ip，[代理数量,代理列表]
    @param headers  头部信息，如user_agent
    @param kwargs   扩展参数，如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值，成功为字典类型数据
    """
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试，可忽略
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        return -400

    # 强制utf-8
    resp.encoding = 'utf-8'
    if '404.html' in resp.url:
        return 404
    return _parse_detail_data(resp, headers=_headers, **kwargs)

Exemplo n.º 2

0

Exibir arquivo

    def fetch_update_data(self, data_list=[], proxy=None, **kwargs):
        '''获取更新数据

        @return
            无论请求data_list
                0       为空（无视）
                -401      错误（需要重试，程序出错，语法或者由于异常删除造成错误，需要检查程序）
                -402      数据异常（需要重试，需要检验数据获取情况）
                -400    代理异常（须重试，可以无视）
                -200    非200状态，代理异常或者数据异常（须重试，特别注意此种情况是否进入死循环）
                200     正常状态，并非指http状态码
                404     产品不存在已被删除

        '''
        # 根据url进行网站判断, 进而调用网站爬虫的模块

        update_url = kwargs.get('update_url', '')
        if not update_url:
            return
        if '360' in update_url:
            return
        supplier_name = update_url.split('.')[1]
        if supplier_name is None:
            return None
        headers = {
            'user-agent': random.choice(config.USER_AGENT_LIST),
        }
        try:
            if not hasattr(supplier, supplier_name):
                module_name = 'supplier.{0}'.format(supplier_name)
                if module_name not in sys.modules:
                    __import__(module_name)
                obj = sys.modules[module_name]
            else:
                obj = getattr(supplier, supplier_name)
            if 'fetch_update_data' in dir(obj):
                _fetch_update_data = getattr(obj, 'fetch_update_data')
            else:
                kwargs['status'] = -401
                data_list.append(kwargs)
                return None
        except Exception as e:
            config.LOG.exception('STATUS: -401, ID: {0} 导入错误,将进行重试: {1}'.format(kwargs['id'], e))
            kwargs['status'] = -401
            data_list.append(kwargs)
            return None
        try:
            kwargs['headers'] = headers
            kwargs['proxy'] = proxy
            data_list.append(_fetch_update_data(**kwargs))
        except Exception as e:
            kwargs['status'] = -402
            if 'headers' in kwargs:
                del kwargs['headers']
            if 'proxy' in kwargs:
                del kwargs['proxy']
            data_list.append(kwargs)
            config.LOG.exception('STATUS: -402, ID: %(id)s 错误: %s',
                                 {'id': util.u2b(kwargs['id']), 'e': util.traceback_info(e)}, e)

Exemplo n.º 3

0

Exibir arquivo

def fetch_data(url, proxy=None, headers=None, **kwargs):
    """获取页面数据

    @param proxy    代理ip，[代理数量,代理列表]
    @param headers  头部信息，如user_agent
    @param kwargs   扩展参数，如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值，成功为字典类型数据
        :param url:


    """
    if 'goods_sn' in kwargs:
        del kwargs['goods_sn']
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    if url[0:2] == '//':
        url = 'http:' + url
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}
        ti_domain = urlparse.urlsplit(url)[1]
        if 'www.ti.com.cn' == ti_domain:
            product_path_pattern = re.compile(r'/cn/(.*)', re.IGNORECASE)
            product_path = product_path_pattern.search(url)
            if product_path:
                url = "http://www.ti.com/product/{path}".format(
                    path=product_path.group(1))
        elif 'store.ti.com' in ti_domain:
            kwargs['proxies'] = proxies
            return _parse_store_ti_com(url, **kwargs)
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试，可忽略
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        return -400
    # 是否需要添加500的判断
    # 强制utf-8
    resp.encoding = 'utf-8'
    if '404.html' in resp.url:
        return 404
    if '/tool/' in resp.url:
        return _parse_tool_detail(resp, **kwargs)
    kwargs['proxies'] = proxies
    return _parse_detail_data(resp, headers=_headers, **kwargs)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: login.py Projeto: LiBin-Chen/spider_demo

    def load_js(self):
        '''
        加载js文件
        :return:
        '''
        file_path = util.get_static_file(self.js_file)
        #
        try:
            with open(file_path, 'r', encoding='utf-8') as fp:
                js_str = fp.read()
        except Exception as e:
            _logger.info('INFO: 加载js文件错误 {0}'.format(util.traceback_info(e)))
            js_str = ''

        return js_str

Exemplo n.º 5

0

Exibir arquivo

Arquivo: put_queue.py Projeto: LiBin-Chen/spider_demo

def run(args):
    if not isinstance(args, argparse.Namespace):
        print('参数有误')
        return
    interval = args.interval
    while 1:
        try:
            PutQueue(**args.__dict__)
            if args.interval <= 0:
                break
            print('------------- sleep %s sec -------------' % interval)
            time.sleep(interval)
        except Exception as e:
            if 'params_error' in e:
                break
            print(util.traceback_info(e, return_all=True))

Exemplo n.º 6

0

Exibir arquivo

def fetch_data(url, proxy=None, headers=None, **kwargs):
    '''
    获取页面数据
    @description

    @param proxy    代理ip，[代理数量,代理列表]
    @param headers  头部信息，如user_agent
    @param kwargs   扩展参数，如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值，成功为字典类型数据
    '''

    if isinstance(headers, dict):
        default_headers = headers
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}

        sess = requests.Session()
        rs = sess.get(url, headers=default_headers, cookies=_cookies, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试，可忽略
        _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url))
        return -400

    if rs.status_code != 200:
        if rs.status_code == 500:
            _logger.debug('STATUS:-500 ; INFO:请求被禁止 ; PROXY：%s ; URL:%s ; User-Agent:%s' % (
                proxies['http'] if proxy else '', url, headers.get('user_agent', '')))
            return -500
        # 已失效产品（url不存在）
        elif rs.status_code == 404:
            _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % url)
            return 404
        _logger.debug('STATUS:-405 ; INFO:请求错误，网页响应码 %s ; PROXY：%s ; URL:%s' % (
            rs.status_code, proxies['http'] if proxy else '', url))
        return -405
    # 强制utf-8
    rs.encoding = 'utf-8'

    return _parse_detail_data(rs.text, url=url, **kwargs)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: login.py Projeto: LiBin-Chen/spider_demo

    def fetch_data(self):
        '''
        获取页面数据
        '''
        headers = self.headers if self.headers else DEFAULT_HEADER

        try:
            sess = requests.Session()
            print('获取url： {0}'.format(self.url))

            if self.method == 'GET':
                rs = sess.get(self.url,
                              headers=headers,
                              cookies=None,
                              timeout=30,
                              proxies=None)
            elif self.method == 'POST':
                rs = sess.post(self.url,
                               data=self.form_data,
                               headers=headers,
                               cookies=None,
                               timeout=30,
                               proxies=None)
            else:
                _logger.info('INFO:请求方法未定义 ; URL: {0}'.format(self.url))
            print('rs', rs)
            print(rs.text, rs.text)
        except Exception as e:
            # 将进行重试，可忽略
            _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                         (util.traceback_info(e), self.url))
            return -400

        if rs.status_code != 200:
            if rs.status_code == 404:
                _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % self.url)
                return 404

        # 强制utf-8
        # rs.encoding = 'utf-8'
        rs.encoding = rs.apparent_encoding
        return self._parse_detail_data(rs.content)

Exemplo n.º 8

0

Exibir arquivo

    def fetch_search_data(self, data_list=[], err_list=[], proxy=None, supp=None, **kwargs):
        """
        根据搜索关键词获取产品产品数据（可能为url也可能为详细信息）

        """
        if not supp or 'keyword' not in kwargs:
            return None
        headers = {
            'user-agent': random.choice(config.USER_AGENT_LIST),
        }
        keyword = util.u2b(kwargs['keyword'])
        supplier_name = config.DB_KEY[supp]
        try:
            if not hasattr(supplier, supplier_name):
                module_name = 'supplier.{0}'.format(supplier_name)
                if module_name not in sys.modules:
                    __import__(module_name)
                obj = sys.modules[module_name]
            else:
                obj = getattr(supplier, supplier_name)
            if hasattr(obj, 'api_search_data'):
                _fetch_function = getattr(obj, 'api_search_data')
            else:
                _fetch_function = getattr(obj, 'fetch_search_data')
        except Exception as e:
            config.LOG.exception('STATUS: -401, Keyword: %(keyword)s', {'keyword': keyword})
            if kwargs.get('count', 1) < self.exception_threshold:
                kwargs['status'] = -401
                kwargs['count'] = kwargs.get('count', 1) + 1
                err_list.append(kwargs)
            return None
        data_dict = {
            'detail': [],
            'list': [],
            'url': []
        }
        if self.optype == 'hot' and self.use:
            kwargs['hot_search'] = True
        del kwargs['keyword']
        try:
            _fetch_function(keyword, supp, data_dict, headers, **kwargs)
        except Exception as e:
            config.LOG.exception('STATUS: -402, Keyword: %(keyword)s', {'keyword': keyword})
            if kwargs.get('count', 1) < self.exception_threshold:
                kwargs['status'] = -402
                kwargs['count'] = kwargs.get('count', 1) + 1
                kwargs['keyword'] = keyword
                err_list.append(kwargs)
            return None
        if data_dict['list']:
            try:
                _fetch_function = getattr(obj, 'fetch_search_list')
            except Exception as e:
                _fetch_function = None
                print(util.traceback_info(e, return_all=1))
            if _fetch_function:
                res = self._crawl(_fetch_function, data_dict['list'], headers, proxy)
                if 'url' in res:
                    for url in res['url']:
                        data_dict['url'].append(url)
                if 'detail' in res:
                    for data in res['detail']:
                        data_dict['detail'].append(data)
        if data_dict['url']:
            try:
                _fetch_function = getattr(obj, 'fetch_data')
            except Exception as e:
                _fetch_function = None
                print(util.traceback_info(e, return_all=1))
            if _fetch_function:
                res = self._crawl(_fetch_function, data_dict['url'], headers, proxy)
                if 'detail' in res:
                    for data in res['detail']:
                        data_dict['detail'].append(data)
        for data in data_dict['detail']:
            pass
            data_list.append(data)
            '''
            此处进行每条数据的清洗整理
            '''
        return data_list

Exemplo n.º 9

0

Exibir arquivo

Arquivo: update.py Projeto: Gzigithub/workspace

def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        print '正在获取 richardsonrfpd 中关键词：%s 的相关数据' % keyword
        url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx'
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        response = requests.get(url,
                                headers=_headers,
                                timeout=30,
                                proxies=proxies)
        resp = do_search(response, keyword)
        if isinstance(resp, int):
            raise ValueError
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400
    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误，网页响应码 %s ; PROXY：%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    # 开始解析resp
    # 获取搜索的数量
    if 'Search-Results.aspx' in resp.url:
        product_list = analyse_product_url(resp)
    root = lxml.html.fromstring(resp.text.encode('utf-8'))
    product_list = root.xpath('//tr[@valign="top"][@height=85]')
    if len(product_list) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    for product in product_list:
        detail = product.xpath('.//a[@class="lnk12b-blackOff"]')
        detail_url = util.urljoin(
            resp.url, detail[0].xpath('./@href')[0]) if detail else ''
        match = goods_sn_pattern.search(detail_url)
        if not match and detail_url:
            logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url))
            return -404
        goods_sn = match.group(1)
        goods_name = detail[0].text_content() if detail else ''
        data_dict['url'].append({
            'id': id,
            'url': detail_url,
            'goods_sn': goods_sn,
            'goods_name': goods_name,
        })
    if 'showMore=true' in url:
        return 200
    count = root.xpath('//td[@class="medtext"]')
    count = util.number_format(count[0].text, places=0, index=999,
                               smart=True) if count else 0
    page_num = int(math.ceil(count / 10.0))
    if page_num <= 1:
        return 200
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    page_list = root.xpath('//td[@class="medtext"]/a/@href')
    for x in xrange(1, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format(
            search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count)
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200

Exemplo n.º 10

0

Exibir arquivo

def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        if not kwargs.get('other_usage', False):
            print '正在获取 ti.com 中关键词：%s 的相关数据' % keyword
        url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&filter=p&sortBy=pstatus&searchTerm=%s' % keyword
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404

    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400

    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误，网页响应码 %s ; PROXY：%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    resp_json = {}
    try:
        resp_json = json.loads(resp.content)
        product = resp_json.get('response', {}).get('searchResults',
                                                    {}).get('PartNoArray', [])
        # print len(product)
    except:
        product = []
        logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url)
    if len(product) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    links = product
    for vo in links:
        pn = vo.get('PartNumber', '')
        tn = vo.get('PartType', '')
        if pn:
            link = 'http://www.ti.com/product/%s' % pn
            if 'tool' in tn:
                link = 'http://www.ti.com/tool/%s' % pn
            data_dict['url'].append({'id': id, 'url': link, 'goods_sn': pn})
    if 'startNum=' in resp.url:
        return 200
    page_num = 0
    count = 0
    try:
        count = resp_json.get('response',
                              {}).get('searchResults',
                                      {}).get('filter',
                                              {}).get('MaxRecordCount', '')
        count = util.intval(count)
    except:
        count = 0
    page_num = int(math.ceil(count / 25.0))
    if page_num <= 1:
        return 200
    # 翻页的form_data
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    for x in xrange(1, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&startNum=%d&filter=p&sortBy=pstatus&searchTerm=%s' % (
            25 * x, keyword)
        page_url = url
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200

Exemplo n.º 11

0

Exibir arquivo

Arquivo: supplier_template.py Projeto: Gzigithub/workspace

def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        print '正在获取 avnet 中关键词：%s 的相关数据' % keyword
        url = "https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1&currency=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber=1&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag%3ANPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&wt=json".format(
            keyword=keyword)
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400
    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误，网页响应码 %s ; PROXY：%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    # 开始解析resp
    # 获取搜索的数量
    search_dict = {}
    try:
        search_dict = json.loads(resp.text.encode('utf-8'))
        product_list = search_dict.get('catalogEntryView', [])
    except:
        product_list = []
        logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url)
    if len(product_list) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    # sn = product.xpath('.//td[@class="partColHeader"]//span[@class="defaultSearchText"]')
    for product in product_list:
        goods_sn = product.get('seo_token_ntk', '')
        base_url = 'https://www.avnet.com/shop/apac/'
        product_url = product.get('avn_pdp_seo_path', '')
        data_dict['url'].append({
            'id': id,
            'url': util.urljoin(base_url, product_url),
            'goods_sn': goods_sn
        })
    if 'showMore=true' in url:
        return 200
    count = search_dict.get('recordSetTotal', 0)
    page_num = int(math.ceil(count / 20.0))
    if page_num <= 1:
        return 200
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    for x in xrange(2, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        page_url = 'https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_More_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1&currency=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber={next_page}&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag:NPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&showMore=true&wt=json'.format(
            next_page=x, keyword=keyword)
        # print page_url
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200

Exemplo n.º 12

0

Exibir arquivo

Arquivo: supplier_template.py Projeto: Gzigithub/workspace

def _parse_detail_data(resp, headers=None, **kwargs):
    """
    解析详情数据，独立出来

    @param  data    页面数据
    @param  url     解析的页面url（方便记录异常）
    @param  kwargs  扩展参数
    """
    item = {}
    try:
        soup = BeautifulSoup(resp.text, 'lxml')
        if soup is None:
            logger.debug('初始化商品详情页面失败 URL: %s', resp.url)
            return -404
    except Exception as e:
        logger.debug('初始化商品详情页面失败 URL: %s ERROR: %s',
                     (resp.url, util.traceback_info(e)))
        return -404
    # goods_sn
    url_path_list = resp.url.split('/')
    goods_sn_pattern = re.compile(r'.*-\d{19}')
    for path in url_path_list[::-1]:
        if goods_sn_pattern.findall(path):
            item['goods_sn'] = path
            break
    if not item.get('goods_sn', False):
        logger.debug("无法从链接中解析goods_sn URL: {url} ".format(url=resp.url))
        return -400
    # goods_name
    goods_info_div = soup.find('div', class_='section-left')
    item['goods_name'] = goods_info_div.find('h1').get_text(
        strip=True) if goods_info_div else item['goods_sn']
    # url
    item['url'] = resp.url
    # goods_img
    img_div = soup.find('div', id="outer-div1")
    img = img_div.find('img') if img_div else None
    item['goods_img'] = util.urljoin(resp.url, img.get('src')) if img else ''
    # goods_thumb
    item['goods_thumb'] = item['goods_img']
    # desc
    desc_p = soup.find('p', class_='RB-pdp_short_Desc')
    item['desc'] = desc_p.get_text(strip=True) if desc_p else ''
    # provider_name
    item['provider_name'] = "AVNET"
    # provider_url
    item['provider_url'] = ''
    # attr: [[None, None]]
    attr_body = soup.find('div', id="techAttr")
    attr_div = attr_body.find_all('div', class_='pdpDescriptionsBodyContent')
    attr = []
    if attr_div is not None:
        for content in attr_div:
            att_name = content.find('div', class_='pdpDescriptionColumn')
            attr_value = content.find('div', class_='pdpValueColumn')
            if att_name and attr_value:
                attr.append([
                    att_name.get_text(strip=True),
                    attr_value.get_text(strip=True)
                ])
            else:
                continue
        item['attr'] = attr
    else:
        item['attr'] = attr
    # tiered: [[0, 0.00]]
    tiered_span = soup.find_all('span', class_='usdpart1')
    tiered = []
    if tiered_span:
        for span in tiered_span:
            qty_span = span.find('span', class_='pdpTierMinQty')
            qty = qty_span.get_text(strip=True) if qty_span else 0
            price_p = span.find('p')
            price = price_p.get_text(strip=True) if price_p else 0.00
            if qty and price:
                tiered.append([util.intval(qty), util.floatval(price)])
            else:
                tiered = [[0, 0.00]]
                break
        item['tiered'] = tiered
    else:
        item['tiered'] = [[0, 0.00]]

    # stock: [0, 1]  >> [stock, qty]
    stock_input = soup.find('input', id='inStock')
    stock = stock_input.get('value') if stock_input else 0
    stock = util.intval(stock)
    # qty
    min_qty_input = soup.find('input', attrs={'name': 'min'})
    min_qty = min_qty_input.get('value') if min_qty_input else 1
    min_qty = util.intval(min_qty)
    item['stock'] = [stock, min_qty] if stock else ['0', '1']
    # increment: 1
    multi_input = soup.find('input', attrs={'name': 'mult'})
    item['increment'] = util.intval(
        multi_input.get('value')) if multi_input else 1
    # doc
    doc_div = soup.find('div', class_='pdfcontent')
    if doc_div is not None:
        doc_url = doc_div.find('a', class_='datasheet_align')
        item['doc'] = doc_url.get('href') if doc_url else ''
    else:
        item['doc'] = ''
    # rohs: -1
    rohs_div = soup.find('div', class_='leafcontent')
    item['rohs'] = 1 if rohs_div else -1
    # catlog: [[name, url]]
    nav = soup.find('nav', class_='breadcrumb')
    nav_ul = nav.find('ul', class_='nav')
    catlog = []
    if nav is not None:
        lis = nav.find_all('a')
        for a in lis:
            cat_name = a.get_text(strip=True)
            cat_url = util.urljoin(resp.url, a.get('href'))
            if cat_name and cat_url:
                catlog.append([cat_name, cat_url])
            else:
                continue
        item['catlog'] = catlog
    else:
        item['catlog'] = catlog
    # goods_other_name
    item['goods_other_name'] = ''
    # product_id
    # family_sn
    return item