Пример #1
0
def fetch_data(url, proxy=None, headers=None, **kwargs):
    """获取页面数据

    @param proxy    代理ip,[代理数量,代理列表]
    @param headers  头部信息,如user_agent
    @param kwargs   扩展参数,如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值,成功为字典类型数据
    """
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试,可忽略
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        return -400

    # 强制utf-8
    resp.encoding = 'utf-8'
    if '404.html' in resp.url:
        return 404
    return _parse_detail_data(resp, headers=_headers, **kwargs)
Пример #2
0
def fetch_data(url, proxy=None, headers=None, **kwargs):
    """获取页面数据

    @param proxy    代理ip,[代理数量,代理列表]
    @param headers  头部信息,如user_agent
    @param kwargs   扩展参数,如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值,成功为字典类型数据
        :param url:


    """
    if 'goods_sn' in kwargs:
        del kwargs['goods_sn']
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    if url[0:2] == '//':
        url = 'http:' + url
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}
        ti_domain = urlparse.urlsplit(url)[1]
        if 'www.ti.com.cn' == ti_domain:
            product_path_pattern = re.compile(r'/cn/(.*)', re.IGNORECASE)
            product_path = product_path_pattern.search(url)
            if product_path:
                url = "http://www.ti.com/product/{path}".format(
                    path=product_path.group(1))
        elif 'store.ti.com' in ti_domain:
            kwargs['proxies'] = proxies
            return _parse_store_ti_com(url, **kwargs)
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试,可忽略
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        return -400
    # 是否需要添加500的判断
    # 强制utf-8
    resp.encoding = 'utf-8'
    if '404.html' in resp.url:
        return 404
    if '/tool/' in resp.url:
        return _parse_tool_detail(resp, **kwargs)
    kwargs['proxies'] = proxies
    return _parse_detail_data(resp, headers=_headers, **kwargs)
Пример #3
0
def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword
        url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx'
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        response = requests.get(url,
                                headers=_headers,
                                timeout=30,
                                proxies=proxies)
        resp = do_search(response, keyword)
        if isinstance(resp, int):
            raise ValueError
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400
    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    # 开始解析resp
    # 获取搜索的数量
    if 'Search-Results.aspx' in resp.url:
        product_list = analyse_product_url(resp)
    root = lxml.html.fromstring(resp.text.encode('utf-8'))
    product_list = root.xpath('//tr[@valign="top"][@height=85]')
    if len(product_list) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    for product in product_list:
        detail = product.xpath('.//a[@class="lnk12b-blackOff"]')
        detail_url = util.urljoin(
            resp.url, detail[0].xpath('./@href')[0]) if detail else ''
        match = goods_sn_pattern.search(detail_url)
        if not match and detail_url:
            logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url))
            return -404
        goods_sn = match.group(1)
        goods_name = detail[0].text_content() if detail else ''
        data_dict['url'].append({
            'id': id,
            'url': detail_url,
            'goods_sn': goods_sn,
            'goods_name': goods_name,
        })
    if 'showMore=true' in url:
        return 200
    count = root.xpath('//td[@class="medtext"]')
    count = util.number_format(count[0].text, places=0, index=999,
                               smart=True) if count else 0
    page_num = int(math.ceil(count / 10.0))
    if page_num <= 1:
        return 200
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    page_list = root.xpath('//td[@class="medtext"]/a/@href')
    for x in xrange(1, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format(
            search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count)
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200
Пример #4
0
def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        if not kwargs.get('other_usage', False):
            print '正在获取 ti.com 中关键词:%s 的相关数据' % keyword
        url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&filter=p&sortBy=pstatus&searchTerm=%s' % keyword
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404

    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400

    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    resp_json = {}
    try:
        resp_json = json.loads(resp.content)
        product = resp_json.get('response', {}).get('searchResults',
                                                    {}).get('PartNoArray', [])
        # print len(product)
    except:
        product = []
        logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url)
    if len(product) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    links = product
    for vo in links:
        pn = vo.get('PartNumber', '')
        tn = vo.get('PartType', '')
        if pn:
            link = 'http://www.ti.com/product/%s' % pn
            if 'tool' in tn:
                link = 'http://www.ti.com/tool/%s' % pn
            data_dict['url'].append({'id': id, 'url': link, 'goods_sn': pn})
    if 'startNum=' in resp.url:
        return 200
    page_num = 0
    count = 0
    try:
        count = resp_json.get('response',
                              {}).get('searchResults',
                                      {}).get('filter',
                                              {}).get('MaxRecordCount', '')
        count = util.intval(count)
    except:
        count = 0
    page_num = int(math.ceil(count / 25.0))
    if page_num <= 1:
        return 200
    # 翻页的form_data
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    for x in xrange(1, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&startNum=%d&filter=p&sortBy=pstatus&searchTerm=%s' % (
            25 * x, keyword)
        page_url = url
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200
Пример #5
0
def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        print '正在获取 avnet 中关键词:%s 的相关数据' % keyword
        url = "https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1&currency=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber=1&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag%3ANPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&wt=json".format(
            keyword=keyword)
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400
    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    # 开始解析resp
    # 获取搜索的数量
    search_dict = {}
    try:
        search_dict = json.loads(resp.text.encode('utf-8'))
        product_list = search_dict.get('catalogEntryView', [])
    except:
        product_list = []
        logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url)
    if len(product_list) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    # sn = product.xpath('.//td[@class="partColHeader"]//span[@class="defaultSearchText"]')
    for product in product_list:
        goods_sn = product.get('seo_token_ntk', '')
        base_url = 'https://www.avnet.com/shop/apac/'
        product_url = product.get('avn_pdp_seo_path', '')
        data_dict['url'].append({
            'id': id,
            'url': util.urljoin(base_url, product_url),
            'goods_sn': goods_sn
        })
    if 'showMore=true' in url:
        return 200
    count = search_dict.get('recordSetTotal', 0)
    page_num = int(math.ceil(count / 20.0))
    if page_num <= 1:
        return 200
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    for x in xrange(2, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        page_url = 'https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_More_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1&currency=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber={next_page}&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag:NPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&showMore=true&wt=json'.format(
            next_page=x, keyword=keyword)
        # print page_url
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200