def get_time_desc(t): """ 获取时间描述 :param t: :return: """ _time_desc = '' h = int(t / 3600) if h >= 1: _time_desc += '%s 小时' % h m = int((t - h * 3600) / 60) if m >= 1: _time_desc += '%s 分' % m s = util.number_format(t - h * 3600 - m * 60, 3) if s >= 0: _time_desc += '%s 秒' % s return _time_desc
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx' elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } response = requests.get(url, headers=_headers, timeout=30, proxies=proxies) resp = do_search(response, keyword) if isinstance(resp, int): raise ValueError except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 if 'Search-Results.aspx' in resp.url: product_list = analyse_product_url(resp) root = lxml.html.fromstring(resp.text.encode('utf-8')) product_list = root.xpath('//tr[@valign="top"][@height=85]') if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 for product in product_list: detail = product.xpath('.//a[@class="lnk12b-blackOff"]') detail_url = util.urljoin( resp.url, detail[0].xpath('./@href')[0]) if detail else '' match = goods_sn_pattern.search(detail_url) if not match and detail_url: logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url)) return -404 goods_sn = match.group(1) goods_name = detail[0].text_content() if detail else '' data_dict['url'].append({ 'id': id, 'url': detail_url, 'goods_sn': goods_sn, 'goods_name': goods_name, }) if 'showMore=true' in url: return 200 count = root.xpath('//td[@class="medtext"]') count = util.number_format(count[0].text, places=0, index=999, smart=True) if count else 0 page_num = int(math.ceil(count / 10.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) page_list = root.xpath('//td[@class="medtext"]/a/@href') for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format( search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count) data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200