def crawler(keyword, page, count, cat_list=''): if cat_list and isinstance(cat_list, list): cat = ','.join(cat_list) else: cat = '' goods_list = _crawler(keyword=keyword, page=page, count=count, cat=cat) if goods_list is None: return [] result = [] for goods in goods_list: tmp = _ship_goods_supers(goods) if not tmp: continue tmp.update({'table': 'goods'}) cat_obj = Category(id=tmp['category_id'], name=tmp['category_name']) cat_obj.save_category() if tmp.get("sub_category_id"): cat_obj = SubCategory(id=tmp['sub_category_id'], name=tmp.get('sub_category_name', ''), parent=tmp['category_id']) cat_obj.save_category() source = keyword if keyword else 'crawler' tmp.update({'source': source}) goods_instance = TbkGoods(**tmp) if goods_instance.check_save(): goods_info = goods_instance.find_goods_by_id() if not goods_info: similar_ids = crawler_similar(tmp['num_id']) goods_instance.similar_goods = similar_ids result.append(tmp) ret = goods_instance.save() searcher.update_index(tmp) LOG.debug(ret) return result
def get_shangxi_content(shangxi_id): url = 'https://so.gushiwen.org/shiwen2017/ajaxshangxi.aspx' params = {'id': shangxi_id} time.sleep(10) client = HttpClient() page_content = client.get(url, params=params) shangxi = '' if page_content: page_content = unicode(page_content, 'utf-8') dom = fromstring(page_content) elements = dom.xpath("//div[@class='contyishang']/p") for element in elements: tmp = element.xpath("string(.)") tmp = tmp.replace(u"▲", "") shangxi += tmp shangxi += '\n' else: LOG.debug("down page error: %s, params: %s", url, params) return shangxi
def crawler_author_record(author): next_page = author['poetry_link'] author_id = author['id'] count = 0 while next_page: detail_links, next_page = detail_crawler(next_page) for poetry_link in detail_links: try: poetry_data = get_detail_url(poetry_link, author_id) poetry_id = save_crawled_poetry(poetry_data) if poetry_id: count += 1 LOGGER.debug("save poetry: %s, authorid: %s", poetry_id, author_id) except Exception as ex: LOGGER.error("link: %s, ex: %s", poetry_link, ex, exc_info=True) # time.sleep(random.randint(6, 10)) LOGGER.info("page: %s, save: %s", next_page, count) count = 0
def get_one_goods(cat=None): if cat is None: cat_obj = Category(recommend=1) cats = cat_obj.all_category() cat_list = [] for cat in cats: cat_list.append(int(cat['id'])) # cat_list = [1801, 16, 30, 50002766, 50006843, 122952001] cat_id = random.choice(cat_list) else: cat_id = cat start = time.time() - 8 * 86400 cond = { "coupon_amount": { '$gt': 5 }, "created": { '$gt': start * 1000 }, "sales": { '$gt': 3000 }, 'category_id': cat_id, "sended": { '$exists': False }, "coupon_expire": 0 } LOG.debug(cond) goods_obj = TbkGoods() goods = goods_obj.find_goods_by_cond(cond, 1, count=20) goods_list = list(goods) length = len(goods_list) if length == 0: return {} index = random.randint(0, length - 1) return goods_list[index]