def crawl(self): url = "http://mobile.gome.com.cn/mobile/product/allCategorys.jsp" jsons = ProcessData.get_json_data(url) if jsons == {}: return {} category1 = jsons['firstLevelCategories'] for first_item in category1: name1 = first_item['goodsTypeName'] #1 lev name try: category2 = first_item['goodsTypeList'] except: pass for second_item in category2: name2 = second_item['goodsTypeName'] try: category3 = second_item['goodsTypeList'] except: pass for third_item in category3: try: third_id = third_item['goodsTypeId'] name3 = third_item['goodsTypeLongName'] except: pass priorcategory = [] priorcategory.append(name1) priorcategory.append(name2) priorcategory.append(name3) data = {'priorcategory': priorcategory} # if name3 != u"冰箱" and name3 != u"空调": # continue Scheduler.schedule(ListCrawler.type, key=third_id, data=data, interval=86400)
def run(self): while True: if not self.runValue.value: print "%s stops" % self.name break self.signalget() start_t = time.time() self.ctask = self.taskqueue.get() if self.ctask.empty: time.sleep(10) continue end_t = time.time() self.log_wait_task(end_t - start_t) self.log_get_task() start_t = time.time() c = Crawler().create(self.ctask.type, self.ctask.key, self.ctask.data) if c: try: c.crawl() success = True logger.info("CRAWL SUCCEED - <%s> %s" % (self.taskqueue.queueid, c)) end_t = time.time() self.log_done_task(end_t - start_t) except Exception: msg = get_exception_info() success = False logger.error("CRAWL FAILED - <%s> %s, %s" % (self.taskqueue.queueid, c, msg)) else: logger.error("CRAWL FAILED - <%s> %s" % (self.taskqueue.queueid, self.ctask)) success = False Scheduler.finish(self.ctask.type, self.ctask.key, c.data if c else {}, success)
def init(conf=None): from xlutils.copy import copy import xlrd import os SRC_PATH = os.path.dirname(__file__) bk = xlrd.open_workbook(os.path.join(SRC_PATH, "../../file/weixin.xls")) sh = bk.sheet_by_name('Sheet1') nrows = sh.nrows ncols = sh.ncols for i in xrange(1, nrows): data = {} data = { 'publisher': sh.cell_value(i, 0).strip(), 'province': sh.cell_value(i, 1).strip(), 'city': sh.cell_value(i, 2).strip(), 'district': sh.cell_value(i, 3).strip() } key = sh.cell_value(i, 6).strip() Scheduler.schedule(FirstCrawler.type, key=key, data=data, interval=28800, reset=True)
def crawl(self): json_data = ProcessData.get_json_data(self.get_json_url(self.key)) is_Bbc = self.get_is_Bbc(json_data) status = self.get_status(json_data) response = self.get_response(self.key) tree = etree.HTML(response.text) info = self.get_info(tree) crawl_data = { "source": self.data["source"], "source_id": self.key, "status": status, "comment": { "is_Bbc": is_Bbc, }, } crawl_data.update(info) crawl_data.update(extract_category(self)) crawl_data.update(get_ctime()) model = EcDetailModel(crawl_data) export(model) comment_data = { "uuid": model["id"], "status": model["status"], "version": model["version"], "series": model["series"], "brand": model["brand"], "is_Bbc": model["comment"]["is_Bbc"], } Scheduler.schedule(CommentCrawler.type, key=self.key, data=comment_data)
def init(conf=None): from xlutils.copy import copy import xlrd import os SRC_PATH = os.path.dirname(__file__) bk = xlrd.open_workbook(os.path.join(SRC_PATH, "../../file/newyuqing.xls")) sh = bk.sheet_by_name('Sheet1') nrows = sh.nrows ncols = sh.ncols for i in range(1,nrows): data = {} types = sh.cell_value(i,1).strip() province = sh.cell_value(i,2).strip() city = sh.cell_value(i,3).strip() district = sh.cell_value(i,4).strip() data = { 'type': types, 'province': province, 'city': city, 'district': district, 'publisher': (province+city+district+types) } key = sh.cell_value(i,5).strip() # print data['publisher'].encode('utf-8') if key == '': continue Scheduler.schedule(FirstCrawler.type ,key=key, data=data, interval=14800, reset=True)
def crawl(self): global COOKIE category_data = extract_category(self) response = self.get_response(self.key) if COOKIE != response.headers.get("set-cookie", ""): COOKIE = response.headers.get("set-cookie", "") tree = etree.HTML(response.text) info = self.get_info(tree) crawl_data = { 'source': "amazon", 'source_id': self.key, 'status': 1, } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcDetailModel(crawl_data) export(model) comment_data = { "uuid": model["id"], "brand": model["brand"], "version": model["version"], "series": model["series"], "is_Bbc": model["comment"]["is_Bbc"], 'status': model["status"], } Scheduler.schedule(CommentCrawler.type, key=self.key, data=comment_data)
def crawl(self): # fid = '1662' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] fid = self.key category_data = extract_category(self) count = 3 #页数初始值为3 pages = 1 #从第一页开始 while pages <= count: url = self.get_url(fid,pages) try: jsons = ProcessData.get_json_data(url) if pages==1 : count = math.ceil(int(jsons['wareCount'])/100) lists = jsons['wareInfo'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return if lists == []: return {} for i in range(len(lists)): ids = uuid.uuid1() #cassandra 主键 wareId = lists[i]['wareId'] try: f = lambda x: int(x[:-1])/100.00 ecsumscores = float(f(lists[i]['good'])) #商品总评分 except: ecsumscores = 0 crawl_data = { # 'id': uuid.uuid1(), 'source_id': wareId, 'source': self.data.get('source'), 'summary': {}, 'title': lists[i]['wname'], 'adword': lists[i]['adword'], 'price': float(lists[i]['jdPrice']), 'original_price': float(lists[i]['martPrice']), 'score': ecsumscores } crawl_data.update(category_data) data = { # 'uuid': ids, 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] # 'presentcategory': self.data['presentcategory'] } model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=wareId, data=data) Scheduler.schedule(CommentCrawler.type, key=wareId, data=data) pages += 1
def init(conf=None): from xlutils.copy import copy import xlrd import os SRC_PATH = os.path.dirname(__file__) bk = xlrd.open_workbook( os.path.join(SRC_PATH, "../../file/newyuqing.xls")) sh = bk.sheet_by_name('Sheet1') nrows = sh.nrows ncols = sh.ncols for i in range(1, nrows): data = {} types = sh.cell_value(i, 1).strip() province = sh.cell_value(i, 2).strip() city = sh.cell_value(i, 3).strip() district = sh.cell_value(i, 4).strip() data = { 'type': types, 'province': province, 'city': city, 'district': district, 'publisher': (province + city + district + types) } key = sh.cell_value(i, 5).strip() # print data['publisher'].encode('utf-8') if key == '': continue Scheduler.schedule(FirstCrawler.type, key=key, data=data, interval=14800, reset=True)
def crawler_data(self,tree): category_data = extract_category(self) XPATH = self.search_list_xpath if len(tree.xpath(XPATH('list'))) == 0: XPATH = self.product_list_xpath dom = tree.xpath(XPATH('list')) for item in dom: crawl_data = {} craw = [ 'title','adword', 'price','original_price', 'source_id','score', ] for value in craw: crawl_data[value] = self.mackining(item.xpath(XPATH(value))) crawl_data['price'] = float(crawl_data['price']) try: f = lambda x: int(x[:-1])/100.00 crawl_data['score'] = float(f(crawl_data['score'])) except: crawl_data['score'] = 0 crawl_data.update(category_data) crawl_data['source'] = 'yhd' model = EcBasicModel(crawl_data) export(model) data = { 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] } data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=str(self.key), data=data)
def crawl(self): catId = str(self.key) category_data = extract_category(self) totalpage = self.get_page(catId) if totalpage == 0: return {} for i in range(1, totalpage + 1): url = self.get_url(catId, i) jsons = ProcessData.get_json_data(url) try: goodsList = jsons['goodsList'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get goodsList fail" for j in range(len(goodsList)): goods = goodsList[j] goodsNo = goods['goodsNo'] goodsName = goods['goodsName'] skuID = goods['skuID'] goods_find = self.has_goods(goodsNo) if not goods_find: data = { 'priorcategory': self.data['priorcategory'], 'skuID': skuID, } Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data) continue adword = self.extract_adword(goods['ad']) crawl_data = { 'id': goods_find['uuid'], 'source_id': goodsNo, 'source': self.data.get('source'), 'title': goods['goodsName'], 'adword': adword, 'status': goods_find['status'], 'price': float(goods['lowestSalePrice']), 'brand': goods_find['brand'], 'version': goods_find['version'], 'series': goods_find['series'], 'comment': { 'is_Bbc': goods_find['isBbc'], 'skuId': goods_find['skuID'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model)
def crawl(self): global COOKIE keyid = self.key category_data = extract_category(self) priorcategory = self.data["priorcategory"] count = 3 page = 1 # 从第一页开始 while page <= count: url = self.get_url(keyid, page) html_stream = ProcessData.get_web_data(url) if COOKIE != html_stream.headers.get("set-cookie", ""): COOKIE = html_stream.headers.get("set-cookie", "") html = etree.HTML(html_stream.content) if page == 1: count = self.getPageSize(html) items = html.xpath(self.xpath["item"]) if not len(items): if html.xpath("//input[@id='captchacharacters']"): time.sleep(random.randint(1, 3)) continue else: self.remove_task(keyid) for item in items: source_id = self.get_source_id(item) task_data = self.has_goods(source_id) if not task_data: data = { 'priorcategory': priorcategory, } Scheduler.schedule(DetailCrawler.type, key=source_id, data=data) else: info = self.get_info(item) crawl_data = { 'id': task_data["uuid"], 'source_id': source_id, 'source': "amazon", 'brand': task_data["brand"], 'version': task_data["version"], 'series': task_data["series"], 'status': task_data["status"], "comment": { "is_Bbc": task_data["is_Bbc"], } } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) page += 1
def crawl(self): homepage = "http://www.jxzj.gov.cn/jxzj/index.html" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+(news).+\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def getchildurl(self, url, data={}): html_stream = _get_url(url) for item in HandleUrl.get_url(html_stream.text): text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$' url_t = re.match(text, item) if url_t != None: # ContentCrawler(key=item).crawl() # print item Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def getchildurl(self, url,data={}): html_stream = _get_url(url) for item in HandleUrl.get_url(html_stream.text): text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$' url_t = re.match(text, item) if url_t != None: # ContentCrawler(key=item).crawl() # print item Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.hzqts.gov.cn/zwpd/index.htm" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # ContentCrawler(key=item).crawl() Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.hzqts.gov.cn/zwpd/index.htm" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # ContentCrawler(key=item).crawl() Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.hbzljd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.gzq.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+(public).+.+\d$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = self.key data = self.data html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) if re.search('(ndex)',item): continue text = '^(http|https).+\d\.(htm|html|net|php)$' url_t = re.match(text, item) if url_t != None: Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = self.key data = self.data html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item, homepage) if re.search('(ndex)', item): continue text = '^(http|https).+\d\.(htm|html|net|php)$' url_t = re.match(text, item) if url_t != None: Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.fsjsjd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): xp_putime = "//a[@href='%s']/parent::*/text()"%item pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: data['pubtime'] = pubtime Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.fsjsjd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): xp_putime = "//a[@href='%s']/parent::*/text()" % item pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: data['pubtime'] = pubtime Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.bjtsb.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): # print '----',item item = HandleUrl.judge_url(item,homepage) # print '====',item text = ur'(http).+(infoview).+\d{3,8}$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
class ThirdCrawler(Crawler): type = "ecommerce.yhd.thirdlvl" def crawl(self): cid = str(self.key) categorys = self.data['priorcategory'] url = "http://interface.m.yhd.com/\ mcategory/servlet/CentralMobileFacadeJsonServlet/\ getNavCategoryWithKeywordByRootCategoryId?rootCategoryId=\ %s&categoryNavId=0&provinceId=1" %(cid) try: jsons = ProcessData.get_json_data(url.replace(' ','')) data = jsons['data'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url for item in data: priorcategory = [] priorcategory.extend(categorys) priorcategory.append(item['categoryName']) if item.has_key('boundCategoryId'): keys = item['boundCategoryId'] else: continue data = { 'priorcategory':priorcategory, } # if priorcategory[2] != u"冰箱" and priorcategory[2] != u"空调": # continue Scheduler.schedule(ListCrawler.type, key=keys, data=data, interval=86400)
def is_detail_done(self): terms = { "type": DetailCrawler.type, "$or": [{"status": 1},{"status": 0}], } result = Scheduler.find_one(DetailCrawler.type, terms) return False if result else True
def crawl(self): world = self.key data = self.data # world = str(self.key) data.update({'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world}) homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\ type=2&t=1427703547684&s_t=&fr=sgsearch&\ query=" + world + "&pg=webSearchList" homepage = clear_space(homepage) html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item) if item == '': continue else: Scheduler.schedule(ContentCrawler.type, key=item, data=data)
def is_first(self, key): terms = { "type": ListCrawler.type, "key": key, "lastrun": datetime.min, } result = Scheduler.find_one(ListCrawler.type, terms) return True if result else False
def save_list(self, items, **args): for item in items: source_id = self.mackining(item.xpath(self.search_list_xpath('source_id'))) if not source_id: continue task_data = self.has_goods(source_id) if not task_data: data = { 'priorcategory': self.data['priorcategory'], } Scheduler.schedule(DetailCrawler.type, key=source_id, data=data) continue crawl_data = self.get_crawl_data(item, category_data=args['category_data'] , source_id=source_id, task_data=task_data) model = EcBasicModel(crawl_data) export(model)
def crawl(self): worlds = str(self.key) world = '+'.join(worlds.split(',')) data = self.data homepage = "http://news.baidu.com/ns?ct=0&rn=20&ie=utf-8&bs=" + world + "&\ rsv_bp=1&sr=0&cl=2&f=8&prevct=no&tn=news&word=" + world # homepage = "http://news.baidu.com/ns?ct=0&rn=20&ie=utf-8&bs=intitle:\ # ("+world+")&rsv_bp=1&sr=0&cl=2&f=8&\ # prevct=no&tn=newstitle&word="+world homepage = clear_space(homepage) html_stream = _get_url(str(homepage)) xp_content = "//div[@id='content_left']/ul/li" items = HandleContent.get_item(html_stream, xp_content) xp_title = "h3[@class='c-title']//text()" xp_str = "div//p[@class='c-author']/text()" #xp_str = "div[@class='c-title-author']/text()" xp_url = "h3[@class='c-title']/a/@href" xp_count = "div//span[@class='c-info']/a[@class='c-more_link']/text()" for item in items: date = new_time() title = HandleContent.get_context(item, xp_title, text=True) pt_text = HandleContent.get_context(item, xp_str, text=True) publisher = HandleContent.get_author(pt_text, xp_text='', STR=True) pubtime = HandleContent.find_pubtime(pt_text) pubtime = local2utc(pubtime) if pubtime else date.get('utctime') url = HandleContent.get_context(item, xp_url, text=True) count = HandleContent.get_context(item, xp_count, text=True) try: count = int(count.split(u'条相同新闻', 1)[0]) if count else 0 except: count = 0 crawl_data = {} crawl_data = { # 'url': url, 'title': title, 'pubtime': pubtime, 'source': u'baidu', 'publisher': publisher, 'count': str(count), 'key': world, 'source_type': data.get('source_type', ''), } # print title,url Scheduler.schedule(ContentCrawler.type, key=url, data=crawl_data)
def crawl(self): key = str(self.key) data = self.data homepage = "http://m.weibo.cn/p/index?containerid=100103type%3D36%26q%3D"+key+\ "%26weibo_type%3Dlongwb&title=%E9%95%BF%E5%BE%AE%E5%8D%9A" homepage = clear_space(homepage) html_stream = _get_url(homepage) time.sleep(random.randint(0, 5)) url_list = re.findall(r"(?<=scheme\":\").+?(?=\")", html_stream.text) data.update({'key': key}) for item in url_list: item = unquote(item) cid = re.search(r'.+\/p\/(.+?)\?.+', item) if cid: Scheduler.schedule(TopicCrawler.type, key=cid.group(1), data=data) else: continue
def has_goods(self, key): terms = { "type": CommentCrawler.type, "$and":[ {"data.source_id": key}, {"data.brand": {"$exists": True}}, ], } result = Scheduler.find_one(CommentCrawler.type, terms) return result["data"] if result else None
def crawl(self): catId = str(self.key) category_data = extract_category(self) totalpage = self.get_page(catId) if totalpage == 0: return {} for i in range(1, totalpage + 1): url = self.get_url(catId, i) jsons = ProcessData.get_json_data(url) try: goodsList = jsons['goodsList'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get goodsList fail" for j in range(len(goodsList)): goods = goodsList[j] goodsName = goods['goodsName'] goodsNo = goods['goodsNo'] skuID = goods['skuID'] # print goodsNo # print skuID crawl_data = { # 'id': uuid.uuid1(), 'source_id': goodsNo, 'source': self.data.get('source'), 'title': goods['goodsName'], 'adword': goods['ad'], 'price': float(goods['lowestSalePrice']), 'original_price': float(goods['highestSalePrice']), #'score': ecsumscores } crawl_data.update(category_data) model = EcBasicModel(crawl_data) export(model) data = { 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] } data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data) Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=data)
def crawl(self): key = str(self.key) data = self.data homepage = "http://m.weibo.cn/p/index?containerid=100103type%3D36%26q%3D"+key+\ "%26weibo_type%3Dlongwb&title=%E9%95%BF%E5%BE%AE%E5%8D%9A" homepage = clear_space(homepage) html_stream = _get_url(homepage) time.sleep(random.randint(0,5)) url_list = re.findall(r"(?<=scheme\":\").+?(?=\")", html_stream.text) data.update({ 'key': key }) for item in url_list: item = unquote(item) cid = re.search(r'.+\/p\/(.+?)\?.+', item) if cid: Scheduler.schedule(TopicCrawler.type, key=cid.group(1), data=data) else: continue
def crawl(self): world = self.key data = self.data homepage = "http://api.weibo.cn/2/profile?\ gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\ wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\ v_f=1&s=d2672a12&luicode=10000194&uid="+str(world) # homepage = "http://api.weibo.cn/2/profile?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid="+str(world)+"&\ # wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&\ # c=iphone&v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&\ # lang=zh_CN&ua=iPhone7,2__weibo__5.2.0__iphone__os8.2&uicode=10000198&uid="+str(world)+\ # "&featurecode=10000085&luicode=10000003" homepage = clear_space(homepage) html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) containerid = json_stream['tabsInfo']['tabs'][1]['containerid'] data['id'] = str(world) Scheduler.schedule(ContentCrawler.type, key=containerid, data=data, reset=True, interval=10800)
def crawl(self): page_size = 0 page = 0 while page <= page_size: url = self.get_url(self.key, page) json_data = ProcessData.get_json_data(url) if page == 0: page_size = self.get_page_size(json_data) for goods in json_data["goods"]: source_id = goods["partnumber"] task_data = self.has_goods(self.key) if not task_data: data = { "priorcategory": self.data["priorcategory"], "status": 1 if int(goods["saleStatus"]) == 0 else 0, } Scheduler.schedule(DetailCrawler.type, key=source_id, data=data) else: crawl_data = { "id": task_data["uuid"], "source": self.data["source"], "source_id": source_id, "title": goods["catentdesc"], "adword": extract_adword(goods.get("auxdescription", "")), "price": float(goods["price"]), 'status': task_data['status'], 'brand': task_data['brand'], 'version': task_data['version'], 'series': task_data['series'], 'comment': { 'is_Bbc': task_data['is_Bbc'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) page += 1
def crawl(self): wareId = str(self.key) url = "http://item.yhd.com/item/%s"%wareId html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) crawl_data = self.crawler_data(tree) product_id = self.parse_productId(tree) model = EcDetailModel(crawl_data) export(model) comment_data = { 'uuid': model['id'], 'status': crawl_data['status'], 'brand': brand, 'series': series, 'version': version, 'is_Bbc': crawl_data['comment']['is_Bbc'], 'priorcategory': self.data['priorcategory'], 'source_id': wareId, } Scheduler.schedule(CommentCrawler.type, key=product_id, data=comment_data)
def init(conf=None): from xlutils.copy import copy import xlrd import os SRC_PATH = os.path.dirname(__file__) bk = xlrd.open_workbook(os.path.join(SRC_PATH, "../../file/weibo.xls")) sh = bk.sheet_by_name('Sheet1') nrows = sh.nrows ncols = sh.ncols for i in xrange(1,nrows): data = {} data = { 'publisher': sh.cell_value(i,3).strip(), 'province': sh.cell_value(i,0).strip(), 'city': sh.cell_value(i,1).strip(), 'district': sh.cell_value(i,2).strip() } key = str(int(sh.cell_value(i,5))).strip() Scheduler.schedule(FirstCrawler.type ,key=key, data=data, interval=3600, reset=True)
def crawl(self): world = self.key data = self.data # world = str(self.key) data.update({ 'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world }) homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\ type=2&t=1427703547684&s_t=&fr=sgsearch&\ query="+world+"&pg=webSearchList" homepage = clear_space(homepage) html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item) if item == '': continue else: Scheduler.schedule(ContentCrawler.type, key=item, data=data)
def crawl(self): url = "http://mobile.gome.com.cn/mobile/product/allCategorys.jsp" jsons = ProcessData.get_json_data(url) if jsons == {}: return {} category1 = jsons['firstLevelCategories'] for first_item in category1: name1 = first_item['goodsTypeName'] #1 lev name try: category2 = first_item['goodsTypeList'] except: pass for second_item in category2: name2 = second_item['goodsTypeName'] #print name try: category3 = second_item['goodsTypeList'] except: pass for third_item in category3: try: third_id = third_item['goodsTypeId'] name3 = third_item['goodsTypeLongName'] except: pass # print third_id # print name3.encode('utf-8') priorcategory = [] priorcategory.append(name1) priorcategory.append(name2) priorcategory.append(name3) #presentcategory = priorcategory data = { 'priorcategory': priorcategory #'presentcategory':presentcategory } Scheduler.schedule(ListCrawler.type, key=third_id, data=data)
def crawl(self): CatID = self.key category_data = extract_category(self) page = 1 page_count = 1 while page <= page_count: jsons = self.get_response(CatID, page) if page == 1: page_count = self.get_page_count(jsons) for goods in jsons['ProductListItems']: source_id = goods["Code"] task_data = self.has_goods(source_id) if task_data: crawl_data = { "id": task_data["uuid"], "title": goods["Title"], "price": goods["Price"]["CurrentPrice"], "source_id": source_id, "source": self.data["source"], "status": task_data["status"], "brand": task_data["brand"], "version": task_data["version"], "series": task_data["series"], "comment": { "is_Bbc": task_data["isBbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) else: detail_data = { "priorcategory": self.data["priorcategory"], } Scheduler.schedule(DetailCrawler.type, key=source_id, data=detail_data) page += 1
def crawl(self): global cookies global clocking global STATUS_CK TIME = time.time() hour = time.strftime('%H',time.localtime(TIME)) if cookies == [] and TIME > STATUS_CK: print 'wait-----------To obtain cookie one ' STATUS_CK = TIME + 35200 clocking = hour cookies = get_cookies() elif not cookies: print 'Gets a cookies failure' # STATUS_CK = TIME + 35200 return elif int(hour)%2 == 0 and clocking != hour: print 'wait-----------To obtain cookie' clocking = hour cookies = [] cookies = get_cookies() cookie = random.choice(cookies) world = self.key data = self.data homepage = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=%s&repp=1"%str(world) html_stream = _get_url(homepage ,cookie=cookie) # re.findall(")") mtre = "sogou.weixin.gzhcb\((.*)\)" match = re.search(mtre, html_stream.text).group(1) all_xml = change_to_json(str(match)).get('items',{}) for item in all_xml: item = item.replace('\"gbk\"','\"utf-8\"') root = ElementTree.fromstring(item) geturl = root.getiterator('url')[0] Scheduler.schedule(ContentCrawler.type, key=geturl.text, data=data) time.sleep(random.randint(30,100))
def crawl(self): # fid = '1620' # categorys = ["家居家装"] fid = self.key categorys = self.data['priorcategory'] start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" sencond_urls = { 'catelogyId': str(fid), 'isDescription': 'true', 'isIcon': 'true', 'level':'1' } url = start_urls + quote(str(sencond_urls)) #print 'url ',url try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except: print 'error ',url return if lists == []: return {} for i in range(len(lists)): cid = lists[i]['cid'] # presentcategory = [] priorcategory = [] priorcategory.extend(categorys) priorcategory.append(extract_title(lists[i]['name'])) data = { 'priorcategory':priorcategory, # 'presentcategory':presentcategory } Scheduler.schedule(ThirdCrawler.type, key=cid, data=data)
def crawl(self): # 获取key 信息 # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071" keyid = self.key source = "amazon" score = 0 # 评分 # 获取原始分类 category_data = extract_category(self) # priorcategory priorcategory = self.data["priorcategory"] presentcategory = self.data["presentcategory"] count = getPageSize(self.get_url(keyid, 1)) # 页数初始值为3 page = 1 # 从第一页开始 content = "//div[@id='mainResults']/div" while page <= count: # 获取url信息 url = self.get_url(keyid, page) # print url # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # self.logger.info("执行页面:"+url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取整个商品的某一个商品的选项,返回的是一个列表 itempath = html.xpath(content) if itempath != None and itempath != []: # print itempath for item in itempath: title = item.xpath("h3[@class='newaps']/a") # crawl_data=[] #存储数据 # jg=item.xpath("") # 价格 pric = item.xpath( "ul[@class='rsltGridList grey']/li[@class='newp']/div") if pric == None: pric = item.xpath("ul/li[@class='newp']/div") # 商品评分 socreitmem = item.xpath( "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a") if socreitmem != []: scoreinfo = socreitmem[0].get('alt') if scoreinfo != None: score = float(scoreinfo[2:-1]) for t in title: # 获取商品的标题和url original_price = u"¥0.00" if pric == None or pric == []: price = u"¥0.00" else: try: price = pric[0].xpath("a/span")[0].text except: print url print "出错价格" + pric if pric != None and pric != [] and pric[0].xpath("a/del") != []: # 有原价 original_price = pric[0].xpath("a/del")[0].text else: # 如果没有原价,那就可以现价一样 original_price = price # i+=1 # 把信息存储到mongodb中 data = { 'priorcategory': priorcategory, 'presentcategory': presentcategory } if price != None and price.strip() != '' and pric != [] and pric[0] != '': # self.logger.info("价格:"+price) # 把信息存储到cassandra中 try: float(price.strip()[1:].replace(",", "")) # float(original_price.strip()[1:].replace(",","") except: self.logger.error("错误price:" + price) self.logger.error("错误price:" + original_price) crawl_data = { # 'id': uuid.uuid1(), 'source_id': t.get("href"), 'source': source, 'summary': {}, 'title': t.xpath("span")[0].text, 'adword': '', 'price': float(price.strip()[1:].replace(",", "")), 'original_price': float(original_price.strip()[1:].replace(",", "")), 'score': 0 } crawl_data.update(category_data) # 保存到cassandra数据库中category_data model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] # print "执行存储cassandra...." Scheduler.schedule( DetailCrawler.type, key=t.get("href"), data=data) Scheduler.schedule( CommentCrawler.type, key=t.get("href"), data=data) # print repr(json.dumps(crawl_data)) page += 1
def crawl(self): url = "http://www.amazon.cn/gp/site-directory" # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # 获取html 信息 html = etree.HTML(html_stream.text) # 整个一级二级三级分类的xpath xpath = "//div[@id='siteDirectory']/div[@class='a-row']/div[@class='a-row a-spacing-small a-spacing-top-medium']" dom = html.xpath(xpath) # 获取一级分类 onexpath = "div[@class='a-row a-spacing-extra-large a-spacing-top-small']/span/a" # binali tmp = "div[@class='a-row a-spacing-none a-spacing-top-mini sd-addPadding']/div[@class='a-column a-span3 sd-colMarginRight']" # 获取二级分类 twoxpath = "div[@class='a-column a-span12 sd-columnSize']/div[@class='a-row a-spacing-small']/span[@class='sd-fontSizeL2 a-text-bold']/a" threexpath = "div[@class='a-column a-span12 sd-columnSize']/div[@class='a-row a-spacing-small']/div[@class='a-row']/ul/li/span/span/a" # 连接mongodb conmn = pymongo.Connection(MONGO_CONN_STR) for item in dom: # 获取一级分类 a-row a-spacing-extra-large a-spacing-top-small oneitem = item.xpath(onexpath) oneinfo = "" # print oneitem for one in oneitem: oneinfo += one.text + ";" # 获取一级分类 oneinfo = oneinfo[:-1] # 把一级分类存储到mongodb中 conmn.crawler.ecommerce.save({'priority': 1, 'status': 1, 'timeout': 3600, 'key': '', 'data': { 'priorcategory': [oneinfo], 'presentcategory': {"1": ''} }, "interval": 0, "type": "ecommerce.amazon.firstlvl"}) tmpxpath = item.xpath(tmp) for itemtmp in tmpxpath: twoitem = itemtmp.xpath(twoxpath) i = 0 for two in twoitem: conmn.crawler.ecommerce.save({'priority': 1, 'status': 1, 'timeout': 3600, 'key': two.get("href"), 'data': { 'priorcategory': [oneinfo, two.text], 'presentcategory': {"1": '', "2": ''} }, "interval": 0, "type": "ecommerce.amazon.goodsdetail"}) threeitem = itemtmp.xpath( "div[@class='a-column a-span12 sd-columnSize']/div[@class='a-row a-spacing-small']/div[@class='a-row']") tmpc = threeitem[i].xpath("ul/li/span/span/a") for t in tmpc: conmn.crawler.ecommerce.save({'priority': 1, 'status': 1, 'timeout': 3600, 'key': t.get("href"), 'data': { 'priorcategory': [oneinfo, two.text, t.text], 'presentcategory': {"1": '', "2": '', "3": ''} }, "interval": 0, "type": "ecommerce.amazon.firstlvl"}) # 执行列表 Scheduler.schedule(ListCrawler.type, key=t.get("href"), data={ 'priorcategory': [oneinfo, two.text, t.text], 'presentcategory': {"1": '', "2": '', "3": ''} }) i = i + 1
def handle(self,id,priorcategory): data = { 'priorcategory':priorcategory } Scheduler.schedule(ThirdCrawler.type, key=id, data=data)
def init(conf=None): # pass Scheduler.schedule(FirstCrawler.type, interval=10800, reset=True)
def handle(self,CatID,priorcategory): data = { 'priorcategory':priorcategory } Scheduler.schedule(ListCrawler.type, key=CatID, data=data)
def crawl(self): key = str(self.key) data = self.data homepage = "http://api.weibo.cn/2/cardlist?\ gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\ wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\ v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\ uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\ extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\ pos=1_-1&wm=3333_2001&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&\ fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\ sourcetype=page&lcardid=user&page=1" # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\ # wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\ # v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\ # 2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\ # type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\ # count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\ # uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\ # c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1" homepage = clear_space(homepage) html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) cards = json_stream['cards'] for item in cards: scheme = re.search(r'=(.+?)$', item.get('scheme','')) scheme = scheme.group(1) if scheme else '' url = "http://weibo.com/%s/%s?type=comment"%(data.get('id', ''), scheme) item = item.get('mblog',{}) item = item.get('retweeted_status',item) text = item.get('text','') title = re.search(ur'【(.+?)】', text) title = title.group(1) if title else '' if not title: title = re.search(ur'#(.+?)#', text) title = title.group(1) if title else text[0:20]+'...' subtitle = re.search(ur'#(.+?)#', text) subtitle = subtitle.group(1) if subtitle else '' pubtime = item.get('created_at', '') pubtime = HandleContent.strformat(str(pubtime)) reposts_count = item.get('reposts_count', '') comments_count = item.get('comments_count', '') attitudes_count = item.get('attitudes_count', '') thumbnail_pic = item.get('thumbnail_pic', '') bmiddle_pic = item.get('bmiddle_pic', '') original_pic = item.get('original_pic', '') mid = item.get('mid', '') author = item.get('user',{}).get('name','') comment = {} comment = { 'reposts_count': str(reposts_count), 'attitudes_count': str(attitudes_count), 'comments_count': str(comments_count) } crawl_data = {} subtitles = [] subtitles.append(subtitle) date = new_time() crawl_data = { 'province': self.data.get('province',''), 'city': self.data.get('city',''), 'district': self.data.get('district',''), 'url': url, 'title': title, 'subtitle': subtitles, 'content': text, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'weibo', 'publisher': self.data.get('publisher',''), 'author': author, 'origin_source': u'新浪微博', 'type': u'微博', 'comment': comment } model = WeiboArticleModel(crawl_data) if export(model): againt_data = {} againt_data = { 'wid': model['id'], 'type': u'微博', 'expire': date.get('crtime_int')/1000000 + 604800, } Scheduler.schedule(AgainCrawler.type, key=mid, data=againt_data, reset=True, interval=21600) else: pass