def fetchWWWFoodList(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", } result = request.get(url, headers=headers, timeout=timeout, format='HTML') foods = result.findall('.//div[@id="shop-all-list"]//ul//li') print len(foods) if len(foods) < 15: nextpage = None else: index = url.split('/') index[-1] = str(int(index[-1]) + 1) nextpage = '/'.join(index) yield nextpage for one in foods: groupbuy = request.getHtmlNodeContent(one.find('.//div[@class="svr-info"]//a'), {'ATTR':'href'}) detail = request.get(groupbuy, headers=headers, timeout=timeout, format='HTML') pic = [] try: for pic_one in detail.findall('.//div[@class="detail"]'): if request.getHtmlNodeContent(pic_one.find('.//span[@class="name"]'), 'TEXT') == '商户介绍': pic.extend([request.getHtmlNodeContent(img, {'ATTR':'lazy-src-load'}) for img in pic_one.findall('.//img')]) break except: pass additions['pic'] = pic gid = groupbuy.split('/')[-1] url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid yield {'url': url, 'additions':additions}
def fetchTuanFoodList(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent":"iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", } result = request.get(url, headers=headers, timeout=timeout, format='HTML') foods = result.findall('.//ul[@class="tg-floor-list Fix tg-floor-list-freak"]//a[@class="tg-floor-img"]') if len(foods) < 40: nextpage = None else: index = url.split('=') index[-1] = str(int(index[-1]) + 1) nextpage = '-'.join(index) yield nextpage for one in foods: detail = request.get('http://t.dianping.com%s' % request.getHtmlNodeContent(one, {'ATTR':'href'}), headers=headers, timeout=timeout, format='HTML') pic = [] try: for pic_one in detail.findall('.//div[@class="detail"]'): if request.getHtmlNodeContent(pic_one.find('.//span[@class="name"]'), 'TEXT') == '商户介绍': pic.extend([request.getHtmlNodeContent(img, {'ATTR':'lazy-src-load'}) for img in pic_one.findall('.//img')]) break except: pass additions['pic'] = pic gid = request.getHtmlNodeContent(one, {'ATTR':'href'}).split('/')[-1] url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid yield {'url': url, 'additions':additions}
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='HTML') news = result.findall('.//div[@class="news_nr"]//li') news = zip(*[iter(news)]*5) for one in news: category, name, detail_link, _, icon = one name = request.getHtmlNodeContent(name.find('.//div'), 'TEXT') icon = request.getHtmlNodeContent(icon.find('.//div'), 'TEXT').replace('"', '') detail_link = request.getHtmlNodeContent(detail_link.find('.//div'), 'TEXT').replace('"', '') detail = request.get(detail_link, timeout=timeout, format='HTML') desc = '' src = '东方网' category = request.getHtmlNodeContent(category, 'TEXT') atime = request.getHtmlNodeContent(detail.find('.//div[@id="title"]//span[@class="src"]'), 'TEXT').split(u'\u2003')[-1] atime = atime.split(u'\xa0\xa0\xa0\xa0')[0] atime = datetime.strptime(atime, '%Y-%m-%d %H:%M') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':'text', 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data)
def fetchTuanFoodList(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent": "iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", } result = request.get(url, headers=headers, timeout=timeout, format='HTML') foods = result.findall( './/ul[@class="tg-floor-list Fix tg-floor-list-freak"]//a[@class="tg-floor-img"]' ) if len(foods) < 40: nextpage = None else: index = url.split('=') index[-1] = str(int(index[-1]) + 1) nextpage = '-'.join(index) yield nextpage for one in foods: detail = request.get( 'http://t.dianping.com%s' % request.getHtmlNodeContent(one, {'ATTR': 'href'}), headers=headers, timeout=timeout, format='HTML') pic = [] try: for pic_one in detail.findall('.//div[@class="detail"]'): if request.getHtmlNodeContent( pic_one.find('.//span[@class="name"]'), 'TEXT') == '商户介绍': pic.extend([ request.getHtmlNodeContent( img, {'ATTR': 'lazy-src-load'}) for img in pic_one.findall('.//img') ]) break except: pass additions['pic'] = pic gid = request.getHtmlNodeContent(one, { 'ATTR': 'href' }).split('/')[-1] url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid yield {'url': url, 'additions': additions}
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='JSON') news = result['data']['data'] if result['data']['is_more']: index = url.split('=') index[-1] = str(int(index[-1]) + 1) nextpage = '='.join(index) else: nextpage = None nextpage = None yield nextpage for one in news: name = one['title'] icon = one['img_url'] detail_link = one['link'].strip() desc = one['description'] src = '虎扑新闻' category = '足球' group = 'text' detail = request.get(detail_link, timeout=timeout, format='HTML') atime = request.getHtmlNodeContent( detail.find('.//span[@class="stime"]'), 'TEXT').strip() if not atime: atime = request.getHtmlNodeContent( detail.find('.//span[@id="pubtime_baidu"]'), 'TEXT').strip() else: atime = '%s:00' % atime if not atime: continue else: atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S') create_time = datetime.now() update_time = datetime.now() data = { "name": name, "icon": icon, "detail_link": detail_link, "desc": desc, "src": src, "category": category, 'group': group, 'content': '', "atime": atime, "create_time": create_time, "update_time": update_time, 'tid': self.tid } yield Data(**data)
def fetchShopList(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent": "iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", } result = request.get(url, headers=headers, timeout=timeout, format='JSON') shops = result['msg']['shops'] for one in shops: additions = dict(additions, **{}) additions['name'] = one['shopName'] + one['branchName'] additions['address'] = one['address'] additions['tel'] = one['contactPhone'] additions['longitude'] = one['glng'] additions['latitude'] = one['glat'] additions['desc'] = one['businessHours'] + ',' + one['crossRoad'] additions['average'] = one['avgPrice'] yield { 'url': 'http://www.dianping.com/shop/%s' % str(one['shopId']), 'additions': additions }
def fetchWWWFoodList(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", } result = request.get(url, headers=headers, timeout=timeout, format='HTML') foods = result.findall('.//div[@id="shop-all-list"]//ul//li') print len(foods) if len(foods) < 15: nextpage = None else: index = url.split('/') index[-1] = str(int(index[-1]) + 1) nextpage = '/'.join(index) yield nextpage for one in foods: groupbuy = request.getHtmlNodeContent( one.find('.//div[@class="svr-info"]//a'), {'ATTR': 'href'}) detail = request.get(groupbuy, headers=headers, timeout=timeout, format='HTML') pic = [] try: for pic_one in detail.findall('.//div[@class="detail"]'): if request.getHtmlNodeContent( pic_one.find('.//span[@class="name"]'), 'TEXT') == '商户介绍': pic.extend([ request.getHtmlNodeContent( img, {'ATTR': 'lazy-src-load'}) for img in pic_one.findall('.//img') ]) break except: pass additions['pic'] = pic gid = groupbuy.split('/')[-1] url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid yield {'url': url, 'additions': additions}
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='JSON') news = result['data']['data'] if result['data']['is_more']: index = url.split('=') index[-1] = str(int(index[-1]) + 1) nextpage = '='.join(index) else: nextpage = None nextpage = None yield nextpage for one in news: name = one['title'] icon = one['img_url'] detail_link = one['link'].strip() desc = one['description'] src = '虎扑新闻' category = '足球' group = 'text' detail = request.get(detail_link, timeout=timeout, format='HTML') atime = request.getHtmlNodeContent(detail.find('.//span[@class="stime"]'), 'TEXT').strip() if not atime: atime = request.getHtmlNodeContent(detail.find('.//span[@id="pubtime_baidu"]'), 'TEXT').strip() else: atime = '%s:00' % atime if not atime: continue else: atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':group, 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data)
def getArticle(aid): projection = {'uid':1, 'name':1, 'clsname':1, 'filepath':1, 'digest':1} article = request.post('%sgdc/api/article/%s' % (HOST, str(aid)), {'projection':json.dumps(projection), 'limit':'one'}, format='JSON', s=session) article = article['article'] filepath = article['filepath'] if filepath in Store and article['digest'] == Store[filepath]: return article result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT') persist(filepath, result) Store[filepath] = article['digest'] return article
def getDatamodel(dmid): projection = {'filepath':1, 'digest':1, 'name':1} datamodel = request.post('%sgdc/api/datamodel/%s' % (HOST, str(dmid)), {'projection':json.dumps(projection), 'limit':'one'}, format='JSON', s=session) datamodel = datamodel['datamodel'] filepath = datamodel['filepath'] if filepath in Store and datamodel['digest'] == Store[filepath]: return datamodel result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT') persist(filepath, result) Store[filepath] = datamodel['digest'] return datamodel
def getUnit(uid): projection = {'name':1, 'filepath':1, 'digest':1, 'dmid':1} unit = request.post('%sgdc/api/unit/%s' % (HOST, str(uid)), {'projection':json.dumps(projection), 'limit':'one'}, format='JSON', s=session) unit = unit['unit'] filepath = unit['filepath'] if filepath in Store and unit['digest'] == Store[filepath]: return unit result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT') persist(filepath, result) Store[filepath] = unit['digest'] filepath = os.path.join(os.path.dirname(os.path.join(CURRPATH, filepath)), '__init__.py') persist(filepath, '#!/usr/bin/env python\n# coding=utf8') return unit
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='HTML') news = result.findall('.//div[@class="news_nr"]//li') news = zip(*[iter(news)] * 5) for one in news: category, name, detail_link, _, icon = one name = request.getHtmlNodeContent(name.find('.//div'), 'TEXT') icon = request.getHtmlNodeContent(icon.find('.//div'), 'TEXT').replace('"', '') detail_link = request.getHtmlNodeContent( detail_link.find('.//div'), 'TEXT').replace('"', '') detail = request.get(detail_link, timeout=timeout, format='HTML') desc = '' src = '东方网' category = request.getHtmlNodeContent(category, 'TEXT') atime = request.getHtmlNodeContent( detail.find('.//div[@id="title"]//span[@class="src"]'), 'TEXT').split(u'\u2003')[-1] atime = atime.split(u'\xa0\xa0\xa0\xa0')[0] atime = datetime.strptime(atime, '%Y-%m-%d %H:%M') create_time = datetime.now() update_time = datetime.now() data = { "name": name, "icon": icon, "detail_link": detail_link, "desc": desc, "src": src, "category": category, 'group': 'text', 'content': '', "atime": atime, "create_time": create_time, "update_time": update_time, 'tid': self.tid } yield Data(**data)
def getDatamodel(dmid): projection = {'filepath': 1, 'digest': 1, 'name': 1} datamodel = request.post('%sgdc/api/datamodel/%s' % (HOST, str(dmid)), { 'projection': json.dumps(projection), 'limit': 'one' }, format='JSON', s=session) datamodel = datamodel['datamodel'] filepath = datamodel['filepath'] if filepath in Store and datamodel['digest'] == Store[filepath]: return datamodel result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT') persist(filepath, result) Store[filepath] = datamodel['digest'] return datamodel
def fetchShopList(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent":"iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", } result = request.get(url, headers=headers, timeout=timeout, format='JSON') shops = result['msg']['shops'] for one in shops: additions = dict(additions, **{}) additions['name'] = one['shopName'] + one['branchName'] additions['address'] = one['address'] additions['tel'] = one['contactPhone'] additions['longitude'] = one['glng'] additions['latitude'] = one['glat'] additions['desc'] = one['businessHours'] + ',' + one['crossRoad'] additions['average'] = one['avgPrice'] yield {'url':'http://www.dianping.com/shop/%s' % str(one['shopId']), 'additions':additions}
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='JSON') news = result['result'] if len(news) == 0: nextpage = None else: urlobj, params = URLParse.decode(url) span = int(params['cend']) - int(params['cstart']) params['cstart'] = params['cend'] params['cend'] = str(int(params['cstart']) + span) nextpage = URLParse.encode(urlobj, params) nextpage = None yield nextpage for one in news: if not one['ctype'] == "news": continue name = one['title'] if 'icon' in one: icon = one['icon'] elif 'image_urls' in one and one['image_urls']: icon = 'http://i1.go2yd.com/image.php?url=%s&type=thumbnail_200x140' % one['image_urls'][0] else: icon = '' detail_link = one['url'] desc = one['summary'] src = '一点资讯' category = one['category'] atime = datetime.strptime(one['date'], '%Y-%m-%d %H:%M:%S') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':'text', 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data)
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='JSON') news = result['data'] if not 'next' in result: nextpage = None else: urlobj, params = URLParse.decode(url) if str(params['max_behot_time']) == str( result['next']['max_behot_time']): nextpage = None else: params['max_behot_time'] = result['next']['max_behot_time'] nextpage = URLParse.encode(urlobj, params) nextpage = None yield nextpage for one in news: name = one['title'] if 'image_url' in one: icon = one['image_url'] elif 'image_list' in one and one['image_list']: icon = one['image_list'][0]['url'] else: icon = '' detail_link = one['display_url'] desc = one['source'] + ',' + one['abstract'] src = '今日头条' category = additions.get('category', '') # '财经' atime = datetime.strptime(one['datetime'], '%Y-%m-%d %H:%M') create_time = datetime.now() update_time = datetime.now() data = { "name": name, "icon": icon, "detail_link": detail_link, "desc": desc, "src": src, "category": category, 'group': 'text', 'content': '', "atime": atime, "create_time": create_time, "update_time": update_time, 'tid': self.tid } yield Data(**data)
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='JSON') news = result['data'] if not 'next' in result: nextpage = None else: urlobj, params = URLParse.decode(url) if str(params['max_behot_time']) == str(result['next']['max_behot_time']): nextpage = None else: params['max_behot_time'] = result['next']['max_behot_time'] nextpage = URLParse.encode(urlobj, params) nextpage = None yield nextpage for one in news: name = one['title'] if 'image_url' in one: icon = one['image_url'] elif 'image_list' in one and one['image_list']: icon = one['image_list'][0]['url'] else: icon = '' detail_link = one['display_url'] desc = one['source'] + ',' + one['abstract'] src = '今日头条' category = additions.get('category', '') # '财经' atime = datetime.strptime(one['datetime'], '%Y-%m-%d %H:%M') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':'text', 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data)
def getUnit(uid): projection = {'name': 1, 'filepath': 1, 'digest': 1, 'dmid': 1} unit = request.post('%sgdc/api/unit/%s' % (HOST, str(uid)), { 'projection': json.dumps(projection), 'limit': 'one' }, format='JSON', s=session) unit = unit['unit'] filepath = unit['filepath'] if filepath in Store and unit['digest'] == Store[filepath]: return unit result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT') persist(filepath, result) Store[filepath] = unit['digest'] filepath = os.path.join(os.path.dirname(os.path.join(CURRPATH, filepath)), '__init__.py') persist(filepath, '#!/usr/bin/env python\n# coding=utf8') return unit
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='JSON') news = result['result']['data'] if len(news) < 30: nextpage = None else: index = url.split('=') index[-1] = str(int(index[-1]) + 1) nextpage = '='.join(index) nextpage = None yield nextpage for one in news: name = one['stitle'] icon = one['img']['u'] detail_link = one.get('wapurl') or one.get('url', '') desc = one['intro'] src = '新浪新闻' category = '足球' group = 'text' if 'http://video.sina.com.cn' in detail_link: group = 'video' atime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(float(one['ctime']))) atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S') create_time = datetime.now() update_time = datetime.now() data = { "name": name, "icon": icon, "detail_link": detail_link, "desc": desc, "src": src, "category": category, 'group': group, 'content': '', "atime": atime, "create_time": create_time, "update_time": update_time, 'tid': self.tid } yield Data(**data)
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, dirtys=[('artiList({', '{'), ('})', '}')], format='JSON') news = result['BA8E6OEOwangning'] start, end = url[url.rindex('/')+1:url.rindex('.')].split('-') span = int(end) - int(start) if len(news) < span: nextpage = None else: start = end end = str(int(start) + span) nextpage = '%s/%s-%s.html' % (url[:url.rindex('/')], start, end) nextpage = None yield nextpage for one in news: if '|' in url: continue name = one['title'] icon = one['imgsrc'] detail_link = 'http://3g.163.com/touch/article.html?channel=sports&docid=%s' % one['docid'] desc = one['digest'] src = '网易新闻' category = additions.get('category') atime = datetime.strptime(one['ptime'], '%Y-%m-%d %H:%M:%S') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':'text', 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data)
def getArticle(aid): projection = { 'uid': 1, 'name': 1, 'clsname': 1, 'filepath': 1, 'digest': 1 } article = request.post('%sgdc/api/article/%s' % (HOST, str(aid)), { 'projection': json.dumps(projection), 'limit': 'one' }, format='JSON', s=session) article = article['article'] filepath = article['filepath'] if filepath in Store and article['digest'] == Store[filepath]: return article result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT') persist(filepath, result) Store[filepath] = article['digest'] return article
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='JSON') news = result['result']['data'] if len(news) < 30: nextpage = None else: index = url.split('=') index[-1] = str(int(index[-1]) + 1) nextpage = '='.join(index) nextpage = None yield nextpage for one in news: name = one['stitle'] icon = one['img']['u'] detail_link = one.get('wapurl') or one.get('url', '') desc = one['intro'] src = '新浪新闻' category = '足球' group = 'text' if 'http://video.sina.com.cn' in detail_link: group = 'video' atime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(float(one['ctime']))) atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':group, 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data)
def fetchShopDetail(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent":"iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", } result = request.get(url, headers=headers, timeout=timeout, format='HTML') food_id = [additions['food_id'], ] name = additions['name'] desc = additions['desc'] tel = additions['tel'] pic = additions['pic'] province_id = additions['province_id'] city_id = additions['city_id'] tag = [] try: for one in result.findall('.//p[@class="info info-indent"]'): prompt = request.getHtmlNodeContent(one.find('.//span'), 'TEXT') if '标签' in prompt: tag.extend([request.getHtmlNodeContent(a, 'TEXT') for a in one.findall('.//a')]) except: pass area_id = additions['area_id'] town_id = additions['town_id'] country_id = additions['country_id'] address = additions['address'] longitude = additions['longitude'] latitude = additions['latitude'] dianping = { 'url': url, 'star': 0, 'average': additions['average'], 'taste': 0, 'env': 0, 'service': 0, } dianping_info = result.findall('.//div[@class="brief-info"]//span') try: dianping['star'] = request.getHtmlNodeContent(dianping_info[0], {'ATTR':'class'}) dianping['star'] = dianping['star'].replace('mid-rank-stars mid-str', '') dianping['star'] = float('%s.%s' % (dianping['star'][0], dianping['star'][1:])) dianping['taste'] = float(request.getHtmlNodeContent(dianping_info[3], 'TEXT').replace('口味:', '')) dianping['env'] = float(request.getHtmlNodeContent(dianping_info[4], 'TEXT').replace('环境:', '')) dianping['service'] = float(request.getHtmlNodeContent(dianping_info[5], 'TEXT').replace('服务:', '')) except: pass src = '大众点评' link_url = url atime = datetime.now() uptime = atime time = atime print link_url, name time_result = request.get(url+'/editmember', headers=headers, timeout=timeout, format='HTML') time_info = time_result.findall('.//ul[@class="block-inner desc-list contribute-list Fix"]//li') try: for one in time_info: prompt = request.getHtmlNodeContent(one.find('.//strong'), 'TEXT') if '商户' in prompt: atime = datetime.strptime('20%s' % request.getHtmlNodeContent(one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1], '%Y-%m-%d') elif '更新' in prompt: uptime = datetime.strptime('20%s' % request.getHtmlNodeContent(one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1], '%Y-%m-%d') except: pass data = Data(food_id=food_id, name=name, desc=desc, tel=tel, pic=pic, province_id=province_id, city_id=city_id, tag=tag, area_id=area_id, town_id=town_id, country_id=country_id, address=address, longitude=longitude, latitude=latitude, dianping=dianping, src=src, link_url=link_url, atime=atime, time=time, uptime=uptime, tid=self.tid) yield data
def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None): result = request.get(url, timeout=timeout, format='HTML') news = result.findall('.//dl[@id="idcur"]//dd') news.append(result.find('.//dl[@id="idcur"]//dt')) for one in news: name = request.getHtmlNodeContent(one.find('.//p'), 'TEXT') icon = request.getHtmlNodeContent(one.find('.//img'), {'ATTR':'src'}) detail_link = request.getHtmlNodeContent(one.find('.//a'), {'ATTR':'href'}) detail = request.get(detail_link, timeout=timeout, format='HTML') desc = '' src = 'cctv' category = '足球' atime = request.getHtmlNodeContent(detail.find('.//div[@class="font_xx"]'), 'TEXT').strip() atime = ('20%s' % atime.replace('日 ', 'T').split(' 20')[-1]).replace('年', '-').replace('月', '-') atime = datetime.strptime(atime, '%Y-%m-%dT%H:%M') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':'pic', 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data) news = result.findall('.//ul[@class="il_w120_b1"]//li') for one in news: name = request.getHtmlNodeContent(one.find('.//div[@class="text"]//a'), 'TEXT').replace('[高清组图]', '') icon = request.getHtmlNodeContent(one.find('.//div[@class="image"]//img'), {'ATTR':'src'}) detail_link = request.getHtmlNodeContent(one.find('.//div[@class="image"]//a'), {'ATTR':'href'}) detail = request.get(detail_link, timeout=timeout, format='HTML') desc = '' src = 'cctv' category = '足球' atime = request.getHtmlNodeContent(detail.find('.//div[@class="font_xx"]'), 'TEXT').strip() atime = ('20%s' % atime.replace('日 ', 'T').split(' 20')[-1]).replace('年', '-').replace('月', '-') atime = datetime.strptime(atime, '%Y-%m-%dT%H:%M') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':'pic', 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data) news = result.findall('.//div[@class="text_list"]//ul//li') for one in news: name = request.getHtmlNodeContent(one.find('.//a'), 'TEXT') detail_link = request.getHtmlNodeContent(one.find('.//a'), {'ATTR':'href'}) detail = request.get(detail_link, timeout=timeout, format='HTML') icon = request.getHtmlNodeContent(detail.find('.//p[@align="center"]//img'), {'ATTR':'src'}) desc = '' src = 'cctv' category = '足球' atime = request.getHtmlNodeContent(detail.find('.//div[@class="function"]//span[@class="info"]//i'), 'TEXT').strip() atime = ('20%s' % atime.replace('日 ', 'T').split(' 20')[-1]).replace('年', '-').replace('月', '-') atime = datetime.strptime(atime, '%Y-%m-%dT%H:%M') create_time = datetime.now() update_time = datetime.now() data = {"name":name, "icon":icon, "detail_link":detail_link, "desc":desc, "src":src, "category":category, 'group':'text', 'content':'', "atime":atime, "create_time":create_time, "update_time":update_time, 'tid':self.tid} yield Data(**data)
def fetchShopDetail(self, url, additions={}, timeout=TIMEOUT, implementor=None): headers = { "User-Agent": "iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1", } result = request.get(url, headers=headers, timeout=timeout, format='HTML') food_id = [ additions['food_id'], ] name = additions['name'] desc = additions['desc'] tel = additions['tel'] pic = additions['pic'] province_id = additions['province_id'] city_id = additions['city_id'] tag = [] try: for one in result.findall('.//p[@class="info info-indent"]'): prompt = request.getHtmlNodeContent(one.find('.//span'), 'TEXT') if '标签' in prompt: tag.extend([ request.getHtmlNodeContent(a, 'TEXT') for a in one.findall('.//a') ]) except: pass area_id = additions['area_id'] town_id = additions['town_id'] country_id = additions['country_id'] address = additions['address'] longitude = additions['longitude'] latitude = additions['latitude'] dianping = { 'url': url, 'star': 0, 'average': additions['average'], 'taste': 0, 'env': 0, 'service': 0, } dianping_info = result.findall('.//div[@class="brief-info"]//span') try: dianping['star'] = request.getHtmlNodeContent( dianping_info[0], {'ATTR': 'class'}) dianping['star'] = dianping['star'].replace( 'mid-rank-stars mid-str', '') dianping['star'] = float( '%s.%s' % (dianping['star'][0], dianping['star'][1:])) dianping['taste'] = float( request.getHtmlNodeContent(dianping_info[3], 'TEXT').replace('口味:', '')) dianping['env'] = float( request.getHtmlNodeContent(dianping_info[4], 'TEXT').replace('环境:', '')) dianping['service'] = float( request.getHtmlNodeContent(dianping_info[5], 'TEXT').replace('服务:', '')) except: pass src = '大众点评' link_url = url atime = datetime.now() uptime = atime time = atime print link_url, name time_result = request.get(url + '/editmember', headers=headers, timeout=timeout, format='HTML') time_info = time_result.findall( './/ul[@class="block-inner desc-list contribute-list Fix"]//li') try: for one in time_info: prompt = request.getHtmlNodeContent(one.find('.//strong'), 'TEXT') if '商户' in prompt: atime = datetime.strptime( '20%s' % request.getHtmlNodeContent( one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1], '%Y-%m-%d') elif '更新' in prompt: uptime = datetime.strptime( '20%s' % request.getHtmlNodeContent( one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1], '%Y-%m-%d') except: pass data = Data(food_id=food_id, name=name, desc=desc, tel=tel, pic=pic, province_id=province_id, city_id=city_id, tag=tag, area_id=area_id, town_id=town_id, country_id=country_id, address=address, longitude=longitude, latitude=latitude, dianping=dianping, src=src, link_url=link_url, atime=atime, time=time, uptime=uptime, tid=self.tid) yield data