Python get示例，webcrawl.request.get Python示例

示例#1

0

显示文件

文件： dianping.py 项目： jerryshew/pholcus

 def fetchWWWFoodList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     headers = {
         "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
     }
     result = request.get(url, headers=headers, timeout=timeout, format='HTML')
     foods = result.findall('.//div[@id="shop-all-list"]//ul//li')
     print len(foods)
     if len(foods) < 15:
         nextpage = None
     else:
         index = url.split('/')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '/'.join(index)
     yield nextpage
     for one in foods:
         groupbuy = request.getHtmlNodeContent(one.find('.//div[@class="svr-info"]//a'), {'ATTR':'href'})
         detail = request.get(groupbuy, headers=headers, timeout=timeout, format='HTML')
         pic = []
         try:
             for pic_one in detail.findall('.//div[@class="detail"]'):
                 if request.getHtmlNodeContent(pic_one.find('.//span[@class="name"]'), 'TEXT') == '商户介绍':
                     pic.extend([request.getHtmlNodeContent(img, {'ATTR':'lazy-src-load'}) for img in pic_one.findall('.//img')])
                     break
         except:
             pass
         additions['pic'] = pic
         gid = groupbuy.split('/')[-1]
         url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid
         yield {'url': url, 'additions':additions}

示例#2

0

显示文件

文件： dianping.py 项目： jerryshew/pholcus

 def fetchTuanFoodList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     headers = {
         "User-Agent":"iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
     }
     result = request.get(url, headers=headers, timeout=timeout, format='HTML')
     foods = result.findall('.//ul[@class="tg-floor-list Fix tg-floor-list-freak"]//a[@class="tg-floor-img"]')
     if len(foods) < 40:
         nextpage = None
     else:
         index = url.split('=')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '-'.join(index)
     yield nextpage
     for one in foods:
         detail = request.get('http://t.dianping.com%s' % request.getHtmlNodeContent(one, {'ATTR':'href'}), headers=headers, timeout=timeout, format='HTML')
         pic = []
         try:
             for pic_one in detail.findall('.//div[@class="detail"]'):
                 if request.getHtmlNodeContent(pic_one.find('.//span[@class="name"]'), 'TEXT') == '商户介绍':
                     pic.extend([request.getHtmlNodeContent(img, {'ATTR':'lazy-src-load'}) for img in pic_one.findall('.//img')])
                     break
         except:
             pass
         additions['pic'] = pic
         gid = request.getHtmlNodeContent(one, {'ATTR':'href'}).split('/')[-1]
         url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid
         yield {'url': url, 'additions':additions}

示例#3

0

显示文件

文件： eastnews.py 项目： jerryshew/pholcus

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='HTML')
     news = result.findall('.//div[@class="news_nr"]//li')
     news = zip(*[iter(news)]*5)
     for one in news:
         category, name, detail_link, _, icon = one
         name = request.getHtmlNodeContent(name.find('.//div'), 'TEXT')
         icon = request.getHtmlNodeContent(icon.find('.//div'), 'TEXT').replace('"', '')
         detail_link = request.getHtmlNodeContent(detail_link.find('.//div'), 'TEXT').replace('"', '')
         detail = request.get(detail_link, timeout=timeout, format='HTML')
         desc = ''
         src = '东方网'
         category = request.getHtmlNodeContent(category, 'TEXT')
         atime = request.getHtmlNodeContent(detail.find('.//div[@id="title"]//span[@class="src"]'), 'TEXT').split(u'\u2003')[-1]
         atime = atime.split(u'\xa0\xa0\xa0\xa0')[0]
         atime = datetime.strptime(atime, '%Y-%m-%d %H:%M')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {"name":name,
             "icon":icon,
             "detail_link":detail_link,
             "desc":desc,
             "src":src,
             "category":category,
             'group':'text',
             'content':'',
             "atime":atime,
             "create_time":create_time,
             "update_time":update_time,
             'tid':self.tid}
         yield Data(**data)

示例#4

0

显示文件

文件： dianping.py 项目： flyfloor/pholcus

 def fetchTuanFoodList(self,
                       url,
                       additions={},
                       timeout=TIMEOUT,
                       implementor=None):
     headers = {
         "User-Agent":
         "iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
     }
     result = request.get(url,
                          headers=headers,
                          timeout=timeout,
                          format='HTML')
     foods = result.findall(
         './/ul[@class="tg-floor-list Fix tg-floor-list-freak"]//a[@class="tg-floor-img"]'
     )
     if len(foods) < 40:
         nextpage = None
     else:
         index = url.split('=')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '-'.join(index)
     yield nextpage
     for one in foods:
         detail = request.get(
             'http://t.dianping.com%s' %
             request.getHtmlNodeContent(one, {'ATTR': 'href'}),
             headers=headers,
             timeout=timeout,
             format='HTML')
         pic = []
         try:
             for pic_one in detail.findall('.//div[@class="detail"]'):
                 if request.getHtmlNodeContent(
                         pic_one.find('.//span[@class="name"]'),
                         'TEXT') == '商户介绍':
                     pic.extend([
                         request.getHtmlNodeContent(
                             img, {'ATTR': 'lazy-src-load'})
                         for img in pic_one.findall('.//img')
                     ])
                     break
         except:
             pass
         additions['pic'] = pic
         gid = request.getHtmlNodeContent(one, {
             'ATTR': 'href'
         }).split('/')[-1]
         url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid
         yield {'url': url, 'additions': additions}

示例#5

0

显示文件

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='JSON')
     news = result['data']['data']
     if result['data']['is_more']:
         index = url.split('=')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '='.join(index)
     else:
         nextpage = None
     nextpage = None
     yield nextpage
     for one in news:
         name = one['title']
         icon = one['img_url']
         detail_link = one['link'].strip()
         desc = one['description']
         src = '虎扑新闻'
         category = '足球'
         group = 'text'
         detail = request.get(detail_link, timeout=timeout, format='HTML')
         atime = request.getHtmlNodeContent(
             detail.find('.//span[@class="stime"]'), 'TEXT').strip()
         if not atime:
             atime = request.getHtmlNodeContent(
                 detail.find('.//span[@id="pubtime_baidu"]'),
                 'TEXT').strip()
         else:
             atime = '%s:00' % atime
         if not atime:
             continue
         else:
             atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {
             "name": name,
             "icon": icon,
             "detail_link": detail_link,
             "desc": desc,
             "src": src,
             "category": category,
             'group': group,
             'content': '',
             "atime": atime,
             "create_time": create_time,
             "update_time": update_time,
             'tid': self.tid
         }
         yield Data(**data)

示例#6

0

显示文件

文件： dianping.py 项目： flyfloor/pholcus

 def fetchShopList(self,
                   url,
                   additions={},
                   timeout=TIMEOUT,
                   implementor=None):
     headers = {
         "User-Agent":
         "iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
     }
     result = request.get(url,
                          headers=headers,
                          timeout=timeout,
                          format='JSON')
     shops = result['msg']['shops']
     for one in shops:
         additions = dict(additions, **{})
         additions['name'] = one['shopName'] + one['branchName']
         additions['address'] = one['address']
         additions['tel'] = one['contactPhone']
         additions['longitude'] = one['glng']
         additions['latitude'] = one['glat']
         additions['desc'] = one['businessHours'] + ',' + one['crossRoad']
         additions['average'] = one['avgPrice']
         yield {
             'url': 'http://www.dianping.com/shop/%s' % str(one['shopId']),
             'additions': additions
         }

示例#7

0

显示文件

文件： dianping.py 项目： flyfloor/pholcus

 def fetchWWWFoodList(self,
                      url,
                      additions={},
                      timeout=TIMEOUT,
                      implementor=None):
     headers = {
         "User-Agent":
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
     }
     result = request.get(url,
                          headers=headers,
                          timeout=timeout,
                          format='HTML')
     foods = result.findall('.//div[@id="shop-all-list"]//ul//li')
     print len(foods)
     if len(foods) < 15:
         nextpage = None
     else:
         index = url.split('/')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '/'.join(index)
     yield nextpage
     for one in foods:
         groupbuy = request.getHtmlNodeContent(
             one.find('.//div[@class="svr-info"]//a'), {'ATTR': 'href'})
         detail = request.get(groupbuy,
                              headers=headers,
                              timeout=timeout,
                              format='HTML')
         pic = []
         try:
             for pic_one in detail.findall('.//div[@class="detail"]'):
                 if request.getHtmlNodeContent(
                         pic_one.find('.//span[@class="name"]'),
                         'TEXT') == '商户介绍':
                     pic.extend([
                         request.getHtmlNodeContent(
                             img, {'ATTR': 'lazy-src-load'})
                         for img in pic_one.findall('.//img')
                     ])
                     break
         except:
             pass
         additions['pic'] = pic
         gid = groupbuy.split('/')[-1]
         url = 'http://t.dianping.com/ajax/dealGroupShopDetail?dealGroupId=%s&action=shops' % gid
         yield {'url': url, 'additions': additions}

示例#8

0

显示文件

文件： hupu.py 项目： jerryshew/pholcus

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='JSON')
     news = result['data']['data']
     if result['data']['is_more']:
         index = url.split('=')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '='.join(index)
     else:
         nextpage = None
     nextpage = None
     yield nextpage
     for one in news:
         name = one['title']
         icon = one['img_url']
         detail_link = one['link'].strip()
         desc = one['description']
         src = '虎扑新闻'
         category = '足球'
         group = 'text'
         detail = request.get(detail_link, timeout=timeout, format='HTML')
         atime = request.getHtmlNodeContent(detail.find('.//span[@class="stime"]'), 'TEXT').strip()
         if not atime:
             atime = request.getHtmlNodeContent(detail.find('.//span[@id="pubtime_baidu"]'), 'TEXT').strip()
         else:
             atime = '%s:00' % atime
         if not atime:
             continue
         else:
             atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {"name":name,
             "icon":icon,
             "detail_link":detail_link,
             "desc":desc,
             "src":src,
             "category":category,
             'group':group,
             'content':'',
             "atime":atime,
             "create_time":create_time,
             "update_time":update_time,
             'tid':self.tid}
         yield Data(**data)

示例#9

0

显示文件

文件： register.py 项目： jerryshew/pholcus

def getArticle(aid):
    projection = {'uid':1, 'name':1, 'clsname':1, 'filepath':1, 'digest':1}
    article = request.post('%sgdc/api/article/%s' % (HOST, str(aid)), {'projection':json.dumps(projection), 'limit':'one'}, format='JSON', s=session)
    article = article['article']
    filepath = article['filepath']
    if filepath in Store and article['digest'] == Store[filepath]:
        return article
    result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT')
    persist(filepath, result)
    Store[filepath] = article['digest']
    return article

示例#10

0

显示文件

文件： register.py 项目： jerryshew/pholcus

def getDatamodel(dmid):
    projection = {'filepath':1, 'digest':1, 'name':1}
    datamodel = request.post('%sgdc/api/datamodel/%s' % (HOST, str(dmid)), {'projection':json.dumps(projection), 'limit':'one'}, format='JSON', s=session)
    datamodel = datamodel['datamodel']
    filepath = datamodel['filepath']
    if filepath in Store and datamodel['digest'] == Store[filepath]:
        return datamodel
    result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT')
    persist(filepath, result)
    Store[filepath] = datamodel['digest']
    return datamodel

示例#11

0

显示文件

文件： register.py 项目： jerryshew/pholcus

def getUnit(uid):
    projection = {'name':1, 'filepath':1, 'digest':1, 'dmid':1}
    unit = request.post('%sgdc/api/unit/%s' % (HOST, str(uid)), {'projection':json.dumps(projection), 'limit':'one'}, format='JSON', s=session)
    unit = unit['unit']
    filepath = unit['filepath']
    if filepath in Store and unit['digest'] == Store[filepath]:
        return unit
    result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT')
    persist(filepath, result)
    Store[filepath] = unit['digest']

    filepath = os.path.join(os.path.dirname(os.path.join(CURRPATH, filepath)), '__init__.py')
    persist(filepath, '#!/usr/bin/env python\n# coding=utf8')
    return unit

示例#12

0

显示文件

文件： eastnews.py 项目： flyfloor/pholcus

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='HTML')
     news = result.findall('.//div[@class="news_nr"]//li')
     news = zip(*[iter(news)] * 5)
     for one in news:
         category, name, detail_link, _, icon = one
         name = request.getHtmlNodeContent(name.find('.//div'), 'TEXT')
         icon = request.getHtmlNodeContent(icon.find('.//div'),
                                           'TEXT').replace('"', '')
         detail_link = request.getHtmlNodeContent(
             detail_link.find('.//div'), 'TEXT').replace('"', '')
         detail = request.get(detail_link, timeout=timeout, format='HTML')
         desc = ''
         src = '东方网'
         category = request.getHtmlNodeContent(category, 'TEXT')
         atime = request.getHtmlNodeContent(
             detail.find('.//div[@id="title"]//span[@class="src"]'),
             'TEXT').split(u'\u2003')[-1]
         atime = atime.split(u'\xa0\xa0\xa0\xa0')[0]
         atime = datetime.strptime(atime, '%Y-%m-%d %H:%M')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {
             "name": name,
             "icon": icon,
             "detail_link": detail_link,
             "desc": desc,
             "src": src,
             "category": category,
             'group': 'text',
             'content': '',
             "atime": atime,
             "create_time": create_time,
             "update_time": update_time,
             'tid': self.tid
         }
         yield Data(**data)

示例#13

0

显示文件

文件： register.py 项目： flyfloor/pholcus

def getDatamodel(dmid):
    projection = {'filepath': 1, 'digest': 1, 'name': 1}
    datamodel = request.post('%sgdc/api/datamodel/%s' % (HOST, str(dmid)), {
        'projection': json.dumps(projection),
        'limit': 'one'
    },
                             format='JSON',
                             s=session)
    datamodel = datamodel['datamodel']
    filepath = datamodel['filepath']
    if filepath in Store and datamodel['digest'] == Store[filepath]:
        return datamodel
    result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT')
    persist(filepath, result)
    Store[filepath] = datamodel['digest']
    return datamodel

示例#14

0

显示文件

文件： dianping.py 项目： jerryshew/pholcus

 def fetchShopList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     headers = {
         "User-Agent":"iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
     }
     result = request.get(url, headers=headers, timeout=timeout, format='JSON')
     shops = result['msg']['shops']
     for one in shops:
         additions = dict(additions, **{})
         additions['name'] = one['shopName'] + one['branchName']
         additions['address'] = one['address']
         additions['tel'] = one['contactPhone']
         additions['longitude'] = one['glng']
         additions['latitude'] = one['glat']
         additions['desc'] = one['businessHours'] + ',' + one['crossRoad']
         additions['average'] = one['avgPrice']
         yield {'url':'http://www.dianping.com/shop/%s' % str(one['shopId']), 'additions':additions}

示例#15

0

显示文件

文件： yidian.py 项目： flyfloor/pholcus

    def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
        result = request.get(url, timeout=timeout, format='JSON')
        news = result['result']
        if len(news) == 0:
            nextpage = None
        else:
            urlobj, params = URLParse.decode(url)

            span = int(params['cend']) - int(params['cstart'])
            params['cstart'] = params['cend']
            params['cend'] = str(int(params['cstart']) + span)
            
            nextpage = URLParse.encode(urlobj, params)
        nextpage = None
        yield nextpage
        for one in news:
            if not one['ctype'] == "news":
                continue
            name = one['title']
            if 'icon' in one:
                icon = one['icon']
            elif 'image_urls' in one and one['image_urls']:
                icon = 'http://i1.go2yd.com/image.php?url=%s&type=thumbnail_200x140' % one['image_urls'][0]
            else:
                icon = ''
            detail_link = one['url']
            desc = one['summary']
            src = '一点资讯'
            category = one['category']
            atime = datetime.strptime(one['date'], '%Y-%m-%d %H:%M:%S')
            create_time = datetime.now()
            update_time = datetime.now()
            data = {"name":name,
                "icon":icon,
                "detail_link":detail_link,
                "desc":desc,
                "src":src,
                "category":category,
                'group':'text',
                'content':'',
                "atime":atime,
                "create_time":create_time,
                "update_time":update_time,
                'tid':self.tid}
            yield Data(**data)

示例#16

0

显示文件

文件： toutiao.py 项目： flyfloor/pholcus

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='JSON')
     news = result['data']
     if not 'next' in result:
         nextpage = None
     else:
         urlobj, params = URLParse.decode(url)
         if str(params['max_behot_time']) == str(
                 result['next']['max_behot_time']):
             nextpage = None
         else:
             params['max_behot_time'] = result['next']['max_behot_time']
             nextpage = URLParse.encode(urlobj, params)
     nextpage = None
     yield nextpage
     for one in news:
         name = one['title']
         if 'image_url' in one:
             icon = one['image_url']
         elif 'image_list' in one and one['image_list']:
             icon = one['image_list'][0]['url']
         else:
             icon = ''
         detail_link = one['display_url']
         desc = one['source'] + '，' + one['abstract']
         src = '今日头条'
         category = additions.get('category', '')  # '财经'
         atime = datetime.strptime(one['datetime'], '%Y-%m-%d %H:%M')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {
             "name": name,
             "icon": icon,
             "detail_link": detail_link,
             "desc": desc,
             "src": src,
             "category": category,
             'group': 'text',
             'content': '',
             "atime": atime,
             "create_time": create_time,
             "update_time": update_time,
             'tid': self.tid
         }
         yield Data(**data)

示例#17

0

显示文件

文件： toutiao.py 项目： jerryshew/pholcus

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='JSON')
     news = result['data']
     if not 'next' in result:
         nextpage = None
     else:
         urlobj, params = URLParse.decode(url)
         if str(params['max_behot_time']) == str(result['next']['max_behot_time']):
             nextpage = None
         else:
             params['max_behot_time'] = result['next']['max_behot_time']
             nextpage = URLParse.encode(urlobj, params)
     nextpage = None
     yield nextpage
     for one in news:
         name = one['title']
         if 'image_url' in one:
             icon = one['image_url']
         elif 'image_list' in one and one['image_list']:
             icon = one['image_list'][0]['url']
         else:
             icon = ''
         detail_link = one['display_url']
         desc = one['source'] + '，' + one['abstract']
         src = '今日头条'
         category = additions.get('category', '') # '财经'
         atime = datetime.strptime(one['datetime'], '%Y-%m-%d %H:%M')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {"name":name,
             "icon":icon,
             "detail_link":detail_link,
             "desc":desc,
             "src":src,
             "category":category,
             'group':'text',
             'content':'',
             "atime":atime,
             "create_time":create_time,
             "update_time":update_time,
             'tid':self.tid}
         yield Data(**data)

示例#18

0

显示文件

文件： register.py 项目： flyfloor/pholcus

def getUnit(uid):
    projection = {'name': 1, 'filepath': 1, 'digest': 1, 'dmid': 1}
    unit = request.post('%sgdc/api/unit/%s' % (HOST, str(uid)), {
        'projection': json.dumps(projection),
        'limit': 'one'
    },
                        format='JSON',
                        s=session)
    unit = unit['unit']
    filepath = unit['filepath']
    if filepath in Store and unit['digest'] == Store[filepath]:
        return unit
    result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT')
    persist(filepath, result)
    Store[filepath] = unit['digest']

    filepath = os.path.join(os.path.dirname(os.path.join(CURRPATH, filepath)),
                            '__init__.py')
    persist(filepath, '#!/usr/bin/env python\n# coding=utf8')
    return unit

示例#19

0

显示文件

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='JSON')
     news = result['result']['data']
     if len(news) < 30:
         nextpage = None
     else:
         index = url.split('=')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '='.join(index)
     nextpage = None
     yield nextpage
     for one in news:
         name = one['stitle']
         icon = one['img']['u']
         detail_link = one.get('wapurl') or one.get('url', '')
         desc = one['intro']
         src = '新浪新闻'
         category = '足球'
         group = 'text'
         if 'http://video.sina.com.cn' in detail_link:
             group = 'video'
         atime = time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(float(one['ctime'])))
         atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {
             "name": name,
             "icon": icon,
             "detail_link": detail_link,
             "desc": desc,
             "src": src,
             "category": category,
             'group': group,
             'content': '',
             "atime": atime,
             "create_time": create_time,
             "update_time": update_time,
             'tid': self.tid
         }
         yield Data(**data)

示例#20

0

显示文件

文件： wangyi.py 项目： flyfloor/pholcus

    def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
        result = request.get(url, timeout=timeout, dirtys=[('artiList({', '{'), ('})', '}')], format='JSON')
        news = result['BA8E6OEOwangning']
        start, end = url[url.rindex('/')+1:url.rindex('.')].split('-')

        span = int(end) - int(start)
        if len(news) < span:
            nextpage = None
        else:
            start = end
            end = str(int(start) + span)
            nextpage = '%s/%s-%s.html' % (url[:url.rindex('/')], start, end)
        nextpage = None
        yield nextpage

        for one in news:
            if '|' in url:
                continue
            name = one['title']
            icon = one['imgsrc']
            detail_link = 'http://3g.163.com/touch/article.html?channel=sports&docid=%s' % one['docid']
            desc = one['digest']
            src = '网易新闻'
            category = additions.get('category')
            atime = datetime.strptime(one['ptime'], '%Y-%m-%d %H:%M:%S')
            create_time = datetime.now()
            update_time = datetime.now()
            data = {"name":name,
                "icon":icon,
                "detail_link":detail_link,
                "desc":desc,
                "src":src,
                "category":category,
                'group':'text',
                'content':'',
                "atime":atime,
                "create_time":create_time,
                "update_time":update_time,
                'tid':self.tid}
            yield Data(**data)

示例#21

0

显示文件

文件： register.py 项目： flyfloor/pholcus

def getArticle(aid):
    projection = {
        'uid': 1,
        'name': 1,
        'clsname': 1,
        'filepath': 1,
        'digest': 1
    }
    article = request.post('%sgdc/api/article/%s' % (HOST, str(aid)), {
        'projection': json.dumps(projection),
        'limit': 'one'
    },
                           format='JSON',
                           s=session)
    article = article['article']
    filepath = article['filepath']
    if filepath in Store and article['digest'] == Store[filepath]:
        return article
    result = request.get('%sstatic/exe/%s' % (HOST, filepath), format='TEXT')
    persist(filepath, result)
    Store[filepath] = article['digest']
    return article

示例#22

0

显示文件

文件： xinlang.py 项目： jerryshew/pholcus

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='JSON')
     news = result['result']['data']
     if len(news) < 30:
         nextpage = None
     else:
         index = url.split('=')
         index[-1] = str(int(index[-1]) + 1)
         nextpage = '='.join(index)
     nextpage = None
     yield nextpage
     for one in news:
         name = one['stitle']
         icon = one['img']['u']
         detail_link = one.get('wapurl') or one.get('url', '')
         desc = one['intro']
         src = '新浪新闻'
         category = '足球'
         group = 'text'
         if 'http://video.sina.com.cn' in detail_link:
             group = 'video'
         atime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(float(one['ctime'])))
         atime = datetime.strptime(atime, '%Y-%m-%d %H:%M:%S')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {"name":name,
             "icon":icon,
             "detail_link":detail_link,
             "desc":desc,
             "src":src,
             "category":category,
             'group':group,
             'content':'',
             "atime":atime,
             "create_time":create_time,
             "update_time":update_time,
             'tid':self.tid}
         yield Data(**data)

示例#23

0

显示文件

文件： dianping.py 项目： jerryshew/pholcus

    def fetchShopDetail(self, url, additions={}, timeout=TIMEOUT, implementor=None):
        headers = {
            "User-Agent":"iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
        }
        result = request.get(url, headers=headers, timeout=timeout, format='HTML')
        food_id = [additions['food_id'], ]
        name = additions['name']
        desc = additions['desc']
        tel = additions['tel']
        pic = additions['pic']
        province_id = additions['province_id']
        city_id = additions['city_id']
        tag = []
        try:
            for one in result.findall('.//p[@class="info info-indent"]'):
                prompt = request.getHtmlNodeContent(one.find('.//span'), 'TEXT')
                if '标签' in prompt:
                    tag.extend([request.getHtmlNodeContent(a, 'TEXT') for a in one.findall('.//a')])
        except:
            pass

        area_id = additions['area_id']
        town_id = additions['town_id']
        country_id = additions['country_id']
        address = additions['address']
        longitude = additions['longitude']
        latitude = additions['latitude']

        dianping = {
            'url': url,
            'star': 0,
            'average': additions['average'],
            'taste': 0,
            'env': 0,
            'service': 0,
        }
        dianping_info = result.findall('.//div[@class="brief-info"]//span')
        try:
            dianping['star'] = request.getHtmlNodeContent(dianping_info[0], {'ATTR':'class'})
            dianping['star'] = dianping['star'].replace('mid-rank-stars mid-str', '')
            dianping['star'] = float('%s.%s' % (dianping['star'][0], dianping['star'][1:]))
            dianping['taste'] = float(request.getHtmlNodeContent(dianping_info[3], 'TEXT').replace('口味：', ''))
            dianping['env'] = float(request.getHtmlNodeContent(dianping_info[4], 'TEXT').replace('环境：', ''))
            dianping['service'] = float(request.getHtmlNodeContent(dianping_info[5], 'TEXT').replace('服务：', ''))
        except:
            pass

        src = '大众点评'
        link_url = url
        atime = datetime.now()
        uptime = atime
        time = atime

        print link_url, name

        time_result = request.get(url+'/editmember', headers=headers, timeout=timeout, format='HTML')
        time_info = time_result.findall('.//ul[@class="block-inner desc-list contribute-list Fix"]//li')
        try:
            for one in time_info:
                prompt = request.getHtmlNodeContent(one.find('.//strong'), 'TEXT')
                if '商户' in prompt:
                    atime = datetime.strptime('20%s' % request.getHtmlNodeContent(one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1], '%Y-%m-%d')
                elif '更新' in prompt:
                    uptime = datetime.strptime('20%s' % request.getHtmlNodeContent(one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1], '%Y-%m-%d')
        except:
            pass

        data = Data(food_id=food_id, name=name, desc=desc, tel=tel, pic=pic, province_id=province_id,
            city_id=city_id, tag=tag, area_id=area_id, town_id=town_id, country_id=country_id, address=address,
            longitude=longitude, latitude=latitude, dianping=dianping, src=src, link_url=link_url,
            atime=atime, time=time, uptime=uptime, tid=self.tid)
        yield data

示例#24

0

显示文件

 def fetchList(self, url, additions={}, timeout=TIMEOUT, implementor=None):
     result = request.get(url, timeout=timeout, format='HTML')
     news = result.findall('.//dl[@id="idcur"]//dd')
     news.append(result.find('.//dl[@id="idcur"]//dt'))
     for one in news:
         name = request.getHtmlNodeContent(one.find('.//p'), 'TEXT')
         icon = request.getHtmlNodeContent(one.find('.//img'), {'ATTR':'src'})
         detail_link = request.getHtmlNodeContent(one.find('.//a'), {'ATTR':'href'})
         detail = request.get(detail_link, timeout=timeout, format='HTML')
         desc = ''
         src = 'cctv'
         category = '足球'
         atime = request.getHtmlNodeContent(detail.find('.//div[@class="font_xx"]'), 'TEXT').strip()
         atime = ('20%s' % atime.replace('日 ', 'T').split(' 20')[-1]).replace('年', '-').replace('月', '-')
         atime = datetime.strptime(atime, '%Y-%m-%dT%H:%M')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {"name":name,
             "icon":icon,
             "detail_link":detail_link,
             "desc":desc,
             "src":src,
             "category":category,
             'group':'pic',
             'content':'',
             "atime":atime,
             "create_time":create_time,
             "update_time":update_time,
             'tid':self.tid}
         yield Data(**data)
     news = result.findall('.//ul[@class="il_w120_b1"]//li')
     for one in news:
         name = request.getHtmlNodeContent(one.find('.//div[@class="text"]//a'), 'TEXT').replace('[高清组图]', '')
         icon = request.getHtmlNodeContent(one.find('.//div[@class="image"]//img'), {'ATTR':'src'})
         detail_link = request.getHtmlNodeContent(one.find('.//div[@class="image"]//a'), {'ATTR':'href'})
         detail = request.get(detail_link, timeout=timeout, format='HTML')
         desc = ''
         src = 'cctv'
         category = '足球'
         atime = request.getHtmlNodeContent(detail.find('.//div[@class="font_xx"]'), 'TEXT').strip()
         atime = ('20%s' % atime.replace('日 ', 'T').split(' 20')[-1]).replace('年', '-').replace('月', '-')
         atime = datetime.strptime(atime, '%Y-%m-%dT%H:%M')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {"name":name,
             "icon":icon,
             "detail_link":detail_link,
             "desc":desc,
             "src":src,
             "category":category,
             'group':'pic',
             'content':'',
             "atime":atime,
             "create_time":create_time,
             "update_time":update_time,
             'tid':self.tid}
         yield Data(**data)
     news = result.findall('.//div[@class="text_list"]//ul//li')
     for one in news:
         name = request.getHtmlNodeContent(one.find('.//a'), 'TEXT')
         detail_link = request.getHtmlNodeContent(one.find('.//a'), {'ATTR':'href'})
         detail = request.get(detail_link, timeout=timeout, format='HTML')
         icon = request.getHtmlNodeContent(detail.find('.//p[@align="center"]//img'), {'ATTR':'src'})
         desc = ''
         src = 'cctv'
         category = '足球'
         atime = request.getHtmlNodeContent(detail.find('.//div[@class="function"]//span[@class="info"]//i'), 'TEXT').strip()
         atime = ('20%s' % atime.replace('日 ', 'T').split(' 20')[-1]).replace('年', '-').replace('月', '-')
         atime = datetime.strptime(atime, '%Y-%m-%dT%H:%M')
         create_time = datetime.now()
         update_time = datetime.now()
         data = {"name":name,
             "icon":icon,
             "detail_link":detail_link,
             "desc":desc,
             "src":src,
             "category":category,
             'group':'text',
             'content':'',
             "atime":atime,
             "create_time":create_time,
             "update_time":update_time,
             'tid':self.tid}
         yield Data(**data)

示例#25

0

显示文件

文件： dianping.py 项目： flyfloor/pholcus

    def fetchShopDetail(self,
                        url,
                        additions={},
                        timeout=TIMEOUT,
                        implementor=None):
        headers = {
            "User-Agent":
            "iPhone; CPU iPhone OS 9_1 like Mac OS X AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
        }
        result = request.get(url,
                             headers=headers,
                             timeout=timeout,
                             format='HTML')
        food_id = [
            additions['food_id'],
        ]
        name = additions['name']
        desc = additions['desc']
        tel = additions['tel']
        pic = additions['pic']
        province_id = additions['province_id']
        city_id = additions['city_id']
        tag = []
        try:
            for one in result.findall('.//p[@class="info info-indent"]'):
                prompt = request.getHtmlNodeContent(one.find('.//span'),
                                                    'TEXT')
                if '标签' in prompt:
                    tag.extend([
                        request.getHtmlNodeContent(a, 'TEXT')
                        for a in one.findall('.//a')
                    ])
        except:
            pass

        area_id = additions['area_id']
        town_id = additions['town_id']
        country_id = additions['country_id']
        address = additions['address']
        longitude = additions['longitude']
        latitude = additions['latitude']

        dianping = {
            'url': url,
            'star': 0,
            'average': additions['average'],
            'taste': 0,
            'env': 0,
            'service': 0,
        }
        dianping_info = result.findall('.//div[@class="brief-info"]//span')
        try:
            dianping['star'] = request.getHtmlNodeContent(
                dianping_info[0], {'ATTR': 'class'})
            dianping['star'] = dianping['star'].replace(
                'mid-rank-stars mid-str', '')
            dianping['star'] = float(
                '%s.%s' % (dianping['star'][0], dianping['star'][1:]))
            dianping['taste'] = float(
                request.getHtmlNodeContent(dianping_info[3],
                                           'TEXT').replace('口味：', ''))
            dianping['env'] = float(
                request.getHtmlNodeContent(dianping_info[4],
                                           'TEXT').replace('环境：', ''))
            dianping['service'] = float(
                request.getHtmlNodeContent(dianping_info[5],
                                           'TEXT').replace('服务：', ''))
        except:
            pass

        src = '大众点评'
        link_url = url
        atime = datetime.now()
        uptime = atime
        time = atime

        print link_url, name

        time_result = request.get(url + '/editmember',
                                  headers=headers,
                                  timeout=timeout,
                                  format='HTML')
        time_info = time_result.findall(
            './/ul[@class="block-inner desc-list contribute-list Fix"]//li')
        try:
            for one in time_info:
                prompt = request.getHtmlNodeContent(one.find('.//strong'),
                                                    'TEXT')
                if '商户' in prompt:
                    atime = datetime.strptime(
                        '20%s' % request.getHtmlNodeContent(
                            one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1],
                        '%Y-%m-%d')
                elif '更新' in prompt:
                    uptime = datetime.strptime(
                        '20%s' % request.getHtmlNodeContent(
                            one.find('.//span'), 'TEXT').split('\xc2\xa0')[-1],
                        '%Y-%m-%d')
        except:
            pass

        data = Data(food_id=food_id,
                    name=name,
                    desc=desc,
                    tel=tel,
                    pic=pic,
                    province_id=province_id,
                    city_id=city_id,
                    tag=tag,
                    area_id=area_id,
                    town_id=town_id,
                    country_id=country_id,
                    address=address,
                    longitude=longitude,
                    latitude=latitude,
                    dianping=dianping,
                    src=src,
                    link_url=link_url,
                    atime=atime,
                    time=time,
                    uptime=uptime,
                    tid=self.tid)
        yield data