예제 #1
0
 def test_html_to_unicode(self):
     """Linote html_to_unicode function"""
     html_to_unicode(
         'charset=("zh_cn")',
         '<html><h1>漢字汉字</h1></html>').should.eq(
             ('utf8',
              u'<html><h1>\u6f22\u5b57\u6c49\u5b57</h1></html>'))
예제 #2
0
def handle(job, *args, **kwargs):
    queue = kwargs['queue']
    task = json.loads(job)
    url = task["url"]
    status, source = fetcher.fetch(url, use_proxy=False)
    logger.info('%s|%s' % (url, status))
    try:
        _, source = encoding.html_to_unicode('', source)
    except Exception, e:
        print e
예제 #3
0
def handle(job, *args, **kwargs):
    print 'handle', args, kwargs
    task = json.loads(job)
    url = task["url"]
    domain = tldextracter.extract_domain(url)
    status, content = fetch(url, use_proxy=False)
    try:
        url = url.encode('utf8')
        urlhash = cityhash.CityHash64(url)
    except:
        return (url, None, status, domain, content)
    logger.info('%s|%s' % (url, status))
    if magic.from_buffer(content, mime=True) != 'text/html':
        return (url, urlhash, status, domain, content)
    _, content = encoding.html_to_unicode('', content)
    if status != 200:
        db.push(url, detail=False)
        return (url, urlhash, status, domain, content)
    return (url, urlhash, status, domain, content)
예제 #4
0
 def format(self, note):
     content = ''
     if note is not None:
         _, content = encoding.html_to_unicode('', note.content)
         content = encoding_match.sub('', content)
     return content
예제 #5
0
파일: test_linote.py 프로젝트: solos/linote
 def test_html_to_unicode(self):
     """Linote html_to_unicode function"""
     html_to_unicode(
         'charset=("zh_cn")', '<html><h1>漢字汉字</h1></html>').should.eq(
             ('utf8', u'<html><h1>\u6f22\u5b57\u6c49\u5b57</h1></html>'))
예제 #6
0
파일: extracter.py 프로젝트: solos/sohutv
        'nid': nid,
        'pid': pid,
        'cover': cover,
        'playlistId': playlistId,
        'o_playlistId': o_playlistId,
        'cid': cid,
        'subcid': subcid,
        'osubcid': osubcid,
        'category': category,
        'cateCode': cateCode,
        'pianhua': pianhua,
        'tag': tag,
        'tvid': tvid,
        'title': title,
        'last': last,
        'brief': brief
    }
    return item

if __name__ == '__main__':
    import fetcher
    url = 'http://tv.sohu.com'
    url = 'http://tv.sohu.com/20131223/n392267093.shtml'
    url = 'http://tv.sohu.com/20131223/n392267093.shtml'
    status, content = fetcher.fetch(url)
    _, ucontent = encoding.html_to_unicode('', content)
    #print extract_links(url, ucontent)
    #print extract_content(url, ucontent)
    #print extract_sohutv(url, ucontent)
    print extract_sohutv_data_by_regex(url, ucontent)
예제 #7
0
 def format(self, note):
     content = ''
     if note is not None:
         _, content = encoding.html_to_unicode('', note.content)
         content = encoding_match.sub('', content)
     return content