Python get_charset_of_html 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: parse

메소드/함수: get_charset_of_html

hotexamples.com에서의 예제들: 6

Python get_charset_of_html - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 parse.get_charset_of_html에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: test.py 프로젝트: resite/news_crawler

def net_test(config_path='config.xml'):
    print "net_test ==>  "

    html = get_html_from_url(
        "http://news.163.com/13/0727/16/94Q724990001124J.html")

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))

    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@class='left']")
    print "len ", len(result)

    for ret in result:
        value = ret.text_content().strip()
        value = re.sub('[\r\n]', '', value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间：", re.search(regx, value).groups(0)[0]

    #regx = u'来源[:：\s]*([^\s]*)[有]*'
    #print "来源：", re.search(regx, value).groups(0)[0].strip()
    #regx = u'\d|-|:|(有.*)'

    regx = u'来源[:：\s]*([^\s]+)有.*'

    print "来源：", re.search(regx, value).groups(0)[0].strip()

예제 #2

파일 보기

파일: test.py 프로젝트: resite/news_crawler

def parse_config(config_path="config.xml"):
    print "parse_xml ==>  "

    html = get_html_from_url(
        "http://finance.sina.com.cn/china/20130727/015816259014.shtml")

    #print html[:600]

    ss = StringIO('')

    charset = get_charset_of_html(html)
    doc = HTML.fromstring(html.decode(charset, 'ignore'))

    #doc = HTML.fromstring(html)
    result = doc.xpath('//div[@class="artInfo"]')
    #result = doc.xpath("//div[@id='artibody']/p")
    print "len ", len(result)
    for ret in result:
        value = ret.text_content().strip()

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间：", re.search(regx, value).groups(0)[0]

    print value
    regx = u':\d{2}([^\s]*)[\s]*'
    print "来源：", re.search(regx, value).groups(0)[0].strip()

예제 #3

파일 보기

파일: test.py 프로젝트: resite/news_crawler

def ifeng_test(config_path='config.xml'):
    print "sohu_test ==>  "

    html = get_html_from_url(
        "http://news.ifeng.com/society/1/detail_2013_07/27/27973995_0.shtml")
    print html[:800]

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))

    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@id='artical_sth']/p")
    print "len ", len(result)

    for ret in result:
        value = ret.text_content().strip()
        value = re.sub('[\r\n]', '', value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间：", re.search(regx, value).groups(0)[0]

    regx = u'来源[:：]*([^\s]*)[\s]*'
    print "来源：", re.search(regx, value).groups(0)[0].strip()

    xpath_cont = "//div[@id='main_content']/p"
    conts = doc.xpath(xpath_cont)
    if conts is not None:
        for cont in conts:
            value = cont.text_content().strip()
            print value

예제 #4

파일 보기

파일: test.py 프로젝트: qq40660/news_crawler

def net_test(config_path = 'config.xml'):
    print "net_test ==>  "
    
    html = get_html_from_url("http://news.163.com/13/0727/16/94Q724990001124J.html")

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))
    
    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@class='left']")
    print "len ", len(result)
    
    for ret in result: 
        value = ret.text_content().strip()
        value = re.sub('[\r\n]','',value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间：", re.search(regx, value).groups(0)[0]

    #regx = u'来源[:：\s]*([^\s]*)[有]*'
    #print "来源：", re.search(regx, value).groups(0)[0].strip()
#regx = u'\d|-|:|(有.*)'
    
    regx = u'来源[:：\s]*([^\s]+)有.*'
    
    print "来源：", re.search(regx, value).groups(0)[0].strip()

예제 #5

파일 보기

파일: test.py 프로젝트: qq40660/news_crawler

def parse_config(config_path = "config.xml"):
    print "parse_xml ==>  "
    
    html = get_html_from_url("http://finance.sina.com.cn/china/20130727/015816259014.shtml")

    #print html[:600]

    ss = StringIO('')

    charset = get_charset_of_html(html)
    doc = HTML.fromstring(html.decode(charset, 'ignore'))
    
    #doc = HTML.fromstring(html)
    result = doc.xpath('//div[@class="artInfo"]')
    #result = doc.xpath("//div[@id='artibody']/p")
    print "len ", len(result)
    for ret in result: 
        value = ret.text_content().strip()

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间：", re.search(regx, value).groups(0)[0]

    print value
    regx = u':\d{2}([^\s]*)[\s]*'
    print "来源：", re.search(regx, value).groups(0)[0].strip()

예제 #6

파일 보기

파일: test.py 프로젝트: qq40660/news_crawler

def ifeng_test(config_path = 'config.xml'):
    print "sohu_test ==>  "
    
    html = get_html_from_url("http://news.ifeng.com/society/1/detail_2013_07/27/27973995_0.shtml")
    print html[:800]

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))
    
    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@id='artical_sth']/p")
    print "len ", len(result)
    
    for ret in result: 
        value = ret.text_content().strip()
        value = re.sub('[\r\n]','',value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间：", re.search(regx, value).groups(0)[0]

    regx = u'来源[:：]*([^\s]*)[\s]*'
    print "来源：", re.search(regx, value).groups(0)[0].strip()


    xpath_cont = "//div[@id='main_content']/p"
    conts = doc.xpath(xpath_cont)
    if conts is not None:
        for cont in conts:
            value = cont.text_content().strip()
            print value