Python IO.replaceSpecialChars示例

def getnewsitem2(url):
    
    f = urllib2.urlopen(url)
    rawhtml = f.read()
    #rawhtml = rawhtml.encode('iso-8859-9')
    f.close()
    encoding = f.headers['content-type'].split('charset=')[-1]

    
    markerTitle1 = '<title>'
    markerTitle2 = '</title>'
    title = extractitem(markerTitle1, markerTitle2, rawhtml)
    title = IO.encodingToutf8(title, encoding)
    title = title.split("/")[0]
    title = IO.replaceSpecialChars(title)
    
    markerText1 = '<div id="metin2" class="fck_li">'
    markerText2 = '<div class="IndexKeywordsHeader"'    # veya 'id="hiddenTitle"'
    text = extractitem(markerText1, markerText2, rawhtml)
    text = nltk.clean_html(text)
    
    text = IO.encodingToutf8(text, encoding)
    text = IO.replaceSpecialChars(text)
    
    return NewsItem(title, "", text, "")

示例#2

显示文件

文件： Crawling2.py 项目： dicleoztur/subjectivity_detection

def getnewsitem(resource, url, newsid):

    '''
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    '''
    req = urllib2.Request(url)
    try:
        f = urllib2.urlopen(req)
    except URLError as e:
        message = ""
        if hasattr(e, 'reason'):
            message += 'Cannot reach a server.'
            message += '\nReason: ' + str(e.reason)
        elif hasattr(e, 'code'):
            message += 'The server couldn\'t fulfill the request.'
            message += '\nError code: ' + str(e.code)
        IO.log_connection_error(resource.name, url, message)
        return None
    else:
        rawhtml = f.read()
        #rawhtml = rawhtml.encode('iso-8859-9')
        f.close()
    
    
    encoding = resource.encoding
    if encoding == "":
        encoding = f.headers['content-type'].split('charset=')[-1]
        resource.setEncoding(encoding)
    
    markerTitle1 = resource.markerTitle1
    markerTitle2 = resource.markerTitle2
    title = extractitem(markerTitle1, markerTitle2, rawhtml)
    title = IO.encodingToutf8(title, encoding)    
    title = re.split(r"[/-]", title)[0]
    title = IO.replaceSpecialChars(title)  
    
    markerDate1 = resource.markerDate1 
    markerDate2 = resource.markerDate2
    date = extractitem(markerDate1, markerDate2, rawhtml)
    date = IO.encodingToutf8(date, encoding)
    #date = nltk.clean_html(date)
    
    #print "markers: ",markerDate1," ",markerDate2
    print "date: ",date
    
    
    markerAuthor1 = resource.markerAuthor1
    markerAuthor2 = resource.markerAuthor2
    author = extractitem(markerAuthor1, markerAuthor2, rawhtml)
    
    markerText1 = resource.markerText1
    markerText2 = resource.markerText2
    text = extractitem(markerText1, markerText2, rawhtml)
    
    #print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    text = IO.encodingToutf8(text, encoding)
    #print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    # text = nltk.clean_html(text)
    
    
    # added due to the mixes in solhaber
    try:
        text = nltk.clean_html(text)
    except HTMLParser.HTMLParseError as e:
        tag = e.__str__().split(",")[0].split(":")[-1][2:-2]
        text = text.replace(tag, "")
        text = nltk.clean_html(text)
    
    '''
    print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    text = text.decode('utf-8', 'ignore')
    print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    text = nltk.clean_html(text)
    
    text = IO.encodingToutf8(text, encoding)
    '''
    text = IO.replaceSpecialChars(text)
    
    if resource.name == "solhaber":
        date = date.split(",")[-1]
        newsid = newsid.split("-")[-1]
    
    return NewsItem(newsid, title, date, text, resource.name, author, url)