def getnewsitem2(url): f = urllib2.urlopen(url) rawhtml = f.read() #rawhtml = rawhtml.encode('iso-8859-9') f.close() encoding = f.headers['content-type'].split('charset=')[-1] markerTitle1 = '<title>' markerTitle2 = '</title>' title = extractitem(markerTitle1, markerTitle2, rawhtml) title = IO.encodingToutf8(title, encoding) title = title.split("/")[0] title = IO.replaceSpecialChars(title) markerText1 = '<div id="metin2" class="fck_li">' markerText2 = '<div class="IndexKeywordsHeader"' # veya 'id="hiddenTitle"' text = extractitem(markerText1, markerText2, rawhtml) text = nltk.clean_html(text) text = IO.encodingToutf8(text, encoding) text = IO.replaceSpecialChars(text) return NewsItem(title, "", text, "")
def getnewsitem(resource, url, newsid): ''' hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} ''' req = urllib2.Request(url) try: f = urllib2.urlopen(req) except URLError as e: message = "" if hasattr(e, 'reason'): message += 'Cannot reach a server.' message += '\nReason: ' + str(e.reason) elif hasattr(e, 'code'): message += 'The server couldn\'t fulfill the request.' message += '\nError code: ' + str(e.code) IO.log_connection_error(resource.name, url, message) return None else: rawhtml = f.read() #rawhtml = rawhtml.encode('iso-8859-9') f.close() encoding = resource.encoding if encoding == "": encoding = f.headers['content-type'].split('charset=')[-1] resource.setEncoding(encoding) markerTitle1 = resource.markerTitle1 markerTitle2 = resource.markerTitle2 title = extractitem(markerTitle1, markerTitle2, rawhtml) title = IO.encodingToutf8(title, encoding) title = re.split(r"[/-]", title)[0] title = IO.replaceSpecialChars(title) markerDate1 = resource.markerDate1 markerDate2 = resource.markerDate2 date = extractitem(markerDate1, markerDate2, rawhtml) date = IO.encodingToutf8(date, encoding) #date = nltk.clean_html(date) #print "markers: ",markerDate1," ",markerDate2 print "date: ",date markerAuthor1 = resource.markerAuthor1 markerAuthor2 = resource.markerAuthor2 author = extractitem(markerAuthor1, markerAuthor2, rawhtml) markerText1 = resource.markerText1 markerText2 = resource.markerText2 text = extractitem(markerText1, markerText2, rawhtml) #print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text) text = IO.encodingToutf8(text, encoding) #print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text) # text = nltk.clean_html(text) # added due to the mixes in solhaber try: text = nltk.clean_html(text) except HTMLParser.HTMLParseError as e: tag = e.__str__().split(",")[0].split(":")[-1][2:-2] text = text.replace(tag, "") text = nltk.clean_html(text) ''' print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text) text = text.decode('utf-8', 'ignore') print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text) text = nltk.clean_html(text) text = IO.encodingToutf8(text, encoding) ''' text = IO.replaceSpecialChars(text) if resource.name == "solhaber": date = date.split(",")[-1] newsid = newsid.split("-")[-1] return NewsItem(newsid, title, date, text, resource.name, author, url)