def get_raw_info(html): if not isinstance(html, unicode): return '', '', '' title = ''.join(re.findall(RE_TITLE, html)).strip() # + re.findall(RE_H, html) #cc filt title from RE_TITLE title = tf.filt(title) html = re.sub(ur"(?is)</a><a", '</a> <a', html) h = re.findall(RE_H, html) for ht in h: ht = tf.remove(ht) ht = tf.filt(ht) ht = ht.strip() if ht == '': continue if title.lower().startswith(ht.lower()) and len(ht.split(' ')) > 2: title = ht break #cc compare string size, choose longer if len(title.split(' ')) < len(ht.split(' ')): title = ht break for k, v in RE_IGNORE_BLOCK.iteritems(): html = re.sub(v, '', html) for k, v in RE_NEWLINE_BLOCK.iteritems(): html = re.sub(v, '\n', html) html = re.sub(RE_MULTI_NEWLINE, '\n', html) return html_util.unescape(title), html_util.unescape(html)
def get_main_content(html, bodyHtml, webInfo): if not isinstance(html, unicode): return '', '', '', {} title = ''.join(re.findall(RE_TITLE, html)).strip() # + re.findall(RE_H, html) html = re.sub(ur"(?is)</a><a", '</a> <a', html) h = re.findall(RE_H, html) for ht in h: ht = ht.strip() if ht == '': continue if title.startswith(ht): title = ht break title = html_util.unescape(title) text = re.sub(RE_TAG, '', html) # 抽取发表时间 time = '' t_time = re.findall(RE_TIME, text) if len(t_time) > 0: time = t_time[0] date = '' t_date = re.findall(RE_DATETIME, text) if len(t_date) > 0: date = t_date[0][0] images, text = HtmlTagStrip(bodyHtml, webInfo) return title, strtotime(date, time), text, images
def get_raw_info(html): if not isinstance(html, unicode): return '','','' title = ''.join(re.findall(RE_TITLE, html))# + re.findall(RE_H, html) html = re.sub(ur"(?is)</a><a",'</a> <a',html) h = re.findall(RE_H, html) for ht in h: ht = ht.strip() if ht == '': continue if title.startswith(ht): title = ht break for k,v in RE_IGNORE_BLOCK.iteritems(): html = re.sub(v, '', html) for k,v in RE_NEWLINE_BLOCK.iteritems(): html = re.sub(v, '\n', html) html = re.sub(RE_MULTI_NEWLINE, '\n', html) return html_util.unescape(title.strip()), html_util.unescape(html)
def get_raw_info(html): if not isinstance(html, unicode): return '', '', '' title = ''.join(re.findall(RE_TITLE, html)) # + re.findall(RE_H, html) html = re.sub(ur"(?is)</a><a", '</a> <a', html) h = re.findall(RE_H, html) for ht in h: ht = ht.strip() if ht == '': continue if title.startswith(ht): title = ht break for k, v in RE_IGNORE_BLOCK.iteritems(): html = re.sub(v, '', html) for k, v in RE_NEWLINE_BLOCK.iteritems(): html = re.sub(v, '\n', html) html = re.sub(RE_MULTI_NEWLINE, '\n', html) return html_util.unescape(title), html_util.unescape(html)
def get_datetime(html, title): # 获取title之后的内容 #title, tmp = get_raw_info(html) bodytext = re.sub(RE_HEAD, '', html) titlepos = (html_util.unescape(bodytext)).find(title) #print "titlepos:%s"%titlepos if titlepos >= 0: bodytext = bodytext[titlepos:] #print "bodytext:%s"%bodytext[:1000] #从title之后开始抽取时间 time = '' t_time = re.findall(RE_TIME, bodytext) if len(t_time) > 0: time = t_time[0] time = time.replace('.', ':', 1) date = '' t_date = re.findall(RE_DATETIME, bodytext) if len(t_date) > 0: date = t_date[0][0] return strtotime(date, time)
def supplesubtitleimages(url, html, text, title): if text == '': return text imagetext = '' try: bodytext = re.sub(RE_HEAD, '', html) #subtitle titlepos = (html_util.unescape(bodytext)).find(title) if titlepos >= 0: bodytext = bodytext[titlepos:] #print "bodytext:%s"%bodytext[:1000] #upcontent contentpos = -1 text1 = re.sub(RE_TAG, '', text) if len(text1) > 30: lastcontent = text1[len(text1) - 30:] contentpos = (html_util.unescape(bodytext)).find(lastcontent) if contentpos > 0: bodytext = bodytext[:contentpos] #domain domain = '' try: domain = get_tld(url) #print "domain:%s"%domain except Exception, e: print e pass #words keywords = ['.jpg', '.gif', '.jpeg'] filtwords = ['thumb', 'twitter', 'facebook'] images = re.findall(RE_IMG_SRC, bodytext) for image in images: #domain if image.find(domain) < 0 and image.lower().find('news') < 0 \ and image.find('intoday') < 0 and image.startswith('/') == False: continue #one keyword at least bfind = False for keyword in keywords: if image.lower().find(keyword) != -1: bfind = True break if bfind == False: continue # no filtword bfind = False for filtword in filtwords: if image.lower().find(filtword) != -1: bfind = True break if bfind: continue if image.lower().startswith('/'): if domain.find('http://') == -1: image = 'http://' + domain + image else: image = domain + image imagetext += '<img from=\"subtitle\" src=\"' + image + '\" />\n' break