def parse_author_public(content): try: #(1)记者 xxx报道 results = pattern_1.findall(content) for result in results: result=result.strip() result=result.replace(u"\u3000",u" ") result=result.replace(u"、",u" ") if result: if result.find(u" ")!=-1: return result.split(u" ") return [result] #(2)(记者 xxx) results = pattern_2.findall(content) for result in results: result=result.strip() result=result.replace(u"\u3000",u" ") result=result.replace(u"、",u" ") if result: if result.find(u" ")!=-1: return result.split(u" ") return [result] #(3)记者 xxx摄 results = pattern_3.findall(content) for result in results: result=result.strip() result=result.replace(u"\u3000",u" ") result=result.replace(u"、",u" ") if result: if result.find(u" ")!=-1: return result.split(u" ") return [result] #(4)(xxx、xxx) cur_list = content.split(u"。") first = cur_list[0] second = cur_list[len(cur_list)-1] results = pattern_4.findall(first) for result in results: result=result.strip() result=result.replace(u"\u3000",u" ") result=result.replace(u"、",u" ") if result: if result.find(u" ")!=-1: return result.split(u" ") if len(result)>1 and len(result)<4: return [result] results = pattern_4.findall(second) for result in results: result=result.strip() result=result.replace(u"\u3000",u" ") result=result.replace(u"、",u" ") if result: if result.find(u" ")!=-1: return result.split(u" ") if len(result)>1 and len(result)<4: return [result] except: log_util.write_err("作者解析失败:"+exce_parse.get_exce_info(sys.exc_info())) return []
def register_handlers(): try: sys_handler = TimedRotatingFileHandler(path+os.sep+'sys_logs.txt',\ level="DEBUG",date_format='%Y-%m-%d',backup_count=30) sys_handler.push_application() info_handler = TimedRotatingFileHandler(path+os.sep+'info_logs.txt',\ level="INFO",date_format='%Y-%m-%d',backup_count=30) info_handler.push_application() error_handler = TimedRotatingFileHandler(path+os.sep+'error_logs.txt',\ level="ERROR",date_format='%Y-%m-%d',backup_count=30) error_handler.push_application() except: write_err("handers注册失败,请检查!:" + exce_parse.get_exce_info(sys.exc_info())) print "handers注册失败,请检查!:" + exce_parse.get_exce_info(sys.exc_info()) exit(0)
def parse_page(self, response, ctype, url_ctype): log_util.write_info("$$$$详情页解析开始:" + response.url.encode("utf-8") + "$$$$") print "$$$$详情页解析开始:" + response.url.encode("utf-8") + "$$$$" item = DemoItem() # 采集对象的容器 item['crawlerid'] = self.crawlerid item['url'] = response.url.encode('UTF-8') html_code = response.body hxs = HtmlXPathSelector(response) # 此处使用了chardet检测网页编码,使用原因是因为虽然部分新浪网页在html声明中指出是使用了gb2312编码,但是实际上并不一定是 try: encoding = Encoding.getCoding(html_code) item['html_code'] = html_code.decode(encoding, 'ignore') item['encoding'] = encoding except: log_util.write_err("编码识别出错:" + exce_parse.get_exce_info(sys.exc_info())) # 语义解析处理begin item['ctype'] = ctype item['subtype'] = url_ctype.split('/')[1] try: parse_page(item, hxs, url_ctype) except: log_util.write_err("语义解析处理出错:" + exce_parse.get_exce_info(sys.exc_info())) # 语义解析处理end # 设置爬取时间 crawl_time = time.strftime(WebsiteInfo.GMT_FORMAT, time.localtime()) # 如果时间信息未抽取到,将其设置为抓取时间 if not item["time"]: item["time"] = crawl_time log_util.write_info("$$$$详情页解析结束:" + item['url'] + "$$$$") print "$$$$详情页解析结束:" + item['url'] + "$$$$" return item
def __init__(self): log_util.write_sys( '=====================init mongodb=========================') log_util.write_sys('init mongodb start') try: con = MongoClient(host=config.mongodb.host, port=int(config.mongodb.port)) exec("db = con." + config.mongodb.database) exec("self.dbpool = db." + config.mongodb.table) except: log_util.write_err("MongoDB连接失败,请检查相关配置:" + exce_parse.get_exce_info(sys.exc_info())) log_util.write_sys('init mongodb end')
def parse_abstract(html, hxs, url_ctype): try: abstract_rules = DetailRule.allowed_channels[url_ctype][ "abstract_rules"] for rule_id in abstract_rules: xpath = abstract_rules[rule_id]["xpath"] try: abstract_str = hxs.select(xpath).extract()[0].strip() if abstract_str: return abstract_str except Exception, e: continue except: log_util.write_err("摘要解析失败:" + exce_parse.get_exce_info(sys.exc_info())) return ""
def predict_type(x): """ 利用朴素贝叶斯模型用于选择goose、boilerpipe、readability以及自主开发的 基于行块密度的正文抽取算法的正文抽取结果中的一种。 :returns:int— — 0表示选择自主实现的基于行块密度的抽取结果 1表示选择goose抽取结果 2表示选择boilerpipe抽取结果 3表示选择readability抽取结果 """ try: model = joblib.load(path + os.sep + "models/nb/nb.pkl") predicted = model.predict(x) return predicted[0] except: log_util.write_err("训练好的朴素贝叶斯分类器模型没有找到,请查看位置是否正确:"\ +exce_parse.get_exce_info(sys.exc_info())) exit(0)
def parse_source(html, hxs, url_ctype): try: source_rules = DetailRule.allowed_channels[url_ctype]["source_rules"] for rule_id in source_rules: xpath = source_rules[rule_id]["xpath"] try: source_str = hxs.select(xpath).extract()[0].strip() if source_str: source_str = source_str.rpartition(u"来源:")[2] source_str = source_str.rpartition(u"来源:")[2] return source_str except Exception, e: continue except: log_util.write_err("来源解析失败:" + exce_parse.get_exce_info(sys.exc_info())) return ""
def parse_title(html, hxs, url_ctype): try: title_rules = DetailRule.allowed_channels[url_ctype]["title_rules"] for rule_id in title_rules: xpath = title_rules[rule_id]["xpath"] title_seps = title_rules[rule_id]["sep"].split(" ") try: time_str = hxs.select(xpath).extract()[0].strip() for title_sep in title_seps: time_str = time_str.partition(title_sep)[0] return time_str except Exception, e: continue except: log_util.write_err("标题解析失败:" + exce_parse.get_exce_info(sys.exc_info())) return ""
def parse_content(html, hxs, url_ctype): """ 新闻、政府网站类网页正文抽取的对外接口函数 三种抽取正文方式(goose/boilerpipe/readability)底层实现的集成调用接口 :param sourceHtml:待抽取网页源码 :returns:str— —网页中的正文信息,若未找到则返回"" """ try: bs = get_bs_by_html(html) sourceHtml = str(bs) #抽取正文 try: # goose抽取正文 content = content_goose.get_content_by_html(sourceHtml) g_content, g_paraNum, g_wordLen, g_cn_segs_num, g_segs_num, g_mid_word_num = \ content_format(content.strip()) except: g_content, g_paraNum, g_wordLen, g_cn_segs_num, g_segs_num, g_mid_word_num = \ "", 0, 0, 0, 0, 0 # boilerpipe抽取正文 content = content_boilerpipe.get_content_by_html(sourceHtml) b_content, b_paraNum, b_wordLen, b_cn_segs_num, b_segs_num, b_mid_word_num = \ content_format(content.strip()) # 生成网页特征向量 x = [ g_paraNum, g_wordLen, g_cn_segs_num, g_segs_num, g_mid_word_num, b_paraNum, b_wordLen, b_cn_segs_num, b_segs_num, b_mid_word_num, b_paraNum, b_wordLen, b_cn_segs_num, b_segs_num, b_mid_word_num ] content = get_result_content(x, g_content, b_content, b_content) except: log_util.write_err("正文解析失败:" + exce_parse.get_exce_info(sys.exc_info())) if not content: return None return content
def parse_editor(html,hxs,url_ctype): try: editor_rules = DetailRule.allowed_channels[url_ctype]["editor_rules"] for rule_id in editor_rules: xpath = editor_rules[rule_id]["xpath"] prefix = editor_rules[rule_id]["prefix"] try: for cur_str in hxs.select(xpath.decode("utf8")).extract(): cur_str=cur_str.replace("\s","") editor_str = cur_str.strip().partition(prefix)[2] if editor_str: res=[] results = pattern.findall(editor_str) for result in results: res.append([result]) return res except Exception, e: print e continue; except: log_util.write_err("编辑解析失败:"+exce_parse.get_exce_info(sys.exc_info())) return []
def parse_time(html,hxs,url_ctype): try: time_rules=DetailRule.allowed_channels[url_ctype]["time_rules"] for rule_id in time_rules: xpath=time_rules[rule_id]["xpath"] time_format=time_rules[rule_id]["temp"] regex=time_rules[rule_id]["regex"] try: time_strs=hxs.select(xpath).extract() for time_str in time_strs: rem = re.search(regex, time_str) if rem: time_str = rem.group() dest_time=transfer_time(time_str.strip(),time_format,WebsiteInfo.GMT_FORMAT) if dest_time: return dest_time except Exception,e: print e continue; except: log_util.write_err("时间解析失败:"+exce_parse.get_exce_info(sys.exc_info())) return ""
from xinhuanet.utils.time_transfer.transfer import transfer_time from xinhuanet.utils.exce_info import exce_parse #获得本文件所处路径 this_file = inspect.getfile(inspect.currentframe()) path = os.path.abspath(os.path.dirname(this_file)) #读取配置文件configure.xml try: log_util.write_sys("读取并解析配置文件configure.xml开始") txt = open(path + os.sep + "configure.xml").read() soup = BeautifulSoup(txt, "lxml") log_util.write_sys("读取并解析配置文件configure.xml结束") except: log_util.write_err("配置文件configure.xml不存在、或者解析出错:"+\ exce_parse.get_exce_info(sys.exc_info())) exit(0) class mongodb: host = soup.xml.configuration.dbs.mongodb.host.string port = soup.xml.configuration.dbs.mongodb.port.string database = soup.xml.configuration.dbs.mongodb.database.string table = soup.xml.configuration.dbs.mongodb.table.string gridfsdb = soup.xml.configuration.dbs.mongodb.gridfsdb.string class mysql: host = soup.xml.configuration.dbs.mysql.host.string port = soup.xml.configuration.dbs.mysql.port.string user = soup.xml.configuration.dbs.mysql.user.string