示例#1
0
def parse_author_public(content):
    try:
        #(1)记者 xxx报道
        results =  pattern_1.findall(content)  
        for result in results:
            result=result.strip()
            result=result.replace(u"\u3000",u" ")
            result=result.replace(u"、",u" ")
            if result:
                if result.find(u" ")!=-1:
                    return result.split(u" ")
                return [result]
        #(2)(记者 xxx)
        results =  pattern_2.findall(content)  
        for result in results:
            result=result.strip()
            result=result.replace(u"\u3000",u" ")
            result=result.replace(u"、",u" ")
            if result:
                if result.find(u" ")!=-1:
                    return result.split(u" ")
                return [result]
        #(3)记者 xxx摄
        results =  pattern_3.findall(content)  
        for result in results:
            result=result.strip()
            result=result.replace(u"\u3000",u" ")
            result=result.replace(u"、",u" ")
            if result:
                if result.find(u" ")!=-1:
                    return result.split(u" ")
                return [result]
        #(4)(xxx、xxx)
        cur_list = content.split(u"。")
        first = cur_list[0]
        second = cur_list[len(cur_list)-1]
        results =  pattern_4.findall(first)  
        for result in results:
            result=result.strip()
            result=result.replace(u"\u3000",u" ")
            result=result.replace(u"、",u" ")
            if result:
                if result.find(u" ")!=-1:
                    return result.split(u" ")
                if len(result)>1 and len(result)<4:
                    return [result]
        results =  pattern_4.findall(second)  
        for result in results:
            result=result.strip()
            result=result.replace(u"\u3000",u" ")
            result=result.replace(u"、",u" ")
            if result:
                if result.find(u" ")!=-1:
                    return result.split(u" ")
                if len(result)>1 and len(result)<4:
                    return [result]
    except:
        log_util.write_err("作者解析失败:"+exce_parse.get_exce_info(sys.exc_info()))
    return []
示例#2
0
def register_handlers():
    try:
        sys_handler = TimedRotatingFileHandler(path+os.sep+'sys_logs.txt',\
                        level="DEBUG",date_format='%Y-%m-%d',backup_count=30)
        sys_handler.push_application()

        info_handler = TimedRotatingFileHandler(path+os.sep+'info_logs.txt',\
                        level="INFO",date_format='%Y-%m-%d',backup_count=30)
        info_handler.push_application()

        error_handler = TimedRotatingFileHandler(path+os.sep+'error_logs.txt',\
                        level="ERROR",date_format='%Y-%m-%d',backup_count=30)
        error_handler.push_application()
    except:
        write_err("handers注册失败,请检查!:" +
                  exce_parse.get_exce_info(sys.exc_info()))
        print "handers注册失败,请检查!:" + exce_parse.get_exce_info(sys.exc_info())
        exit(0)
示例#3
0
    def parse_page(self, response, ctype, url_ctype):
        log_util.write_info("$$$$详情页解析开始:" + response.url.encode("utf-8") +
                            "$$$$")
        print "$$$$详情页解析开始:" + response.url.encode("utf-8") + "$$$$"
        item = DemoItem()  # 采集对象的容器
        item['crawlerid'] = self.crawlerid

        item['url'] = response.url.encode('UTF-8')

        html_code = response.body
        hxs = HtmlXPathSelector(response)

        # 此处使用了chardet检测网页编码,使用原因是因为虽然部分新浪网页在html声明中指出是使用了gb2312编码,但是实际上并不一定是
        try:
            encoding = Encoding.getCoding(html_code)
            item['html_code'] = html_code.decode(encoding, 'ignore')
            item['encoding'] = encoding
        except:
            log_util.write_err("编码识别出错:" +
                               exce_parse.get_exce_info(sys.exc_info()))

        # 语义解析处理begin
        item['ctype'] = ctype
        item['subtype'] = url_ctype.split('/')[1]
        try:
            parse_page(item, hxs, url_ctype)
        except:
            log_util.write_err("语义解析处理出错:" +
                               exce_parse.get_exce_info(sys.exc_info()))
        # 语义解析处理end

        # 设置爬取时间
        crawl_time = time.strftime(WebsiteInfo.GMT_FORMAT, time.localtime())

        # 如果时间信息未抽取到,将其设置为抓取时间
        if not item["time"]:
            item["time"] = crawl_time

        log_util.write_info("$$$$详情页解析结束:" + item['url'] + "$$$$")
        print "$$$$详情页解析结束:" + item['url'] + "$$$$"

        return item
示例#4
0
 def __init__(self):
     log_util.write_sys(
         '=====================init mongodb=========================')
     log_util.write_sys('init mongodb start')
     try:
         con = MongoClient(host=config.mongodb.host,
                           port=int(config.mongodb.port))
         exec("db = con." + config.mongodb.database)
         exec("self.dbpool = db." + config.mongodb.table)
     except:
         log_util.write_err("MongoDB连接失败,请检查相关配置:" +
                            exce_parse.get_exce_info(sys.exc_info()))
     log_util.write_sys('init mongodb end')
示例#5
0
def parse_abstract(html, hxs, url_ctype):
    try:
        abstract_rules = DetailRule.allowed_channels[url_ctype][
            "abstract_rules"]
        for rule_id in abstract_rules:
            xpath = abstract_rules[rule_id]["xpath"]
            try:
                abstract_str = hxs.select(xpath).extract()[0].strip()
                if abstract_str:
                    return abstract_str
            except Exception, e:
                continue
    except:
        log_util.write_err("摘要解析失败:" +
                           exce_parse.get_exce_info(sys.exc_info()))
    return ""
示例#6
0
def predict_type(x):
    """
        利用朴素贝叶斯模型用于选择goose、boilerpipe、readability以及自主开发的
        基于行块密度的正文抽取算法的正文抽取结果中的一种。
        :returns:int— — 0表示选择自主实现的基于行块密度的抽取结果
                        1表示选择goose抽取结果
                        2表示选择boilerpipe抽取结果
                        3表示选择readability抽取结果
    """
    try:
        model = joblib.load(path + os.sep + "models/nb/nb.pkl")
        predicted = model.predict(x)
        return predicted[0]
    except:
        log_util.write_err("训练好的朴素贝叶斯分类器模型没有找到,请查看位置是否正确:"\
        +exce_parse.get_exce_info(sys.exc_info()))
        exit(0)
示例#7
0
def parse_source(html, hxs, url_ctype):
    try:
        source_rules = DetailRule.allowed_channels[url_ctype]["source_rules"]
        for rule_id in source_rules:
            xpath = source_rules[rule_id]["xpath"]
            try:
                source_str = hxs.select(xpath).extract()[0].strip()
                if source_str:
                    source_str = source_str.rpartition(u"来源:")[2]
                    source_str = source_str.rpartition(u"来源:")[2]
                    return source_str
            except Exception, e:
                continue
    except:
        log_util.write_err("来源解析失败:" +
                           exce_parse.get_exce_info(sys.exc_info()))
    return ""
示例#8
0
def parse_title(html, hxs, url_ctype):

    try:
        title_rules = DetailRule.allowed_channels[url_ctype]["title_rules"]
        for rule_id in title_rules:
            xpath = title_rules[rule_id]["xpath"]
            title_seps = title_rules[rule_id]["sep"].split(" ")
            try:
                time_str = hxs.select(xpath).extract()[0].strip()
                for title_sep in title_seps:
                    time_str = time_str.partition(title_sep)[0]
                return time_str
            except Exception, e:
                continue
    except:
        log_util.write_err("标题解析失败:" +
                           exce_parse.get_exce_info(sys.exc_info()))
    return ""
示例#9
0
def parse_content(html, hxs, url_ctype):
    """
        新闻、政府网站类网页正文抽取的对外接口函数
        三种抽取正文方式(goose/boilerpipe/readability)底层实现的集成调用接口
        :param sourceHtml:待抽取网页源码
        :returns:str— —网页中的正文信息,若未找到则返回""
    """
    try:
        bs = get_bs_by_html(html)
        sourceHtml = str(bs)

        #抽取正文
        try:
            # goose抽取正文
            content = content_goose.get_content_by_html(sourceHtml)
            g_content, g_paraNum, g_wordLen, g_cn_segs_num, g_segs_num, g_mid_word_num = \
                content_format(content.strip())
        except:
            g_content, g_paraNum, g_wordLen, g_cn_segs_num, g_segs_num, g_mid_word_num = \
                "", 0, 0, 0, 0, 0

        # boilerpipe抽取正文
        content = content_boilerpipe.get_content_by_html(sourceHtml)
        b_content, b_paraNum, b_wordLen, b_cn_segs_num, b_segs_num, b_mid_word_num = \
            content_format(content.strip())

        # 生成网页特征向量
        x = [
            g_paraNum, g_wordLen, g_cn_segs_num, g_segs_num, g_mid_word_num,
            b_paraNum, b_wordLen, b_cn_segs_num, b_segs_num, b_mid_word_num,
            b_paraNum, b_wordLen, b_cn_segs_num, b_segs_num, b_mid_word_num
        ]
        content = get_result_content(x, g_content, b_content, b_content)
    except:
        log_util.write_err("正文解析失败:" +
                           exce_parse.get_exce_info(sys.exc_info()))

    if not content:
        return None
    return content
示例#10
0
def parse_editor(html,hxs,url_ctype):
    try:
        editor_rules = DetailRule.allowed_channels[url_ctype]["editor_rules"]
        for rule_id in editor_rules:
            xpath = editor_rules[rule_id]["xpath"]
            prefix = editor_rules[rule_id]["prefix"]
            try:
                for cur_str in hxs.select(xpath.decode("utf8")).extract():
                    cur_str=cur_str.replace("\s","")
                    editor_str = cur_str.strip().partition(prefix)[2]
                    if editor_str:
                        res=[]
                        results =  pattern.findall(editor_str)  
                        for result in results:  
                            res.append([result])
                        return res
            except Exception, e:
                print e
                continue;
    except:
        log_util.write_err("编辑解析失败:"+exce_parse.get_exce_info(sys.exc_info()))
    return []
示例#11
0
def parse_time(html,hxs,url_ctype):
    try:
        time_rules=DetailRule.allowed_channels[url_ctype]["time_rules"]
        for rule_id in time_rules:
            xpath=time_rules[rule_id]["xpath"]
            time_format=time_rules[rule_id]["temp"]
            regex=time_rules[rule_id]["regex"]
            try:
                time_strs=hxs.select(xpath).extract()
                for time_str in time_strs:
                    rem = re.search(regex, time_str)
                    if rem:
                        time_str = rem.group()
                        dest_time=transfer_time(time_str.strip(),time_format,WebsiteInfo.GMT_FORMAT)
                        if dest_time:
                            return dest_time
            except Exception,e:
                print e
                continue;
    except:
        log_util.write_err("时间解析失败:"+exce_parse.get_exce_info(sys.exc_info()))
    return ""
示例#12
0
from xinhuanet.utils.time_transfer.transfer import transfer_time
from xinhuanet.utils.exce_info import exce_parse

#获得本文件所处路径
this_file = inspect.getfile(inspect.currentframe())
path = os.path.abspath(os.path.dirname(this_file))

#读取配置文件configure.xml
try:
    log_util.write_sys("读取并解析配置文件configure.xml开始")
    txt = open(path + os.sep + "configure.xml").read()
    soup = BeautifulSoup(txt, "lxml")
    log_util.write_sys("读取并解析配置文件configure.xml结束")
except:
    log_util.write_err("配置文件configure.xml不存在、或者解析出错:"+\
    exce_parse.get_exce_info(sys.exc_info()))
    exit(0)


class mongodb:
    host = soup.xml.configuration.dbs.mongodb.host.string
    port = soup.xml.configuration.dbs.mongodb.port.string
    database = soup.xml.configuration.dbs.mongodb.database.string
    table = soup.xml.configuration.dbs.mongodb.table.string
    gridfsdb = soup.xml.configuration.dbs.mongodb.gridfsdb.string


class mysql:
    host = soup.xml.configuration.dbs.mysql.host.string
    port = soup.xml.configuration.dbs.mysql.port.string
    user = soup.xml.configuration.dbs.mysql.user.string