def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.lst_schema = [ u'inside_square', u'deal_type', u'type_building', u'mortgage', u'elevator', u'heat', u'own_year', u'house_type', u'unique', u'type_xiaoqu', u'property_right' ]
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.quarter_map = {u"-03-31": u"一季", u"-06-30": u"中期", u"-09-30": u"三季", u"-12-31": u"年度", } self.year_mode_list = {u"2014",u"2015",u"2016"} self.month_mode_map = {u"1-3":u"-03-31", u"1-6":u"-06-30",u"1-9":u"-09-30",u"1-12":u"-12-31",u"一季度":u"-03-31",u"一季":u"-03-31",u"中期":u"-06-30",u"三季度":u"-09-30",u"三季":u"-09-30",u"年度":u"-12-31",} self.config_path = self.basic_path + "i_entity_extractor/extractors/ssgs_caibao_profit/mapping.conf" self.mapping_conf = self.read_config(self.config_path) self.profit_config_path = self.basic_path + "i_entity_extractor/extractors/ssgs_caibao_profit/profit_mapping.conf" self.profit_mapping_conf = self.read_config(self.profit_config_path) self.money_type_list = [u'美元', u'欧元', u'港元', u'港币'] self.title_map = {u"108":u"利润表",u"110":u"资产负债表",u"111":u"现金流量表",u"112":u"公司综合能力指标"}
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.info_dic = { u"宗地编号": u"code", u"宗地总面积": u"acreage", u"宗地面积": u"acreage", u"宗地坐落": u"address", u"出让年限": u"land_use_year", u"保证金": u"margin", u"起始价": u"starting_price", u"容积率": u"volume_ratio" }
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.config_path = self.basic_path + "i_entity_extractor/extractors/ssgs_caibao/mapping.conf" self.mapping_conf = self.read_config() self.public_sector_regex = re.compile("\D+") self.public_sector_dict = { "szmb": u"深市主板", "szsme": u"中小企业板", "szcn": u"创业板", "shmb": u"沪市主板", "hkmb": u"香港主板", "hkgem": u"香港创业板", }
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.time_map = { u"小时": 3600, u"分钟": 60, u"秒": 1, } negative_word_conf = self.basic_path + 'i_entity_extractor/dict/negative_word.conf' self.negative_word_list = open(negative_word_conf).read().split( '\n')[:-1] self.negative_word_index = esm.Index() for negative_word in self.negative_word_list: if negative_word: self.negative_word_index.enter(negative_word) self.negative_word_index.fix()
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.config_path = self.basic_path + "i_entity_extractor/extractors/gsxx/mapping.conf" self.mapping_conf = self.read_config() self.period_regex = re.compile( u"(\d{4}.\d{1,2}.\d{1,2}).*?(\d{4}.\d{1,2}.\d{1,2})") self.period_regex2 = re.compile(u"\d{4}.\d{1,2}.\d{1,2}") self.punctuation_list = [ '+', '!', '。', ',', '?', '&', '#', '@', '、', '~', '*', '……', '(', ')', ';' ] for special_str in string.punctuation: self.punctuation_list.append(special_str) self.extract_re = re.compile( u'^(.{0,5}名称|企业基本信息:名称|企业\(机构\)名称|名称序号: 企业名称|变更前内容|变更后内容|【变更前内容|【变更后内容)\s{0,1}(:|:|】|\s) {0,3}([^;\:\.]+)' )
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.case_type_map = { "1": u"刑事案件", "2": u"民事案件", "3": u"行政案件", "4": u"赔偿案件", "5": u"执行案件" } self.case_id_type_map = { u"刑": u"刑事案件", u"民": u"民事案件", u"商": u"民事案件", u"行": u"行政案件", u"赔": u"赔偿案件", u"执": u"执行案件", }
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.money_regex = re.compile(u'\d+\.\d+万元|\d+万元|\d+\.\d+元|\d+元') self.money_regex_chs = re.compile( u'[一二三四五六七八九十壹贰叁肆伍陆柒捌玖拾]万\S+元|[一二三四五六七八九十壹贰叁肆伍陆柒捌玖拾]千\S+元|[一二三四五六七八九十壹贰叁肆伍陆柒捌玖拾]百\S+元' ) money_pattern_list = [] money_pattern_list.append(u'¥\d+\.\d+') for keyword in shixin_conf.money_keyword_list: pattern = keyword + '\d+\.\d+万' money_pattern_list.append(pattern) pattern = keyword + '\d+万' money_pattern_list.append(pattern) pattern = keyword + '\d+\.\d+' money_pattern_list.append(pattern) pattern = keyword + '\d+' money_pattern_list.append(pattern) money_patterns = '|'.join(money_pattern_list) self.money_regex_last = re.compile(money_patterns) self.money_regex3 = re.compile(u'\d+\.\d+|\d+')
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log)
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.seps = [',', ':', '\t'] self.parser_obj = CommonParser(self.parser_tool, log)
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.litiants_seps = [',', ':', ',', ':', '。', '、', ";", ";", '\t']
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.src_table = 'stock_info'
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.lst_schema = [u'building_usage', u'years', u'decoration']
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.config_path = self.basic_path + "i_entity_extractor/extractors/annual_reports/mapping.conf" self.mapping_conf = self.read_config()
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.lst_schema = [u'building_usage', u'years', u'decoration'] self.province_parser = ProvinceParser(province_city, phone_city, region_city, city_city)
def __init__(self, topic_info, log): DefaultExtractor.__init__(self, topic_info, log) self.lst_cfg_area = [u'province', u'city', u'county']