def __init__(self, config, dataset, feature_extractor): self.config = config self.X = dataset self.n_feature = dataset.n_feature # 特征总数量 self.n_tag = dataset.n_tag # 标签总数量==5 if config.init_model is None: self.model = Model(self.n_feature, self.n_tag) # do this else: self.model = Model.load(config.init_model) self.model.expand(self.n_feature, self.n_tag) self.optim = self._get_optimizer(dataset, self.model) self.feature_extractor = feature_extractor self.idx_to_chunk_tag = { } # {0: 'B', 1: 'B_single', 2: 'I', 3: 'I', 4: 'I'} """ `tag_to_idx` : {'B': 0, 'B_single': 1, 'I': 2, 'I_end': 3, 'I_first': 4} `startswith()` 函数: >>> aaa 'Begin' >>> aaa.startswith("A") False >>> aaa.startswith("B") True """ for tag, idx in feature_extractor.tag_to_idx.items(): if tag.startswith("I"): # ['I', 'I_end', 'I_first'] tag = "I" if tag.startswith("O"): tag = "O" self.idx_to_chunk_tag[idx] = tag
def __init__(self, config, dataset, feature_extractor): self.config = config self.X = dataset self.n_feature = dataset.n_feature self.n_tag = dataset.n_tag if config.init_model is None: self.model = Model(self.n_feature, self.n_tag) else: self.model = Model.load(config.init_model) self.model.expand(self.n_feature, self.n_tag) self.optim = self._get_optimizer(dataset, self.model) self.feature_extractor = feature_extractor self.idx_to_chunk_tag = {} for tag, idx in feature_extractor.tag_to_idx.items(): if tag.startswith("I"): tag = "I" if tag.startswith("O"): tag = "O" self.idx_to_chunk_tag[idx] = tag
class Trainer: def __init__(self, config, dataset, feature_extractor): self.config = config self.X = dataset self.n_feature = dataset.n_feature # 特征总数量 self.n_tag = dataset.n_tag # 标签总数量==5 if config.init_model is None: self.model = Model(self.n_feature, self.n_tag) # do this else: self.model = Model.load(config.init_model) self.model.expand(self.n_feature, self.n_tag) self.optim = self._get_optimizer(dataset, self.model) self.feature_extractor = feature_extractor self.idx_to_chunk_tag = { } # {0: 'B', 1: 'B_single', 2: 'I', 3: 'I', 4: 'I'} """ `tag_to_idx` : {'B': 0, 'B_single': 1, 'I': 2, 'I_end': 3, 'I_first': 4} `startswith()` 函数: >>> aaa 'Begin' >>> aaa.startswith("A") False >>> aaa.startswith("B") True """ for tag, idx in feature_extractor.tag_to_idx.items(): if tag.startswith("I"): # ['I', 'I_end', 'I_first'] tag = "I" if tag.startswith("O"): tag = "O" self.idx_to_chunk_tag[idx] = tag def _get_optimizer(self, dataset, model): config = self.config if "adf" in config.modelOptimizer: # self.modelOptimizer = "crf.adf" return ADF(config, dataset, model) raise ValueError("Invalid Optimizer") def train_epoch(self): return self.optim.optimize() def test(self, testset, iteration): outfile = os.path.join(config.outDir, config.fOutput.format(iteration)) func_mapping = { "tok.acc": self._decode_tokAcc, "str.acc": self._decode_strAcc, "f1": self._decode_fscore, } with open(outfile, "w", encoding="utf8") as writer: score_list = func_mapping[config.evalMetric](testset, self.model, writer) for example in testset: example.predicted_tags = None return score_list def _decode(self, testset: DataSet, model: Model): if config.nThread == 1: self._decode_single(testset, model) else: self._decode_multi_proc(testset, model) def _decode_single(self, testset: DataSet, model: Model): # n_tag = model.n_tag for example in testset: _, tags = _inf.decodeViterbi_fast(example.features, model) example.predicted_tags = tags @staticmethod def _decode_proc(model, in_queue, out_queue): while True: item = in_queue.get() if item is None: return idx, features = item _, tags = _inf.decodeViterbi_fast(features, model) out_queue.put((idx, tags)) def _decode_multi_proc(self, testset: DataSet, model: Model): in_queue = Queue() out_queue = Queue() procs = [] nthread = self.config.nThread for i in range(nthread): p = Process(target=self._decode_proc, args=(model, in_queue, out_queue)) procs.append(p) for idx, example in enumerate(testset): in_queue.put((idx, example.features)) for proc in procs: in_queue.put(None) proc.start() for _ in range(len(testset)): idx, tags = out_queue.get() testset[idx].predicted_tags = tags for p in procs: p.join() # token accuracy def _decode_tokAcc(self, dataset, model, writer): config = self.config self._decode(dataset, model) n_tag = model.n_tag all_correct = [0] * n_tag all_pred = [0] * n_tag all_gold = [0] * n_tag for example in dataset: pred = example.predicted_tags gold = example.tags if writer is not None: writer.write(",".join(map(str, pred))) writer.write("\n") for pred_tag, gold_tag in zip(pred, gold): all_pred[pred_tag] += 1 all_gold[gold_tag] += 1 if pred_tag == gold_tag: all_correct[gold_tag] += 1 config.swLog.write( "% tag-type #gold #output #correct-output token-precision token-recall token-f-score\n" ) sumGold = 0 sumOutput = 0 sumCorrOutput = 0 for i, (correct, gold, pred) in enumerate(zip(all_correct, all_gold, all_pred)): sumGold += gold sumOutput += pred sumCorrOutput += correct if gold == 0: rec = 0 else: rec = correct * 100.0 / gold if pred == 0: prec = 0 else: prec = correct * 100.0 / pred config.swLog.write( "% {}: {} {} {} {:.2f} {:.2f} {:.2f}\n".format( i, gold, pred, correct, prec, rec, (2 * prec * rec / (prec + rec)), )) if sumGold == 0: rec = 0 else: rec = sumCorrOutput * 100.0 / sumGold if sumOutput == 0: prec = 0 else: prec = sumCorrOutput * 100.0 / sumOutput if prec == 0 and rec == 0: fscore = 0 else: fscore = 2 * prec * rec / (prec + rec) config.swLog.write( "% overall-tags: {} {} {} {:.2f} {:.2f} {:.2f}\n".format( sumGold, sumOutput, sumCorrOutput, prec, rec, fscore)) config.swLog.flush() return [fscore] def _decode_strAcc(self, dataset, model, writer): config = self.config self._decode(dataset, model) correct = 0 total = len(dataset) for example in dataset: pred = example.predicted_tags gold = example.tags if writer is not None: writer.write(",".join(map(str, pred))) writer.write("\n") for pred_tag, gold_tag in zip(pred, gold): if pred_tag != gold_tag: break else: correct += 1 acc = correct / total * 100.0 config.swLog.write( "total-tag-strings={} correct-tag-strings={} string-accuracy={}%" .format(total, correct, acc)) return [acc] def _decode_fscore(self, dataset, model, writer): config = self.config self._decode(dataset, model) gold_tags = [] pred_tags = [] for example in dataset: pred = example.predicted_tags gold = example.tags pred_str = ",".join(map(str, pred)) pred_tags.append(pred_str) if writer is not None: writer.write(pred_str) writer.write("\n") gold_tags.append(",".join(map(str, gold))) scoreList, infoList = getFscore(gold_tags, pred_tags, self.idx_to_chunk_tag) config.swLog.write( "#gold-chunk={} #output-chunk={} #correct-output-chunk={} precision={:.2f} recall={:.2f} f-score={:.2f}\n" .format( infoList[0], infoList[1], infoList[2], scoreList[1], scoreList[2], scoreList[0], )) return scoreList
def __init__(self, model_name="default", user_dict="default", postag=False): """初始化函数,加载模型及用户词典""" # print("loading model") # config = Config() # self.config = config self.postag = postag if model_name in ["default"]: config.modelDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "models", model_name, ) elif model_name in config.available_models: config.modelDir = os.path.join( config.pkuseg_home, model_name, ) download_model(config.model_urls[model_name], config.pkuseg_home, config.model_hash[model_name]) else: config.modelDir = model_name # config.fModel = os.path.join(config.modelDir, "model.txt") if user_dict is None: file_name = None other_names = None else: if user_dict not in config.available_models: file_name = user_dict else: file_name = None if model_name in config.models_with_dict: other_name = os.path.join( config.pkuseg_home, model_name, model_name + "_dict.pkl", ) default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [other_name, default_name] else: default_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.pkl", ) other_names = [default_name] self.preprocesser = Preprocesser(file_name) # self.preprocesser = Preprocesser([]) self.postprocesser = Postprocesser(None, other_names) self.feature_extractor = FeatureExtractor.load() self.model = Model.load() self.idx_to_tag = { idx: tag for tag, idx in self.feature_extractor.tag_to_idx.items() } self.n_feature = len(self.feature_extractor.feature_to_idx) self.n_tag = len(self.feature_extractor.tag_to_idx) if postag: download_model(config.model_urls["postag"], config.pkuseg_home, config.model_hash[model_name]) postag_dir = os.path.join( config.pkuseg_home, "postag", ) self.tagger = Postag(postag_dir)
def __init__(self, model_name="default", user_dict="default"): """初始化函数,加载模型及用户词典""" # print("loading model") # config = Config() # self.config = config if model_name in ["default"]: config.modelDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "models", model_name, ) else: config.modelDir = model_name # config.fModel = os.path.join(config.modelDir, "model.txt") if user_dict == "default": # file_name = os.path.join( # os.path.dirname(os.path.realpath(__file__)), # "dicts", "default_common.txt", # ) file_name = None other_name = os.path.join( os.path.dirname(os.path.realpath(__file__)), "dicts", "default.txt", ) else: file_name = user_dict other_name = None # self.preprocesser = Preprocesser(file_name) self.preprocesser = Preprocesser([]) self.postprocesser = Postprocesser(file_name, other_name) self.feature_extractor = FeatureExtractor.load() self.model = Model.load() self.idx_to_tag = { idx: tag for tag, idx in self.feature_extractor.tag_to_idx.items() } # self.idx2tag = [None] * len(self.testFeature.tagIndexMap) # for i in self.testFeature.tagIndexMap: # self.idx2tag[self.testFeature.tagIndexMap[i]] = i # if config.nLabel == 2: # B = B_single = "B" # I_first = I = I_end = "I" # elif config.nLabel == 3: # B = B_single = "B" # I_first = I = "I" # I_end = "I_end" # elif config.nLabel == 4: # B = "B" # B_single = "B_single" # I_first = I = "I" # I_end = "I_end" # elif config.nLabel == 5: # B = "B" # B_single = "B_single" # I_first = "I_first" # I = "I" # I_end = "I_end" # self.B = B # self.B_single = B_single # self.I_first = I_first # self.I = I # self.I_end = I_end self.n_feature = len(self.feature_extractor.feature_to_idx) self.n_tag = len(self.feature_extractor.tag_to_idx)