def load_dict(self, dict_name, dict_file_path, sep='\t'): """ manage the dict from list or dict list index start from 1 """ if dict_name not in self.dict_manager: dict_object = {} ''' load dict from file ''' print('load dict from file {}'.format(dict_file_path)) f_dict = utils.create_read_file(dict_file_path) for idx, line in enumerate(f_dict): if line[0] == '[' and line[-1] == ']': continue line = line.strip().split(sep) if len(line) == 1: dict_object[line[0]] = idx + 1 elif len(line) == 2: # NOT eval, for the value may be str # dict_object[line[0]] = line[1] dict_object[line[0]] = ast.literal_eval(line[1]) else: raise NotImplementedError self.dict_manager[dict_name] = dict_object return self.dict_manager[dict_name]
def train(self, train_instances, train_file, out_list=None): """ out_list is used to sub train_instances from all train_instances if only happen when all faetures have been made. """ ''' 1. Extract Features ''' self.make_feature_file(train_instances, train_file) if out_list: dev = utils.create_read_file(self.train_feature_file).readlines() dev = [ dev[idx].strip() for idx in range(len(dev)) if idx not in out_list ] f_dev = utils.create_write_file(self.train_feature_file) print('\n'.join(dev), file=f_dev) f_dev.close() print('finish filter, train examples %d', len(dev)) ''' 2. Train Classifier ''' self.classifier.train_model(self.train_feature_file, self.model_file) ''' 3. Predict Answers ''' self.output_file = self.get_output_file(train_file) predict_label = self.classifier.test_model(self.train_feature_file, self.model_file, self.output_file) f_out = utils.create_write_file(self.output_file) for label, train_instances in zip(predict_label, train_instances): print('%.2f\t#\t%s' % (label, train_instances.get_instance_string()), file=f_out) return self.classifier
def extract_information(self, train_instances): if self.is_training: sents = [] for train_instance in train_instances: warrant0, warrant1, reason, claim, title, info = train_instance.get_six( type='word') sents.append(warrant0) sents.append(warrant1) sents.append(reason) sents.append(claim) idf_dict = utils.idf_calculator(sents) # idf_dict = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True) with utils.create_write_file(config.RESOURCE_DIR + '/idf_dict.txt') as fw: for key in idf_dict: print('{}\t{}'.format(key, idf_dict[key]), file=fw) print(len(idf_dict)) else: with utils.create_read_file(config.RESOURCE_DIR + '/idf_dict.txt') as fr: idf_dict = {} for line in fr: line = line.strip().split('\t') idf_dict[line[0]] = float(line[1]) self.unigram_dict = idf_dict
def load_dict(self, dict_name, path=config.DICT_DIR): """ path: config.DICT_DIR config.DICT_EX_DIR """ if dict_name not in self.dict_manager: dict_object = {} cur_dir = os.path.dirname(__file__) path = os.path.join(cur_dir, '../resources') ''' load dict from file ''' file_name = path + '/dict_%s.txt' % dict_name print('load dict from file %s \n' % file_name) f_dict = utils.create_read_file(file_name) for idx, line in enumerate(f_dict): line = line.strip().split('\t') if len(line) == 1: dict_object[line[0]] = idx + 1 elif len(line) == 2: dict_object[line[0]] = eval(line[1]) else: raise NotImplementedError self.dict_manager[dict_name] = dict_object return self.dict_manager[dict_name]
def load_STS(train_file): with utils.create_read_file(train_file) as f: data = [] for line in f: line = line.strip().split('\t') score = float(line[4]) sa, sb = line[5], line[6] data.append((sa, sb, score)) return data
def load_model_score(self, train_file): self.output_file = self.get_output_file(train_file) y_pred = utils.create_read_file(self.output_file).readlines() y_pred = [ '1:{}'.format(DICT_LABEL_TO_INDEX[x.strip().split("\t#\t")[0]]) for x in y_pred ] return y_pred
def extract_instances(self, train_instances): """ extract features to features """ self.extract_information(train_instances) features = [] infos = [] process_bar = pyprind.ProgPercent(len(train_instances)) ''' get features from train instances''' alignment_feature_file = self.feature_file.replace('IdfAlignmentFeature', 'AlignmentFeature') alignment_features = utils.create_read_file(alignment_feature_file).readlines() idf_weight = self.idf_weight default_idf_weight = min(idf_weight.values()) for train_instance, alignment_feature in zip(train_instances, alignment_features[1:]): process_bar.update() alignment_feature = alignment_feature.split('\t#\t')[1] myWordAlignments = json.loads(alignment_feature)[0] # list of [sa_idx, sb_idx] index start from 1 word_sa, word_sb = train_instance.get_word(type='lemma', lower=True) sa_aligned = [sa_idx - 1 for sa_idx, sb_idx in myWordAlignments] sb_aligned = [sb_idx - 1 for sa_idx, sb_idx in myWordAlignments] sent1_aligned = [0] * len(word_sa) sent2_aligned = [0] * len(word_sb) for sa_index in sa_aligned: sent1_aligned[sa_index] = 1 for sb_index in sb_aligned: sent2_aligned[sb_index] = 1 # calc all and aligned except stopwords sent1_sum = 0 sent2_sum = 0 sent1_ali = 0 sent2_ali = 0 for idx, word in enumerate(word_sa): weight = idf_weight.get(word, default_idf_weight) sent1_ali += sent1_aligned[idx] * weight sent1_sum += weight for idx, word in enumerate(word_sb): weight = idf_weight.get(word, default_idf_weight) sent2_ali += sent2_aligned[idx] * weight sent2_sum += weight feature = [1.0 * (sent1_ali + sent2_ali) / (sent1_sum + sent2_sum + 1e-6)] info = [sent2_ali, sent2_ali, sent1_sum, sent2_sum] features.append(feature) infos.append(info) return features, infos
def load_parse_data(train_file, parser=None, flag=False): """ Load data after Parse, like POS, NER, etc. Value: [ SentPair:class, ... ] Parameter: flag: False(Default), Load from file (resources....) True, Parse and Write to file, and then load from file """ ''' Pre-Define Write File ''' # parse_train_file = config.PARSE_DIR + '/' + \ # utils.FileManager.get_file(train_file) parse_train_file = train_file.replace('./data', './generate/parse') if flag or not os.path.isfile(parse_train_file): print(train_file) if parser is None: raise RuntimeError( "parser should be init by ``nlp = stst.StanfordNLP('http://localhost:9000')``" ) ''' Parse Data ''' data = load_STS(train_file) print('*' * 50) print("Parse Data, train_file=%s, n_train=%d\n" % (train_file, len(data))) parse_data = [] process_bar = pyprind.ProgPercent(len(data)) for (sa, sb, score) in data: process_bar.update() parse_sa = parser.parse(sa) parse_sb = parser.parse(sb) parse_data.append((parse_sa, parse_sb, score)) ''' Write Data to File ''' with utils.create_write_file(parse_train_file) as f_parse: for parse_instance in parse_data: line = json.dumps(parse_instance) print(line, file=f_parse) ''' Load Data from File ''' print('*' * 50) parse_data = [] with utils.create_read_file(parse_train_file) as f: for line in f: parse_json = json.loads(line) sentpair_instance = SentPair(parse_json) parse_data.append(sentpair_instance) print("Load Data, train_file=%s, n_train=%d\n" % (train_file, len(parse_data))) return parse_data
def extract_information(self, train_instances): if self.is_training: sents, labels = [], [] for train_instance in train_instances: sent = train_instance.get_word() label = train_instance.get_label() sents.append(sent) labels.append(label) rf_dict = utils.rf_calculator(sents, labels, max_cnt=1000) with utils.create_write_file(config.DICTIONARY_DIR + '/rf_dict.txt', 'w') as fw: json.dump(rf_dict, fw, ensure_ascii=False) with utils.create_read_file(config.DICTIONARY_DIR + '/rf_dict.txt', 'rb') as fr: rf_dict = json.load(fr) with utils.create_read_file(config.DICTIONARY_DIR + '/vocab.txt') as fr: vocab_dict = {} for line in fr: line = line.strip().split('\t') vocab_dict[line[0]] = int(line[1]) self.rf_dict = rf_dict self.vocab_dict = vocab_dict
def load_parse_data(train_file, nlp=None, flag=False): """ Load data after Parse, like POS, NER, etc. Value: [ SentPair:class, ... ] Parameter: flag: False(Default), Load from file (resources....) True, Parse and Write to file, and then load from file """ ''' Pre-Define Write File ''' # parse_train_file = config.PARSE_DIR + '/' + \ # utils.FileManager.get_file(train_file) parse_train_file = train_file.replace('./data', './generate/parse') if flag or not os.path.isfile(parse_train_file): print(train_file) ''' Parse Data ''' data = load_data(train_file) print('*' * 50) print("Parse Data, train_file=%s, n_train=%d\n" % (train_file, len(data))) parse_data = [] process_bar = pyprind.ProgPercent(len(data)) for (sent, label) in data: process_bar.update() sent = preprocess(sent) parse_data.append((sent, label)) ''' Write Data to File ''' with utils.create_write_file(parse_train_file) as f_parse: for parse_instance in parse_data: line = json.dumps(parse_instance, ensure_ascii=False) print(line, file=f_parse) ''' Load Data from File ''' print('*' * 50) parse_data = [] with utils.create_read_file(parse_train_file) as f: for line in f: sent, label = json.loads(line) sentpair_instance = Sent(sent, label) parse_data.append(sentpair_instance) print("Load Data, train_file=%s, n_train=%d\n" % (train_file, len(parse_data))) return parse_data
def load_feature_from_file(feature_file): """ load features from file """ f_feature = utils.create_read_file(feature_file) feature_information = f_feature.readline() n_instance, n_dim = feature_information.strip().split() n_instance, n_dim = int(n_instance), int(n_dim) features = [] for feature in f_feature: feature_string, instance_string = feature.split("\t#\t") features.append(feature_string) return features, n_dim, n_instance
def extract_instances(self, train_instances): features = [] infos = [] input_file = self.feature_file.split('/')[-2] + '.txt' f_in = utils.create_read_file(config.NN_FEATURE_PATH + '/' + self.nntype + '/' + input_file) for line in f_in: line = line.strip() obj = json.loads(line) sc = obj[0] / 5.0 features.append([sc]) infos.append([]) print(len(features), features[0]) return features, infos
def extract_instances(self, train_instances): features = [] infos = [] input_file = self.feature_file.split('/')[-2] + '.txt' f_in = utils.create_read_file(config.NN_FEATURE_PATH + '/' + self.nntype + '/' + input_file) for line in f_in: line = line.strip() obj = json.loads(line) emb1 = obj[1] emb2 = obj[2] emb1 = vk.normalize(emb1) emb2 = vk.normalize(emb2) feats, info = vk.get_all_kernel(emb1, emb2) features.append(feats) infos.append(info) print(len(features), features[0], infos[0]) return features, infos
def extract_information(self, train_instances): if self.is_training: sents = [] for train_instance in train_instances: sent = train_instance.get_sent(self.type) sents.append(sent) idf_dict = utils.idf_calculator(sents) with utils.create_write_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fw: idf_dict_tuple = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True) for key, value in idf_dict_tuple: print('{}\t{}'.format(key, value), file=fw) else: with utils.create_read_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fr: idf_dict = {} for line in fr: line = line.strip().split('\t') idf_dict[line[0]] = float(line[1]) self.unigram_dict = idf_dict word_keys = sorted(idf_dict.keys(), reverse=True) self.word2index = {word: i for i, word in enumerate(word_keys)}
def extract_information(self, train_instances): if self.is_training: sents = [] for train_instance in train_instances: sent = train_instance.get_word() sents.append(sent) idf_dict = utils.idf_calculator(sents) #idf_dict = sorted(idf_dict.iteritems(), key=lambda x: x[1], reverse=True) with utils.create_write_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fw: for key in idf_dict: print('{}\t{}'.format(key, idf_dict[key]), file=fw) print(len(idf_dict)) else: with utils.create_read_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fr: idf_dict = {} for line in fr: line = line.strip().split('\t') idf_dict[line[0]] = float(line[1]) self.unigram_dict = idf_dict
def load_idf_dict(self, dict_name='idf_dict'): if dict_name not in self.dict_manager: word_frequencies = {} file_name = config.EX_DICT_DIR + '/word-frequencies.txt' print('load dict from file %s \n' % file_name) f_dict = utils.create_read_file(file_name) for idx, line in enumerate(f_dict): if idx == 0: totfreq = int(line) else: w, freq = line.strip().split() freq = float(freq) if freq < 10: continue word_frequencies[w] = math.log(totfreq / freq) / math.log(2) self.dict_manager[dict_name] = word_frequencies return self.dict_manager[dict_name]
def load_model_score(self, train_file): self.output_file = self.get_output_file(train_file) y_pred = utils.create_read_file(self.output_file).readlines() y_pred = ['1:' + x.strip().split("\t#\t")[0] for x in y_pred] return y_pred
def cross_validation(self, data_instances, data_file, k_fold=5, shuffle=False): self.make_feature_file(data_instances, data_file) n_data = len(data_instances) n_batch = n_data // k_fold data_instances = list(zip(range(n_data), data_instances)) id_map = range(n_data) if shuffle is True: random.shuffle(id_map) preds = [None] * n_data for fold in range(k_fold): st = fold * n_batch ed = (fold + 1) * n_batch if ed > n_data: ed = n_data data = utils.create_read_file(self.dev_feature_file).readlines() # make train data train = [ data[id_map[idx]].strip() for idx in range(len(data)) if idx not in range(st, ed) ] dev_feature_file_train = self.dev_feature_file.replace( 'txt', 'train') f_train = utils.create_write_file(dev_feature_file_train) print('\n'.join(train), file=f_train) f_train.close() # make dev data dev = [data[id_map[idx]].strip() for idx in range(st, ed)] dev_feature_file_dev = self.dev_feature_file.replace('txt', 'dev') f_dev = utils.create_write_file(dev_feature_file_dev) print('\n'.join(dev), file=f_dev) f_dev.close() ''' Train Classifier ''' self.classifier.train_model( dev_feature_file_train, self.model_file) # Attention! self.dev_feature_file ''' Predict Lables''' self.output_file = self.get_output_file(data_file) predict_label = self.classifier.test_model(dev_feature_file_dev, self.model_file, self.output_file) for idx in range(st, ed): idy = idx - st preds[id_map[idx]] = predict_label[idy] ''' Write to File ''' self.output_file = self.get_output_file(data_file) f_out = utils.create_write_file(self.output_file) for label, train_instance in zip(preds, data_instances): print('%.2f\t#\t%s' % (label, train_instance[1].get_instance_string()), file=f_out)
def extract_instances(self, train_instances): """ extract features to features """ self.extract_information(train_instances) idf_weight = self.idf_weight default_idf_weight = min(idf_weight.values()) features = [] infos = [] process_bar = pyprind.ProgPercent(len(train_instances)) ''' get features from train instances''' alignment_feature_file = self.feature_file.replace( 'PosAlignmentFeature', 'AlignmentFeature') alignment_features = utils.create_read_file( alignment_feature_file).readlines() for train_instance, alignment_feature in zip(train_instances, alignment_features[1:]): process_bar.update() alignment_feature = alignment_feature.split('\t#\t')[1] myWordAlignments = json.loads(alignment_feature)[ 0] # list of [sa_idx, sb_idx] index start from 1 pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False) ner_sa, ner_sb = train_instance.get_word(type='ner', stopwords=False) word_sa, word_sb = train_instance.get_word(type='lemma', lower=True) feature, info = [], [] sa_aligned = [sa_idx - 1 for sa_idx, sb_idx in myWordAlignments] sb_aligned = [sb_idx - 1 for sa_idx, sb_idx in myWordAlignments] sent1_aligned = [0] * len(word_sa) sent2_aligned = [0] * len(word_sb) for sa_index in sa_aligned: sent1_aligned[sa_index] = 1 for sb_index in sb_aligned: sent2_aligned[sb_index] = 1 sent1_sum = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.} sent2_sum = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.} sent1_ali = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.} sent2_ali = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.} for idx, word in enumerate(word_sa): pos = pos_sa[idx][1] weight = idf_weight.get(word, default_idf_weight) sent1_ali[pos] = sent1_aligned[idx] * weight sent1_sum[pos] += weight for idx, word in enumerate(word_sb): pos = pos_sb[idx][1] weight = idf_weight.get(word, default_idf_weight) sent2_ali[pos] += sent2_aligned[idx] * weight sent2_sum[pos] += weight for pos in ['n', 'v', 'a', 'r', '#']: score = 1.0 * (sent1_ali[pos] + sent2_ali[pos]) / (sent1_sum[pos] + sent2_sum[pos] + 1e-6) \ if sent1_sum[pos] + sent2_sum[pos] > 1e-6 else 0.0 feature.append(score) info = [sent1_sum, sent2_sum, sent1_ali, sent2_ali] features.append(feature) infos.append(info) return features, infos