def ngram_match_remove_stopwords(sa, sb, n): nga = utils.make_ngram(sa, n) ngb = utils.make_ngram(sb, n) stopwords = dict_utils.DictLoader().load_dict('stopwords') new_nga = [] for ng in nga: new_ng = [] for x in ng: if x not in stopwords: new_ng.append(x) new_ng = tuple(new_ng) if new_ng != (): new_nga.append(new_ng) new_ngb = [] for ng in ngb: new_ng = [] for x in ng: if x.lower() not in stopwords: new_ng.append(x) new_ng = tuple(new_ng) if new_ng != (): new_ngb.append(new_ng) f1 = utils.overlap_f1(new_nga, new_ngb) info = [new_nga, new_ngb] return f1, info
def weighted_ngram_match(sa, sb, n, idf_weight): """weighted_ngram_match """ nga = utils.make_ngram(sa, n) ngb = utils.make_ngram(sb, n) min_idf_weight = min(idf_weight.values()) def calc_ngram_idf(ngram): res = 0.0 for ng in ngram: res += idf_weight.get(ng, min_idf_weight) return res idf_sa, idf_sb = 0.0, 0.0 for ng in nga: idf_sa += calc_ngram_idf(ng) for ng in ngb: idf_sb += calc_ngram_idf(ng) matches = 0 c1 = Counter(nga) info = [] for ng in ngb: if c1[ng] > 0: c1[ng] -= 1 matches += calc_ngram_idf(ng) info.append(ng) p, r, f1 = 0., 0., 1. if idf_sa > 0 and idf_sb > 0: p = matches / float(idf_sa) r = matches / float(idf_sb) f1 = 2 * p * r / (p + r) if p + r > 0 else 0. return f1, info
def extract(self, train_instance): sent = train_instance.get_sent(self.type) sent = utils.make_ngram(sent, 2) feats = utils.sparse_vectorize(sent, self.bigram_dict, self.word2index, self.convey) infos = [len(self.bigram_dict), 'bigram'] feats = Feature._feat_dict_to_string(feats) return feats, infos
def extract_information(self, train_instances): if self.is_training: sents = [] for train_instance in train_instances: sent = train_instance.get_sent(self.type) sent = utils.make_ngram(sent, 2) sents.append(sent) idf_dict = utils.idf_calculator(sents) with utils.create_write_file(config.DICTIONARY_DIR + '/{}_bigram_dict.txt'.format(self.type)) as fw: idf_dict_tuple = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True) for key, value in idf_dict_tuple: print('{}\t{}\t{}'.format(key[0], key[1], value), file=fw) else: with utils.create_read_file(config.DICTIONARY_DIR + '/{}_bigram_dict.txt'.format(self.type)) as fr: idf_dict = {} for line in fr: line = line.strip().split('\t') idf_dict[(line[0], line[1])] = float(line[2]) self.bigram_dict = idf_dict word_keys = sorted(idf_dict.keys(), reverse=True) self.word2index = {word: i for i, word in enumerate(word_keys)}