예제 #1
0
    def ngram_match_remove_stopwords(sa, sb, n):
        nga = utils.make_ngram(sa, n)
        ngb = utils.make_ngram(sb, n)

        stopwords = dict_utils.DictLoader().load_dict('stopwords')

        new_nga = []
        for ng in nga:
            new_ng = []
            for x in ng:
                if x not in stopwords:
                    new_ng.append(x)
            new_ng = tuple(new_ng)
            if new_ng != ():
                new_nga.append(new_ng)

        new_ngb = []
        for ng in ngb:
            new_ng = []
            for x in ng:
                if x.lower() not in stopwords:
                    new_ng.append(x)
            new_ng = tuple(new_ng)
            if new_ng != ():
                new_ngb.append(new_ng)

        f1 = utils.overlap_f1(new_nga, new_ngb)
        info = [new_nga, new_ngb]
        return f1, info
예제 #2
0
    def weighted_ngram_match(sa, sb, n, idf_weight):
        """weighted_ngram_match
        """
        nga = utils.make_ngram(sa, n)
        ngb = utils.make_ngram(sb, n)
        min_idf_weight = min(idf_weight.values())

        def calc_ngram_idf(ngram):
            res = 0.0
            for ng in ngram:
                res += idf_weight.get(ng, min_idf_weight)
            return res

        idf_sa, idf_sb = 0.0, 0.0
        for ng in nga:
            idf_sa += calc_ngram_idf(ng)
        for ng in ngb:
            idf_sb += calc_ngram_idf(ng)

        matches = 0
        c1 = Counter(nga)
        info = []
        for ng in ngb:
            if c1[ng] > 0:
                c1[ng] -= 1
                matches += calc_ngram_idf(ng)
                info.append(ng)
        p, r, f1 = 0., 0., 1.
        if idf_sa > 0 and idf_sb > 0:
            p = matches / float(idf_sa)
            r = matches / float(idf_sb)
            f1 = 2 * p * r / (p + r) if p + r > 0 else 0.
        return f1, info
예제 #3
0
 def extract(self, train_instance):
     sent = train_instance.get_sent(self.type)
     sent = utils.make_ngram(sent, 2)
     feats = utils.sparse_vectorize(sent, self.bigram_dict, self.word2index, self.convey)
     infos = [len(self.bigram_dict), 'bigram']
     feats = Feature._feat_dict_to_string(feats)
     return feats, infos
예제 #4
0
 def extract_information(self, train_instances):
     if self.is_training:
         sents = []
         for train_instance in train_instances:
             sent = train_instance.get_sent(self.type)
             sent = utils.make_ngram(sent, 2)
             sents.append(sent)
         idf_dict = utils.idf_calculator(sents)
         with utils.create_write_file(config.DICTIONARY_DIR + '/{}_bigram_dict.txt'.format(self.type)) as fw:
             idf_dict_tuple = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
             for key, value in idf_dict_tuple:
                 print('{}\t{}\t{}'.format(key[0], key[1], value), file=fw)
     else:
         with utils.create_read_file(config.DICTIONARY_DIR + '/{}_bigram_dict.txt'.format(self.type)) as fr:
             idf_dict = {}
             for line in fr:
                 line = line.strip().split('\t')
                 idf_dict[(line[0], line[1])] = float(line[2])
     self.bigram_dict = idf_dict
     word_keys = sorted(idf_dict.keys(), reverse=True)
     self.word2index = {word: i for i, word in enumerate(word_keys)}