예제 #1
0
    def generate_non_matching(self, words_fpath, output_fpath, continue_locally=True, threshold_percentile=PERCENTILE):

        with codecs.open(output_fpath, "w", "utf-8") as output_file:
            print >> output_file, "word\tsense_i\tsense_j\tsim\tsense_i_cluster\tsense_j_cluster"

            df = read_csv(words_fpath, encoding='utf-8', delimiter=WORDS_SEP, error_bad_lines=False)
            for i, row in df.iterrows():
                try:
                    senses1 = self._fetcher1.get_senses(row.word)
                    senses2 = self._fetcher2.get_senses(row.word)
                    res = self._match(row.word, senses1, senses2, q=threshold_percentile)

                    for sid1, bow1 in senses1:
                        for sid2, bow2 in senses2:
                            if sid1 not in res and sid2 not in res[sid1]:
                                cluster1 = ','.join(take(CLUSTER_SIZE,[x[0] for x in sorted(self._fetcher1.get_cluster(row.word, sid1).items(), reverse=True, key=operator.itemgetter(1))]))
                                cluster2 = ','.join(take(CLUSTER_SIZE,[x[0] for x in sorted(self._fetcher2.get_cluster(row.word, sid2).items(), reverse=True, key=operator.itemgetter(1))]))
                                output_file.write("%s\t%s\t%s\t%.2f\t%s\t%s\n" % (
                                    row.word, sid1, sid2, 0.0, cluster1, cluster2))
                except KeyboardInterrupt:
                    print "Keyboard interrupt"
                    return
                except DailyLimitException:
                    if continue_locally:
                        print "Skipping due to API limit:", row.word
                        continue
                    else:
                        print "BabelNet API daily limit reached"
                        return
                except:
                    print "Error:", row
                    print format_exc()
        print "Matched senses:", output_fpath

        return output_fpath
예제 #2
0
from jnt.common import load_voc
import codecs 
from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS
from jnt.common import take

MAX_WORDS = 999

voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv"
output_fpath = voc_fpath + "-babelnet.csv"
babelnet_dir = "/Users/alex/tmp/matching/babelnet-eval/"
adagram_voc_fpath = "/Users/alex/tmp/adagram/HugeModel-voc.csv"


babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir,
                    freq_fpath="", divide_by_freq=False)
adagram_voc = load_voc(adagram_voc_fpath)
voc = load_voc(voc_fpath)


with codecs.open(output_fpath, "w", "utf-8") as out:
    for word in voc:
        senses = babelnet.get_senses(word)
        for sense_id, bow in senses:
            bow_words = []
            for w in sorted(bow, key=bow.get, reverse=True):
                if w in adagram_voc and w != word:
                    bow_words.append(w) 
            out.write("%s\t%s\t%s\n" % (word, sense_id, ' '.join(take(MAX_WORDS,bow_words))))
        
print "Output:", output_fpath
예제 #3
0
    def match_file(self,
                   words_fpath,
                   output_fpath,
                   continue_locally=True,
                   threshold_percentile=PERCENTILE):

        with codecs.open(output_fpath, "w", "utf-8") as output_file:
            print >> output_file, "word\tsense_i\tsense_j\tsim\tsense_i_cluster\tsense_j_cluster"

            df = read_csv(words_fpath,
                          encoding='utf-8',
                          delimiter=WORDS_SEP,
                          error_bad_lines=False)
            for i, row in df.iterrows():
                try:
                    senses1 = self._fetcher1.get_senses(
                        row.word, min_prob=MIN_SENSE_PROB)
                    senses2 = self._fetcher2.get_senses(
                        row.word, min_prob=MIN_SENSE_PROB)
                    res = self._match(row.word,
                                      senses1,
                                      senses2,
                                      q=threshold_percentile)

                    for sid1 in res:
                        for sid2, sim in sorted(res[sid1].items(),
                                                key=operator.itemgetter(1),
                                                reverse=True):
                            cluster1 = ','.join(
                                take(CLUSTER_SIZE, [
                                    x[0]
                                    for x in sorted(self._fetcher1.get_cluster(
                                        row.word, sid1).items(),
                                                    reverse=True,
                                                    key=operator.itemgetter(1))
                                ]))
                            cluster2 = ','.join(
                                take(CLUSTER_SIZE, [
                                    x[0]
                                    for x in sorted(self._fetcher2.get_cluster(
                                        row.word, sid2).items(),
                                                    reverse=True,
                                                    key=operator.itemgetter(1))
                                ]))
                            output_file.write("%s\t%s\t%s\t%.6f\t%s\t%s\n" %
                                              (row.word, sid1, sid2, sim,
                                               cluster1, cluster2))
                except KeyboardInterrupt:
                    print "Keyboard interrupt"
                    return
                except DailyLimitException:
                    if continue_locally:
                        print "Skipping due to API limit:", row.word
                        continue
                    else:
                        print "BabelNet API daily limit reached"
                        return
                except:
                    print "Error:", row
                    print format_exc()
        print "Matched senses:", output_fpath

        return output_fpath