Exemplo n.º 1
0
def big_test(version="3.0", max_length=3):
    from topicmod.util.wordnet import load_wn
    from nltk.corpus import brown
    from nltk.util import ingrams

    wn = load_wn(version)

    term_counts = defaultdict(int)

    for ngram_length in xrange(max_length):
        token = 0
        for w in ingrams(brown.words(), ngram_length):
            token += 1
            normalized = "_".join(w).lower()
            if wn.synsets(normalized, 'n'):
                term_counts[wn.morphy(normalized)] += 1

    filename = "wn/wordnet.wn"
    if version != "3.0":
        filename = "wn/wordnet_%s.wn" % version
    o = OntologyWriter(filename)
    for ii in orderedTraversal(wn):
        o.AddSynset(ii.offset,
                    ii.name,
                    [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()],
                    [(0, x.name.lower(), term_counts[x.name] + 1)
                     for x in ii.lemmas])
    o.Finalize()
Exemplo n.º 2
0
def big_test(version="3.0", max_length=3):
    from topicmod.util.wordnet import load_wn
    from nltk.corpus import brown
    from nltk.util import ingrams

    wn = load_wn(version)

    term_counts = defaultdict(int)

    for ngram_length in xrange(max_length):
        token = 0
        for w in ingrams(brown.words(), ngram_length):
            token += 1
            normalized = "_".join(w).lower()
            if wn.synsets(normalized, 'n'):
                term_counts[wn.morphy(normalized)] += 1

    filename = "wn/wordnet.wn"
    if version != "3.0":
        filename = "wn/wordnet_%s.wn" % version
    o = OntologyWriter(filename)
    for ii in orderedTraversal(wn):
        o.AddSynset(ii.offset, ii.name,
                    [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()],
                    [(0, x.name.lower(), term_counts[x.name] + 1)
                     for x in ii.lemmas])
    o.Finalize()
Exemplo n.º 3
0
    def read_german(self, directory="../../data/germanet/"):
        old_wn = load_wn("2.0")
        new_wn = load_wn("3.0")

        self._mapping["de"] = defaultdict(set)
        for ii in glob(directory + "/ILI*"):
            print "Reading mapping from ", ii
            for jj in codecs.open(ii, 'r', "latin-1"):
                fields = jj.split()
                word = fields[0]
                if word.startswith("$"):
                    print "Spurious symbol: %s" % word.encode(
                        "ascii", "ignore")
                    word = word.replace("$", "")
                if word.startswith("?"):
                    print "Spurious symbol: %s" % word.encode(
                        "ascii", "ignore")
                    word = word.replace("?", "")

                fields = fields[4:]
                fields.reverse()

                while fields:
                    try:
                        link_type = fields.pop()
                        eng_word = fields.pop()
                        eng_sense = fields.pop()
                        synset = fields.pop()
                    except IndexError:
                        print "Pop error:", jj.encode("ascii",
                                                      'ignore'), fields
                        break

                    if synset.startswith("ENG20"):
                        vers, offset, pos = synset.split("-")
                        assert vers == "ENG20", "Wrong version of WordNet: %s" % vers
                    else:
                        if "-" in synset:
                            offset, pos = synset.split("-")
                        else:
                            continue

                    new_synset = find_equiv(pos, eng_word, offset, old_wn,
                                            new_wn)
                    if new_synset and link_type in flags.gn_valid_relations:
                        self._mapping["de"][new_synset.name].add(word.lower())
Exemplo n.º 4
0
 def load_wn(self, location, version):
     globals()["wn"] = load_wn(version, location)
Exemplo n.º 5
0
                        print "Pop error:", jj.encode("ascii",
                                                      'ignore'), fields
                        break

                    if synset.startswith("ENG20"):
                        vers, offset, pos = synset.split("-")
                        assert vers == "ENG20", "Wrong version of WordNet: %s" % vers
                    else:
                        if "-" in synset:
                            offset, pos = synset.split("-")
                        else:
                            continue

                    new_synset = find_equiv(pos, eng_word, offset, old_wn,
                                            new_wn)
                    if new_synset and link_type in flags.gn_valid_relations:
                        self._mapping["de"][new_synset.name].add(word.lower())


# Load all the languages we have as a test

if __name__ == "__main__":
    flags.InitFlags()

    #gn = GermaNet()
    mapping = MultilingMapping()
    mapping.read_german()

    wn = load_wn()
    print[list(mapping.related_words(x)) for x in wn.synsets("dog")]
    word_senses_count[word] = 0
    count_word += 1
    tmp = word
    for pos in multipaths[word]:
      tmp += '\t' + pos
      for index in multipaths[word][pos]:
        word_senses_count[word] += 1
        count_sense += 1
        tmp += '\t' + str(index)
    if word_senses_count[word] > 1:
      im_words += word + " "
    outfile.write(tmp + '\n')
  outfile.write("\nThe total number of cons words: " + str(count_word) + "\n")
  outfile.write("\nThe total number of cons words senses: " + str(count_sense) + "\n")
  outfile.write("\nInteresting words: " + im_words + "\n")
  outfile.close()


flags.define_string("vocab", None, "The input vocab")
flags.define_string("output", None, "The output constraint file")
flags.define_int("num_cons", 0, "The number of constraints we want")

if __name__ == "__main__":

  flags.InitFlags()
  wordnet_path = "../../../data/wordnet/" 
  eng_wn = load_wn("3.0", wordnet_path, "wn")
  vocab = readVocab(flags.vocab)
  generateCons(vocab, eng_wn, flags.output, flags.num_cons)