示例#1
0
def jaccard(sen_1, sen_2):
  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_1))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_1 = set(words)

  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_2))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_2 = set(words)

  jaccard_value = jaccard_distance(sen_set_1, sen_set_2)
  return jaccard_value
示例#2
0
def similarity_by_jaccard(ox_defis, wn_defis):

  matrix_similarity_jaccard = [[0 for x in range(len(ox_defis))] for x in range(len(wn_defis))];

  for iWnWord in range(len(wn_defis)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn_defis[iWnWord]));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_defis[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
#    print "\n"
#    print wn_set
    # wn_set = set(wn.synset(wn_defis[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(ox_defis)):

#      if not ox_defis[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
#        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
#        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(ox_defis[iDictWord]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(ox_defis[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
#      print dict_set
      # print
      # dict_set = set(ox_defis[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set);
#      matrix_similarity_jaccard[iWnWord][iDictWord] = cal_jacc_for_ngrams(wn_set, dict_set, 1)

  ########################################
  return matrix_similarity_jaccard
示例#3
0
def get_nouns(dict_words):

    dict_words_nouns = []

    for iWord in range(len(dict_words)):

        print iWord

        dict_words_nouns.append([])

        wordDict = dict_words[iWord]

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # sd

        tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["sd"]))
        nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")]

        for noun in nouns:
            if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]:
                print noun
                dict_words_nouns[iWord].append(noun)

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # d

        tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"]))
        nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")]

        for noun in nouns:
            if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]:
                # print noun;
                dict_words_nouns[iWord].append(noun)

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        print wordDict["tv"]
        print dict_words_nouns[iWord]

    ########################################
    return dict_words_nouns
示例#4
0
def split_and_POS(sen):
    # tokenize
    tokens = nltk.wordpunct_tokenize(sen)
    # pos tag
    tagged_words = POSWrapper.pos_tag(tokens)

    arr_pos = []
    for (word, pos) in tagged_words:
        arr_pos.append(pos)

    return arr_pos
def preprocess_sentence_to_nouns(sentence):
  # tokenize
  tokens = nltk.wordpunct_tokenize(sentence)
  # pos tag
  tagged_words = POSWrapper.pos_tag(tokens)
  # get n
  nouns = [word for word, pos in tagged_words if check_pos_noun(pos)]
  # stemming
#  nouns_stemmed = []
#  for noun in nouns:
#    noun_stemmed = __wordnet_lemmatizer__.lemmatize(noun, pos='n')
#    nouns_stemmed.append(noun_stemmed)
#
  return nouns
def read_nouns():
  dict_wn = {}
  for synset in list(wn.all_synsets('n')):
    key = synset.name() + "-" + synset.definition()
    lemmas =  [str(lemma.name()) for lemma in synset.lemmas()]
    key += "="
    for lemma in lemmas:
      key = key + "-" + lemma
    dict_wn[key] = lemmas

      # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # # get hypernyms
      # print "\nhypernyms ------";
    for hypernym in synset.hypernyms():
      for lemma in wn.synset(hypernym.name()).lemmas():
        lemma_name = lemma.name();
        dict_wn[key].append(lemma_name)

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # get hyponyms
    for hyponym in synset.hyponyms():
      for lemma in wn.synset(hyponym.name()).lemmas():
        lemma_name = lemma.name();
        dict_wn[key].append(lemma_name)
      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # get description
      # print wn.synset(bank.name()).definition();
#
#    for meronym in synset.part_meronyms():
#      for lemma in wn.synset(meronym.name()).lemmas():
#        lemma_name = lemma.name();
#        dict_wn[key].append(lemma_name)
#
#    for holonym in synset.member_holonyms():
#      for lemma in wn.synset(holonym.name()).lemmas():
#        lemma_name = lemma.name();
#        dict_wn[key].append(lemma_name)
#
#
    tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(synset.definition()));
    nouns = [word for word,pos in tagged_sent if pos == 'NN'];
    for i in range(len(nouns)):
        nouns[i] = wordnet_lemmatizer.lemmatize(nouns[i]);


    for noun in nouns:
      dict_wn[key].append(noun)
##
  return dict_wn
示例#7
0
def get_nbest_synsets_n_v_with_word_vn(dict_words,word_concept):

  dict_words_nouns = [];
  dict_synsets_nouns = [];

  wn_words = wn.synsets(word_concept, pos = 'n');

  if word_concept == 'bedroom':
    asdf = 0;

  for iWord in range(len(dict_words)):

    print iWord;

    dict_words_nouns.append([]);
    dict_synsets_nouns.append([]);

    wordDict = dict_words[iWord];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # sd

    if not wordDict.has_key('tv'):
      continue

    if not wordDict.has_key('d'):
      continue

    nouns = [];
    if wordDict.has_key("sd"):
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["sd"]));
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

      if len(nouns) == 0:
        tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
        print tagged_sent
        nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

    elif wordDict.has_key("d") and wordDict["d"] != None:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      print tagged_sent
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' or word != 'etc'))];
    else:
      continue

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun != "sth" and noun != 'etc' and noun not in dict_words_nouns[iWord]:
        dict_words_nouns[iWord].append(noun);

    if len(dict_words_nouns[iWord]) == 0:
      continue

    print dict_words_nouns[iWord]
    synsetsSD = [];

    for word in dict_words_nouns[iWord]:
      synsets = wn.synsets(word, pos = 'n');
      for synset in synsets:
        synsetsSD.append(synset)

    if len(synsetsSD) == 0:
      continue

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # d

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
    nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS')];

    if wordDict.has_key('xh0') and wordDict['xh0'] is not None and wordDict['xh0'] != 'nn':
      nouns.append(wordDict['xh0']);
    # if wordDict.has_key('xh1') and wordDict['xh1'] is not None:
    #   nouns.append(wordDict['xh1']);
    # if wordDict.has_key('xh2') and wordDict['xh2'] is not None:
    #   nouns.append(wordDict['xh2']);

    # print  tagged_sent

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]:
        # print noun;
        dict_words_nouns[iWord].append(noun);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    print wordDict["tv"]
    print dict_words_nouns[iWord]

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # synsets

    iSDMax = 0;
    pSD_max = 0;

    for iSyn in range(len(synsetsSD)):
      synsetSD = synsetsSD[iSyn];
      pSD = 0;

      arr_p = [];

      for synset in wn_words:
        p_noun_max = 0;
        p = synsetSD.path_similarity(synset);
          # arr_p.append(p);
        # print "-----------------------"
        # print synsetSD
        # print synset
        # print p
        if p > p_noun_max:
          p_noun_max = p;

        arr_p.append(p_noun_max);

      arr_p = sorted(arr_p, reverse=True);

      for i in xrange(0, len(arr_p)-1):
        if i <= 0:
          pSD += arr_p[i];

      # print "\n"

      if pSD > pSD_max:
        # print pSD
        # print pSD_max
        pSD_max = pSD;
        # print iSyn
        # print iSDMax
        iSDMax = iSyn;

    # print "\n"

    synsetRoot = synsetsSD[iSDMax];
    print "synsetroot"
    print synsetRoot

    for noun in dict_words_nouns[iWord]:
      synsets_noun = wn.synsets(noun, pos = 'n');
      if len(synsets_noun) <= 0:
        continue;

      p_noun_max = 0;
      synMax = synsets_noun[0];

      for synset_noun in synsets_noun:
        # dict_synsets_nouns[iWord].append(synMax);
        for synset in wn_words:
          p = synset.path_similarity(synset_noun);
        # p = synsetRoot.path_similarity(synset_noun);
          if p > p_noun_max:
            p_noun_max = p;
            synMax = synset_noun;

      if synMax not in dict_synsets_nouns[iWord]:
        dict_synsets_nouns[iWord].append(synMax);

      # if synsets_noun[0] not in dict_synsets_nouns[iWord]:
      #   dict_synsets_nouns[iWord].append(synsets_noun[0]);

    # if len(synsetsSD) >= 1:
    #   synsetRoot = synsetsSD[0];
    #   print "synsetroot"
    #   print synsetRoot
    #
    #   for noun in dict_words_nouns[iWord]:
    #     synsets_noun = wn.synsets(noun, pos = 'n');
    #     if len(synsets_noun) <= 0:
    #       continue;
    #
    #     p_noun_max = 0;
    #     synMax = synsets_noun[0];
    #
    #     for synset_noun in synsets_noun:
    #       p = synsetRoot.path_similarity(synset_noun);
    #       if p > p_noun_max:
    #         p_noun_max = p;
    #         synMax = synset_noun;
    #
    #     if synMax not in dict_synsets_nouns[iWord]:
    #       dict_synsets_nouns[iWord].append(synMax);
    #     if synsets_noun[0] not in dict_synsets_nouns[iWord]:
    #       dict_synsets_nouns[iWord].append(synsets_noun[0]);

    # if len(synsetsSD) >= 3:
    #   synsetRoot = synsetsSD[2];
    #   print "synsetroot"
    #   print synsetRoot
    #
    #   for noun in dict_words_nouns[iWord]:
    #     synsets_noun = wn.synsets(noun, pos = 'n');
    #     if len(synsets_noun) <= 0:
    #       continue;
    #
    #     p_noun_max = 0;
    #     synMax = synsets_noun[0];
    #
    #     for synset_noun in synsets_noun:
    #       p = synsetRoot.path_similarity(synset_noun);
    #       if p > p_noun_max:
    #         p_noun_max = p;
    #         synMax = synset_noun;
    #
    #     if synMax not in dict_synsets_nouns[iWord]:
    #       dict_synsets_nouns[iWord].append(synMax);
    #     if synsets_noun[0] not in dict_synsets_nouns[iWord]:
    #       dict_synsets_nouns[iWord].append(synsets_noun[0]);


    # continue
    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
    nouns = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBN' or pos == 'VBD')];

    print "VVVVV"
    print nouns
    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='v');
      if noun == None:
        continue

      if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]:
        # print noun;
        dict_words_nouns[iWord].append(noun);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    print wordDict["tv"]
    print dict_words_nouns[iWord]

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # synsets

    iSDMax = 0;
    pSD_max = 0;

    for iSyn in range(len(synsetsSD)):
      synsetSD = synsetsSD[iSyn];
      pSD = 0;

      arr_p = [];

      for synset in wn_words:
        p_noun_max = 0;
        p = synsetSD.path_similarity(synset);
          # arr_p.append(p);
        # print "-----------------------"
        # print synsetSD
        # print synset
        # print p
        if p > p_noun_max:
          p_noun_max = p;

        arr_p.append(p_noun_max);

      arr_p = sorted(arr_p, reverse=True);

      for i in xrange(0, len(arr_p)-1):
        if i <= 1:
          pSD += arr_p[i];

      # print "\n"

      if pSD > pSD_max:
        # print pSD
        # print pSD_max
        pSD_max = pSD;
        # print iSyn
        # print iSDMax
        iSDMax = iSyn;

    # print "\n"

    synsetRoot = synsetsSD[iSDMax];
    print "synsetroot"
    print synsetRoot

    for noun in dict_words_nouns[iWord]:
      synsets_noun = wn.synsets(noun, pos = 'v');
      if len(synsets_noun) <= 0:
        continue;

      p_noun_max = 0;
      synMax = synsets_noun[0];

      for synset_noun in synsets_noun:
        p = synsetRoot.path_similarity(synset_noun);
        if p > p_noun_max:
          p_noun_max = p;
          synMax = synset_noun;

      if synMax not in dict_synsets_nouns[iWord]:
        dict_synsets_nouns[iWord].append(synMax);
      # if synsets_noun[0] not in dict_synsets_nouns[iWord]:
        # dict_synsets_nouns[iWord].append(synsets_noun[0]);

    print "dict_synsets_nouns"
    print dict_synsets_nouns[iWord]

  ########################################
  return dict_synsets_nouns;
示例#8
0
def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words):


  if WORD == "bank":
    asf = 0;
  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data
  dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD);
  # print "dict-word_synsets"
  # print dict_words_synsets

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  wn_words = wn.synsets(WORD, pos = 'n');
  print "wn_words -------"
  print wn_words;

  wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words);

  print wn_words_synsets

  # matrix for similarity dict_words vs wn_words
  matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];
      #
      for dict_synset in dict_words_synsets[iDictWord]:

        # print "------------ dict noun"
        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:
          #
          p_max = dict_synset.path_similarity(wn_synset);
          if p_max == None:
            continue

          arr_p.append(p_max);

          # print p_max

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001;
        for i in xrange(0, len(arr_p)-1):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count += 1;

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in range(len(arr_p_word)):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data

  wn_words = dict_words;
  wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD);

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  dict_words = wn.synsets(WORD, pos = 'n');
  # print wn_words;
  dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words);

  print "sysnets -----------------------.----.-----.--.-"

  # matrix for similarity dict_words vs wn_words
  matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];

      for dict_synset in dict_words_synsets[iDictWord]:

        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets
        countwnNouns = 0.00000001;

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:

          p_max = dict_synset.path_similarity(wn_synset);
          if p_max != None:
            arr_p.append(p_max);

          # print p_max
          # - - - - - - - - - - - - - - - - - - - - - - - -

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001
        for i in range(len(arr_p)):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count +=1

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in xrange(0, len(arr_p_word)-1):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity_reverse]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  dict_words = wn_words;
  wn_words = wn.synsets(WORD, pos = 'n');

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord];
      matrix_similarity[iWnWord][iDictWord] /= 2;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # @brief:
  #

  matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  for iWnWord in range(len(wn_words)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
    # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
      # print
      # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set);


  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]);
      matrix_similarity[iWnWord][iDictWord] /= 12;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # write file

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # col
  arrColWn = [];
  for i in range(len(wn_words)):
    matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition());

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # row
  arrRowDict = [];
  arrRowDict.append("--");
  for i in range(len(dict_words)):
    if not dict_words[str(i)].has_key('tv'):
      dict_words[str(i)]['tv'] = "--";
    if dict_words[str(i)]['tv'] == None:
      dict_words[str(i)]['tv'] = "--"
    arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8'));

  FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)
示例#9
0
def get_nouns(word_origin, wn_words):
  
  wn_words_nouns = [];

  for iWord in range(len(wn_words)):

    print "- - - - - - - - - - - - - - - - - - - - - - - - - - -";
    print iWord;
    wn_words_nouns.append([]);
    # get a bank in wn_words
    wordDict = wn_words[iWord];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get synsets of bank
    print "synsets -------";
    synsets_of_word = wn.synset(wordDict.name());

    for lemma in synsets_of_word.lemmas():

      lemma_name = lemma.name();
      if lemma_name != word_origin:

        print lemma_name;
        wn_words_nouns[iWord].append(lemma_name);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get hypernyms
    # print "\nhypernyms ------";
    # for hypernym in wn.synset(wordDict.name()).hypernyms():
    #
    #   for lemma in wn.synset(hypernym.name()).lemmas():
    #     lemma_name = lemma.name();
    #     if lemma_name != "bank":
    #
    #       if not(lemma_name in wn_words_nouns[iWord]):
    #         print lemma_name;
    #         wn_words_nouns[iWord].append(lemma_name);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hyponyms
    print "\nhyponyms ------";
    for hyponym in wn.synset(wordDict.name()).hyponyms():

      for lemma in wn.synset(hyponym.name()).lemmas():
        lemma_name = lemma.name();
        if lemma_name != "bank":

          if not(lemma_name in wn_words_nouns[iWord]):
            print lemma_name;
            wn_words_nouns[iWord].append(lemma_name);
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get description
    print "\ndefinition ------";
    # print wn.synset(bank.name()).definition();

    tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wn.synset(wordDict.name()).definition()));
    nouns = [word for word,pos in tagged_sent if pos == 'NN'];

    for noun in nouns:
      if noun != "bank" and noun != "sth" and noun not in wn_words_nouns[iWord]:
        print noun;
        wn_words_nouns[iWord].append(noun);


  ########################################
  return wn_words_nouns
示例#10
0
def get_synsets_n_v(word_origin, wn_words):

  wn_words_synset = [];

  for iWord in range(len(wn_words)):

    print "- - - - - - - - - - - - - - - - - - - - - - - - - - -";
    print iWord;
    wn_words_synset.append([]);
    # get a bank in wn_words
    wordDict = wn_words[iWord];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get synsets of bank
    synset_of_word = wn.synset(wordDict.name());
    wn_words_synset[iWord].append(synset_of_word);
    print synset_of_word
    print "---"

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hypernyms
    # for hypernym in wn.synset(wordDict.name()).hypernyms():
      # print hypernym
      # wn_words_synset[iWord].append(hypernym);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hyponyms
    print "---"
    for hyponym in wn.synset(wordDict.name()).part_meronyms():
      print hyponym
      wn_words_synset[iWord].append(hyponym);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get hyponyms
    # print "---"
    # for hyponym in wn.synset(wordDict.name()).member_holonyms():
    #   print hyponym
    #   wn_words_synset[iWord].append(hyponym);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hyponyms
    print "---"
    for hyponym in wn.synset(wordDict.name()).hyponyms():
      print hyponym
      wn_words_synset[iWord].append(hyponym);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get description
    print "\ndefinition ------";
    # print wn.synset(bank.name()).definition();

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wordDict.name()).definition()));
    print tagged_sent
    nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS'  or pos == 'JJ')];

    for noun in nouns:

      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');

      if noun == None:
        continue

      # if noun != "bank" and noun != "sth" and noun not in wn_words_synset[iWord]:
      synsetsDictNoun = wn.synsets(noun, pos = "n");


      if len(synsetsDictNoun) > 0:
        synsetMax = synsetsDictNoun[0];
        p_max = 0;

        for synsetNoun in synsetsDictNoun:
          p = synsetNoun.path_similarity(synset_of_word);
          if p > p_max:
            p_max = p;
          # # synsetMax = synsetNoun

        print synsetMax
        if synsetMax not in wn_words_synset[iWord]:
          wn_words_synset[iWord].append(synsetMax);
        # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
        #   # wn_words_synset[iWord].append(synsetsDictNoun[0]);



    tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wn.synset(wordDict.name()).definition()));
    nouns = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBD' or pos == 'VBN')];

    for noun in nouns:

      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');

      if noun == None:
        continue

      # if noun != "bank" and noun != "sth" and noun not in wn_words_synset[iWord]:
      synsetsDictNoun = wn.synsets(noun, pos = "v");


      if len(synsetsDictNoun) > 0:
        synsetMax = synsetsDictNoun[0];
        p_max = 0;

        for synsetNoun in synsetsDictNoun:
          p = synsetNoun.path_similarity(synset_of_word);
          if p > p_max:
            p_max = p;
          # # synsetMax = synsetNoun

        print synsetMax
        if synsetMax not in wn_words_synset[iWord]:
          wn_words_synset[iWord].append(synsetMax);
        # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
        #   wn_words_synset[iWord].append(synsetsDictNoun[0]);


  ########################################
  return wn_words_synset
示例#11
0
def split_words(sen):
    tokens = nltk.wordpunct_tokenize(sen)
    tagged_words = POSWrapper.pos_tag(tokens)
    return tagged_words
def get_synsets_for_word_in_wn(word_origin, wn_synsets_for_word_origin):

  # arr synsets for arr words
  # each word has an array of synsets
  wn_synsets_for_words = [];

  # add p
  p_synsets_for_words = [];

  for iWord in range(len(wn_synsets_for_word_origin)):

    # print "- - - - - - - - - - - - - - - - - - - - - - - - - - -";
    # print iWord;
    wn_synsets_for_words.append([]);

    # add p
    p_synsets_for_words.append([]);

    # get a bank in wn_words
    wordDict = wn_synsets_for_word_origin[iWord];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get synsets of bank
    synset_of_word = wn.synset(wordDict.name());
    wn_synsets_for_words[iWord].append(synset_of_word);

    # add p
    p_synsets_for_words[iWord].append(1.5);

    # print synset_of_word
    # print "---"

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hypernyms

    if PARAMETERS.DICT_WN_FEATURE_RELATION_hypernyms == 1:
      # print "hypernyms"
      for hypernym in wn.synset(wordDict.name()).hypernyms():
        # print hypernym
        wn_synsets_for_words[iWord].append(hypernym);

        # add p
        p_synsets_for_words[iWord].append(1.2);


    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get meronyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_part_meronyms == 1:
      # print "meronyms"
      for meronym in wn.synset(wordDict.name()).part_meronyms():
        # print meronym
        wn_synsets_for_words[iWord].append(meronym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get holonyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_member_holonyms == 1:
      # print "holonyms"
      for holonym in wn.synset(wordDict.name()).member_holonyms():
        # print holonym
        wn_synsets_for_words[iWord].append(holonym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hyponyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_hyponyms == 1:
      # print "hyponyms"
      for hyponym in wn.synset(wordDict.name()).hyponyms():
        # print hyponym
        wn_synsets_for_words[iWord].append(hyponym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get description

    if PARAMETERS.DICT_WN_FEATURE_RELATION_definition == 1:

      # print "\ndefinition ------";

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wordDict.name()).definition()));
      # print tagged_sent

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if PARAMETERS.POS_FEATURE_n == 1:
        nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS'  or pos == 'JJ')];

        for noun in nouns:

          noun = wordnet_lemmatizer.lemmatize(noun, pos='n');

          if noun == None:
            continue

          if noun != word_origin and noun != "sth":
            synsetsDictNoun = WordnetHandler.get_synsets_for_word(noun, "n");

            if len(synsetsDictNoun) > 0:
              synsetMax = synsetsDictNoun[0];
              p_max = 0;

              for synsetNoun in synsetsDictNoun:
                p = synsetNoun.path_similarity(synset_of_word);
                if p > p_max:
                  p_max = p;
                  synsetMax = synsetNoun

              # print synsetMax
              if synsetMax not in wn_synsets_for_words[iWord]:
                wn_synsets_for_words[iWord].append(synsetMax);

                # add p
                p_synsets_for_words[iWord].append(1.);

            # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
            #   # wn_words_synset[iWord].append(synsetsDictNoun[0]);
      # - - - - - - - - - - - - - - - - - - - - - - - - - - -

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if PARAMETERS.POS_FEATURE_v == 1:
        verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBD' or pos == 'VBN')];

        for verb in verbs:

          verb = wordnet_lemmatizer.lemmatize(verb, pos='v');

          if verb == None:
            continue

          if verb != "bank":
            synsetsDictVerb = WordnetHandler.get_synsets_for_word(verb, "v");


            if len(synsetsDictVerb) > 0:
              synsetMax = synsetsDictVerb[0];
              p_max = 0;

              for synsetVerb in synsetsDictVerb:
                p = synsetVerb.path_similarity(synset_of_word);
                if p > p_max:
                  p_max = p;
                  synsetMax = synsetVerb
              #
              # print synsetMax
              if synsetMax not in wn_synsets_for_words[iWord]:
                wn_synsets_for_words[iWord].append(synsetMax);

                # add p
                p_synsets_for_words[iWord].append(1.);

            # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
            #   wn_words_synset[iWord].append(synsetsDictNoun[0]);

    # print wn_synsets_for_words[iWord]

  ########################################
  return wn_synsets_for_words,p_synsets_for_words;
def get_nbest_synsets_for_word_in_oxford(dict_words,word_concept):

  dict_words_nouns = [];
  dict_words_verbs = [];
  dict_synsets_for_words = [];

  wn_words = WordnetHandler.get_synsets_for_word(word_concept, 'n');

  # add p
  p_synsets_for_words = [];

  for iWord in range(len(dict_words)):

    # print iWord;

    dict_words_nouns.append([]);
    dict_words_verbs.append([]);
    dict_synsets_for_words.append([]);

    # add p
    p_synsets_for_words.append([]);

    wordDict = dict_words[str(iWord)];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # sd

    if not wordDict.has_key('tv'):
      continue

    if not wordDict.has_key('d'):
      continue

    nouns = [];
    if wordDict.has_key("sd") and PARAMETERS.DICT_OX_FEATURE_RELATION_sd == 1:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["sd"]));
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

      if len(nouns) == 0:
        tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
        # print tagged_sent
        nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

    elif wordDict.has_key("d") and wordDict["d"] != None:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      # print tagged_sent
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' or word != 'etc'))];
    else:
      continue

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun != "sth" and noun != 'etc' and noun not in dict_words_nouns[iWord]:
        dict_words_nouns[iWord].append(noun);

    if len(dict_words_nouns[iWord]) == 0:
      continue

    # print dict_words_nouns[iWord]
    synsetsSD = [];

    for word in dict_words_nouns[iWord]:
      synsets = WordnetHandler.get_synsets_for_word(word, 'n');
      for synset in synsets:
        synsetsSD.append(synset)

    if len(synsetsSD) == 0:
      continue

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # d

    if PARAMETERS.DICT_OX_FEATURE_RELATION_d == 1:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS')];

    if PARAMETERS.DICT_OX_FEATURE_RELATION_xh == 1:
      if wordDict.has_key('xh0') and wordDict['xh0'] is not None and wordDict['xh0'] != 'nn':
        nouns.append(wordDict['xh0']);
      if wordDict.has_key('xh1') and wordDict['xh1'] is not None:
        nouns.append(wordDict['xh1']);
      if wordDict.has_key('xh2') and wordDict['xh2'] is not None:
        nouns.append(wordDict['xh2']);

    # print  tagged_sent

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]:
        dict_words_nouns[iWord].append(noun);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # print wordDict["tv"]
    # print dict_words_nouns[iWord]
    #
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # synsets

    iSDMax = 0;
    pSD_max = 0;

    for iSyn in range(len(synsetsSD)):
      synsetSD = synsetsSD[iSyn];
      pSD = 0;

      arr_p = [];

      for synset in wn_words:
        # p_noun_max = 0;
        p = synsetSD.path_similarity(synset);
        # print "-----------------------"
        # if p > p_noun_max:
        p_noun_max = p;

        arr_p.append(p_noun_max);

      arr_p = sorted(arr_p, reverse=True);

      for i in xrange(0, len(arr_p)-1):
        if i <= 0:
          pSD += arr_p[i];

      # print "\n"

      if pSD > pSD_max:
        pSD_max = pSD;
        iSDMax = iSyn;

    # print "\n"

    synsetRoot = synsetsSD[iSDMax];
    # print "synsetroot"
    # print synsetRoot

    for noun in dict_words_nouns[iWord]:
      synsets_noun = WordnetHandler.get_synsets_for_word(noun, 'n');
      if len(synsets_noun) <= 0:
        continue;

      p_noun_max = 0;
      synMax = synsets_noun[0];

      for synset_noun in synsets_noun:
        # dict_synsets_nouns[iWord].append(synMax);
        for synset in wn_words:
          p = synset.path_similarity(synset_noun);
        # p = synsetRoot.path_similarity(synset_noun);
          if p > p_noun_max:
            p_noun_max = p;
            synMax = synset_noun;

      if synMax not in dict_synsets_for_words[iWord]:
        dict_synsets_for_words[iWord].append(synMax);

    if PARAMETERS.POS_FEATURE_v:

      # continue
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBN' or pos == 'VBD')];

      # print "VVVVV"
      # print verbs
      for verb in verbs:
        verb = wordnet_lemmatizer.lemmatize(verb, pos='v');
        if verb == None:
          continue

        if verb.encode('utf8') != word_concept and verb != "sth" and verb not in dict_words_verbs[iWord]:
          # print noun;
          dict_words_verbs[iWord].append(verb);

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # print dict_words_verbs[iWord]

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      #
      # synsets

      iSDMax = 0;
      pSD_max = 0;

      for iSyn in range(len(synsetsSD)):
        synsetSD = synsetsSD[iSyn];
        pSD = 0;

        arr_p = [];

        for synset in wn_words:
          # p_noun_max = 0;
          p = synsetSD.path_similarity(synset);
            # arr_p.append(p);
          # print "-----------------------"
          # print synsetSD
          # print synset
          # print p
          # if p > p_noun_max:
          p_verb_max = p;

          arr_p.append(p_verb_max);

        arr_p = sorted(arr_p, reverse=True);

        for i in xrange(0, len(arr_p)-1):
          if i <= 1:
            pSD += arr_p[i];

        # print "\n"

        if pSD > pSD_max:
          # print pSD
          # print pSD_max
          pSD_max = pSD;
          # print iSyn
          # print iSDMax
          iSDMax = iSyn;

      # print "\n"

      synsetRoot = synsetsSD[iSDMax];
      # print "synsetroot"
      # print synsetRoot

      for verb in dict_words_verbs[iWord]:
        synsets_verb = WordnetHandler.get_synsets_for_word(verb, 'v');
        if len(synsets_verb) <= 0:
          continue;

        p_verb_max = 0;
        synMax = synsets_verb[0];

        for synset_verb in synsets_verb:
          # p = synsetRoot.path_similarity(synset_verb);
          for synset in wn_words:
            p = synset.path_similarity(synset_verb);

            if p > p_verb_max:
              p_verb_max = p;
              synMax = synset_verb;

        if synMax not in dict_synsets_for_words[iWord]:
          dict_synsets_for_words[iWord].append(synMax);
        # if synsets_noun[0] not in dict_synsets_nouns[iWord]:
          # dict_synsets_nouns[iWord].append(synsets_noun[0]);

    # print "dict_synsets_nouns"
    # print dict_synsets_for_words[iWord]

  ########################################
  return dict_synsets_for_words;
示例#14
0
def get_synsets(dict_words):

    dict_words_nouns = []
    dict_synsets_nouns = []

    for iWord in range(len(dict_words)):

        print iWord

        dict_words_nouns.append([])
        dict_synsets_nouns.append([])

        wordDict = dict_words[iWord]

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # sd

        tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["sd"]))
        nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")]

        for noun in nouns:
            if noun != "sth" and noun != "etc" and noun not in dict_words_nouns[iWord]:
                dict_words_nouns[iWord].append(noun)

        print dict_words_nouns[iWord]
        synsetsSD = wn.synsets(dict_words_nouns[iWord][len(dict_words_nouns[iWord]) - 1], pos="n")

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # d

        tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"]))
        nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")]

        for noun in nouns:
            if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]:
                # print noun;
                dict_words_nouns[iWord].append(noun)

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        print wordDict["tv"]
        print dict_words_nouns[iWord]

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # synsets

        iSDMax = 0
        pSD_max = 0

        for iSyn in range(len(synsetsSD) - 1):
            synsetSD = synsetsSD[iSyn]
            pSD = 0

            for iNoun in range(len(dict_words_nouns[iWord]) - 1):
                if iNoun == 0:
                    continue
                synsets_noun = wn.synsets(dict_words_nouns[iWord][iNoun], pos="n")
                p_noun_max = 0

                for synset_noun in synsets_noun:
                    p = synsetSD.path_similarity(synset_noun)
                    # print synsetSD
                    # print synset_noun
                    # print p
                    if p > p_noun_max:
                        p_noun_max = p

                pSD += p_noun_max

            # print "\n"

            if pSD > pSD_max:
                # print pSD
                # print pSD_max
                pSD_max = pSD
                # print iSyn
                # print iSDMax
                iSDMax = iSyn

        # print "\n"

        synsetRoot = synsetsSD[0]
        print "synsetroot"
        print synsetRoot

        for noun in dict_words_nouns[iWord]:
            synsets_noun = wn.synsets(noun, pos="n")
            if len(synsets_noun) <= 0:
                continue

            p_noun_max = 0
            synMax = synsets_noun[0]

            for synset_noun in synsets_noun:
                p = synsetRoot.path_similarity(synset_noun)
                if p > p_noun_max:
                    p_noun_max = p
                    synMax = synset_noun

            dict_synsets_nouns[iWord].append(synMax)
            # dict_synsets_nouns[iWord].append(synsets_noun[0]);

        print "dict_synsets_nouns"
        print dict_synsets_nouns[iWord]

    ########################################
    return dict_synsets_nouns
示例#15
0
def get_nbest_synsets_n_v_x_with_word(dict_words, word_concept):

    dict_words_nouns = []
    dict_synsets_nouns = []

    wn_words = wn.synsets(word_concept, pos="n")

    for iWord in range(len(dict_words)):

        print iWord

        dict_words_nouns.append([])
        dict_synsets_nouns.append([])

        wordDict = dict_words[iWord]

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # sd

        tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["sd"]))
        nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")]

        for noun in nouns:
            noun = wn.morphy(noun)
            if noun == None:
                continue

            if noun != "sth" and noun != "etc" and noun not in dict_words_nouns[iWord]:
                dict_words_nouns[iWord].append(noun)

        print dict_words_nouns[iWord]
        synsetsSD = wn.synsets(dict_words_nouns[iWord][len(dict_words_nouns[iWord]) - 1], pos="n")

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # d

        tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"]))
        nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")]

        for noun in nouns:
            noun = wn.morphy(noun)
            if noun == None:
                continue

            if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]:
                # print noun;
                dict_words_nouns[iWord].append(noun)

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        print wordDict["tv"]
        print dict_words_nouns[iWord]

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # synsets

        iSDMax = 0
        pSD_max = 0

        for iSyn in range(len(synsetsSD) - 1):
            synsetSD = synsetsSD[iSyn]
            pSD = 0

            arr_p = []

            for synset in wn_words:
                p_noun_max = 0
                p = synsetSD.path_similarity(synset)
                # arr_p.append(p);
                # print "-----------------------"
                # print synsetSD
                # print synset
                # print p
                if p > p_noun_max:
                    p_noun_max = p

                arr_p.append(p_noun_max)

            arr_p = sorted(arr_p, reverse=True)

            for i in xrange(0, len(arr_p) - 1):
                if i <= 1:
                    pSD += arr_p[i]

            # print "\n"

            if pSD > pSD_max:
                # print pSD
                # print pSD_max
                pSD_max = pSD
                # print iSyn
                # print iSDMax
                iSDMax = iSyn

        # print "\n"

        synsetRoot = synsetsSD[iSDMax]
        print "synsetroot"
        print synsetRoot

        for noun in dict_words_nouns[iWord]:
            synsets_noun = wn.synsets(noun, pos="n")
            if len(synsets_noun) <= 0:
                continue

            p_noun_max = 0
            synMax = synsets_noun[0]

            for synset_noun in synsets_noun:
                p = synsetRoot.path_similarity(synset_noun)
                if p > p_noun_max:
                    p_noun_max = p
                    synMax = synset_noun

            if synMax not in dict_synsets_nouns[iWord]:
                dict_synsets_nouns[iWord].append(synMax)
            if synsets_noun[0] not in dict_synsets_nouns[iWord]:
                dict_synsets_nouns[iWord].append(synsets_noun[0])

        # if len(synsetsSD) >= 1:
        #   synsetRoot = synsetsSD[0];
        #   print "synsetroot"
        #   print synsetRoot
        #
        #   for noun in dict_words_nouns[iWord]:
        #     synsets_noun = wn.synsets(noun, pos = 'n');
        #     if len(synsets_noun) <= 0:
        #       continue;
        #
        #     p_noun_max = 0;
        #     synMax = synsets_noun[0];
        #
        #     for synset_noun in synsets_noun:
        #       p = synsetRoot.path_similarity(synset_noun);
        #       if p > p_noun_max:
        #         p_noun_max = p;
        #         synMax = synset_noun;
        #
        #     if synMax not in dict_synsets_nouns[iWord]:
        #       dict_synsets_nouns[iWord].append(synMax);
        #     if synsets_noun[0] not in dict_synsets_nouns[iWord]:
        #       dict_synsets_nouns[iWord].append(synsets_noun[0]);

        # if len(synsetsSD) >= 3:
        #   synsetRoot = synsetsSD[2];
        #   print "synsetroot"
        #   print synsetRoot
        #
        #   for noun in dict_words_nouns[iWord]:
        #     synsets_noun = wn.synsets(noun, pos = 'n');
        #     if len(synsets_noun) <= 0:
        #       continue;
        #
        #     p_noun_max = 0;
        #     synMax = synsets_noun[0];
        #
        #     for synset_noun in synsets_noun:
        #       p = synsetRoot.path_similarity(synset_noun);
        #       if p > p_noun_max:
        #         p_noun_max = p;
        #         synMax = synset_noun;
        #
        #     if synMax not in dict_synsets_nouns[iWord]:
        #       dict_synsets_nouns[iWord].append(synMax);
        #     if synsets_noun[0] not in dict_synsets_nouns[iWord]:
        #       dict_synsets_nouns[iWord].append(synsets_noun[0]);

        tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["d"]))
        nouns = [word for word, pos in tagged_sent if (pos == "VB" or pos == "VBN" or pos == "VBD")]
        print "VVVVV"
        print nouns
        for noun in nouns:
            noun = wn.morphy(noun)
            if noun == None:
                continue

            if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]:
                # print noun;
                dict_words_nouns[iWord].append(noun)

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        print wordDict["tv"]
        print dict_words_nouns[iWord]

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # synsets

        iSDMax = 0
        pSD_max = 0

        for iSyn in range(len(synsetsSD) - 1):
            synsetSD = synsetsSD[iSyn]
            pSD = 0

            arr_p = []

            for synset in wn_words:
                p_noun_max = 0
                p = synsetSD.path_similarity(synset)
                # arr_p.append(p);
                # print "-----------------------"
                # print synsetSD
                # print synset
                # print p
                if p > p_noun_max:
                    p_noun_max = p

                arr_p.append(p_noun_max)

            arr_p = sorted(arr_p, reverse=True)

            for i in xrange(0, len(arr_p) - 1):
                if i <= 1:
                    pSD += arr_p[i]

            # print "\n"

            if pSD > pSD_max:
                # print pSD
                # print pSD_max
                pSD_max = pSD
                # print iSyn
                # print iSDMax
                iSDMax = iSyn

        # print "\n"

        synsetRoot = synsetsSD[iSDMax]
        print "synsetroot"
        print synsetRoot

        for noun in dict_words_nouns[iWord]:
            synsets_noun = wn.synsets(noun, pos="v")
            if len(synsets_noun) <= 0:
                continue

            p_noun_max = 0
            synMax = synsets_noun[0]

            for synset_noun in synsets_noun:
                p = synsetRoot.path_similarity(synset_noun)
                if p > p_noun_max:
                    p_noun_max = p
                    synMax = synset_noun

            if synMax not in dict_synsets_nouns[iWord]:
                print synMax
                dict_synsets_nouns[iWord].append(synMax)
            if synsets_noun[0] not in dict_synsets_nouns[iWord]:
                dict_synsets_nouns[iWord].append(synsets_noun[0])

        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        # x1

        if wordDict.has_key("x1"):
            tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["x1"]))
            nouns = [word for word, pos in tagged_sent if (pos == "NN" or pos == "NNS")]

            for noun in nouns:
                noun = wn.morphy(noun)
                if noun == None:
                    continue

                if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]:
                    # print noun;
                    dict_words_nouns[iWord].append(noun)

            # - - - - - - - - - - - - - - - - - - - - - - - - - - -
            print wordDict["tv"]
            print dict_words_nouns[iWord]

            # - - - - - - - - - - - - - - - - - - - - - - - - - - -
            #
            # synsets

            iSDMax = 0
            pSD_max = 0

            for iSyn in range(len(synsetsSD) - 1):
                synsetSD = synsetsSD[iSyn]
                pSD = 0

                arr_p = []

                for synset in wn_words:
                    p_noun_max = 0
                    p = synsetSD.path_similarity(synset)
                    # arr_p.append(p);
                    # print "-----------------------"
                    # print synsetSD
                    # print synset
                    # print p
                    if p > p_noun_max:
                        p_noun_max = p

                    arr_p.append(p_noun_max)

                arr_p = sorted(arr_p, reverse=True)

                for i in xrange(0, len(arr_p) - 1):
                    if i <= 1:
                        pSD += arr_p[i]

                # print "\n"

                if pSD > pSD_max:
                    # print pSD
                    # print pSD_max
                    pSD_max = pSD
                    # print iSyn
                    # print iSDMax
                    iSDMax = iSyn

            # print "\n"

            synsetRoot = synsetsSD[iSDMax]
            print "synsetroot"
            print synsetRoot

            for noun in dict_words_nouns[iWord]:
                synsets_noun = wn.synsets(noun, pos="n")
                if len(synsets_noun) <= 0:
                    continue

                p_noun_max = 0
                synMax = synsets_noun[0]

                for synset_noun in synsets_noun:
                    p = synsetRoot.path_similarity(synset_noun)
                    if p > p_noun_max:
                        p_noun_max = p
                        synMax = synset_noun

                if synMax not in dict_synsets_nouns[iWord]:
                    dict_synsets_nouns[iWord].append(synMax)
                if synsets_noun[0] not in dict_synsets_nouns[iWord]:
                    dict_synsets_nouns[iWord].append(synsets_noun[0])

            # if len(synsetsSD) >= 1:
            #   synsetRoot = synsetsSD[0];
            #   print "synsetroot"
            #   print synsetRoot
            #
            #   for noun in dict_words_nouns[iWord]:
            #     synsets_noun = wn.synsets(noun, pos = 'n');
            #     if len(synsets_noun) <= 0:
            #       continue;
            #
            #     p_noun_max = 0;
            #     synMax = synsets_noun[0];
            #
            #     for synset_noun in synsets_noun:
            #       p = synsetRoot.path_similarity(synset_noun);
            #       if p > p_noun_max:
            #         p_noun_max = p;
            #         synMax = synset_noun;
            #
            #     if synMax not in dict_synsets_nouns[iWord]:
            #       dict_synsets_nouns[iWord].append(synMax);
            #     if synsets_noun[0] not in dict_synsets_nouns[iWord]:
            #       dict_synsets_nouns[iWord].append(synsets_noun[0]);

            # if len(synsetsSD) >= 3:
            #   synsetRoot = synsetsSD[2];
            #   print "synsetroot"
            #   print synsetRoot
            #
            #   for noun in dict_words_nouns[iWord]:
            #     synsets_noun = wn.synsets(noun, pos = 'n');
            #     if len(synsets_noun) <= 0:
            #       continue;
            #
            #     p_noun_max = 0;
            #     synMax = synsets_noun[0];
            #
            #     for synset_noun in synsets_noun:
            #       p = synsetRoot.path_similarity(synset_noun);
            #       if p > p_noun_max:
            #         p_noun_max = p;
            #         synMax = synset_noun;
            #
            #     if synMax not in dict_synsets_nouns[iWord]:
            #       dict_synsets_nouns[iWord].append(synMax);
            #     if synsets_noun[0] not in dict_synsets_nouns[iWord]:
            #       dict_synsets_nouns[iWord].append(synsets_noun[0]);

            tagged_sent = POSWrapper.pos_tag(nltk.word_tokenize(wordDict["x1"]))
            nouns = [word for word, pos in tagged_sent if (pos == "VB" or pos == "VBN" or pos == "VBD")]
            print "VVVVV"
            print nouns
            for noun in nouns:
                noun = wn.morphy(noun)
                if noun == None:
                    continue

                if noun != wordDict["en"] and noun != "sth" and noun not in dict_words_nouns[iWord]:
                    # print noun;
                    dict_words_nouns[iWord].append(noun)

            # - - - - - - - - - - - - - - - - - - - - - - - - - - -
            print wordDict["tv"]
            print dict_words_nouns[iWord]

            # - - - - - - - - - - - - - - - - - - - - - - - - - - -
            #
            # synsets

            iSDMax = 0
            pSD_max = 0

            for iSyn in range(len(synsetsSD) - 1):
                synsetSD = synsetsSD[iSyn]
                pSD = 0

                arr_p = []

                for synset in wn_words:
                    p_noun_max = 0
                    p = synsetSD.path_similarity(synset)
                    # arr_p.append(p);
                    # print "-----------------------"
                    # print synsetSD
                    # print synset
                    # print p
                    if p > p_noun_max:
                        p_noun_max = p

                    arr_p.append(p_noun_max)

                arr_p = sorted(arr_p, reverse=True)

                for i in xrange(0, len(arr_p) - 1):
                    if i <= 1:
                        pSD += arr_p[i]

                # print "\n"

                if pSD > pSD_max:
                    # print pSD
                    # print pSD_max
                    pSD_max = pSD
                    # print iSyn
                    # print iSDMax
                    iSDMax = iSyn

            # print "\n"

            synsetRoot = synsetsSD[iSDMax]
            print "synsetroot"
            print synsetRoot

            for noun in dict_words_nouns[iWord]:
                synsets_noun = wn.synsets(noun, pos="v")
                if len(synsets_noun) <= 0:
                    continue

                p_noun_max = 0
                synMax = synsets_noun[0]

                for synset_noun in synsets_noun:
                    p = synsetRoot.path_similarity(synset_noun)
                    if p > p_noun_max:
                        p_noun_max = p
                        synMax = synset_noun

                if synMax not in dict_synsets_nouns[iWord]:
                    print synMax
                    dict_synsets_nouns[iWord].append(synMax)
                if synsets_noun[0] not in dict_synsets_nouns[iWord]:
                    dict_synsets_nouns[iWord].append(synsets_noun[0])

        print "dict_synsets_nouns"
        print dict_synsets_nouns[iWord]

    ########################################
    return dict_synsets_nouns