Пример #1
0
def synsets_for_word(word_pos):
    (word, pos) = word_pos
    synsets_of_noun = []
    if pos_is_noun(pos):
        synsets_of_noun = WordnetHandler.get_synsets_for_word(word, "n")
    if pos_is_verb(pos):
        synsets_of_noun = WordnetHandler.get_synsets_for_word(word, "v")
    return synsets_of_noun
Пример #2
0
def sim_ox_wn_value_main_synsets(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_value_for(word)
  synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
  dict_vectors_ox = OxParseDefinition.get_vectors_value_for_word(word, synsets_wn)

  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)

  m2d_sim_defi_temp =  sim_ox_wn_defi_WDS_via_main_syns(word)
  DebugHandler.print_2d_matrix(m2d_sim_defi_temp)

  m2d_sim_defi = [[0 for x in range(len(vectors_wn))] for x in range(len(vectors_ox))]
  for i in range(len(vectors_wn)):
    for j in range(len(vectors_ox)):
      m2d_sim_defi[j][i] = m2d_sim_defi_temp[i][j]

  m2d_sim = [[0 for x in range(len(vectors_ox))] for x in range(len(vectors_wn))]
  for i in range(len(vectors_wn)):
    vector_wn = vectors_wn[i]
    print vector_wn
    for j in range(len(vectors_ox)):
      vector_ox = vectors_ox[j]
      cosine = spatial.distance.cosine(m2d_sim_defi[j], vector_wn)
      m2d_sim[i][j] = cosine

  print "\n"
  for j in range(len(vectors_ox)):
    vector_ox = vectors_ox[j]
    print vector_ox
  return m2d_sim
Пример #3
0
def get_greatest_synsets_similarity_between(synsets_wn, nouns):
  synset_wn_max = None
  p_max = 0

  if len(nouns) != 0:
    for synset_wn in synsets_wn:
      p_noun = 0
      for noun in nouns:
        synsets_of_noun = WordnetHandler.get_synsets_for_word(noun, 'n')

        if len(synsets_of_noun) > 0:
          p_each_noun = 0
          for synset_of_noun in synsets_of_noun:
    #        p = synset_wn.path_similarity(synset_of_noun)
            p = WordnetHandler.cal_similarity(synset_wn, synset_of_noun)
            p_each_noun += p
          p_each_noun = p_each_noun/len(synsets_of_noun)
          p_noun += p_each_noun

      p = p_noun/len(nouns)
      if p > p_max:
        synset_wn_max = synset_wn
  else:
    print "no nouns"

  return synset_wn_max
Пример #4
0
def sim_ox_wn_defi_WDS_via_main_syns(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word)
  synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
  dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, synsets_wn)
#
  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)
#
  m2d_sim = sim_wn_ox_vector(vectors_ox, vectors_wn)
#
  cal_sim_ngrams(word)

# write to file
#  # - - - - - - - - - - - - - - - - - - - - - - - - -
#  for i in range(len(keys_wn)):
#    m2d_sim[i].insert(0,keys_wn[i]);
#  # - - - - - - - - - - - - - - - - - - - - - - - - -
#  # row
#  row_dict = [];
#  row_dict.append(word);
#  for i in range(len(keys_ox)):
#    row_dict.append(keys_ox[i].encode('utf8'));
#  # - - - - - - - - - - - - - - - - - - - - - - - - -
#  filename = 'Results/vector_definition/' + word + '.csv'
#  FileProcess.append_to_excel_file(filename, row_dict, m2d_sim)
#  # - - - - - - - - - - - - - - - - - - - - - - - - -

  return m2d_sim
def get_dict_vectors_value_for(word):
  vectors = OrderedDict()
  synsets = WordnetHandler.get_synsets_for_word(word, 'n')
  for synset in synsets:
    vector = get_value_synset_for(synset, synsets)
    key = synset.definition()
    vectors[key] = vector

  return vectors
def get_gloss_for_jacc(word):
  vectors = OrderedDict()
  synsets = WordnetHandler.get_synsets_for_word(word, 'n')
  for synset in synsets:
    vector = get_gloss_synset_for(synset)
    key = synset.definition()
    vectors[key] = vector

  return vectors
def get_dict_vectors_words_for_word(word):
  vectors = OrderedDict()
  synsets = WordnetHandler.get_synsets_for_word(word, 'n')
  for synset in synsets:
    definition = synset.definition()
    vector = PreprocessDefinition.preprocess_sentence(definition)
    key = synset.definition()
    vectors[key] = vector

  return vectors
Пример #8
0
def sim_ox_wn_defi_WDS_via_1_main_syn(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word)
  synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
  dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, synsets_wn)

  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)

  m2d_sim = sim_wn_ox_vector(vectors_ox, vectors_wn)

  return m2d_sim
def get_greatest_synset_similarity_between(synset_1, noun_2):
  synset_max = None

  (word, pos) = noun_2
  synsets_of_noun = []
#  if pos_is_noun(pos):
#    synsets_of_noun = WordnetHandler.get_synsets_for_word(word, 'n')
#  if pos_is_verb(pos):
#    synsets_of_noun = WordnetHandler.get_synsets_for_word(word, 'v')

  synsets_of_noun_1 = WordnetHandler.get_synsets_for_word(word, 'n')
  synsets_of_noun_2 = WordnetHandler.get_synsets_for_word(word, 'v')
  synsets_of_noun = synsets_of_noun_1 + synsets_of_noun_2

  total_count = 0.1 + len(synsets_of_noun)*__SMOOTH_WEIGHT__
  for synset_of_noun in synsets_of_noun:
    total_count += WordnetHandler.get_freq_count_of_synset(synset_of_noun)
#
  if len(synsets_of_noun) > 0:
    synset_max = synsets_of_noun[0]
#    p_max = -1.0
#
#    for synset_of_noun in synsets_of_noun:
##      p = synset_1.path_similarity(synset_of_noun)
#      p = WordnetHandler.cal_similarity(synset_1, synset_of_noun)
#
#      if p is not None:
#        synset_freq_count = __SMOOTH_WEIGHT__
#        synset_freq_count += WordnetHandler.get_freq_count_of_synset(synset_of_noun)
#
#        p = p*(synset_freq_count/total_count)
#
##
#      if p > p_max:
#        p_max = p
#        synset_max = synset_of_noun
#
  return synset_max
Пример #10
0
def sim_ox_wn_defi_WDS_via_align(word):
  words_wn = WordnetParseDefinition.get_dict_vectors_words_for_word(word)
  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(words_wn)
  words_ox = OxParseDefinition.get_dict_vectors_word_for_word(word)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(words_ox)

  synsets_wn = WordnetHandler.get_synsets_for_word(word, 'n')

  m2d_sim = [[0 for x in range(len(keys_ox))] for x in range(len(keys_wn))]

  for i in range(len(keys_wn)):
    vector_wn = vectors_wn[i]
    synset_wn = synsets_wn[i]
    for j in range(len(keys_ox)):
      vector_ox = vectors_ox[j]
      m2d_sim[i][j] = WordnetHandler.sim_for_words_words_no_order(vector_ox, vector_wn, synset_wn)
#      (vector_1, vector_2) = WordnetHandler.get_nearest_synsets_words_words_noorder(vector_ox, vector_wn)
#      m2d_sim[i][j] = sim_2_vector(vector_1, vector_2)

  cal_sim_ngrams(word)

  return m2d_sim
Пример #11
0
def sim_ox_wn_defi_WDS_via_curr_main_syn(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word)
  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  synsets_wn = WordnetHandler.get_synsets_for_word(word, 'n')

  definitions = OxfordParser.get_definitions_of_word(word)

  m2d_sim = [[0 for x in range(len(definitions))] for x in range(len(vectors_wn))]

  for i in range(len(vectors_wn)):
    vector_wn = vectors_wn[i]

    dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, [synsets_wn[i]])
    (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)

    for j in range(len(vectors_ox)):
      vector_ox = vectors_ox[j]
      m2d_sim[i][j] = sim_2_vector(vector_ox, vector_wn)

  cal_sim_ngrams(word)

  return m2d_sim
Пример #12
0
def create_input_for_test_svm():
  dict_ox =  OxfordParser.get_dict_nouns()
  flag_can_go = False
  for word in dict_ox:

#    if word == "brook":
#      flag_can_go = True
#
#    if flag_can_go == False:
#      continue

    if len(dict_ox[word]) == 0:
      continue

    syns_wn = WordnetHandler.get_synsets_for_word(word, 'n')
    syns_ox = dict_ox[word]

    if len(syns_ox) == 1 and len(syns_wn) == 1:
      continue

    write_sens_for_reading(syns_wn, syns_ox, __filename_input_sen_test__)
    cal_features_and_write_to_file_for(syns_wn, syns_ox, __filename_input_test_feature_values__)
def similarity_nbest_withword_average(WORD, dict_words):

  wn_words = WordnetHandler.get_synsets_for_word(WORD, 'n');

  matrix_similarity = similarity_by_synsets_synsets_nbest_withword_dict_wn(WORD, dict_words,wn_words)

  if matrix_similarity == None:
    return  [],[]

  matrix_similarity_reverse = similarity_by_synsets_synsets_nbest_withword_wn_dict(WORD , wn_words, dict_words)

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord];
      matrix_similarity[iWnWord][iDictWord] /= 2;

  matrix_similarity_jaccard = similarity_by_jaccard(WORD, dict_words, wn_words)

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*(1-PARAMETERS.JACCARD_WEIGHT) + PARAMETERS.JACCARD_WEIGHT*(1-matrix_similarity_jaccard[iWnWord][iDictWord]);

  return matrix_similarity, wn_words
Пример #14
0
def create_input_for_train_svm():
  dict_ox =  OxfordParser.get_dict_nouns()
  dict_gold = CompareWithGold.goldData

  for word in dict_ox:

    if len(dict_ox[word]) == 0 or word not in dict_gold:
      continue

    if word == "brook":
      return

#    if word != "bank":
#      continue

    syns_wn = WordnetHandler.get_synsets_for_word(word, 'n')
    syns_ox = dict_ox[word]

    if len(syns_ox) == 1 and len(syns_wn) == 1:
      continue

    write_label_for_svm(syns_wn, syns_ox, dict_gold[word])
    write_sens_for_reading(syns_wn, syns_ox, __filename_input_sen_train__)
    cal_features_and_write_to_file_for(syns_wn, syns_ox, __filename_input_train_feature_values__)
Пример #15
0
def pair_0_1_reducing_m2d_sim(matrix_similarity, num_rows, num_cols, word):

  if num_rows == 1 and num_cols == 1 and matrix_similarity[0][0] > Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_1_MIN:
      matrix_similarity[0][0] = 1;

  if num_rows > 1 and num_cols == 1:
    col = []
    for iWnWord in range(num_rows):
      col.append(matrix_similarity[iWnWord][0])
    order = heapq.nlargest(2, range(num_rows), col.__getitem__);
    if matrix_similarity[order[0]][0] >= Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_COL_RANGE_FIRST*matrix_similarity[order[1]][0] or \
            matrix_similarity[order[0]][0] > Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_COL_MIN_FIRST:
      matrix_similarity[order[0]][0] = 1;

  if num_rows >= 1 and num_cols > 1:
    synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
    status_synsets = create_status_array(synsets_wn)
    updated = reducing_m2d_sim(matrix_similarity, status_synsets)
    while updated == 1:
      m2d = sim_ox_wn_defi_WDS_via_main_syns_for_reduce(synsets_wn, status_synsets, word)
      updated = reducing_m2d_sim(m2d, status_synsets)
      match_matrix_sim_with_temp_matrix(matrix_similarity, m2d)

  return matrix_similarity
def get_nbest_synsets_for_word_in_oxford(dict_words,word_concept):

  dict_words_nouns = [];
  dict_words_verbs = [];
  dict_synsets_for_words = [];

  wn_words = WordnetHandler.get_synsets_for_word(word_concept, 'n');

  # add p
  p_synsets_for_words = [];

  for iWord in range(len(dict_words)):

    # print iWord;

    dict_words_nouns.append([]);
    dict_words_verbs.append([]);
    dict_synsets_for_words.append([]);

    # add p
    p_synsets_for_words.append([]);

    wordDict = dict_words[str(iWord)];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # sd

    if not wordDict.has_key('tv'):
      continue

    if not wordDict.has_key('d'):
      continue

    nouns = [];
    if wordDict.has_key("sd") and PARAMETERS.DICT_OX_FEATURE_RELATION_sd == 1:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["sd"]));
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

      if len(nouns) == 0:
        tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
        # print tagged_sent
        nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

    elif wordDict.has_key("d") and wordDict["d"] != None:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      # print tagged_sent
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' or word != 'etc'))];
    else:
      continue

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun != "sth" and noun != 'etc' and noun not in dict_words_nouns[iWord]:
        dict_words_nouns[iWord].append(noun);

    if len(dict_words_nouns[iWord]) == 0:
      continue

    # print dict_words_nouns[iWord]
    synsetsSD = [];

    for word in dict_words_nouns[iWord]:
      synsets = WordnetHandler.get_synsets_for_word(word, 'n');
      for synset in synsets:
        synsetsSD.append(synset)

    if len(synsetsSD) == 0:
      continue

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # d

    if PARAMETERS.DICT_OX_FEATURE_RELATION_d == 1:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS')];

    if PARAMETERS.DICT_OX_FEATURE_RELATION_xh == 1:
      if wordDict.has_key('xh0') and wordDict['xh0'] is not None and wordDict['xh0'] != 'nn':
        nouns.append(wordDict['xh0']);
      if wordDict.has_key('xh1') and wordDict['xh1'] is not None:
        nouns.append(wordDict['xh1']);
      if wordDict.has_key('xh2') and wordDict['xh2'] is not None:
        nouns.append(wordDict['xh2']);

    # print  tagged_sent

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]:
        dict_words_nouns[iWord].append(noun);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # print wordDict["tv"]
    # print dict_words_nouns[iWord]
    #
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # synsets

    iSDMax = 0;
    pSD_max = 0;

    for iSyn in range(len(synsetsSD)):
      synsetSD = synsetsSD[iSyn];
      pSD = 0;

      arr_p = [];

      for synset in wn_words:
        # p_noun_max = 0;
        p = synsetSD.path_similarity(synset);
        # print "-----------------------"
        # if p > p_noun_max:
        p_noun_max = p;

        arr_p.append(p_noun_max);

      arr_p = sorted(arr_p, reverse=True);

      for i in xrange(0, len(arr_p)-1):
        if i <= 0:
          pSD += arr_p[i];

      # print "\n"

      if pSD > pSD_max:
        pSD_max = pSD;
        iSDMax = iSyn;

    # print "\n"

    synsetRoot = synsetsSD[iSDMax];
    # print "synsetroot"
    # print synsetRoot

    for noun in dict_words_nouns[iWord]:
      synsets_noun = WordnetHandler.get_synsets_for_word(noun, 'n');
      if len(synsets_noun) <= 0:
        continue;

      p_noun_max = 0;
      synMax = synsets_noun[0];

      for synset_noun in synsets_noun:
        # dict_synsets_nouns[iWord].append(synMax);
        for synset in wn_words:
          p = synset.path_similarity(synset_noun);
        # p = synsetRoot.path_similarity(synset_noun);
          if p > p_noun_max:
            p_noun_max = p;
            synMax = synset_noun;

      if synMax not in dict_synsets_for_words[iWord]:
        dict_synsets_for_words[iWord].append(synMax);

    if PARAMETERS.POS_FEATURE_v:

      # continue
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBN' or pos == 'VBD')];

      # print "VVVVV"
      # print verbs
      for verb in verbs:
        verb = wordnet_lemmatizer.lemmatize(verb, pos='v');
        if verb == None:
          continue

        if verb.encode('utf8') != word_concept and verb != "sth" and verb not in dict_words_verbs[iWord]:
          # print noun;
          dict_words_verbs[iWord].append(verb);

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # print dict_words_verbs[iWord]

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      #
      # synsets

      iSDMax = 0;
      pSD_max = 0;

      for iSyn in range(len(synsetsSD)):
        synsetSD = synsetsSD[iSyn];
        pSD = 0;

        arr_p = [];

        for synset in wn_words:
          # p_noun_max = 0;
          p = synsetSD.path_similarity(synset);
            # arr_p.append(p);
          # print "-----------------------"
          # print synsetSD
          # print synset
          # print p
          # if p > p_noun_max:
          p_verb_max = p;

          arr_p.append(p_verb_max);

        arr_p = sorted(arr_p, reverse=True);

        for i in xrange(0, len(arr_p)-1):
          if i <= 1:
            pSD += arr_p[i];

        # print "\n"

        if pSD > pSD_max:
          # print pSD
          # print pSD_max
          pSD_max = pSD;
          # print iSyn
          # print iSDMax
          iSDMax = iSyn;

      # print "\n"

      synsetRoot = synsetsSD[iSDMax];
      # print "synsetroot"
      # print synsetRoot

      for verb in dict_words_verbs[iWord]:
        synsets_verb = WordnetHandler.get_synsets_for_word(verb, 'v');
        if len(synsets_verb) <= 0:
          continue;

        p_verb_max = 0;
        synMax = synsets_verb[0];

        for synset_verb in synsets_verb:
          # p = synsetRoot.path_similarity(synset_verb);
          for synset in wn_words:
            p = synset.path_similarity(synset_verb);

            if p > p_verb_max:
              p_verb_max = p;
              synMax = synset_verb;

        if synMax not in dict_synsets_for_words[iWord]:
          dict_synsets_for_words[iWord].append(synMax);
        # if synsets_noun[0] not in dict_synsets_nouns[iWord]:
          # dict_synsets_nouns[iWord].append(synsets_noun[0]);

    # print "dict_synsets_nouns"
    # print dict_synsets_for_words[iWord]

  ########################################
  return dict_synsets_for_words;
def get_synsets_for_word_in_wn(word_origin, wn_synsets_for_word_origin):

  # arr synsets for arr words
  # each word has an array of synsets
  wn_synsets_for_words = [];

  # add p
  p_synsets_for_words = [];

  for iWord in range(len(wn_synsets_for_word_origin)):

    # print "- - - - - - - - - - - - - - - - - - - - - - - - - - -";
    # print iWord;
    wn_synsets_for_words.append([]);

    # add p
    p_synsets_for_words.append([]);

    # get a bank in wn_words
    wordDict = wn_synsets_for_word_origin[iWord];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get synsets of bank
    synset_of_word = wn.synset(wordDict.name());
    wn_synsets_for_words[iWord].append(synset_of_word);

    # add p
    p_synsets_for_words[iWord].append(1.5);

    # print synset_of_word
    # print "---"

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hypernyms

    if PARAMETERS.DICT_WN_FEATURE_RELATION_hypernyms == 1:
      # print "hypernyms"
      for hypernym in wn.synset(wordDict.name()).hypernyms():
        # print hypernym
        wn_synsets_for_words[iWord].append(hypernym);

        # add p
        p_synsets_for_words[iWord].append(1.2);


    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get meronyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_part_meronyms == 1:
      # print "meronyms"
      for meronym in wn.synset(wordDict.name()).part_meronyms():
        # print meronym
        wn_synsets_for_words[iWord].append(meronym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get holonyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_member_holonyms == 1:
      # print "holonyms"
      for holonym in wn.synset(wordDict.name()).member_holonyms():
        # print holonym
        wn_synsets_for_words[iWord].append(holonym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hyponyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_hyponyms == 1:
      # print "hyponyms"
      for hyponym in wn.synset(wordDict.name()).hyponyms():
        # print hyponym
        wn_synsets_for_words[iWord].append(hyponym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get description

    if PARAMETERS.DICT_WN_FEATURE_RELATION_definition == 1:

      # print "\ndefinition ------";

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wordDict.name()).definition()));
      # print tagged_sent

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if PARAMETERS.POS_FEATURE_n == 1:
        nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS'  or pos == 'JJ')];

        for noun in nouns:

          noun = wordnet_lemmatizer.lemmatize(noun, pos='n');

          if noun == None:
            continue

          if noun != word_origin and noun != "sth":
            synsetsDictNoun = WordnetHandler.get_synsets_for_word(noun, "n");

            if len(synsetsDictNoun) > 0:
              synsetMax = synsetsDictNoun[0];
              p_max = 0;

              for synsetNoun in synsetsDictNoun:
                p = synsetNoun.path_similarity(synset_of_word);
                if p > p_max:
                  p_max = p;
                  synsetMax = synsetNoun

              # print synsetMax
              if synsetMax not in wn_synsets_for_words[iWord]:
                wn_synsets_for_words[iWord].append(synsetMax);

                # add p
                p_synsets_for_words[iWord].append(1.);

            # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
            #   # wn_words_synset[iWord].append(synsetsDictNoun[0]);
      # - - - - - - - - - - - - - - - - - - - - - - - - - - -

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if PARAMETERS.POS_FEATURE_v == 1:
        verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBD' or pos == 'VBN')];

        for verb in verbs:

          verb = wordnet_lemmatizer.lemmatize(verb, pos='v');

          if verb == None:
            continue

          if verb != "bank":
            synsetsDictVerb = WordnetHandler.get_synsets_for_word(verb, "v");


            if len(synsetsDictVerb) > 0:
              synsetMax = synsetsDictVerb[0];
              p_max = 0;

              for synsetVerb in synsetsDictVerb:
                p = synsetVerb.path_similarity(synset_of_word);
                if p > p_max:
                  p_max = p;
                  synsetMax = synsetVerb
              #
              # print synsetMax
              if synsetMax not in wn_synsets_for_words[iWord]:
                wn_synsets_for_words[iWord].append(synsetMax);

                # add p
                p_synsets_for_words[iWord].append(1.);

            # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
            #   wn_words_synset[iWord].append(synsetsDictNoun[0]);

    # print wn_synsets_for_words[iWord]

  ########################################
  return wn_synsets_for_words,p_synsets_for_words;
Пример #18
0
def sim_ox_wn_via_svm():
  total_tp = 0.00001
  total_tn = 0.00001
  total_fn = 0.00001
  total_fp = 0.00001
  total_pair = 0

  dict_ox = OxfordParser.get_dict_nouns()
  flag_can_go = False
  for word in dict_ox:

#    if word == "brook":
#      flag_can_go = True
#
#    if flag_can_go == False:
#      continue

    word_syns_ox = dict_ox[word]
    wn_synsets = WordnetHandler.get_synsets_for_word(word, "n")

    m2d_sim = [[0 for x in range(len(word_syns_ox))] for x in range(len(wn_synsets))]

    if len(word_syns_ox) == 1 and len(wn_synsets) == 1:
      m2d_sim[0][0] = 1
    else:
      m2d_sim = get_m2d_sim_for_word_from_svm_result(word)

    if m2d_sim == None:
      continue

#    DebugHandler.print_2d_matrix(m2d_sim)

    m2d_sim = choose_pair_0_1(m2d_sim, len(m2d_sim), len(m2d_sim[0]))
#    DebugHandler.print_2d_matrix(m2d_sim)

    pair = count_pair(m2d_sim)
    total_pair += pair

    (tp, tn, fn, fp) = CompareWithGold.compareGoldWithResult_without_cal_result(m2d_sim,word)
    if tp != -1:
      total_tp += tp
      total_tn += tn
      total_fn += fn
      total_fp += fp

  precision = total_tp / (total_tp + total_fp)
  recall = total_tp / (total_tp + total_fn)
  accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)

  f_score = 0
  if precision != 0 or recall != 0:
    f_score = 2*(precision*recall)/(precision + recall)
  print "total:"
  print total_pair
  print total_tp
  print total_tn
  print total_fn
  print total_fp

  print precision
  print recall
  print f_score
  print accuracy

  Parameters.append_result_to_file( precision, recall, f_score, accuracy)
  current_params = Parameters.get_current_params()
  current_params = copy.deepcopy(current_params)
  return f_score, current_params