Пример #1
0
def similarity_by_synsets_synsets_nbest_withword(WORD, dict_words):

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data
  dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD);
  print "dict-word_synsets"
  print dict_words_synsets

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  wn_words = wn.synsets(WORD, pos = 'n');
  print "wn_words -------"
  print wn_words;

  wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words);

  print wn_words_synsets

  # matrix for similarity dict_words vs wn_words
  matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];
      #
      for dict_synset in dict_words_synsets[iDictWord]:

        # print "------------ dict noun"
        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:
          #
          p_max = dict_synset.path_similarity(wn_synset);
          if p_max == None:
            continue

          arr_p.append(p_max);

          # print p_max

        arr_p = sorted(arr_p, reverse=True);

        nBest = 3;
        count = 0.0001;
        for i in xrange(0, len(arr_p)-1):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count += 1;

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 40;
      count = 5;
      for i in xrange(0, len(arr_p_word)-1):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*10.;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*10.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # write file

  # - - - - - - - - - - - - - - - - - - - - - - - - -
   # - - - - - - - - - - - - - - - - - - - - - - - - -
  # col
  arrColWn = [];
  for i in range(len(wn_words)):
    matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition());

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # row
  arrRowDict = [];
  arrRowDict.append("--");
  for i in range(len(dict_words)):
    if not dict_words[str(i)].has_key('tv'):
      dict_words[str(i)]['tv'] = "--";
    if dict_words[str(i)]['tv'] == None:
      dict_words[str(i)]['tv'] = "--"
    arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8'));

  FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword.csv",arrRowDict,matrix_similarity)
Пример #2
0
def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words):


  if WORD == "bank":
    asf = 0;
  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data
  dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD);
  # print "dict-word_synsets"
  # print dict_words_synsets

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  wn_words = wn.synsets(WORD, pos = 'n');
  print "wn_words -------"
  print wn_words;

  wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words);

  print wn_words_synsets

  # matrix for similarity dict_words vs wn_words
  matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];
      #
      for dict_synset in dict_words_synsets[iDictWord]:

        # print "------------ dict noun"
        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:
          #
          p_max = dict_synset.path_similarity(wn_synset);
          if p_max == None:
            continue

          arr_p.append(p_max);

          # print p_max

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001;
        for i in xrange(0, len(arr_p)-1):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count += 1;

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in range(len(arr_p_word)):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data

  wn_words = dict_words;
  wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD);

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  dict_words = wn.synsets(WORD, pos = 'n');
  # print wn_words;
  dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words);

  print "sysnets -----------------------.----.-----.--.-"

  # matrix for similarity dict_words vs wn_words
  matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];

      for dict_synset in dict_words_synsets[iDictWord]:

        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets
        countwnNouns = 0.00000001;

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:

          p_max = dict_synset.path_similarity(wn_synset);
          if p_max != None:
            arr_p.append(p_max);

          # print p_max
          # - - - - - - - - - - - - - - - - - - - - - - - -

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001
        for i in range(len(arr_p)):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count +=1

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in xrange(0, len(arr_p_word)-1):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity_reverse]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  dict_words = wn_words;
  wn_words = wn.synsets(WORD, pos = 'n');

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord];
      matrix_similarity[iWnWord][iDictWord] /= 2;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # @brief:
  #

  matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  for iWnWord in range(len(wn_words)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
    # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
      # print
      # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set);


  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]);
      matrix_similarity[iWnWord][iDictWord] /= 12;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # write file

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # col
  arrColWn = [];
  for i in range(len(wn_words)):
    matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition());

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # row
  arrRowDict = [];
  arrRowDict.append("--");
  for i in range(len(dict_words)):
    if not dict_words[str(i)].has_key('tv'):
      dict_words[str(i)]['tv'] = "--";
    if dict_words[str(i)]['tv'] == None:
      dict_words[str(i)]['tv'] = "--"
    arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8'));

  FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)