Пример #1
0
def write_sens_for_reading(syns_wn, syns_ox, filename_output):
  for i_wn in range(len(syns_wn)):
    for i_ox in range(len(syns_ox)):
      defi_wn = syns_wn[i_wn].definition()
      defi_ox = syns_ox[str(i_ox)]["d"]
      value = defi_wn + "\t" + defi_ox
      FileProcess.append_value_to_file(value, filename_output)
Пример #2
0
def parse_ox_wn_defi_to_input(word):
  defis_wn = WordnetHandler.get_definitions_for_word(word)
  defis_ox = OxfordParser.get_definitions_of_word_for_svm(word)

  for defi_wn in defis_wn:
    for defi_ox in defis_ox:
      value = defi_wn + "\t" + defi_ox
      FileProcess.append_value_to_file(value, __filename_input_sen__)
Пример #3
0
def map_wordnet_EVD():
  print "loading EVD"
  dict_EVD = EVDParser.readEVDFile()
  print "loading WN"
  dict_wn = WordnetProcessForEVD.read_nouns()

  for key, values in dict_wn.items():
    key_lemmas = key.split("=")[1]
    key_lemmas = key_lemmas.split("-")
    key = key.split("=")[0]
    key_definition = key.split("-")[1]
    key = key.split("-")[0]

    test_flag = 0
    for lemma in key_lemmas:
      if lemma[:1] == "b" :
        test_flag = 1
    if test_flag == 0:
      continue

    print "map_wordnet_EVD " + key

    vi_means = get_EVD_means(key, key_lemmas, values, dict_EVD)
    ox_means = get_Ox_means(key, key_lemmas)

    means = get_best_mean(vi_means, ox_means, 2)

################################################################################
# get greatest duplicated mean
#    if len(values) == 1:
#      means = vi_means
#      item_count = [(item,count) for item, count in collections.Counter(vi_means).items() if count > 1]
#      if len(item_count) > 0:
#        means = [max(item_count,key = itemgetter(1))[0]]
#
#        items_2 = [item for item, count in collections.Counter(vi_means).items() if count > 2]
#        for item in items_2:
#          means.append(item)
#        means = list(set(means))
#
#    else:
#      item_count = [(item,count) for item, count in collections.Counter(vi_means).items() if count > 1]
#      if len(item_count) == 0:
#        continue
#      means = [max(item_count,key = itemgetter(1))[0]]
#
#      items_2 = [item for item, count in collections.Counter(vi_means).items() if count > 2]
#      for item in items_2:
#        means.append(item)
#      means = list(set(means))
################################################################################

    if len(means) > 0:
      means = [means[0]]
      means.insert(0,key + "-" + key_definition)
      filename = "Results/EVD/wn_evd_b_0_1.csv"
      FileProcess.append_result_to_excel_file(filename, means)
Пример #4
0
def get_synset_gloss(synset, filename):
  result = ""
  for lemma in synset.lemmas():
    gloss = lemma.name().replace("_", " ")
    result += gloss + ". "

  result += synset.definition() + ". "
  for example in synset.examples():
    result += example + "."

  FileProcess.append_value_to_file(result, filename)

  for hypo in synset.hyponyms():
    get_synset_gloss(hypo, filename)
Пример #5
0
def create_input_sens_test(dict_ox):

  flag_can_go = False
  for word in dict_ox:

    if word == "blockage":
      flag_can_go = True

    if flag_can_go == False:
      continue

    if len(dict_ox[word]) == 0:
      continue

    defis_wn = WordnetHandler.get_definitions_for_word(word)
    defis_ox = OxfordParser.get_definitions_of_word_for_svm(word)

    if len(defis_ox) == 1 and len(defis_wn) == 1:
      continue

    if len(defis_ox) == 1 and len(defis_wn) > 1:
      all_defi_wn = ""
      for defi_wn in defis_wn:
        all_defi_wn += defi_wn + "\t"

      if all_defi_wn != "":
        all_defi_wn = all_defi_wn[:-1]
      for defi_wn in defis_wn:
        for defi_ox in defis_ox:
          value = defi_wn + "\t" + defi_ox + "\t" + all_defi_wn
          FileProcess.append_value_to_file(value, __filename_input_sen_test__)
    else:
      for defi_wn in defis_wn:
        all_defi_ox = ""
        for defi_ox in defis_ox:
          all_defi_ox += defi_ox + "\t"

        if all_defi_ox != "":
          all_defi_ox = all_defi_ox[:-1]

        for defi_ox in defis_ox:
          value = defi_wn + "\t" + defi_ox + "\t" + all_defi_ox
          FileProcess.append_value_to_file(value, __filename_input_sen_test__)
Пример #6
0
def cal_features_from_sens_write_to_file(filename_sens, filename_output):
  f = open(filename_sens,'r');
  line = f.readline();
  while (line):
    if len(line) > 0:

      feature_values = ""

      sens = line.split("\t")

      sen_1 = sens[0]
      sen_2 = sens[1]

      feature_values += str(Literal.levenshtein_in_context(sen_1, sen_2, sens)) + "\t"
#      feature_values += str(ShallowSyntactic.jaccard_POS_in_context(sen_1, sen_2, sens)) + "\t"
      feature_values += str(WordnetBased.wordnet_based_in_context(sen_1, sen_2, sens, 0))
#      feature_values += str(WordnetBased.wordnet_based_in_context(sen_1, sen_2, sens, 1))

      FileProcess.append_value_to_file(feature_values, filename_output)

      line = f.readline();

  f.close()
Пример #7
0
def create_input_sen_via_ox_vn(dict_vn, dict_ox):

  for word in dict_ox:
    if len(dict_ox[word]) == 0:
      continue

    if word in dict_vn:

      word_syns_vn = dict_vn[word]
      word_syns_ox = dict_ox[word]
      if len(word_syns_ox) == 1 and len(word_syns_ox) == 1:
        continue
      for i_vn in word_syns_vn:
        syn_vn = word_syns_vn[i_vn]

        all_defi_ox = ""
        for i_ox in word_syns_ox:
          syn_ox = word_syns_ox[i_ox]
          if "tv" not in syn_ox:
            continue
          defi_ox = syn_ox['d']
          all_defi_ox += defi_ox + "\t"

        flag_can_use = False
        for i_ox in word_syns_ox:
          syn_ox = word_syns_ox[i_ox]
          if "tv" not in syn_ox:
            continue
          if check_tv_similar(syn_vn['tv'], syn_ox['tv']) == 1:
            defi_vn = syn_vn['d']
            defi_ox = syn_ox['d']
            value = defi_vn + "\t" + defi_ox + all_defi_ox
            FileProcess.append_value_to_file(value, __filename_input_sen__)
            FileProcess.append_value_to_file("1", __filename_input_gs__)
            flag_can_use = True
          else:
            if flag_can_use == True:
              defi_vn = syn_vn['d']
              defi_ox = syn_ox['d']
              value = defi_vn + "\t" + defi_ox + all_defi_ox
              FileProcess.append_value_to_file(value, __filename_input_sen__)
              FileProcess.append_value_to_file("0", __filename_input_gs__)
seed = 2017
np.random.seed(seed)
'''
data = load_iris()
idx = np.random.permutation(150)
X = data.data[idx]
y = data.target[idx]
print("Iris data shape and format:")
print(type(X))
print(type(y))
print(y.shape)
'''
# open and load csv files
time_load_start = time.clock()

X_train, y_train = fipr.load_csv("train_file.csv", True)
X_test, y_test = fipr.load_csv("test_file.csv", True)

time_load_end = time.clock()
print("Loading finished, loading time: %g seconds" %
      (time_load_end - time_load_start))

X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True)

training_data = X_train
training_labels = y_train.flatten()
test_data = X_test
test_labels = y_test.flatten()

test_data_even = X_test_even
test_labels_even = y_test_even.flatten()
Пример #9
0
def compareVietNetAndOxford(dict_VietNet, dict_Oxford):

    for WORD in dict_Oxford:

        if len(dict_Oxford[WORD]) == 0:
            continue

        # if WORD == "BA":
        # print "holyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyy"

        wn_words = wn.synsets(WORD, pos="n")
        if wn_words == None:
            continue

        if WORD == "baby":
            a = 1

        if dict_VietNet.has_key(WORD):

            arr_VietNet = dict_VietNet[WORD]
            arr_Oxford = dict_Oxford[WORD]

            matrix_similarity = [[0 for x in range(len(arr_Oxford))] for x in range(len(wn_words))]

            for iWn in range(len(wn_words)):

                definitionWn = wn.synset(wn_words[iWn].name()).definition()

                vietNet = {}
                for iVietNet in arr_VietNet:

                    levenshtein_vn_wn = Util.levenshtein(arr_VietNet[iVietNet]["d"], definitionWn)

                    if levenshtein_vn_wn < len(definitionWn) / 2.0:
                        vietNet = arr_VietNet[iVietNet]
                        break

                if not vietNet.has_key("tv"):
                    vietNet["tv"] = ""

                viet_net_tv = vietNet["tv"]

                for iOxford in range(len(arr_Oxford)):
                    oxford = arr_Oxford[str(iOxford)]

                    vietNet_tv = viet_net_tv

                    if not oxford.has_key("tv"):
                        continue
                    oxford_tv = oxford["tv"].encode("utf-8")

                    vietNet_tv.replace(";", "")
                    oxford_tv = oxford_tv.replace(";", "")
                    oxford_tv = oxford_tv.replace(",", "")
                    oxford_tv = oxford_tv.replace("/", " ")

                    arr_tv_oxford = set(oxford_tv.split(" "))
                    arr_tv_vietnet = set(vietNet_tv.split(" "))

                    jaccard = jaccard_distance(arr_tv_oxford, arr_tv_vietnet)
                    print arr_tv_vietnet
                    print arr_tv_oxford
                    print jaccard
                    matrix_similarity[iWn][iOxford] = 0
                    if jaccard < 0.95:
                        matrix_similarity[iWn][iOxford] = 1

                matrix_similarity[iWn].insert(0, viet_net_tv + "<>" + definitionWn.encode("utf-8"))

            print matrix_similarity
            # - - - - - - - - - - - - - - - - - - - - - - - - -
            # col
            # for i in range(len(dict_VietNet[WORD])):
            #   matrix_similarity[i].insert(0,dict_VietNet[WORD][i]["tv"] + "<>" + dict_VietNet[WORD][i]["d"]);

            # - - - - - - - - - - - - - - - - - - - - - - - - -
            # row
            arrRowDict = []
            arrRowDict.append(WORD)
            for i in range(len(dict_Oxford[WORD])):
                if not dict_Oxford[WORD][str(i)].has_key("tv"):
                    dict_Oxford[WORD][str(i)]["tv"] = "-"
                if not dict_Oxford[WORD][str(i)].has_key("d"):
                    dict_Oxford[WORD][str(i)]["d"] = "-"
                if dict_Oxford[WORD][str(i)]["d"] == None:
                    dict_Oxford[WORD][str(i)]["d"] = "-"

                arrRowDict.append(
                    dict_Oxford[WORD][str(i)]["tv"].encode("utf-8")
                    + "<>"
                    + dict_Oxford[WORD][str(i)]["d"].encode("utf-8")
                )

            FileProcess.append_to_excel_file(
                "Results/parameters/VN_Ox/" + "compare_VN_Ox_2_2.1.csv", arrRowDict, matrix_similarity
            )
Пример #10
0
def create_input_sen_via_gold_data(dict_vn, dict_ox, dict_gold):

  for word in dict_ox:

    if len(dict_ox[word]) == 0 or word not in dict_gold:
      continue

    if word == "blockage":
      return

    if word in dict_vn:
      word_syns_vn = dict_vn[word]
      word_syns_ox = dict_ox[word]

      if len(word_syns_ox) == 1 and len(word_syns_vn) == 1:
        continue

      if len(word_syns_ox) == 1 and len(word_syns_vn) > 1:
        all_defi_vn = ""
        for i_vn in word_syns_vn:
          syn_vn = word_syns_vn[i_vn]
          if "tv" not in syn_vn:
            continue
          defi_vn = syn_vn['d']
          all_defi_vn += defi_vn + "\t"

        if all_defi_vn != "":
          all_defi_vn = all_defi_vn[:-1]

        for i_vn in word_syns_vn:
          syn_vn = word_syns_vn[i_vn]


          for i_ox in word_syns_ox:
            syn_ox = word_syns_ox[i_ox]
            if "tv" not in syn_ox:
              continue

            defi_vn = syn_vn['d']
            defi_ox = syn_ox['d']
            value = defi_vn + "\t" + defi_ox + "\t" + all_defi_vn
            if dict_gold[word][int(i_vn)][int(i_ox)] == "1":
              FileProcess.append_value_to_file(value, __filename_input_sen__)
              FileProcess.append_value_to_file("1", __filename_input_gs__)
            else:
              FileProcess.append_value_to_file(value, __filename_input_sen__)
              FileProcess.append_value_to_file("0", __filename_input_gs__)
      else:
        for i_vn in word_syns_vn:
          syn_vn = word_syns_vn[i_vn]

          all_defi_ox = ""
          for i_ox in word_syns_ox:
            syn_ox = word_syns_ox[i_ox]
            if "tv" not in syn_ox:
              continue
            defi_ox = syn_ox['d']
            all_defi_ox += defi_ox + "\t"

          if all_defi_ox != "":
            all_defi_ox = all_defi_ox[:-1]

          for i_ox in word_syns_ox:
            syn_ox = word_syns_ox[i_ox]
            if "tv" not in syn_ox:
              continue

            defi_vn = syn_vn['d']
            defi_ox = syn_ox['d']
            value = defi_vn + "\t" + defi_ox + "\t" + all_defi_ox
            if dict_gold[word][int(i_vn)][int(i_ox)] == "1":
              FileProcess.append_value_to_file(value, __filename_input_sen__)
              FileProcess.append_value_to_file("1", __filename_input_gs__)
            else:
              FileProcess.append_value_to_file(value, __filename_input_sen__)
              FileProcess.append_value_to_file("0", __filename_input_gs__)
Пример #11
0
def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words):


  if WORD == "bank":
    asf = 0;
  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data
  dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD);
  # print "dict-word_synsets"
  # print dict_words_synsets

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  wn_words = wn.synsets(WORD, pos = 'n');
  print "wn_words -------"
  print wn_words;

  wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words);

  print wn_words_synsets

  # matrix for similarity dict_words vs wn_words
  matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];
      #
      for dict_synset in dict_words_synsets[iDictWord]:

        # print "------------ dict noun"
        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:
          #
          p_max = dict_synset.path_similarity(wn_synset);
          if p_max == None:
            continue

          arr_p.append(p_max);

          # print p_max

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001;
        for i in xrange(0, len(arr_p)-1):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count += 1;

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in range(len(arr_p_word)):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data

  wn_words = dict_words;
  wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD);

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  dict_words = wn.synsets(WORD, pos = 'n');
  # print wn_words;
  dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words);

  print "sysnets -----------------------.----.-----.--.-"

  # matrix for similarity dict_words vs wn_words
  matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];

      for dict_synset in dict_words_synsets[iDictWord]:

        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets
        countwnNouns = 0.00000001;

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:

          p_max = dict_synset.path_similarity(wn_synset);
          if p_max != None:
            arr_p.append(p_max);

          # print p_max
          # - - - - - - - - - - - - - - - - - - - - - - - -

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001
        for i in range(len(arr_p)):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count +=1

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in xrange(0, len(arr_p_word)-1):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity_reverse]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  dict_words = wn_words;
  wn_words = wn.synsets(WORD, pos = 'n');

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord];
      matrix_similarity[iWnWord][iDictWord] /= 2;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # @brief:
  #

  matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  for iWnWord in range(len(wn_words)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
    # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
      # print
      # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set);


  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]);
      matrix_similarity[iWnWord][iDictWord] /= 12;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # write file

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # col
  arrColWn = [];
  for i in range(len(wn_words)):
    matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition());

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # row
  arrRowDict = [];
  arrRowDict.append("--");
  for i in range(len(dict_words)):
    if not dict_words[str(i)].has_key('tv'):
      dict_words[str(i)]['tv'] = "--";
    if dict_words[str(i)]['tv'] == None:
      dict_words[str(i)]['tv'] = "--"
    arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8'));

  FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)
Пример #12
0
        print("File open")
        print("Receiving data...")
        i = 0
        while True:
            data = conn.recv(1024)
            i += 1
            if not data:
                break
            file.write(data)

    print("File receive")

    conn.close()

    conn, addr = sock.accept()
    FileProcess.readFromExcel(fileInput)
    FileProcess.supervisorInRoom()
    FileProcess.supervisorOutRoom()
    FileProcess.writeToExcel(fileOutput)
    FileProcess.connectToDatabase()

    with open(fileOutput, "rb") as file:
        data = file.read(1024)

        i = 0
        while data:
            conn.send(data)
            #print(f"Sent {data!r}")
            i += 1
            data = file.read(1024)
    print("File send")
Пример #13
0
def cal_features_and_write_to_file_for(syns_wn, syns_ox, filename_output):
  if len(syns_ox) == 1 and len(syns_wn) > 1:

    # cal all features between syns in ox with syn in wn
    syns_values_in_row = []
    for i_wn in range(len(syns_wn)):
      syn_wn = syns_wn[i_wn]
      syn_ox = syns_ox[str(0)]
      feature_values = cal_feature_values_for(syn_wn, syn_ox)
      syns_values_in_row.append(feature_values)

    # cal max values of each feature
    arr_root_values_of_feature = []
    for i_feature in range(len(syns_values_in_row[0])):
      root = root_values_of_a_feature_in_row(syns_values_in_row, i_feature)
      arr_root_values_of_feature.append(root)

    for i_wn in range(len(syns_wn)):

      # cal value for svm
      for i_ox in range(len(syns_ox)):
        feature_values_for_svm = ""
        feature_values_1_syn = syns_values_in_row[i_wn]
        for i_feature in range(len(feature_values_1_syn)):
          root_value = arr_root_values_of_feature[i_feature]
          feature_value = feature_values_1_syn[i_feature]
          feature_value_for_svm = feature_value/root_value
          feature_values_for_svm += str(feature_value_for_svm) + "\t"

        if feature_values_for_svm != "":
          feature_values_for_svm = feature_values_for_svm[:-1]

        FileProcess.append_value_to_file(feature_values_for_svm, filename_output)
  else:
    for i_wn in range(len(syns_wn)):

      # cal all features between syns in ox with syn in wn
      syns_values_in_row = []
      for i_ox in range(len(syns_ox)):
        syn_wn = syns_wn[i_wn]
        syn_ox = syns_ox[str(i_ox)]
        feature_values = cal_feature_values_for(syn_wn, syn_ox)
        syns_values_in_row.append(feature_values)

      # cal max values of each feature
      arr_root_values_of_feature = []
      for i_feature in range(len(syns_values_in_row[0])):
        root = root_values_of_a_feature_in_row(syns_values_in_row, i_feature)
        arr_root_values_of_feature.append(root)

      # cal value for svm
      for i_ox in range(len(syns_ox)):
        feature_values_for_svm = ""
        feature_values_1_syn = syns_values_in_row[i_ox]
        for i_feature in range(len(feature_values_1_syn)):
          root_value = arr_root_values_of_feature[i_feature]
          feature_value = feature_values_1_syn[i_feature]
          feature_value_for_svm = feature_value/root_value
          feature_values_for_svm += str(feature_value_for_svm) + "\t"

        if feature_values_for_svm != "":
          feature_values_for_svm = feature_values_for_svm[:-1]

        FileProcess.append_value_to_file(feature_values_for_svm, filename_output)
Пример #14
0
def write_label_for_svm(syns_wn, syns_ox, dict_gold):
  for i_wn in range(len(syns_wn)):
    for i_ox in range(len(syns_ox)):
      FileProcess.append_value_to_file(dict_gold[i_wn][i_ox], __filename_input_gs_train__)
Пример #15
0
import Edge
import FileProcess
import KruskalAlgorithm
import Graph
import SteinerTree

# Reading file

FILENAME = "Data/"+input("Enter the file's name : Data/")

inputFile = open(FILENAME, "r")

# _____________________________________________________________________________________________________________________________
# Process

tuple = FileProcess.fileProcess(inputFile.read())

inputFile.close()

Edges = tuple[4]
connectedNodes = tuple[3]
terminalNodes = tuple[5]
edgesNum = tuple[1]  # number of edges
nodesNum = tuple[0]  # number of nodes
terminalsNum = tuple[2]  # number of nodes


newEdges = KruskalAlgorithm.KruskalAlgorithm(Edges, connectedNodes, nodesNum, edgesNum)
print(Graph.calculateCost(newEdges))# MST

newEdges = SteinerTree.buildSteinerTree(nodesNum, terminalNodes, connectedNodes, newEdges)
Пример #16
0
def main():

    # open and load csv files
    time_load_start = time.clock()
    X_train, y_train = fipr.load_csv("train_file.csv", True)
    X_test, y_test = fipr.load_csv("test_file.csv", True)
    #y_train = y_train.flatten()
    #y_test = y_test.flatten()
    time_load_end = time.clock()
    print("Loading finished, loading time: %g seconds" %
          (time_load_end - time_load_start))

    X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True)

    training_data = X_train
    training_labels = y_train

    test_data = X_test
    test_labels = y_test

    test_data_even = X_test_even
    test_labels_even = y_test_even

    # building the SDA
    sDA = StackedDA([100])

    # start counting time for training
    time_train_start = time.clock()
    print('Pre-training...')

    # pre-trainning the SDA
    sDA.pre_train(training_data[:1000], noise_rate=0.3, epochs=100)
    print('Training Network...')

    # adding the final layer
    sDA.finalLayer(training_data, training_labels, epochs=500)

    # trainning the whole network
    sDA.fine_tune(training_data, training_labels, epochs=500)

    # print training time
    time_train_end = time.clock()
    print("Training finished, training time: %g seconds \n" %
          (time_train_end - time_train_start))

    # start counting time for testing
    time_test_start = time.clock()

    print('Testing performance...')
    # predicting using the SDA
    y_pred = sDA.predict(test_data).argmax(1)

    # print simple precision metric to the console
    print('Accuracy:  ' + str(fipr.compute_accuracy(y_test, y_pred)))

    # print testing time
    time_test_end = time.clock()
    print("Testing finished, testing time: %g seconds  \n" %
          (time_test_end - time_test_start))

    # Even set test
    y_pred_even = sDA.predict(test_data_even).argmax(1)

    # print simple precision metric to the console
    print('Accuracy on EVEN set:  ' +
          str(fipr.compute_accuracy(y_test_even, y_pred_even)))

    return sDA
def append_params_and_result_to_file(values):
  FileProcess.append_result_to_excel_file(result_file_name,values)
Пример #18
0
# Import labs
import numpy as np
import matplotlib.pyplot as plt
from RBFN import RBFN
import sys
import numpy as np
import FileProcess as fipr
import time

# Start counting time
start_time = time.clock()

# Open and load csv files
time_load_start = time.clock()
X_train, y_train = fipr.load_csv("train_file.csv", True)
X_test, y_test = fipr.load_csv("test_file.csv", True)
y_train = y_train.flatten() 
y_test = y_test.flatten()
time_load_end = time.clock()
print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start))

# Training the network
'''
x = np.linspace(0,10,100)
y = np.sin(x)
'''
# start counting time for training
time_train_start = time.clock()

# start training
Пример #19
0
# -*- coding: utf-8 -*-
# @Time    : 2020/10/26 9:30
# @Author  : SanZhi
# @File    : index.py
# @Software: PyCharm
import FileProcess
import getdoc2vec
import get_tSNE
import get_DBscan

predata, namespace = FileProcess.preprocess(r'use5.csv')
DocData = getdoc2vec.do_doc2vec(predata)
TsneData = get_tSNE.getTsneData(DocData, namespace)
# FileProcess.write_json('1-20alltsne.json', TsneData)
# FileProcess.write_json('1-20peopleList.json', namespace)
DBscanData = get_DBscan.getDbscanData(TsneData)
FileProcess.write_json('1-20top7.json', DBscanData)
def main():

    # open and load csv files
    time_load_start = time.clock()
    X_train, y_train = fipr.load_csv("train_file.csv", True)
    X_test, y_test = fipr.load_csv("test_file.csv", True)
    y_train = y_train.flatten()
    y_test = y_test.flatten()
    time_load_end = time.clock()
    print("Loading finished, loading time: %g seconds" %
          (time_load_end - time_load_start))

    X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True)
    y_test_even = y_test_even.flatten()
    # scale features to encourage gradient descent convergence
    X_train = fipr.scale_features(X_train, 0.0, 1.0)
    X_test = fipr.scale_features(X_test, 0.0, 1.0)

    X_test_even = fipr.scale_features(X_test_even, 0.0, 1.0)

    Pattern_train = []
    for i, sample_train in enumerate(X_train):
        Pattern_train.append([sample_train, y_train[i]])

    Pattern_test = []
    for j, sample_test in enumerate(X_test):
        Pattern_test.append([sample_test, y_test[j]])

    Pattern_test_even = []
    for k, sample_test_even in enumerate(X_test_even):
        Pattern_test_even.append([sample_test_even, y_test_even[k]])

    #print(Pattern_train)
    #print(Pattern_test)
    # Teach network XOR function (for test only)
    '''pat = [
        [[0,0], [0]],
        [[0,1], [1]],
        [[1,0], [1]],
        [[1,1], [0]]
        ]
    print(pat)

    # create a network with two input, two hidden, and one output nodes
    n = NN(2, 2, 1)
    # train it with some patterns
    n.train(pat)
    # test it
    n.test(pat)'''

    # Test on Iris data
    #pattern = irisdemo()

    # create a network with two hundred inputs, two hidden, and one output nodes
    n = NN(200, 4, 1)

    # start counting time for training
    time_train_start = time.clock()

    # train it with some patterns
    n.train(Pattern_train)

    # print training time
    time_train_end = time.clock()
    print("Training finished, training time: %g seconds \n" %
          (time_train_end - time_train_start))

    # start counting time for testing
    time_test_start = time.clock()

    # test it
    n.test(Pattern_test)

    # print testing time
    time_test_end = time.clock()
    print("Testing finished, testing time: %g seconds  \n" %
          (time_test_end - time_test_start))

    # test on EVEN data set
    n.test(Pattern_test_even)
Пример #21
0
				 lmbda - the regularization term
				 model_file - the name of the file to store the final classification model
	"""


# Start counting time
start_time = time.clock()

# Set parameters
alpha = 0.01
lmbda = 0
maxiter = 100

# open and load csv files
time_load_start = time.clock()
X_train, y_train = fipr.load_csv("train_file.csv", True)
X_test, y_test = fipr.load_csv("test_file.csv", True)
y_train = y_train.flatten()
y_test = y_test.flatten()
time_load_end = time.clock()
print("Loading finished, loading time: %g seconds" %
      (time_load_end - time_load_start))

X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True)
y_test_even = y_test_even.flatten()

# scale features to encourage gradient descent convergence
X_train = fipr.scale_features(X_train, 0.0, 1.0)
X_test = fipr.scale_features(X_test, 0.0, 1.0)

X_test_even = fipr.scale_features(X_test_even, 0.0, 1.0)
Пример #22
0
### Import Library of Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
import sys
import numpy as np
import FileProcess as fipr
import time

# Start counting time
start_time = time.clock()

# open and load csv files
time_load_start = time.clock()
X_train, y_train = fipr.load_csv("train_file_400atb.csv", True)
X_test, y_test = fipr.load_csv("test_file_400atb.csv", True)
y_train = y_train.flatten()
y_test = y_test.flatten()
time_load_end = time.clock()
print("Loading finished, loading time: %g seconds" %
      (time_load_end - time_load_start))

X_test_even, y_test_even = fipr.load_csv("test_file_400atb_even.csv", True)
y_test = y_test.flatten()

# Create a Gaussian Classifier
model = GaussianNB()

# start counting time for training
time_train_start = time.clock()

# Train the model using the training sets
model.fit(X_train, y_train)
Пример #23
0
def similarity_by_synsets_synsets_nbest_withword(WORD, dict_words):

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data
  dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD);
  print "dict-word_synsets"
  print dict_words_synsets

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  wn_words = wn.synsets(WORD, pos = 'n');
  print "wn_words -------"
  print wn_words;

  wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words);

  print wn_words_synsets

  # matrix for similarity dict_words vs wn_words
  matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];
      #
      for dict_synset in dict_words_synsets[iDictWord]:

        # print "------------ dict noun"
        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:
          #
          p_max = dict_synset.path_similarity(wn_synset);
          if p_max == None:
            continue

          arr_p.append(p_max);

          # print p_max

        arr_p = sorted(arr_p, reverse=True);

        nBest = 3;
        count = 0.0001;
        for i in xrange(0, len(arr_p)-1):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count += 1;

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 40;
      count = 5;
      for i in xrange(0, len(arr_p_word)-1):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*10.;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*10.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # write file

  # - - - - - - - - - - - - - - - - - - - - - - - - -
   # - - - - - - - - - - - - - - - - - - - - - - - - -
  # col
  arrColWn = [];
  for i in range(len(wn_words)):
    matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition());

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # row
  arrRowDict = [];
  arrRowDict.append("--");
  for i in range(len(dict_words)):
    if not dict_words[str(i)].has_key('tv'):
      dict_words[str(i)]['tv'] = "--";
    if dict_words[str(i)]['tv'] == None:
      dict_words[str(i)]['tv'] = "--"
    arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8'));

  FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword.csv",arrRowDict,matrix_similarity)
Пример #24
0
import urllib
import time
import tensorflow as tf
import FileProcess as fipr

from Mnist import Mnist
mnist = Mnist()

sess = tf.InteractiveSession()

# Start counting time
start_time = time.clock()

# open and load csv files
time_load_start = time.clock()
X_train, y_train = fipr.load_csv("train_file.csv", True)
#X_test, y_test = fipr.load_csv("test_file.csv", True)
#y_train = y_train.flatten()
#y_test = y_test.flatten()
time_load_end = time.clock()
#print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start))

training_data = X_train
training_labels = y_train

print(type(training_labels))
print(type(training_labels[0, 0]))
print(training_labels.shape)

print('original labels:')
print(training_labels[3])