def preprocessing(filenames): data = "" sentences = [] words = [] # Find Sentences and save to file data = F.readData(filenames.corpus_name) import os if(not os.path.isfile(filenames.output_folder+'/'+filenames.sents_file_name)): sentences = F.getSentences(data) F.save_to_file(filenames.sents_file_name, sentences, filenames.output_folder) else: print("Sentences File Found") sentences=F.load_to_file(filenames.sents_file_name,filenames.output_folder) if(not os.path.isfile(filenames.output_folder+'/'+filenames.words_file_name)) : words = F.getWords(sentences) F.save_to_file(filenames.words_file_name, words, filenames.output_folder) else: print("Words File Found") words = F.load_to_file(filenames.words_file_name,filenames.output_folder) # Find Sentences and save to file print("Length of text data: ",len(data)) # updated_words, vocab = F.getVocabulary(words, 400,filenames) # updated_words, vocab = F.getVocabulary(words, 300,filenames) # updated_words, vocab = F.getVocabulary(words, 200,filenames) # updated_words, vocab = F.getVocabulary(words, 100,filenames) # updated_words, vocab = F.getVocabulary(words, 75,filenames) # updated_words, vocab = F.getVocabulary(words, 50,filenames) # updated_words, vocab = F.getVocabulary(words, 25,filenames) # updated_words, vocab = F.getVocabulary(words, 20,filenames) # updated_words, vocab = F.getVocabulary(words, 15,filenames) updated_words, vocab = F.getVocabulary(words, 10,filenames) # updated_words, vocab = F.getVocabulary(words, 5,filenames) # updated_words, vocab = F.getVocabulary(words, 4,filenames) # updated_words, vocab = F.getVocabulary(words, 3,filenames) # updated_words, vocab = F.getVocabulary(words, 2,filenames) # updated_words, vocab = F.getVocabulary(words, 1,filenames) # updated_words, vocab = F.getVocabulary(words, 0,filenames) F.save_to_file(filenames.vocab_file, vocab, filenames.output_folder) F.save_to_file(filenames.updated_words_file_name, updated_words, filenames.output_folder) word_to_index = {} index_to_word = {} for k, v in enumerate(vocab): word_to_index[v] = k index_to_word[k] = v F.save_to_file(filenames.w2i_file, word_to_index, filenames.output_folder) F.save_to_file(filenames.i2w_file, index_to_word, filenames.output_folder) print(len(sentences), len(words))
def filter_dp_triplets(filenames,i,files): # Filter DP triple based on vocab # DP Dict to Triplet # print(start,end) for f in files: relation = [] final_triplet = [] triplet_data = F.load_to_file("dp_data_pos/" + f, filenames.output_folder) # Find H R T c=0 for sent in triplet_data: print(c) (H, HPOS), R, (T, TPOS) = sent H = H.lower() R = R.lower() T = T.lower() if R not in relation and R!="": relation.append(R) if H not in vocab or T not in vocab: # print(H,R,T,"0") continue else: # print(H,R,T,"1") final_triplet.append((H, R, T)) c += 1 print(f) F.save_to_file("Filtered_DP/"+filenames.dp_triplet_file+"_"+f, final_triplet, filenames.output_folder) F.save_to_file("Relations_DP/"+filenames.dp_relation_file+"_"+f, relation, filenames.output_folder)
def find_co_occurences(filenames): # Occurence os.system("mkdir -p " + filenames.output_folder + "/occurences") data = F.load_to_file(filenames.updated_words_file_name, filenames.output_folder) vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder) index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder) print(word_to_index) print(index_to_word) print(len(vocab), len(data)) data_index = [word_to_index[w] for w in data] unknown_id = word_to_index['UKN'] occurrence = {} window = 2 print("Words:", len(data_index)) for i in range(-window, window + 1): occurrence[i] = [] for c in range(len(data_index)): # print(c) start = max(0, c - window) end = min(len(data_index) - 1, c + window) # print(start,end) if data_index[c] != unknown_id: for j in range(start, end + 1): if c != j and data_index[j] != 0: # print(j,c) occurrence[j - c].append((data_index[c], data_index[j])) # if(c%10000000==9999999): if (c % 10000000 == 9999999): F.save_to_file( "occurences/" + filenames.updated_words_file_name + str((c / 10000000) + 1), occurrence, filenames.output_folder) for i in range(-window, window + 1): occurrence[i] = [] if len(data_index) <= 10000000: F.save_to_file( "occurences/" + filenames.updated_words_file_name + str(len(data_index)), occurrence, filenames.output_folder) for k in occurrence: print(k, len(occurrence[k]))
def getVocabulary(words,less,filenames): import operator word_lower=[] import os if(not os.path.isfile(filenames.output_folder+'/'+filenames.lower_words_file_name)): words_lower=[ w.lower() for w in words] #lower F.save_to_file(filenames.lower_words_file_name, words_lower, filenames.output_folder) else: print("Words File Found") words_lower = F.load_to_file(filenames.lower_words_file_name,filenames.output_folder) print("Lower words count",len(words_lower)) #remove less occuring d=Counter(words_lower) v=list(d.keys()) #Write All word in sorted Order with their count f=open(filenames.output_folder+'/count_of_all_words.csv','w') # data_temp=sorted(d.items(),key=operator.getitem(1)) for k in d: f.write(str(k)+"\t"+str(d[k])+"\n") f.close() for k in v: if d[k]<less: del d[k] vocab=list(d.keys()) print("Removing less",str(less),len(vocab)) vocab=[w for w in vocab if not re.match( r'.*[0-9]+.*', w)] print("Removing Numbers",len(vocab)) vocab=[w for w in vocab if not re.match( r'.*[:;,_`=!@#$%^&*()/<>"\'\?\\\+\-\{\}\[\]\|\.]+.*', w)] print("Removing Special",len(vocab)) #Write filtered word in sorted Order with their count f=open(filenames.output_folder+'/count_of_filtered_words_'+str(less)+'.csv','w') # data_temp=sorted(d.items(),key=operator.getitem(1)) for k in d: if k in vocab: f.write(str(k)+"\t"+str(d[k])+"\n") f.close() updated_words=[] vocab_dict={} for v in vocab: vocab_dict[v]="" #Update Word to their ID for Co-Occureneces i=0 for w in words_lower: print(i) if w in vocab_dict: updated_words.append(w) else: updated_words.append('UKN') i += 1 vocab.append('UKN') print(len(updated_words)) return updated_words,vocab
def find_temp_co_occurences(filenames): # Occurence # os.system("mkdir -p " + filenames.output_folder + "/occurences") f = open(filenames.output_folder + '/occurences.txt','w') data = F.load_to_file(filenames.updated_words_file_name, filenames.output_folder) # vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder) index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder) print(word_to_index) print(index_to_word) print(len(vocab), len(data)) data_index = [word_to_index[w] for w in data] unknown_id = word_to_index['UKN'] occurrence = {} window = 2 # print("Words:", len(data_index)) # for i in range(-window, window + 1): # occurrence[i] = [] for c in range(len(data_index)): # print(c) start = max(0, c - window) end = min(len(data_index) - 1, c + window) # print(start,end) if data_index[c] != unknown_id: for j in range(start, end + 1): if c != j and data_index[j] != 0: # print(j,c) # occurrence[j - c].append((data_index[c], data_index[j])) f.write(str(data_index[c])+"\t"+str(data_index[j])+"\t"+str(j-c)+"\n") # if(c%10000000==9999999): # if (c % 10000000 == 9999999): # F.save_to_file("occurences/" + filenames.updated_words_file_name + str((c / 10000000) + 1), occurrence, # filenames.output_folder) # for i in range(-window, window + 1): # occurrence[i] = [] # if len(data_index) <= 10000000: # F.save_to_file("occurences/" + filenames.updated_words_file_name + str(len(data_index)), occurrence, # filenames.output_folder) f.close()
def find_dp_triplets(filenames): # Filter DP triple based on vocab # DP Dict to Triplet vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) os.listdir(filenames.output_folder + "/dp_data_pos") files = os.listdir(filenames.output_folder + "/dp_data_pos") relation = [] final_triplet = [] for f in files: triplet_data = F.load_to_file("dp_data_pos/" + f, filenames.output_folder) # print(triplet_data) # Find H R T for sent in triplet_data: # for t in sent: if True: (H, HPOS), R, (T, TPOS) = sent H = H.lower() R = R.lower() T = T.lower() if R not in relation and R != "": relation.append(R) if H not in vocab or T not in vocab: # print(H,R,T,"0") continue else: # print(H,R,T,"1") final_triplet.append((H, R, T)) print(len(final_triplet), len(relation)) print(final_triplet) F.save_to_file(filenames.dp_triplet_file, final_triplet, filenames.output_folder) F.save_to_file(filenames.dp_relation_file, relation, filenames.output_folder) print(relation)
def combine_dp_triplets(filenames): files = os.listdir(filenames.output_folder + "/Filtered_DP") all_triplets=[] c=0 for f in files: triplet_data = F.load_to_file("Filtered_DP/" + f, filenames.output_folder) all_triplets += triplet_data # if c>5: # break print(c) c += 1 F.save_to_file('all_dp_triplet',all_triplets,filenames.output_folder) all_triplets=[] return
def find_wn_relations(filenames): # Wordnet Relation vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) stop = stopwords.words('english') d = {} count = 1 for w1 in vocab: print(count) countj = 0 d[w1] = {} if w1 not in stop and len(w1) > 2: for w2 in vocab: countj += 1 if w1 != w2 and w2 not in stop and len(w2) > 2: rel = get_relation(w1, w2) if len(rel) > 0: d[w1][w2] = rel print(count, countj) count += 1 F.save_to_file(filenames.wordnet_triplet_file, d, filenames.output_folder) a = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder) print(a)
def combine_dp_relations(filenames): files = os.listdir(filenames.output_folder + "/Relations_DP") all_triplets=[] c=0 for f in files: triplet_data = F.load_to_file("Relations_DP/" + f, filenames.output_folder) all_triplets += triplet_data # if c>5: # break print(c) c += 1 all_triplets=list(set(all_triplets)) print(all_triplets) F.save_to_file(filenames.dp_relation_file,all_triplets,filenames.output_folder) return
def find_dp_triplets(filenames,NO_OF_THREADS=2): files = os.listdir(filenames.output_folder + "/dp_data_pos") vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) os.system("mkdir -p " + filenames.output_folder + "/Filtered_DP") os.system("mkdir -p " + filenames.output_folder + "/Relations_DP") def f(i): # start=NO_OF_THREADS # print(files[]) start= int(i * (len(files)/NO_OF_THREADS)) end= int((i+1) * (len(files)/NO_OF_THREADS) - 1) filter_dp_triplets(filenames,vocab_dict,files[start,end]) # t = [threading.Thread(target=filter_dp_triplets, args=(filenames,i,NO_OF_THREADS)) for i in range(NO_OF_THREADS)] t = [threading.Thread(target=f, args=(i,)) for i in range(NO_OF_THREADS)] for temp in t: temp.start() for temp in t: temp.join()
import torch import torch.nn as nn import torch.nn.functional as fun import functions as F import torch.optim as optim from torch.autograd import Variable import numpy as np import matplotlib.pyplot as plt import os word_to_index_file = 'word_to_index' index_to_word_file = 'index_to_word' word_to_index = F.load_to_file(word_to_index_file) index_to_word = F.load_to_file(index_to_word_file) relation_to_index_file = 'relation_to_index' index_to_relation_file = 'index_to_relation' relation_to_index = F.load_to_file(relation_to_index_file) index_to_relation = F.load_to_file(index_to_relation_file) def get_word_vectors(one, func, epoch, folder, name): embedding_dim = 100 vocab_dim = len(index_to_word) relation_dim = len(index_to_relation) model_file = F.folder + folder + 'training_t' + name + str(epoch) + '.pt' print(model_file) # return if one == True: net = NetOne(embedding_dim, vocab_dim, relation_dim, func) else: net = Net(embedding_dim, vocab_dim, relation_dim, func)
dp_relation_file = 'dp_relation' dp_triplet_file = 'dp_triplets' wordnet_triplet_file = 'wordnet_relation' occ_triplet_file = 'occurrence' word_to_index_file = 'word_to_index' index_to_word_file = 'index_to_word' wn_num_file = 'wn_num' occ_num_file = 'occ_num' dp_num_file = 'dp_num' occ_num_dups_file = 'occ_num_dups' relation_to_index_file = 'relation_to_index' index_to_relation_file = 'index_to_relation' positive_table_file = 'Positive_Table' word_to_index = F.load_to_file(word_to_index_file) index_to_word = F.load_to_file(index_to_word_file) relation_to_index = F.load_to_file(relation_to_index_file) index_to_relation = F.load_to_file(index_to_relation_file) wn_num = F.load_to_file(wn_num_file) occ_num = F.load_to_file(occ_num_file) dp_num = F.load_to_file(dp_num_file) occ_num_dups = F.load_to_file(occ_num_dups_file) positive_table = F.load_to_file(positive_table_file) # In[12]: count = 0 count_r = 0 for t in positive_table: count_r += len(positive_table[t])
# In[1]: #Occurence import functions as F dt = F.datetime.now() time_t = F.datetime.strftime(dt, "%x %X") print("Start", time_t) data_file = 'updated_words' vocab_file = 'vocab' w2i_file = 'word_to_index' i2w_file = 'index_to_word' occurrence_data_file = 'occurrence' data = F.load_to_file(data_file) vocab = F.load_to_file(vocab_file) word_to_index = F.load_to_file(w2i_file) index_to_word = F.load_to_file(i2w_file) print(word_to_index) print(index_to_word) print(len(vocab), len(data)) data_index = [word_to_index[w] for w in data] unknown_id = word_to_index['unknown'] occurrence = {} window = 2 print("Words:", len(data_index)) for i in range(-window, window + 1):
def combine_all_triplets(filenames): # Positive and NUM vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder) index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) dp_triplet = F.load_to_file(filenames.dp_triplet_file, filenames.output_folder) wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder) os.listdir(filenames.output_folder + "/occurences") files = os.listdir(filenames.output_folder + "/occurences") occ = {} flag = 1 for f in files: print(f) if flag: occ = F.load_to_file("occurences/" + f, filenames.output_folder) flag = 0 else: temp_occ = F.load_to_file("occurences/" + f, filenames.output_folder) for k in occ: occ[k] += temp_occ[k] wordnet_relation = [ 'antonym', 'synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak' ] # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum'] # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) print("DP rel: ", dp_relation) print("WN rel: ", wordnet_relation) print("OC rel: ", list(occ.keys())) relations = dp_relation + wordnet_relation + list(occ.keys()) relation_to_index = {} index_to_relation = {} for k, v in enumerate(relations): relation_to_index[v] = k index_to_relation[k] = v F.save_to_file(filenames.r2i_file, relation_to_index, filenames.output_folder) F.save_to_file(filenames.i2r_file, index_to_relation, filenames.output_folder) relation_to_index = F.load_to_file(filenames.r2i_file, filenames.output_folder) index_to_relation = F.load_to_file(filenames.i2r_file, filenames.output_folder) print(relation_to_index) print(index_to_relation) dp_number_triple = [] dp_relation_num = [relation_to_index[r] for r in dp_relation] count = 0 for dp_triple in dp_triplet: try: a, b, c = dp_triple a = word_to_index[a] b = relation_to_index[b] c = word_to_index[c] dp_number_triple.append((a, b, c)) except: print(c) count += 1 len(dp_number_triple) wn_number_triple = [] wn_relation_num = [relation_to_index[r] for r in wordnet_relation] for w1 in wordnet_triplet: for w2 in wordnet_triplet[w1]: a = word_to_index[w1] b = word_to_index[w2] for c in wordnet_triplet[w1][w2]: c = relation_to_index[c] wn_number_triple.append((a, c, b)) len(wn_number_triple) # All occ_number_triple = [] occ_relation_num = [relation_to_index[r] for r in list(occ.keys())] for r in occ: c = relation_to_index[r] for a, b in occ[r]: occ_number_triple.append((a, c, b)) len(occ_number_triple) # without duplicates occ_number_triple_without_duplicate = {} occ_relation_num_without_duplicate = [ relation_to_index[r] for r in list(occ.keys()) ] for r in occ: if r < 10 and r > -10: c = relation_to_index[r] print(r, c) l = 0 for a, b in occ[r]: # if (a,c,b) not in occ_number_triple_without_duplicate: occ_number_triple_without_duplicate[(a, c, b)] = 1 print(len(occ_number_triple_without_duplicate) - l) print(list(occ_number_triple_without_duplicate.keys())[:10]) print(len(list(occ_number_triple_without_duplicate.keys()))) occ_number_triple_without_dup = list( occ_number_triple_without_duplicate.keys()) F.save_to_file(filenames.all_relations, relations, filenames.output_folder) print(len(relations)) print(len(wn_number_triple)) print(len(dp_number_triple)) print(len(occ_number_triple)) print(len(occ_number_triple_without_duplicate)) print(index_to_relation) F.save_to_file(filenames.wn_num_file, wn_number_triple, filenames.output_folder) F.save_to_file(filenames.occ_num_file, occ_number_triple, filenames.output_folder) F.save_to_file(filenames.dp_num_file, dp_number_triple, filenames.output_folder) F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup, filenames.output_folder) print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple)) positive_table = {} total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup for triple in total_triple: a, b, c = triple if a not in positive_table: positive_table[a] = {} if b not in positive_table[a]: positive_table[a][b] = [c] else: positive_table[a][b].append(c) F.save_to_file(filenames.positive_table_file, positive_table, filenames.output_folder)
w2i_file = 'word_to_index' i2w_file = 'index_to_word' corpus_name = '../Data/reviews.txt' data = "" sentences = [] words = [] if 's' not in F.sys.argv: print("A") data = F.readData(corpus_name) sentences = F.getSentences(data) F.save_to_file(sents_file_name, sentences) else: print("B") sentences = F.load_to_file(sents_file_name) if 'w' not in F.sys.argv: print("C") words = F.getWords(sentences) F.save_to_file(words_file_name, words) else: print("D") words = F.load_to_file(words_file_name) updated_words, vocab = F.getVocabulary(words, 400) F.save_to_file(vocab_file, vocab) F.save_to_file(updated_words_file_name, updated_words) word_to_index = {} index_to_word = {}
def combine_all_triplets(filenames): # Positive and NUM vocab = F.load_to_file(filenames.vocab_file, filenames.output_folder) word_to_index = F.load_to_file(filenames.w2i_file, filenames.output_folder) index_to_word = F.load_to_file(filenames.i2w_file, filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) # dp_triplet = F.load_to_file(filenames.dp_triplet_file, filenames.output_folder) # wordnet_triplet = F.load_to_file(filenames.wordnet_triplet_file, filenames.output_folder) os.listdir(filenames.output_folder + "/occurences") files = os.listdir(filenames.output_folder + "/occurences") occ = {} # flag = 1 # for f in files: # print(f) # if flag: # occ = F.load_to_file("occurences/" + f, filenames.output_folder) # flag = 0 # else: # temp_occ = F.load_to_file("occurences/" + f, filenames.output_folder) # for k in occ: # occ[k] += temp_occ[k] wordnet_relation = ['antonym','synset', 'hyponym', 'hypernym', 'holonym', 'strong', 'weak'] # dp_relation=['advmod','amod','appos','compound','conj','fixed','flat','doeswith','list','nmod','nummod','orphan','reparandum'] occ=[0,1,2,-1,-2] # wordnet_relation=F.load_to_file(filenames.wordnet_relation,filenames.output_folder) dp_relation = F.load_to_file(filenames.dp_relation_file, filenames.output_folder) print("DP rel: ",dp_relation) print("WN rel: ",wordnet_relation) print("OC rel: ",list(occ.keys())) relations = dp_relation + wordnet_relation + occ_relation print(relations) relation_to_index = {} index_to_relation = {} for k, v in enumerate(relations): relation_to_index[v] = k index_to_relation[k] = v F.save_to_file(filenames.r2i_file, relation_to_index, filenames.output_folder) F.save_to_file(filenames.i2r_file, index_to_relation, filenames.output_folder) # relation_to_index = F.load_to_file(filenames.r2i_file, filenames.output_folder) # index_to_relation = F.load_to_file(filenames.i2r_file, filenames.output_folder) # print(relation_to_index) # print(index_to_relation) # dp_number_triple = [] # dp_relation_num = [relation_to_index[r] for r in dp_relation] # count = 0 # for dp_triple in dp_triplet: # try: # a, b, c = dp_triple # a = word_to_index[a] # b = relation_to_index[b] # c = word_to_index[c] # dp_number_triple.append((a, b, c)) # except: # print(c) # count += 1 # len(dp_number_triple) # wn_number_triple = [] # wn_relation_num = [relation_to_index[r] for r in wordnet_relation] # for w1 in wordnet_triplet: # for w2 in wordnet_triplet[w1]: # a = word_to_index[w1] # b = word_to_index[w2] # for c in wordnet_triplet[w1][w2]: # c = relation_to_index[c] # wn_number_triple.append((a, c, b)) # len(wn_number_triple) # # All # occ_number_triple = [] # occ_relation_num = [relation_to_index[r] for r in list(occ.keys())] # for r in occ: # c = relation_to_index[r] # for a, b in occ[r]: # occ_number_triple.append((a, c, b)) # len(occ_number_triple) # # without duplicates # occ_number_triple_without_duplicate = {} # occ_relation_num_without_duplicate = [relation_to_index[r] for r in list(occ.keys())] # for r in occ: # if r < 10 and r > -10: # c = relation_to_index[r] # print(r, c) # l = 0; # for a, b in occ[r]: # # if (a,c,b) not in occ_number_triple_without_duplicate: # occ_number_triple_without_duplicate[(a, c, b)] = 1 # print(len(occ_number_triple_without_duplicate) - l) # print(list(occ_number_triple_without_duplicate.keys())[:10]) # print(len(list(occ_number_triple_without_duplicate.keys()))) # occ_number_triple_without_dup = list(occ_number_triple_without_duplicate.keys()) # F.save_to_file(filenames.all_relations, relations, filenames.output_folder) # print(len(relations)) # print(len(wn_number_triple)) # print(len(dp_number_triple)) # print(len(occ_number_triple)) # print(len(occ_number_triple_without_duplicate)) # print(index_to_relation) # F.save_to_file(filenames.wn_num_file, wn_number_triple, filenames.output_folder) # F.save_to_file(filenames.occ_num_file, occ_number_triple, filenames.output_folder) # F.save_to_file(filenames.dp_num_file, dp_number_triple, filenames.output_folder) # F.save_to_file(filenames.occ_num_dups_file, occ_number_triple_without_dup, filenames.output_folder) # print(len(wn_number_triple), len(occ_number_triple), len(dp_number_triple)) # positive_table = {} # total_triple = wn_number_triple + dp_number_triple + occ_number_triple_without_dup # for triple in total_triple: # a, b, c = triple # if a not in positive_table: # positive_table[a] = {} # if b not in positive_table[a]: # positive_table[a][b] = [c] # else: # positive_table[a][b].append(c) # F.save_to_file(filenames.positive_table_file, positive_table, filenames.output_folder)
import functions as F import sys # python load_pickle.py folder name data=F.load_to_file(sys.argv[2],sys.argv[1]) print(len(data)) for k in data: # print(str(k)+"\t"+str(data[k])) print(k)
#Wordnet Relation import functions as F from nltk.corpus import wordnet as wn1 from nltk.corpus import stopwords # In[8]: dt=F.datetime.now() time_t=F.datetime.strftime(dt,"%x %X") print("Start",time_t) vocab_file='vocab' vocab=F.load_to_file(vocab_file) wordnet_realtion_file='wordnet_relation' # In[22]: stop=stopwords.words('english') # In[3]: # If strong 3 # If weak 1 : w1 present in w2 definition # If weak 2 : w2 present in w1 definition
occ_triplet_file = 'occurrence' word_to_index_file = 'word_to_index' index_to_word_file = 'index_to_word' wn_num_file = 'wn_num' occ_num_file = 'occ_num' dp_num_file = 'dp_num' occ_num_dups_file = 'occ_num_dups' relation_to_index_file = 'relation_to_index' index_to_relation_file = 'index_to_relation' all_relations = 'all_relations' # In[6]: vocab = F.load_to_file(vocab_file) word_to_index = F.load_to_file(word_to_index_file) index_to_word = F.load_to_file(index_to_word_file) dp_relation = F.load_to_file(dp_relation_file) dp_triplet = F.load_to_file(dp_triplet_file) wordnet_triplet = F.load_to_file(wordnet_triplet_file) # import os # os.listdir(F.folder+"occurences") # files=os.listdir(F.folder+"occurences") occ = {} # flag=1; # for f in files:
#!/usr/bin/env python # coding: utf-8 # # Filter DP triple based on vocab # In[1]: #DP Dict to Triplet import functions as F dt=F.datetime.now() time_t=F.datetime.strftime(dt,"%x %X") print("Start",time_t) vocab_file='vocab' vocab=F.load_to_file(vocab_file) # triplets_dict_file='dp_triplets_dict' # dp_triplets=F.load_to_file(triplets_dict_file) final_triplet_file='dp_triplets' dp_relation_file='dp_relation' # #Concatenate all triplets from threads # final_triplet_with_pos=[] # print(len(dp_triplets)) # for m in dp_triplets: # # print(len(triplets[m])) # for n in dp_triplets[m]:
#Preprossing and DP import functions as F sents_file_name = 'sents' words_file_name = 'words' updated_words_file_name = 'updated_words' vocab_file = 'vocab' w2i_file = 'word_to_index' i2w_file = 'index_to_word' # In[2]: dt = F.datetime.now() time_t = F.datetime.strftime(dt, "%x %X") print("Start", time_t) sentences = F.load_to_file(sents_file_name) # # sent_data=F.remove_special_from_sent_data(sent_data) # # F.save_to_file('filtered_sent_data',sent_data) # sent_data_filter=F.load_to_file('filtered_sent_data') # sent_data=sent_data_filter print("Sentence:", len(sentences)) # print("Sentence:",sentences) # print(sent_data) # # sent_data=sent_data[:10000] import threading NO_OF_THREADS = 25 triplets_dict = {} F.count = 0 from nltk.parse.stanford import StanfordDependencyParser path_to_jar = '/home/cs17mtech11004/stanford-parser-full-2018-02-27/stanford-parser.jar'