def compute_NER(corpus): NER=[] #fi=open("NER_features_train.txt","w") st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath')) for sentence in corpus: ner=st.tag(sentence.split()) ner_tag="" for n in ner: ner_tag=ner_tag+n[1]+" " NER.append(ner_tag) return NER
def compute_NER(corpus): NER = [] #fi=open("NER_features_train.txt","w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) ner_tag = "" for n in ner: ner_tag = ner_tag + n[1] + " " NER.append(ner_tag) return NER
def fine_train_label(label): file_label = read_property('FineInputFiles') + label + "_training.txt" train_corpus, temp, train_class = file_preprocess(file_label) file_word = read_property( 'FineOutputfilesPath') + label + "_training_word.txt" file_POS = read_property( 'FineOutputfilesPath') + label + "_training_POS.txt" file_NER = read_property( 'FineOutputfilesPath') + label + "_training_NER.txt" file_Chunk = read_property( 'FineOutputfilesPath') + label + "_training_Chunk.txt" vectorizer_words = CountVectorizer(min_df=1, ngram_range=(1, 2)) vectorizer_POS = CountVectorizer(min_df=1, ngram_range=(1, 2)) vectorizer_Chunk = CountVectorizer(min_df=1, ngram_range=(1, 2)) vectorizer_NER = CountVectorizer(min_df=1, ngram_range=(1, 2)) X_words = vectorizer_words.fit_transform(append_noread(file_word)) X_POS = vectorizer_POS.fit_transform(append_noread(file_POS)) X_NER = vectorizer_NER.fit_transform(append_noread(file_NER)) X_Chunk = vectorizer_Chunk.fit_transform(append_noread(file_Chunk)) X = hstack((X_words, X_POS)) X_train = hstack((X, X_NER)) X_train = hstack((X_train, X_Chunk)) '''saving the vectorizers to secondory memory ''' pickle_out = open("TrainedModels/" + label + "_vectorizer_words.pickle", "wb") cPickle.dump(vectorizer_words, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL) pickle_out = open("TrainedModels/" + label + "_vectorizer_POS.pickle", "wb") cPickle.dump(vectorizer_POS, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL) pickle_out = open("TrainedModels/" + label + "_vectorizer_NER.pickle", "wb") cPickle.dump(vectorizer_NER, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL) pickle_out = open("TrainedModels/" + label + "_vectorizer_Chunk.pickle", "wb") cPickle.dump(vectorizer_Chunk, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL) ''' storing done ''' print "Applying SVC" label_model = LinearSVC(loss='squared_hinge', dual=False, tol=1e-3) label_model = LinearSVC.fit(label_model, X_train, train_class) print(label, " training done") return label_model
def compute_NER(corpus): #NER=[] fi=open(read_property('NER_features_train_coarse_path'),"w") st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath')) for sentence in corpus: ner=st.tag(sentence.split()) #print ner #pos_seq=nltk.pos_tag(text) #print pos_seq ner_tag="" for n in ner: #print n[1] ner_tag=ner_tag+n[1]+" " #print pos_tags fi.write(ner_tag+"\n") #NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close()
def append(filename): f = open(read_property(filename), "r") corpus = [] for lines in f: l = lines.split() words = "" for w in l: words = words + w + " " corpus.append(words) return corpus
def compute_NER(corpus): #NER=[] fi = open(read_property('NER_features_train_coarse_path'), "w") st = NERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) for sentence in corpus: ner = st.tag(sentence.split()) #print ner #pos_seq=nltk.pos_tag(text) #print pos_seq ner_tag = "" for n in ner: #print n[1] ner_tag = ner_tag + n[1] + " " #print pos_tags fi.write(ner_tag + "\n") #NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close()
def compute_NER(corpus): NER = [] fi = open(read_property('NER_features_test_coarse_path'), "w") annotator = Annotator() for sentence in corpus: ners = annotator.getAnnotations(sentence)['ner'] ner = "" for elem in ners: ner = ner + elem[1] + " " print ner fi.write(ner + "\n") NER.append(ner) return NER
def compute_Chunks(corpus): #Chunk_Tags=[] fi = open(read_property('Chunk_features_test_path'), "w") annotator = Annotator() for sentence in corpus: chunks = annotator.getAnnotations(sentence)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " print chunk fi.write(chunk + "\n") #Chunk_Tags.append(chunk) #print "The bag of words for Chunks is ",Chunk_Tags fi.close()
def compute_Chunks(corpus): #Chunk_Tags=[] fi=open(read_property('Chunk_features_train_path'),"w") annotator=Annotator() for sentence in corpus: chunks=annotator.getAnnotations(sentence)['chunk'] chunk="" for elem in chunks: chunk=chunk+elem[1]+" " #print chunk fi.write(chunk+"\n") #Chunk_Tags.append(chunk) #print "The bag of words for Chunks is ",Chunk_Tags fi.close()
def compute_POS_Tags(corpus): #POS=[] fi = open(read_property('POS_features_train_coarse_path'), "w") for sentence in corpus: text = nltk.word_tokenize(sentence) pos_seq = nltk.pos_tag(text) #print pos_seq pos_tags = "" for pos in pos_seq: pos_tags = pos_tags + pos[1] + " " fi.write(pos_tags + "\n") #print pos_tags #POS.append(pos_tags) #print "The bag of words of POS is ",POS fi.close()
def compute_POS_Tags(corpus): #POS=[] fi=open(read_property('POS_features_train_coarse_path'),"w") for sentence in corpus: text = nltk.word_tokenize(sentence) pos_seq=nltk.pos_tag(text) #print pos_seq pos_tags="" for pos in pos_seq: pos_tags=pos_tags+pos[1]+" " fi.write(pos_tags+"\n") #print pos_tags #POS.append(pos_tags) #print "The bag of words of POS is ",POS fi.close()
def compute_POS_Tags(corpus): #POS=[] fi = open(read_property('POS_features_test_coarse_path'), "w") annotator = Annotator() for sentence in corpus: pos_seq = annotator.getAnnotations(sentence)['pos'] #print pos_seq pos_tags = "" for pos in pos_seq: pos_tags = pos_tags + pos[1] + " " fi.write(pos_tags + "\n") print pos_tags ############### #POS.append(pos_tags) #print "The bag of words of POS is ",POS fi.close()
def file_preprocess(filename): corpus = [] classes = [] f = open(filename, 'r') fi = open(read_property('word_features_test_coarse_path'), "w") lines = f.readlines() for line in lines: line = line.rstrip('\n') line = preprocess(line) print "The line is ", line ################### sentence = "" words = line.split() for i in range(0, len(words)): if not (i == 0): sentence = sentence + (words[i]) + " " fi.write(sentence + "\n") corpus.append(sentence) f.close() fi.close() return corpus, classes
def file_preprocess(filename): corpus=[] classes=[] f=open(filename,'r') fi=open(read_property('word_features_train_coarse_path'),"w") lines=f.readlines() for line in lines: line=line.rstrip('\n') line=preprocess(line) #print "The line is ",line sentence="" words=line.split() for i in range(0,len(words)): if not(i==0): sentence=sentence+(words[i])+" " fi.write(sentence+"\n") corpus.append(sentence) f.close() fi.close() return corpus,classes
# -*- coding: utf-8 -*- """ Created on Fri Jan 30 23:58:14 2015 @author: nausheenfatma """ from readproperties import read_property f=open(read_property('trainingfilepath'),"r") fi_1=open(read_property('FineInputFiles')+'LOC_training.txt',"w") fi_2=open(read_property('FineInputFiles')+'HUM_training.txt',"w") fi_3=open(read_property('FineInputFiles')+'NUM_training.txt',"w") fi_4=open(read_property('FineInputFiles')+'ABBR_training.txt',"w") fi_5=open(read_property('FineInputFiles')+'ENTY_training.txt',"w") fi_6=open(read_property('FineInputFiles')+'DESC_training.txt',"w") classes=[] lines=f.readlines() f.close() i=0 for line in lines: i=i+1 line=line.rstrip('\n') if not (line=="\n"): classes.append((line.split()[0]).split(":")[0]) label=(line.split()[0]).split(":")[0] if label=="LOC": fi_1.write(str(i)+" ") fi_1.write(line+"\n") print line if label=="HUM":
print ner fi.write(ner + "\n") NER.append(ner) return NER ##Compute Chunks## def compute_Chunks(corpus): #Chunk_Tags=[] fi = open(read_property('Chunk_features_test_path'), "w") annotator = Annotator() for sentence in corpus: chunks = annotator.getAnnotations(sentence)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " print chunk fi.write(chunk + "\n") #Chunk_Tags.append(chunk) #print "The bag of words for Chunks is ",Chunk_Tags fi.close() #return Chunk_Tags filename_train = read_property('testfilepath') corpus, train_class = file_preprocess(filename_train) compute_POS_Tags(corpus) compute_NER(corpus) compute_Chunks(corpus)
import re import nltk from sklearn.feature_extraction.text import CountVectorizer #from nltk.tag.stanford import NERTagger from nltk.tag.stanford import StanfordNERTagger from scipy.sparse import hstack from sklearn.svm import LinearSVC from practnlptools.tools import Annotator from readproperties import read_property from sklearn.externals import joblib ##removing special characters from sentence## annotator = Annotator() st = StanfordNERTagger(read_property('StanfordNerClassifier'), read_property('StanfordNerJarPath')) f = open(read_property('coarse_classification_path'), "r") fi = open( read_property('FineOutputfilesPath') + 'fine_classification.txt', "w") t_class = [] handled_class = [] for line in f: label = line.split()[0] #if label in handled_class : # continue #handled_class.append(label) file_w = read_property('FineInputFiles') + label + "_training.txt" fa = open(file_w, "r") train_class = [] for each_line in fa:
chunk="" for elem in chunks: chunk=chunk+elem[1]+" " Chunk_Tags.append(chunk) return Chunk_Tags ######################################TRAINING############################################ #######Train class labels##### train_class=[] f=open(read_property('trainingfilepath'),'r') lines=f.readlines() for line in lines: line=line.rstrip('\n') if not (line=="\n"): train_class.append((line.split()[0]).split(":")[0]) ###words in question### print "Training" f=open(read_property('word_features_train_coarse_path'),"r") corpus=[] for lines in f: l=lines.split() words=""
# self = LinearSVC(loss='l2', dual=False, tol=1e-3) # self = LinearSVC.fit(self, X_train, train_class) # test_class = LinearSVC.predict(self, X_test) # print test_class # fi.write(label + ":") # fi.write(test_class[0]+" ") # fi.write(line.split(":")[1]) # t_class.append(label+":"+test_class[0]) # #fi.close() #f.close() ###################################### Accuracy Calculation ################################################ test_class_gold=[] f=open(read_property('testfilepath'),'r') for lines in f: test_class_gold.append(lines.split()[0]) print t_class print test_class_gold print len(t_class) print len(test_class_gold) hits=0.00 for i in range(0,len(t_class)): if t_class[i]==test_class_gold[i]: print t_class[i] hits=hits+1 print "Number of hits = ",hits print "The accuracy is ",((hits/len(t_class))*100.0)," %"
print line indexno = line.split()[0] print indexno # linesfrom=fullfeaturefie.readlines() lineread = linesfromfeaturefile[int(indexno) - 1] print lineread write_file.write(lineread) fineindexedfile.close() fullfeaturefile.close() write_file.close() #write_ner_file.close() #################LOC class################ #f1="LOC_training.txt" Full_train_features_POS_file = read_property('POS_features_train_coarse_path') #Fine_write_file="LOC_training_POS.txt" #extract(f1,Full_train_features_file,Fine_write_file) Full_train_features_NER_file = read_property('NER_features_train_coarse_path') #Fine_write_file="LOC_training_NER.txt" #extract(f1,Full_train_features_file,Fine_write_file) Full_train_features_WORD_file = read_property( 'word_features_train_coarse_path') #Fine_write_file="LOC_training_word.txt" #extract(f1,Full_train_features_file,Fine_write_file) Full_train_features_CHUNK_file = read_property('Chunk_features_train_path') #Fine_write_file="LOC_training_Chunk.txt" #extract(f1,Full_train_features_file,Fine_write_file)
#NER.append(ner_tag) #print "The bag of words of NER is ",NER fi.close() #return NER ##Compute Chunks## def compute_Chunks(corpus): #Chunk_Tags=[] fi = open(read_property('Chunk_features_train_path'), "w") annotator = Annotator() for sentence in corpus: chunks = annotator.getAnnotations(sentence)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " #print chunk fi.write(chunk + "\n") #Chunk_Tags.append(chunk) #print "The bag of words for Chunks is ",Chunk_Tags fi.close() #return Chunk_Tags filename_train = read_property('trainingfilepath') corpus, train_class = file_preprocess(filename_train) compute_POS_Tags(corpus) compute_NER(corpus) compute_Chunks(corpus)
f = open(read_property(filename), "r") corpus = [] for lines in f: l = lines.split() words = "" for w in l: words = words + w + " " corpus.append(words) return corpus ######################################TRAINING############################################ #######Train class labels##### train_class = [] f = open(read_property('trainingfilepath'), 'r') lines = f.readlines() for line in lines: line = line.rstrip('\n') if not (line == "\n"): train_class.append((line.split()[0]).split(":")[0]) print("Training") vectorizer_words = CountVectorizer(min_df=1, ngram_range=(1, 2)) X_words = vectorizer_words.fit_transform( append('word_features_train_coarse_path')) f.close() print("word feature extraction done") vectorizer_POS = CountVectorizer(min_df=1, ngram_range=(1, 2))
annotator = Annotator() for sentence in corpus: chunks = annotator.getAnnotations(sentence)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " Chunk_Tags.append(chunk) return Chunk_Tags ######################################TRAINING############################################ #######Train class labels##### train_class = [] f = open(read_property('trainingfilepath'), 'r') lines = f.readlines() for line in lines: line = line.rstrip('\n') if not (line == "\n"): train_class.append((line.split()[0]).split(":")[0]) ###words in question### print "Training" f = open(read_property('word_features_train_coarse_path'), "r") corpus = [] for lines in f: l = lines.split() words = "" for w in l:
# -*- coding: utf-8 -*- from readproperties import read_property f=open(read_property('trainingfilepath'),"r") fi_1=open(read_property('FineInputFiles')+'LOC_training.txt',"w") fi_2=open(read_property('FineInputFiles')+'HUM_training.txt',"w") fi_3=open(read_property('FineInputFiles')+'NUM_training.txt',"w") fi_4=open(read_property('FineInputFiles')+'ABBR_training.txt',"w") fi_5=open(read_property('FineInputFiles')+'ENTY_training.txt',"w") fi_6=open(read_property('FineInputFiles')+'DESC_training.txt',"w") classes=[] lines=f.readlines() f.close() i=0 for line in lines: i=i+1 line=line.rstrip('\n') if not (line=="\n"): classes.append((line.split()[0]).split(":")[0]) label=(line.split()[0]).split(":")[0] if label=="LOC": fi_1.write(str(i)+" ") fi_1.write(line+"\n") print (line) if label=="HUM": fi_2.write(str(i)+" ") fi_2.write(line+"\n") print (line) if label=="NUM": fi_3.write(str(i)+" ")
##Compute Chunks## def compute_Chunks(corpus): #Chunk_Tags=[] fi=open(read_property('Chunk_features_train_path'),"w") annotator=Annotator() for sentence in corpus: chunks=annotator.getAnnotations(sentence)['chunk'] chunk="" for elem in chunks: chunk=chunk+elem[1]+" " #print chunk fi.write(chunk+"\n") #Chunk_Tags.append(chunk) #print "The bag of words for Chunks is ",Chunk_Tags fi.close() #return Chunk_Tags filename_train=read_property('trainingfilepath') corpus,train_class=file_preprocess(filename_train) compute_POS_Tags(corpus) compute_NER(corpus) compute_Chunks(corpus)
annotator = Annotator() for sentence in corpus: chunks = annotator.getAnnotations(sentence)['chunk'] chunk = "" for elem in chunks: chunk = chunk + elem[1] + " " # print chunk To see what these chucks are Chunk_Tags.append(chunk) return Chunk_Tags ######################################TRAINING############################ #######Train class labels##### train_class = [] f = open(read_property('trainingfilepath'), 'r') lines = f.readlines() for line in lines: line = line.rstrip('\n') if not (line == "\n"): train_class.append((line.split()[0]).split(":")[0]) ###words in question### f = open(read_property('word_features_train_coarse_path'), "r") corpus = [] for lines in f: l = lines.split() words = "" for w in l: words = words + w + " " corpus.append(words)
for elem in chunks: chunk = chunk + elem[1] + " " # print chunk To see what these chucks are Chunk_Tags.append(chunk) return Chunk_Tags ##removing special characters from sentence## def preprocess(raw_sentence): sentence = re.sub(r'[$|.|!|"|(|)|,|;|`|\']', r'', raw_sentence) return sentence f = open( os.path.join(query_proc_dir, read_property('word_features_train_coarse_path')), "r") corpus = [] for lines in f: l = lines.split() words = "" for w in l: words = words + w + " " corpus.append(words) vectorizer_words = CountVectorizer(min_df=1) vectorizer_words.fit_transform(corpus) f.close() ###POS tags in question### f = open( os.path.join(query_proc_dir, read_property('POS_features_train_coarse_path')), "r")
for line in fineindexedfile: print line indexno=line.split()[0] print indexno # linesfrom=fullfeaturefie.readlines() lineread=linesfromfeaturefile[int(indexno)-1] print lineread write_file.write(lineread) fineindexedfile.close() fullfeaturefile.close() write_file.close() #write_ner_file.close() #################LOC class################ #f1="LOC_training.txt" Full_train_features_POS_file=read_property('POS_features_train_coarse_path') #Fine_write_file="LOC_training_POS.txt" #extract(f1,Full_train_features_file,Fine_write_file) Full_train_features_NER_file=read_property('NER_features_train_coarse_path') #Fine_write_file="LOC_training_NER.txt" #extract(f1,Full_train_features_file,Fine_write_file) Full_train_features_WORD_file=read_property('word_features_train_coarse_path') #Fine_write_file="LOC_training_word.txt" #extract(f1,Full_train_features_file,Fine_write_file) Full_train_features_CHUNK_file=read_property('Chunk_features_train_path') #Fine_write_file="LOC_training_Chunk.txt" #extract(f1,Full_train_features_file,Fine_write_file)
pickle_out = open("TrainedModels/ABBR_model.pickle", "wb") cPickle.dump(ABBR_model, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL) ''' storing to secondory memory done ''' coarse_class = get_coarse_output_class() coarse_corpus = [] f = open('TestOutputfiles/word_features_test.txt', 'r') lines = f.readlines() for line in lines: line = line.rstrip('\n') if not (line == "\n"): coarse_corpus.append(line) f.close() filename_test = read_property('testfilepath') corpus_test, test_class_gold = file_preprocess_test(filename_test) def compute_word(line, v): corpus = [] words = "" l = line.split() for w in l: words = words + w + " " corpus.append(words) X_words = v.transform(corpus) return X_words def compute_POS(line, v):