def compute_NER(corpus):
      NER=[]
      #fi=open("NER_features_train.txt","w")
      st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath'))
      for sentence in corpus:
            ner=st.tag(sentence.split())
            ner_tag=""
            for n in ner:
                  ner_tag=ner_tag+n[1]+" "
            NER.append(ner_tag)
      return NER
def compute_NER(corpus):
    NER = []
    #fi=open("NER_features_train.txt","w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        ner_tag = ""
        for n in ner:
            ner_tag = ner_tag + n[1] + " "
        NER.append(ner_tag)
    return NER
def fine_train_label(label):
    file_label = read_property('FineInputFiles') + label + "_training.txt"
    train_corpus, temp, train_class = file_preprocess(file_label)

    file_word = read_property(
        'FineOutputfilesPath') + label + "_training_word.txt"
    file_POS = read_property(
        'FineOutputfilesPath') + label + "_training_POS.txt"
    file_NER = read_property(
        'FineOutputfilesPath') + label + "_training_NER.txt"
    file_Chunk = read_property(
        'FineOutputfilesPath') + label + "_training_Chunk.txt"

    vectorizer_words = CountVectorizer(min_df=1, ngram_range=(1, 2))
    vectorizer_POS = CountVectorizer(min_df=1, ngram_range=(1, 2))
    vectorizer_Chunk = CountVectorizer(min_df=1, ngram_range=(1, 2))
    vectorizer_NER = CountVectorizer(min_df=1, ngram_range=(1, 2))

    X_words = vectorizer_words.fit_transform(append_noread(file_word))
    X_POS = vectorizer_POS.fit_transform(append_noread(file_POS))
    X_NER = vectorizer_NER.fit_transform(append_noread(file_NER))
    X_Chunk = vectorizer_Chunk.fit_transform(append_noread(file_Chunk))

    X = hstack((X_words, X_POS))
    X_train = hstack((X, X_NER))
    X_train = hstack((X_train, X_Chunk))
    '''saving the vectorizers to secondory memory '''
    pickle_out = open("TrainedModels/" + label + "_vectorizer_words.pickle",
                      "wb")
    cPickle.dump(vectorizer_words,
                 pickle_out,
                 protocol=cPickle.HIGHEST_PROTOCOL)
    pickle_out = open("TrainedModels/" + label + "_vectorizer_POS.pickle",
                      "wb")
    cPickle.dump(vectorizer_POS, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL)
    pickle_out = open("TrainedModels/" + label + "_vectorizer_NER.pickle",
                      "wb")
    cPickle.dump(vectorizer_NER, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL)
    pickle_out = open("TrainedModels/" + label + "_vectorizer_Chunk.pickle",
                      "wb")
    cPickle.dump(vectorizer_Chunk,
                 pickle_out,
                 protocol=cPickle.HIGHEST_PROTOCOL)
    ''' storing done '''

    print "Applying SVC"
    label_model = LinearSVC(loss='squared_hinge', dual=False, tol=1e-3)
    label_model = LinearSVC.fit(label_model, X_train, train_class)
    print(label, " training done")

    return label_model
def compute_NER(corpus):
      #NER=[]
      fi=open(read_property('NER_features_train_coarse_path'),"w")
      st = NERTagger(read_property('StanfordNerClassifier'),read_property('StanfordNerJarPath'))
      for sentence in corpus:
            ner=st.tag(sentence.split())
            #print ner
            #pos_seq=nltk.pos_tag(text)
            #print pos_seq
            ner_tag=""
            for n in ner:
                  #print n[1]
                  ner_tag=ner_tag+n[1]+" "
            #print pos_tags
	    fi.write(ner_tag+"\n")
            #NER.append(ner_tag)
      #print "The bag of words of NER is ",NER
      fi.close()
def append(filename):
    f = open(read_property(filename), "r")
    corpus = []
    for lines in f:
        l = lines.split()
        words = ""
        for w in l:
            words = words + w + " "
        corpus.append(words)
    return corpus
def compute_NER(corpus):
    #NER=[]
    fi = open(read_property('NER_features_train_coarse_path'), "w")
    st = NERTagger(read_property('StanfordNerClassifier'),
                   read_property('StanfordNerJarPath'))
    for sentence in corpus:
        ner = st.tag(sentence.split())
        #print ner
        #pos_seq=nltk.pos_tag(text)
        #print pos_seq
        ner_tag = ""
        for n in ner:
            #print n[1]
            ner_tag = ner_tag + n[1] + " "
        #print pos_tags
        fi.write(ner_tag + "\n")
        #NER.append(ner_tag)
    #print "The bag of words of NER is ",NER
    fi.close()
Пример #7
0
def compute_NER(corpus):
    NER = []
    fi = open(read_property('NER_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        ners = annotator.getAnnotations(sentence)['ner']
        ner = ""
        for elem in ners:
            ner = ner + elem[1] + " "
        print ner
        fi.write(ner + "\n")
        NER.append(ner)
    return NER
Пример #8
0
def compute_Chunks(corpus):
    #Chunk_Tags=[]
    fi = open(read_property('Chunk_features_test_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        print chunk
        fi.write(chunk + "\n")
        #Chunk_Tags.append(chunk)
    #print "The bag of words for Chunks is ",Chunk_Tags
    fi.close()
def compute_Chunks(corpus):
      #Chunk_Tags=[]
      fi=open(read_property('Chunk_features_train_path'),"w")
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            #print chunk
	    fi.write(chunk+"\n")
            #Chunk_Tags.append(chunk)
      #print "The bag of words for Chunks is ",Chunk_Tags
      fi.close()
def compute_POS_Tags(corpus):
    #POS=[]
    fi = open(read_property('POS_features_train_coarse_path'), "w")
    for sentence in corpus:
        text = nltk.word_tokenize(sentence)
        pos_seq = nltk.pos_tag(text)
        #print pos_seq
        pos_tags = ""
        for pos in pos_seq:
            pos_tags = pos_tags + pos[1] + " "
        fi.write(pos_tags + "\n")
        #print pos_tags
        #POS.append(pos_tags)
    #print "The bag of words of POS is ",POS
    fi.close()
def compute_POS_Tags(corpus):
      #POS=[]
      fi=open(read_property('POS_features_train_coarse_path'),"w")
      for sentence in corpus:
            text = nltk.word_tokenize(sentence)
            pos_seq=nltk.pos_tag(text)
            #print pos_seq
            pos_tags=""
            for pos in pos_seq:
                  pos_tags=pos_tags+pos[1]+" "
	    fi.write(pos_tags+"\n")
            #print pos_tags
            #POS.append(pos_tags)
      #print "The bag of words of POS is ",POS
      fi.close()
Пример #12
0
def compute_POS_Tags(corpus):
    #POS=[]
    fi = open(read_property('POS_features_test_coarse_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        pos_seq = annotator.getAnnotations(sentence)['pos']
        #print pos_seq
        pos_tags = ""
        for pos in pos_seq:
            pos_tags = pos_tags + pos[1] + " "
        fi.write(pos_tags + "\n")
        print pos_tags  ###############
        #POS.append(pos_tags)
    #print "The bag of words of POS is ",POS
    fi.close()
Пример #13
0
def file_preprocess(filename):
    corpus = []
    classes = []
    f = open(filename, 'r')
    fi = open(read_property('word_features_test_coarse_path'), "w")
    lines = f.readlines()
    for line in lines:
        line = line.rstrip('\n')
        line = preprocess(line)
        print "The line is  ", line  ###################
        sentence = ""
        words = line.split()
        for i in range(0, len(words)):
            if not (i == 0):
                sentence = sentence + (words[i]) + " "
        fi.write(sentence + "\n")
        corpus.append(sentence)
    f.close()
    fi.close()
    return corpus, classes
def file_preprocess(filename):
	corpus=[]
	classes=[]
	f=open(filename,'r')
	fi=open(read_property('word_features_train_coarse_path'),"w")
	lines=f.readlines()
	for line in lines:
		line=line.rstrip('\n')
		line=preprocess(line)
		#print "The line is  ",line
		sentence=""
		words=line.split()
		for i in range(0,len(words)):
			if not(i==0):
				sentence=sentence+(words[i])+" "
		fi.write(sentence+"\n")
		corpus.append(sentence)
	f.close()
	fi.close()
	return corpus,classes
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 30 23:58:14 2015

@author: nausheenfatma
"""

from readproperties import read_property

f=open(read_property('trainingfilepath'),"r")
fi_1=open(read_property('FineInputFiles')+'LOC_training.txt',"w")
fi_2=open(read_property('FineInputFiles')+'HUM_training.txt',"w")
fi_3=open(read_property('FineInputFiles')+'NUM_training.txt',"w")
fi_4=open(read_property('FineInputFiles')+'ABBR_training.txt',"w")
fi_5=open(read_property('FineInputFiles')+'ENTY_training.txt',"w")
fi_6=open(read_property('FineInputFiles')+'DESC_training.txt',"w")
classes=[]
lines=f.readlines()
f.close()
i=0
for line in lines:
    i=i+1
    line=line.rstrip('\n')
    if not (line=="\n"):
        classes.append((line.split()[0]).split(":")[0])
        label=(line.split()[0]).split(":")[0]
        if label=="LOC":
            fi_1.write(str(i)+" ")
            fi_1.write(line+"\n")
            print line
        if label=="HUM":
Пример #16
0
        print ner
        fi.write(ner + "\n")
        NER.append(ner)
    return NER


##Compute Chunks##
def compute_Chunks(corpus):
    #Chunk_Tags=[]
    fi = open(read_property('Chunk_features_test_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        print chunk
        fi.write(chunk + "\n")
        #Chunk_Tags.append(chunk)
    #print "The bag of words for Chunks is ",Chunk_Tags
    fi.close()
    #return Chunk_Tags


filename_train = read_property('testfilepath')
corpus, train_class = file_preprocess(filename_train)

compute_POS_Tags(corpus)
compute_NER(corpus)
compute_Chunks(corpus)
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
#from nltk.tag.stanford import NERTagger
from nltk.tag.stanford import StanfordNERTagger
from scipy.sparse import hstack
from sklearn.svm import LinearSVC
from practnlptools.tools import Annotator
from readproperties import read_property
from sklearn.externals import joblib

##removing special characters from sentence##
annotator = Annotator()
st = StanfordNERTagger(read_property('StanfordNerClassifier'),
                       read_property('StanfordNerJarPath'))
f = open(read_property('coarse_classification_path'), "r")
fi = open(
    read_property('FineOutputfilesPath') + 'fine_classification.txt', "w")
t_class = []
handled_class = []

for line in f:
    label = line.split()[0]
    #if label in handled_class :
    #	continue
    #handled_class.append(label)

    file_w = read_property('FineInputFiles') + label + "_training.txt"
    fa = open(file_w, "r")
    train_class = []
    for each_line in fa:
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            Chunk_Tags.append(chunk)
      return Chunk_Tags





######################################TRAINING############################################

#######Train class labels#####

train_class=[]
f=open(read_property('trainingfilepath'),'r')
lines=f.readlines()
for line in lines:
	line=line.rstrip('\n')
    	if not (line=="\n"):
        	train_class.append((line.split()[0]).split(":")[0])


###words in question###

print "Training"
f=open(read_property('word_features_train_coarse_path'),"r")
corpus=[]
for lines in f:
	l=lines.split()
	words=""
#	self = LinearSVC(loss='l2', dual=False, tol=1e-3)
#	self = LinearSVC.fit(self, X_train, train_class)
#	test_class = LinearSVC.predict(self, X_test)
#	print test_class
#	fi.write(label + ":")
#	fi.write(test_class[0]+" ")
#	fi.write(line.split(":")[1])
#	t_class.append(label+":"+test_class[0])
#
#fi.close()
#f.close()


###################################### Accuracy Calculation ################################################
test_class_gold=[]
f=open(read_property('testfilepath'),'r')
for lines in f:
	test_class_gold.append(lines.split()[0])
print t_class
print test_class_gold
print len(t_class)
print len(test_class_gold)
hits=0.00
for i in range(0,len(t_class)):
	if t_class[i]==test_class_gold[i]:
		print t_class[i]
		hits=hits+1
print "Number of hits = ",hits
print "The accuracy is ",((hits/len(t_class))*100.0)," %"

        print line
        indexno = line.split()[0]
        print indexno
        #  linesfrom=fullfeaturefie.readlines()
        lineread = linesfromfeaturefile[int(indexno) - 1]
        print lineread
        write_file.write(lineread)
    fineindexedfile.close()
    fullfeaturefile.close()
    write_file.close()
#write_ner_file.close()


#################LOC class################
#f1="LOC_training.txt"
Full_train_features_POS_file = read_property('POS_features_train_coarse_path')
#Fine_write_file="LOC_training_POS.txt"
#extract(f1,Full_train_features_file,Fine_write_file)

Full_train_features_NER_file = read_property('NER_features_train_coarse_path')
#Fine_write_file="LOC_training_NER.txt"
#extract(f1,Full_train_features_file,Fine_write_file)

Full_train_features_WORD_file = read_property(
    'word_features_train_coarse_path')
#Fine_write_file="LOC_training_word.txt"
#extract(f1,Full_train_features_file,Fine_write_file)

Full_train_features_CHUNK_file = read_property('Chunk_features_train_path')
#Fine_write_file="LOC_training_Chunk.txt"
#extract(f1,Full_train_features_file,Fine_write_file)
        #NER.append(ner_tag)
    #print "The bag of words of NER is ",NER
    fi.close()
    #return NER


##Compute Chunks##
def compute_Chunks(corpus):
    #Chunk_Tags=[]
    fi = open(read_property('Chunk_features_train_path'), "w")
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
    #print chunk
        fi.write(chunk + "\n")
    #Chunk_Tags.append(chunk)
    #print "The bag of words for Chunks is ",Chunk_Tags
    fi.close()
    #return Chunk_Tags


filename_train = read_property('trainingfilepath')
corpus, train_class = file_preprocess(filename_train)

compute_POS_Tags(corpus)
compute_NER(corpus)
compute_Chunks(corpus)
    f = open(read_property(filename), "r")
    corpus = []
    for lines in f:
        l = lines.split()
        words = ""
        for w in l:
            words = words + w + " "
        corpus.append(words)
    return corpus


######################################TRAINING############################################

#######Train class labels#####
train_class = []
f = open(read_property('trainingfilepath'), 'r')
lines = f.readlines()
for line in lines:
    line = line.rstrip('\n')
    if not (line == "\n"):
        train_class.append((line.split()[0]).split(":")[0])

print("Training")

vectorizer_words = CountVectorizer(min_df=1, ngram_range=(1, 2))
X_words = vectorizer_words.fit_transform(
    append('word_features_train_coarse_path'))
f.close()
print("word feature extraction done")

vectorizer_POS = CountVectorizer(min_df=1, ngram_range=(1, 2))
    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        Chunk_Tags.append(chunk)
    return Chunk_Tags


######################################TRAINING############################################

#######Train class labels#####

train_class = []
f = open(read_property('trainingfilepath'), 'r')
lines = f.readlines()
for line in lines:
    line = line.rstrip('\n')
    if not (line == "\n"):
        train_class.append((line.split()[0]).split(":")[0])

###words in question###

print "Training"
f = open(read_property('word_features_train_coarse_path'), "r")
corpus = []
for lines in f:
    l = lines.split()
    words = ""
    for w in l:
Пример #24
0
# -*- coding: utf-8 -*-

from readproperties import read_property

f=open(read_property('trainingfilepath'),"r")
fi_1=open(read_property('FineInputFiles')+'LOC_training.txt',"w")
fi_2=open(read_property('FineInputFiles')+'HUM_training.txt',"w")
fi_3=open(read_property('FineInputFiles')+'NUM_training.txt',"w")
fi_4=open(read_property('FineInputFiles')+'ABBR_training.txt',"w")
fi_5=open(read_property('FineInputFiles')+'ENTY_training.txt',"w")
fi_6=open(read_property('FineInputFiles')+'DESC_training.txt',"w")
classes=[]
lines=f.readlines()
f.close()
i=0
for line in lines:
    i=i+1
    line=line.rstrip('\n')
    if not (line=="\n"):
        classes.append((line.split()[0]).split(":")[0])
        label=(line.split()[0]).split(":")[0]
        if label=="LOC":
            fi_1.write(str(i)+" ")
            fi_1.write(line+"\n")
            print (line)
        if label=="HUM":
            fi_2.write(str(i)+" ")
            fi_2.write(line+"\n")
            print (line)
        if label=="NUM":
            fi_3.write(str(i)+" ")
##Compute Chunks##     
def compute_Chunks(corpus):
      #Chunk_Tags=[]
      fi=open(read_property('Chunk_features_train_path'),"w")
      annotator=Annotator()
      for sentence in corpus:
	    chunks=annotator.getAnnotations(sentence)['chunk']
            chunk=""
            for elem in chunks:
                  chunk=chunk+elem[1]+" "
            #print chunk
	    fi.write(chunk+"\n")
            #Chunk_Tags.append(chunk)
      #print "The bag of words for Chunks is ",Chunk_Tags
      fi.close()
      #return Chunk_Tags


 
     


filename_train=read_property('trainingfilepath')
corpus,train_class=file_preprocess(filename_train)

compute_POS_Tags(corpus)
compute_NER(corpus)
compute_Chunks(corpus)

    annotator = Annotator()
    for sentence in corpus:
        chunks = annotator.getAnnotations(sentence)['chunk']
        chunk = ""
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        # print chunk  To see what these chucks are
        Chunk_Tags.append(chunk)
    return Chunk_Tags


######################################TRAINING############################
#######Train class labels#####

train_class = []
f = open(read_property('trainingfilepath'), 'r')
lines = f.readlines()
for line in lines:
    line = line.rstrip('\n')
    if not (line == "\n"):
        train_class.append((line.split()[0]).split(":")[0])

###words in question###
f = open(read_property('word_features_train_coarse_path'), "r")
corpus = []
for lines in f:
    l = lines.split()
    words = ""
    for w in l:
        words = words + w + " "
    corpus.append(words)
Пример #27
0
        for elem in chunks:
            chunk = chunk + elem[1] + " "
        # print chunk  To see what these chucks are
        Chunk_Tags.append(chunk)
    return Chunk_Tags


##removing special characters from sentence##
def preprocess(raw_sentence):
    sentence = re.sub(r'[$|.|!|"|(|)|,|;|`|\']', r'', raw_sentence)
    return sentence


f = open(
    os.path.join(query_proc_dir,
                 read_property('word_features_train_coarse_path')), "r")
corpus = []
for lines in f:
    l = lines.split()
    words = ""
    for w in l:
        words = words + w + " "
    corpus.append(words)
vectorizer_words = CountVectorizer(min_df=1)
vectorizer_words.fit_transform(corpus)
f.close()

###POS tags in question###
f = open(
    os.path.join(query_proc_dir,
                 read_property('POS_features_train_coarse_path')), "r")
        for line in fineindexedfile:
		print line
		indexno=line.split()[0]
		print indexno
                #  linesfrom=fullfeaturefie.readlines()
                lineread=linesfromfeaturefile[int(indexno)-1]
		print lineread
                write_file.write(lineread)
        fineindexedfile.close()
        fullfeaturefile.close()
        write_file.close()
        #write_ner_file.close()

#################LOC class################
#f1="LOC_training.txt"
Full_train_features_POS_file=read_property('POS_features_train_coarse_path')
#Fine_write_file="LOC_training_POS.txt"
#extract(f1,Full_train_features_file,Fine_write_file)

Full_train_features_NER_file=read_property('NER_features_train_coarse_path')
#Fine_write_file="LOC_training_NER.txt"
#extract(f1,Full_train_features_file,Fine_write_file)

Full_train_features_WORD_file=read_property('word_features_train_coarse_path')
#Fine_write_file="LOC_training_word.txt"
#extract(f1,Full_train_features_file,Fine_write_file)

Full_train_features_CHUNK_file=read_property('Chunk_features_train_path')
#Fine_write_file="LOC_training_Chunk.txt"
#extract(f1,Full_train_features_file,Fine_write_file)
pickle_out = open("TrainedModels/ABBR_model.pickle", "wb")
cPickle.dump(ABBR_model, pickle_out, protocol=cPickle.HIGHEST_PROTOCOL)
''' storing to secondory memory done '''

coarse_class = get_coarse_output_class()
coarse_corpus = []
f = open('TestOutputfiles/word_features_test.txt', 'r')
lines = f.readlines()
for line in lines:
    line = line.rstrip('\n')
    if not (line == "\n"):
        coarse_corpus.append(line)
f.close()

filename_test = read_property('testfilepath')
corpus_test, test_class_gold = file_preprocess_test(filename_test)


def compute_word(line, v):
    corpus = []
    words = ""
    l = line.split()
    for w in l:
        words = words + w + " "
    corpus.append(words)
    X_words = v.transform(corpus)
    return X_words


def compute_POS(line, v):