def main2(): # run this filename = '../release3.2/data/conll14st-preprocessed.m2' print "Load data from", filename f = open(filename, 'r') data_raw = [p.split('\n') for p in ''.join(f.readlines()).split('\n\n')] sentence_tuples = [ (sentence[0][2:], [tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw[:len(data_raw) - 1] ] f.close() random.shuffle(sentence_tuples) sents = sentence_tuples[:150] # this is the only thing what I have to do tbank_s = dts.tbankparser() targets = [tbank_s.parse(t[0]) for t in sents] inputs = [t[0] for t in sents] print "main 0" main(0, None, (inputs, targets)) print "main 1" main(1, None, (inputs, targets)) print "main 4" main(4, None, (inputs, targets)) reload(sys) sys.setdefaultencoding('utf8') print "main 5" main(5, None, (inputs, targets))
def main2(): # preproces the data filename= '../release3.2/data/conll14st-preprocessed.m2' print "Load data from", filename f = open(filename,'r') data_raw = [p.split('\n') for p in ''.join(f.readlines() ).split('\n\n')] sentence_tuples = [(sentence[0][2:],[tuple(errors.split('|||')) for errors in sentence[1:]]) for sentence in data_raw[:len(data_raw)-1]] f.close() random.shuffle(sentence_tuples) sents = sentence_tuples[:150] # select 150 sentences for testing tbank_s = dts.tbankparser() targets = [tbank_s.parse(t[0]) for t in sents] inputs = [t[0] for t in sents] main(0,None,(inputs,targets)) main(1,None,(inputs,targets)) main(4,None,(inputs,targets)) reload(sys) sys.setdefaultencoding('utf8') main(5,None,(inputs,targets))
def main(history=1,tiny='.tiny',tbank = None): assert history >= 1, "use at least some history" t1 = time() TRAIN_FILE = '../release3.2/final_data/train-data.pre' VAL_FILE = '../release3.2/final_data/validate-data.pre' print 'loading tree bank' t2 = time()-t1 if tbank is None: tbank = dts.tbankparser() print 'loading sentences' dp._init_(tbank) all_sentences, feature_dict = dp.process(TRAIN_FILE,history) val_sentences, _val_feat = dp.process(VAL_FILE,history) t3 = time()-t1-t2 print "features has been made" print "init perceptron" sp._init_(len(feature_dict),dts, False) print "end init" out( ('SSE random weights, only Ne-tags',flaws(dts,val_sentences,feature_dict,tbank,history,with_tags=False)) ) print "SSE random weights, only Ne-tags" out( ( 'SSE random weights',flaws(dts,val_sentences,feature_dict,tbank,history) ) ) print "SSE random weight" t4 = time() print "learning" weights = sp.train_perceptron(all_sentences, feature_dict, tbank, history) np.save('weights'+str(history)+tiny+'.npy',weights) t4 = time()-t4 print weights.shape t1=time()-t1 print "validating" out( ( 'after %d sentences, only Ne-tags'%(len(all_sentences)), flaws(dts, val_sentences,feature_dict,tbank,history,weights,False) ) ) out( ( 'after %d sentences'%(len(all_sentences)), flaws(dts, val_sentences,feature_dict,tbank,history,weights) ) ) out( ( 'total %f sec (loading: %f, %f; training: %f'%(t1,t2,t3,t4) ) ) return feature_dict,weights
def main(history=1, tiny='.tiny', tbank=None): """ run the whole proces """ assert history >= 1, """"use at least some history""" t1 = time() TRAIN_FILE = '../release3.2/final_data/train-data.pre' VAL_FILE = '../release3.2/final_data/validate-data.pre' print 'loading tree bank' t2 = time() - t1 if tbank is None: tbank = dts.tbankparser() print 'loading sentences' dp._init_(tbank) all_sentences, feature_dict = dp.process(TRAIN_FILE, history) val_sentences, _val_feat = dp.process(VAL_FILE, history) t3 = time() - t1 - t2 print "features has been made" print "init perceptron" sp._init_(len(feature_dict), dts, False) print "end init" out(('SSE random weights, only Ne-tags', flaws(dts, val_sentences, feature_dict, tbank, history, with_tags=False))) print "SSE random weights, only Ne-tags" out(('SSE random weights', flaws(dts, val_sentences, feature_dict, tbank, history))) print "SSE random weight" t4 = time() print "learning" weights = sp.train_perceptron(all_sentences, feature_dict, tbank, history) np.save('weights' + str(history) + tiny + '.npy', weights) t4 = time() - t4 print weights.shape t1 = time() - t1 print "validating" out(('after %d sentences, only Ne-tags' % (len(all_sentences)), flaws(dts, val_sentences, feature_dict, tbank, history, weights, False))) out(('after %d sentences' % (len(all_sentences)), flaws(dts, val_sentences, feature_dict, tbank, history, weights))) out(('total %f sec (loading: %f, %f; training: %f' % (t1, t2, t3, t4))) return feature_dict, weights
sentence.insert(0,parw) sentence.insert(0,parw) sentence.insert(0,parw) return sentence else: parw = parent.orth_ current_word = parent sentence.insert(0,parw) return recursive_tree_climb(current_word, sentence) if __name__ == '__main__': print 'start' TRAIN_FILE = 'test_data/test_linear.txt' #'../release3.2/data/test.txt' all_sentences, feature_dict = dp.process(TRAIN_FILE,1) tbank = dts.tbankparser() text_file = open("preprocessed-4gram-sentences2.txt", "w") print "start looping through sentece" for sentence in all_sentences: try: seen_mistakes = [] parsed_sentence = tbank.parse(sentence.raw_sentence) context_tags = [word_tag[1] for word_tag in sentence.words_tags] for i in range(0,len(sentence.raw_sentence.split(' '))): if context_tags[i] != "Ne": cur = parsed_sentence[i] sentence_array = [] sentence_array.insert(0,cur.orth_) result = recursive_tree_climb(cur, sentence_array) four_gram = result[len(result)-4:]
def main(xin=0, tbank=None, train_test=None): """load a given treebank, score it's accuracy and time runtime """ # x is the type of treebank if tbank is None: x = xin else: x = -1 name = "user" # X is the amount of train-trees for nltk-based tbank # Y is the amount of added flaws to the nltk-based tbank # slice X:Z are the sentences tested on X,Y,Z = 3750,15000,3900 out( 'making targets') tt = time() if train_test is None: data = nltk.corpus.dependency_treebank testing_targets = [t.tree() for t in data.parsed_sents()[X:Z]] testing_inputs = data.sents()[X:Z] else: testing_targets = train_test[1] testing_inputs = train_test[0] tt = time()-tt out( 'in',tt,'sec') out( "loading tbank") tl = time() if x == 0: name = "spacy" tbank = dts.tbankparser() elif x == 1: name = "ntlk no noise" tbank = dto.tbankparser() tbank.getParser(X) elif x == 2: name = "nltk random noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y,True,False) tbank.getParser() elif x == 3: name = "ntlk flaws noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y,True,True) tbank.getParser() elif x == 4: name = "nltk only random noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y,False,False) tbank.getParser() elif x == 5: name = "ntlk only flaws noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y,False,True) tbank.getParser() tl = time()-tl out( "scoring...") ts = time() s = score(tbank,testing_inputs,testing_targets) ts = time()-ts out("%s loaded in %f sec. Scored %f on %d targets in %f sec."%(name,tl,s.sum(),len(testing_targets),ts)) np.save(name+str(time())+'data.npy',s) return s
def main(xin=0, tbank=None, train_test=None): """load a given treebank, score it's accuracy and time runtime """ # x is the type of treebank if tbank is None: x = xin else: x = -1 name = "user" # X is the amount of train-trees for nltk-based tbank # Y is the amount of added flaws to the nltk-based tbank # slice X:Z are the sentences tested on X, Y, Z = 3750, 15000, 3900 out('making targets') tt = time() if train_test is None: data = nltk.corpus.dependency_treebank testing_targets = [t.tree() for t in data.parsed_sents()[X:Z]] testing_inputs = data.sents()[X:Z] else: testing_targets = train_test[1] testing_inputs = train_test[0] tt = time() - tt out('in', tt, 'sec') out("loading tbank") tl = time() if x == 0: name = "spacy" tbank = dts.tbankparser() elif x == 1: name = "ntlk no noise" tbank = dto.tbankparser() tbank.getParser(X) elif x == 2: name = "nltk random noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y, True, False) tbank.getParser() elif x == 3: name = "ntlk flaws noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y, True, True) tbank.getParser() elif x == 4: name = "nltk only random noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y, False, False) tbank.getParser() elif x == 5: name = "ntlk only flaws noise" tbank = dto.tbankparser() tbank.truncate(X) tbank.add_noise(Y, False, True) tbank.getParser() tl = time() - tl out("scoring...") ts = time() s = score(tbank, testing_inputs, testing_targets) ts = time() - ts out("%s loaded in %f sec. Scored %f on %d targets in %f sec." % (name, tl, s.sum(), len(testing_targets), ts)) np.save(name + str(time()) + 'data.npy', s) return s
sentence.insert(0, parw) sentence.insert(0, parw) return sentence else: parw = parent.orth_ current_word = parent sentence.insert(0, parw) return recursive_tree_climb(current_word, sentence) if __name__ == '__main__': print 'start' TRAIN_FILE = 'test_data/test_linear.txt' #'../release3.2/data/test.txt' all_sentences, feature_dict = dp.process(TRAIN_FILE, 1) tbank = dts.tbankparser() text_file = open("preprocessed-4gram-sentences2.txt", "w") print "start looping through sentece" for sentence in all_sentences: try: seen_mistakes = [] parsed_sentence = tbank.parse(sentence.raw_sentence) context_tags = [word_tag[1] for word_tag in sentence.words_tags] for i in range(0, len(sentence.raw_sentence.split(' '))): if context_tags[i] != "Ne": cur = parsed_sentence[i] sentence_array = [] sentence_array.insert(0, cur.orth_) result = recursive_tree_climb(cur, sentence_array) four_gram = result[len(result) - 4:]
if __name__ == '__main__': use_spacy = False parse_type = "linear" #parse_type = "dep" #filename_prep = "preprocessed-BrownCorpus.txt" filename_prep = "test.txt" quatrogram_dict,trigram_dict,bigram_dict,unigram_dict = prepare(parse_type, filename_prep) # comment these three lines out if you don't want spacy, uncomment if you do if use_spacy: print "Starting with the spacy stuff.." from spacy.en import LOCAL_DATA_DIR, English tbank = dt.tbankparser() print "Finding corrections..." if parse_type == "dep": filename = "test_correction.txt" #filename = "preprocessed-4gram-sentences.txt" if use_spacy: correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp) else: correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename) else: filename='test_linear.txt' if use_spacy: correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp) else: correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename)
return quatrogram_dict, trigram_dict, bigram_dict, unigram_dict if __name__ == '__main__': use_spacy = False parse_type = "linear" filename_prep = "test_data/test.txt" quatrogram_dict, trigram_dict, bigram_dict, unigram_dict = prepare( parse_type, filename_prep) # comment these three lines out if you don't want spacy, uncomment if you do if use_spacy: print "Starting with the spacy stuff.." from spacy.en import LOCAL_DATA_DIR, English tbank = dt.tbankparser() print "Finding corrections..." if parse_type == "dep": filename = "test_data/test_correction.txt" if use_spacy: correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp) else: correct_dep(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename) else: filename = 'test_data/test_linear.txt' if use_spacy: correct(quatrogram_dict, trigram_dict, bigram_dict, unigram_dict, filename, tbank.nlp)