def word_freq_test(): penta_freq = gen_vector.gen_word_pentagram_freq(1000, './data/corpus/runeberg/') tri_freq = gen_vector.gen_trigram_freq(15000) values = [] size = 0 max_size = 20000 if (os.path.exists(c.word_freq_path)): os.remove(c.word_freq_path) word_freq = error_correction.calc_freq(0, max_size) while (size <= max_size): if (os.path.exists(c.training_data)): os.remove(c.training_data) sortedOutput = {} count = 0 for key, value in sorted(word_freq.items(), key=lambda item: item[1], reverse=True): if (count >= size): break sortedOutput[key] = value count += 1 # print(sortedOutput) gen_vector.get_training_data(c.training_data, c.main_db, 13000, tri_freq, penta_freq, sortedOutput) values.append(main()) print(values) size += 500
def filter_test(): values = [] penta_freq = gen_vector.gen_word_pentagram_freq(1000, './data/corpus/runeberg/') tri_freq = gen_vector.gen_trigram_freq(10000) word_freq = error_correction.calc_freq(0, 10000) if (os.path.exists(c.training_data)): os.remove(c.training_data) gen_vector.get_training_data(c.training_data, c.main_db, 13000, tri_freq, penta_freq, word_freq) values.append(main())
def process_file(plain_text,output_file, db_size, training_size, svm_kernal, c_value, gamma,word_freq_size,tri_freq,penta_freq,word_freq): gen_vector.get_training_data(c.training_data, c.main_db,db_size,tri_freq,penta_freq,word_freq) gen_vector.get_input(plain_text, c.input,tri_freq,penta_freq,word_freq) svclassifier = word_classifier.train(c.svm_model, c.training_data, training_size, svm_kernal, c_value,gamma) classified_words = word_classifier.predict(c.input, svclassifier) output=[] for word in classified_words: if(word[1]==0): corr_word =error_correction.updated_correct_word(word[0],word_freq) else: corr_word= word[0] if isinstance(corr_word, (list,)): for word in corr_word: output.append(word) else: output.append(corr_word) with open(output_file, 'w') as f: for item in output: f.write("%s " % item)