def word_freq_test(): penta_freq = gen_vector.gen_word_pentagram_freq(1000, './data/corpus/runeberg/') tri_freq = gen_vector.gen_trigram_freq(15000) values = [] size = 0 max_size = 20000 if (os.path.exists(c.word_freq_path)): os.remove(c.word_freq_path) word_freq = error_correction.calc_freq(0, max_size) while (size <= max_size): if (os.path.exists(c.training_data)): os.remove(c.training_data) sortedOutput = {} count = 0 for key, value in sorted(word_freq.items(), key=lambda item: item[1], reverse=True): if (count >= size): break sortedOutput[key] = value count += 1 # print(sortedOutput) gen_vector.get_training_data(c.training_data, c.main_db, 13000, tri_freq, penta_freq, sortedOutput) values.append(main()) print(values) size += 500
def filter_test(): values = [] penta_freq = gen_vector.gen_word_pentagram_freq(1000, './data/corpus/runeberg/') tri_freq = gen_vector.gen_trigram_freq(10000) word_freq = error_correction.calc_freq(0, 10000) if (os.path.exists(c.training_data)): os.remove(c.training_data) gen_vector.get_training_data(c.training_data, c.main_db, 13000, tri_freq, penta_freq, word_freq) values.append(main())
def process_dir(input_dir, test, sample_size, db_size, training_size, svm_kernal, c_value, gamma,word_freq_size, tri_freq_size): count=1 tri_freq=gen_vector.gen_trigram_freq(tri_freq_size) penta_freq=gen_vector.gen_word_pentagram_freq(1000,'./data/corpus/runeberg/') word_freq=error_correction.calc_freq(0, word_freq_size) for file in os.listdir(input_dir): plain = input_dir+file output_dir= "./output/%s/%s"%(test,file) print(plain) if(not os.path.isfile(output_dir)): process_file(plain, output_dir, db_size, training_size, svm_kernal, c_value, gamma,word_freq_size,tri_freq,penta_freq,word_freq) print("Corrected page %i out of %i)" %(count, len(os.listdir(input_dir)))) count+=1 if(sample_size): if(sample_size<count): break