def generate_official_test_data(): docid2entity_pos_list = load_EDL2018_output( '/home/wyin3/LORELEI/2019/retest/UPENN+18-rules.tab') # IL_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2) IL_into_test_withMT_filteredby_NER_2018( '/home/wyin3/LORELEI/2019/retest/monolingual_text/', '/home/wyin3/LORELEI/2019/retest/BBN_MT/', '/home/wyin3/LORELEI/2019/retest/il3-uyghur-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2)
import pickle import codecs from preprocess_common import IL_into_test_filteredby_NER_2018, load_EDL2018_output, IL_into_test_withMT_filteredby_NER_2018 if __name__ == '__main__': docid2entity_pos_list = load_EDL2018_output( '/save/wenpeng/datasets/LORELEI/il9/il9_sub4_setE-anno-v2-wiki_candgen_v21_nilcluster_exact_english.tab' ) # IL_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2) IL_into_test_withMT_filteredby_NER_2018( '/save/wenpeng/datasets/LORELEI/il9/monolingual_text/', '/save/wenpeng/datasets/LORELEI/il9/BBN-MT/', '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2) # json_validation('/save/wenpeng/datasets/LORELEI/il9/il9_system_output_forfun_w2.json')
import pickle import codecs from preprocess_common import IL_into_test_filteredby_NER_2018, load_EDL2018_output if __name__ == '__main__': docid2entity_pos_list = load_EDL2018_output( '/save/wenpeng/datasets/LORELEI/il9/il9_setE-anno_cand_v4.tab') IL_into_test_filteredby_NER_2018( '/save/wenpeng/datasets/LORELEI/il9/monolingual_text/', '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2) # json_validation('/save/wenpeng/datasets/LORELEI/il9/il9_system_output_forfun_w2.json')
import pickle import codecs from preprocess_common import IL_eng_into_test_filteredby_NER_2018, load_EDL2018_output, IL_into_test_filteredby_NER_2018 if __name__ == '__main__': # IL_eng_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9-eng/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9-eng/il9-eng-setE-as-test-input_ner_filtered', 2) # IL_eng_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il10-eng/monolingual_text/','/save/wenpeng/datasets/LORELEI/il10-eng/il10-eng-setE-as-test-input_ner_filtered', 2) # docid2entity_pos_list = load_EDL2018_output('/save/wenpeng/datasets/LORELEI/il9-eng/il9_cp2_english.nil.fix.tab') # IL_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9-eng/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9-eng/il9-eng-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2) docid2entity_pos_list = load_EDL2018_output( '/save/wenpeng/datasets/LORELEI/il10-eng/il10_cp2_english.nil.fix.tab') IL_into_test_filteredby_NER_2018( '/save/wenpeng/datasets/LORELEI/il10-eng/monolingual_text/', '/save/wenpeng/datasets/LORELEI/il10-eng/il10-eng-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2)
if __name__ == '__main__': '''generate bilingual embeddings''' '''xin li generated somali bilingual emb before, so this step is neglected''' # dict_file = '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set0/docs/categoryI_dictionary/IL11_dictionary.txt'#, '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set0/docs/categoryI_dictionary/IL11_dictionary_rev.txt'] # sub_dict_file = '/home/wyin3/LORELEI/2019/il11/il11_dictionary_w2w.txt' # ltf_path = ['/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/setE/data/monolingual_text/il11/ltf/', # '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set0/data/monolingual_text/ltf/', # '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set1/data/monolingual_text/ltf/'] # mono_text = '/home/wyin3/LORELEI/2019/il11/raw.text.for.train.word2vec.txt' # IL_mono_emb_file = '/home/wyin3/LORELEI/2019/il11/il11.w2v.txt' # eng_mono_emb_file = '/home/wyin3/Datasets/word2vec_words_300d_insertedline0.txt'#'/home/wyin3/Datasets/word2vec_words_300d.txt' # extract_dictionary(dict_file, sub_dict_file, False) # extract_monolingual_text_4_train_word2vec(ltf_path, mono_text) # run_word2vec(mono_text, IL_mono_emb_file) # generate_bilingual_wordembeddings(eng_mono_emb_file, IL_mono_emb_file, sub_dict_file, 'il11longerw2v', True) # generate_official_test_data() '''generate test set with EDL and MT output''' set_E_mono_text = '/home1/w/wenpeng/dataset/LORELEI/final/candgen/' MT_input = '' #'/scratch/wyin3/dickens_save_dataset/LORELEI/il11/MT/' EDL_input = ['/home1/w/wenpeng/dataset/LORELEI/final/candgen.tab'] output_file = '/home1/w/wenpeng/dataset/LORELEI/2019_09_07_somali-setE-as-test-input_edl0_filtered' docid2entity_pos_list = load_EDL2018_output(EDL_input[0]) Somali_into_test_filteredby_NER_2019(set_E_mono_text, output_file, docid2entity_pos_list, 2) # IL_into_test_withMT_filteredby_NER_2018(set_E_mono_text, MT_input, output_file, docid2entity_pos_list, 2) # json_validation('/save/wenpeng/datasets/LORELEI/il9/il9_system_output_forfun_w2.json') '''translate_bbn_2_il''' # translate_bbn_2_il()