示例#1
0
def generate_official_test_data():
    docid2entity_pos_list = load_EDL2018_output(
        '/home/wyin3/LORELEI/2019/retest/UPENN+18-rules.tab')
    # IL_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2)
    IL_into_test_withMT_filteredby_NER_2018(
        '/home/wyin3/LORELEI/2019/retest/monolingual_text/',
        '/home/wyin3/LORELEI/2019/retest/BBN_MT/',
        '/home/wyin3/LORELEI/2019/retest/il3-uyghur-setE-as-test-input_ner_filtered',
        docid2entity_pos_list, 2)
示例#2
0
import pickle
import codecs
from preprocess_common import IL_into_test_filteredby_NER_2018, load_EDL2018_output, IL_into_test_withMT_filteredby_NER_2018

if __name__ == '__main__':
    docid2entity_pos_list = load_EDL2018_output(
        '/save/wenpeng/datasets/LORELEI/il9/il9_sub4_setE-anno-v2-wiki_candgen_v21_nilcluster_exact_english.tab'
    )
    # IL_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2)
    IL_into_test_withMT_filteredby_NER_2018(
        '/save/wenpeng/datasets/LORELEI/il9/monolingual_text/',
        '/save/wenpeng/datasets/LORELEI/il9/BBN-MT/',
        '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered',
        docid2entity_pos_list, 2)
    # json_validation('/save/wenpeng/datasets/LORELEI/il9/il9_system_output_forfun_w2.json')
示例#3
0
import pickle
import codecs
from preprocess_common import IL_into_test_filteredby_NER_2018, load_EDL2018_output

if __name__ == '__main__':
    docid2entity_pos_list = load_EDL2018_output(
        '/save/wenpeng/datasets/LORELEI/il9/il9_setE-anno_cand_v4.tab')
    IL_into_test_filteredby_NER_2018(
        '/save/wenpeng/datasets/LORELEI/il9/monolingual_text/',
        '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered',
        docid2entity_pos_list, 2)

    # json_validation('/save/wenpeng/datasets/LORELEI/il9/il9_system_output_forfun_w2.json')
import pickle
import codecs
from preprocess_common import IL_eng_into_test_filteredby_NER_2018, load_EDL2018_output, IL_into_test_filteredby_NER_2018

if __name__ == '__main__':
    # IL_eng_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9-eng/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9-eng/il9-eng-setE-as-test-input_ner_filtered', 2)
    # IL_eng_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il10-eng/monolingual_text/','/save/wenpeng/datasets/LORELEI/il10-eng/il10-eng-setE-as-test-input_ner_filtered', 2)

    # docid2entity_pos_list = load_EDL2018_output('/save/wenpeng/datasets/LORELEI/il9-eng/il9_cp2_english.nil.fix.tab')
    # IL_into_test_filteredby_NER_2018('/save/wenpeng/datasets/LORELEI/il9-eng/monolingual_text/','/save/wenpeng/datasets/LORELEI/il9-eng/il9-eng-setE-as-test-input_ner_filtered', docid2entity_pos_list, 2)

    docid2entity_pos_list = load_EDL2018_output(
        '/save/wenpeng/datasets/LORELEI/il10-eng/il10_cp2_english.nil.fix.tab')
    IL_into_test_filteredby_NER_2018(
        '/save/wenpeng/datasets/LORELEI/il10-eng/monolingual_text/',
        '/save/wenpeng/datasets/LORELEI/il10-eng/il10-eng-setE-as-test-input_ner_filtered',
        docid2entity_pos_list, 2)
示例#5
0
if __name__ == '__main__':
    '''generate bilingual embeddings'''
    '''xin li generated somali bilingual emb before, so this step is neglected'''
    # dict_file = '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set0/docs/categoryI_dictionary/IL11_dictionary.txt'#, '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set0/docs/categoryI_dictionary/IL11_dictionary_rev.txt']
    # sub_dict_file = '/home/wyin3/LORELEI/2019/il11/il11_dictionary_w2w.txt'
    # ltf_path = ['/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/setE/data/monolingual_text/il11/ltf/',
    # '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set0/data/monolingual_text/ltf/',
    # '/scratch/wyin3/dickens_save_dataset/LORELEI/il11/il11/source/il11/set1/data/monolingual_text/ltf/']
    # mono_text = '/home/wyin3/LORELEI/2019/il11/raw.text.for.train.word2vec.txt'
    # IL_mono_emb_file = '/home/wyin3/LORELEI/2019/il11/il11.w2v.txt'
    # eng_mono_emb_file = '/home/wyin3/Datasets/word2vec_words_300d_insertedline0.txt'#'/home/wyin3/Datasets/word2vec_words_300d.txt'
    # extract_dictionary(dict_file, sub_dict_file, False)
    # extract_monolingual_text_4_train_word2vec(ltf_path, mono_text)
    # run_word2vec(mono_text, IL_mono_emb_file)
    # generate_bilingual_wordembeddings(eng_mono_emb_file, IL_mono_emb_file, sub_dict_file, 'il11longerw2v', True)

    # generate_official_test_data()
    '''generate test set with EDL and MT output'''
    set_E_mono_text = '/home1/w/wenpeng/dataset/LORELEI/final/candgen/'
    MT_input = ''  #'/scratch/wyin3/dickens_save_dataset/LORELEI/il11/MT/'
    EDL_input = ['/home1/w/wenpeng/dataset/LORELEI/final/candgen.tab']
    output_file = '/home1/w/wenpeng/dataset/LORELEI/2019_09_07_somali-setE-as-test-input_edl0_filtered'
    docid2entity_pos_list = load_EDL2018_output(EDL_input[0])
    Somali_into_test_filteredby_NER_2019(set_E_mono_text, output_file,
                                         docid2entity_pos_list, 2)
    # IL_into_test_withMT_filteredby_NER_2018(set_E_mono_text, MT_input, output_file, docid2entity_pos_list, 2)
    # json_validation('/save/wenpeng/datasets/LORELEI/il9/il9_system_output_forfun_w2.json')
    '''translate_bbn_2_il'''
    # translate_bbn_2_il()