def build_combined_english_dataset(udbase_dir, tokenizer_dir, extern_dir, short_name, dataset, prepare_labels): output_txt = f"{tokenizer_dir}/{short_name}.{dataset}.txt" output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" if dataset == 'train': # TODO: include more UD treebanks, possibly with xpos removed # UD_English-ParTUT, UD_English-Pronouns, UD_English-Pronouns - xpos are different # also include "external" treebanks such as PTB treebanks = ["UD_English-EWT", "UD_English-GUM"] sents = [] for treebank in treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True) sents.extend(read_sentences_from_conllu(conllu_file)) else: ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu") sents = read_sentences_from_conllu(ewt_conllu) sents = strip_mwt_from_sentences(sents) write_sentences_to_conllu(output_conllu, sents) convert_conllu_to_txt(output_conllu, output_txt) if prepare_labels: prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir, short_name, "it", dataset)
def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): """ en_combined is currently EWT, GUM, PUD, and Pronouns TODO: use more of the handparsed data """ check_gum_ready(udbase_dir) if dataset == 'train': # TODO: include more UD treebanks, possibly with xpos removed # UD_English-ParTUT - xpos are different # also include "external" treebanks such as PTB # NOTE: in order to get the best results, make sure each of these treebanks have the latest edits applied train_treebanks = ["UD_English-EWT", "UD_English-GUM", "UD_English-GUMReddit"] test_treebanks = ["UD_English-PUD", "UD_English-Pronouns"] sents = [] for treebank in train_treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True) sents.extend(read_sentences_from_conllu(conllu_file)) for treebank in test_treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True) sents.extend(read_sentences_from_conllu(conllu_file)) else: ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu") sents = read_sentences_from_conllu(ewt_conllu) sents = strip_mwt_from_sentences(sents) return sents
def build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_txt, output_conllu, prepare_labels=True): """ Builds a combined dataset out of multiple Korean datasets. Currently this uses GSD and Kaist. If a segmenter-appropriate dataset was requested, spaces are removed. TODO: we need to handle the difference in xpos tags somehow. """ gsd_conllu = common.find_treebank_dataset_file("UD_Korean-GSD", udbase_dir, dataset, "conllu") kaist_conllu = common.find_treebank_dataset_file("UD_Korean-Kaist", udbase_dir, dataset, "conllu") sents = read_sentences_from_conllu( gsd_conllu) + read_sentences_from_conllu(kaist_conllu) segmenter = short_name.endswith("_seg") if segmenter: sents = remove_spaces_from_sentences(sents) write_sentences_to_conllu(output_conllu, sents) convert_conllu_to_txt(output_conllu, output_txt) if prepare_labels: prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir, short_name, "ko", dataset)
def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): """ es_combined is AnCora and GSD put together TODO: remove features which aren't shared between datasets TODO: consider mixing in PUD? """ if dataset == 'train': treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"] sents = [] for treebank in treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True) new_sents = read_sentences_from_conllu(conllu_file) if treebank.endswith("GSD"): new_sents = replace_semicolons(new_sents) sents.extend(new_sents) extra_spanish = os.path.join(handparsed_dir, "spanish-mwt", "spanish.mwt") if not os.path.exists(extra_spanish): raise FileNotFoundError("Cannot find the extra dataset 'spanish.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian)) extra_sents = read_sentences_from_conllu(extra_spanish) sents.extend(extra_sents) else: conllu_file = common.find_treebank_dataset_file("UD_Spanish-AnCora", udbase_dir, dataset, "conllu", fail=True) sents = read_sentences_from_conllu(conllu_file) return sents
def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset): if dataset == 'train': # could maybe add ParTUT, but that dataset has a slightly different xpos set # (no DE or I) # and I didn't feel like sorting through the differences # Note: currently these each have small changes compared with # the UD2.7 release. See the issues (possibly closed by now) # filed by AngledLuffa on each of the treebanks for more info. treebanks = ["UD_Italian-ISDT", "UD_Italian-VIT", "UD_Italian-TWITTIRO", "UD_Italian-PoSTWITA"] sents = [] for treebank in treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True) sents.extend(read_sentences_from_conllu(conllu_file)) extra_italian = os.path.join(handparsed_dir, "italian-mwt", "italian.mwt") if not os.path.exists(extra_italian): raise FileNotFoundError("Cannot find the extra dataset 'italian.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian)) extra_sents = read_sentences_from_conllu(extra_italian) for sentence in extra_sents: if not sentence[2].endswith("_") or not MWT_RE.match(sentence[2]): raise AssertionError("Unexpected format of the italian.mwt file. Has it already be modified to have SpaceAfter=No everywhere?") sentence[2] = sentence[2][:-1] + "SpaceAfter=No" sents = sents + extra_sents else: istd_conllu = common.find_treebank_dataset_file("UD_Italian-ISDT", udbase_dir, dataset, "conllu") sents = read_sentences_from_conllu(istd_conllu) return sents
def process_treebank(treebank, paths, args): """ Processes a single treebank into train, dev, test parts TODO Currently assumes it is always a UD treebank. There are Thai treebanks which are not included in UD. Also, there is no specific mechanism for UD_Arabic-NYUAD or similar treebanks, which need integration with LDC datsets """ udbase_dir = paths["UDBASE"] tokenizer_dir = paths["TOKENIZE_DATA_DIR"] handparsed_dir = paths["HANDPARSED_DIR"] short_name = common.project_to_short_name(treebank) short_language = short_name.split("_")[0] os.makedirs(tokenizer_dir, exist_ok=True) if short_name.startswith("ko_combined"): build_combined_korean(udbase_dir, tokenizer_dir, short_name) elif short_name in ("it_combined", "en_combined", "es_combined"): build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment) elif short_name.startswith("en_gum"): # we special case GUM because it should include a filled-out GUMReddit print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language)) build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment) else: # check that we can find the train file where we expect it train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True) print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language)) if not common.find_treebank_dataset_file( treebank, udbase_dir, "dev", "conllu", fail=False): process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language) else: process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment) convert_conllu_to_txt(tokenizer_dir, short_name) if args.prepare_labels: prepare_treebank_labels(tokenizer_dir, short_name)
def process_treebank(treebank, paths, args): """ Processes a single treebank into train, dev, test parts Includes processing for a few external tokenization datasets: vi_vlsp, th_orchid, th_best Also, there is no specific mechanism for UD_Arabic-NYUAD or similar treebanks, which need integration with LDC datsets """ udbase_dir = paths["UDBASE"] tokenizer_dir = paths["TOKENIZE_DATA_DIR"] handparsed_dir = paths["HANDPARSED_DIR"] short_name = common.project_to_short_name(treebank) short_language = short_name.split("_")[0] os.makedirs(tokenizer_dir, exist_ok=True) if short_name == "vi_vlsp": convert_vi_vlsp.convert_vi_vlsp(paths["EXTERN_DIR"], tokenizer_dir, args) elif short_name == "th_orchid": convert_th_orchid.main(paths["EXTERN_DIR"], tokenizer_dir) elif short_name == "th_lst20": convert_th_lst20.convert(paths["EXTERN_DIR"], tokenizer_dir, args) elif short_name == "th_best": convert_th_best.main(paths["EXTERN_DIR"], tokenizer_dir) elif short_name.startswith("ko_combined"): build_combined_korean(udbase_dir, tokenizer_dir, short_name) elif short_name in ("it_combined", "en_combined", "es_combined"): build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment) elif short_name.startswith("en_gum"): # we special case GUM because it should include a filled-out GUMReddit print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language)) build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment) else: # check that we can find the train file where we expect it train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True) print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language)) if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev", "conllu", fail=False): process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language) else: process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment) if not short_name in ('th_orchid', 'th_lst20'): convert_conllu_to_txt(tokenizer_dir, short_name) if args.prepare_labels: prepare_treebank_labels(tokenizer_dir, short_name)
def process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language): """ Process a UD treebank with only train/test splits For example, in UD 2.7: UD_Buryat-BDT UD_Galician-TreeGal UD_Indonesian-CSUI UD_Kazakh-KTB UD_Kurmanji-MG UD_Latin-Perseus UD_Livvi-KKPP UD_North_Sami-Giella UD_Old_Russian-RNC UD_Sanskrit-Vedic UD_Slovenian-SST UD_Upper_Sorbian-UFAL UD_Welsh-CCG """ train_input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu") train_output_conllu = f"{tokenizer_dir}/{short_name}.train.gold.conllu" dev_output_conllu = f"{tokenizer_dir}/{short_name}.dev.gold.conllu" if not split_train_file(treebank=treebank, train_input_conllu=train_input_conllu, train_output_conllu=train_output_conllu, dev_output_conllu=dev_output_conllu): return # the test set is already fine # currently we do not do any augmentation of these partial treebanks prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment=False)
def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment): """ Build the GUM dataset by combining GUMReddit It checks to make sure GUMReddit is filled out using the included script """ check_gum_ready(udbase_dir) random.seed(1234) output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" treebanks = ["UD_English-GUM", "UD_English-GUMReddit"] sents = [] for treebank in treebanks: conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True) sents.extend(read_sentences_from_conllu(conllu_file)) if dataset == 'train' and augment: sents = augment_punct(sents) write_sentences_to_conllu(output_conllu, sents)
def process_treebank(treebank, paths, args): """ Processes a single treebank into train, dev, test parts TODO Currently assumes it is always a UD treebank. There are Thai treebanks which are not included in UD. Also, there is no specific mechanism for UD_Arabic-NYUAD or similar treebanks, which need integration with LDC datsets """ udbase_dir = paths["UDBASE"] tokenizer_dir = paths["TOKENIZE_DATA_DIR"] extern_dir = paths["EXTERN_DIR"] short_name = common.project_to_short_name(treebank) short_language = short_name.split("_")[0] if short_name.startswith("ko_combined"): build_combined_korean(udbase_dir, tokenizer_dir, short_name, args.prepare_labels) elif short_name.startswith("it_combined"): build_combined_italian(udbase_dir, tokenizer_dir, extern_dir, short_name, args.prepare_labels) elif short_name.startswith("en_combined"): build_combined_english(udbase_dir, tokenizer_dir, extern_dir, short_name, args.prepare_labels) else: train_txt_file = common.find_treebank_dataset_file( treebank, udbase_dir, "train", "txt") if not train_txt_file: raise ValueError("Cannot find train file for treebank %s" % treebank) print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language)) if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev", "txt"): process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.prepare_labels) else: process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment, args.prepare_labels)
def check_gum_ready(udbase_dir): gum_conllu = common.find_treebank_dataset_file("UD_English-GUMReddit", udbase_dir, "train", "conllu") if common.mostly_underscores(gum_conllu): raise ValueError( "Cannot process UD_English-GUMReddit in its current form. There should be a download script available in the directory which will help integrate the missing proprietary values. Please run that script to update the data, then try again." )
def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True, prepare_labels=True): # TODO: do this higher up os.makedirs(tokenizer_dir, exist_ok=True) input_txt = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "txt") input_txt_copy = f"{tokenizer_dir}/{short_name}.{dataset}.txt" input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu") input_conllu_copy = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" if short_name == "sl_ssj": preprocess_ssj_data.process(input_txt, input_conllu, input_txt_copy, input_conllu_copy) elif short_name == "te_mtg" and dataset == 'train' and augment: write_augmented_dataset(input_conllu, input_conllu_copy, input_txt_copy, augment_telugu) elif short_name == "ar_padt" and dataset == 'train' and augment: write_augmented_dataset(input_conllu, input_conllu_copy, input_txt_copy, augment_arabic_padt) elif short_name.startswith("es_ancora") and dataset == 'train': # note that we always do this for AnCora, since this token is bizarre and confusing fix_spanish_ancora(input_conllu, input_conllu_copy, input_txt_copy, augment=augment) elif short_name.startswith("ko_") and short_name.endswith("_seg"): remove_spaces(input_conllu, input_conllu_copy, input_txt_copy) else: shutil.copyfile(input_txt, input_txt_copy) shutil.copyfile(input_conllu, input_conllu_copy) if prepare_labels: prepare_dataset_labels(input_txt_copy, input_conllu_copy, tokenizer_dir, short_name, short_language, dataset)
def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True): input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True) output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu" if short_name == "te_mtg" and dataset == 'train' and augment: write_augmented_dataset(input_conllu, output_conllu, augment_telugu) elif short_name == "ar_padt" and dataset == 'train' and augment: write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt) elif short_name.startswith("ko_") and short_name.endswith("_seg"): remove_spaces(input_conllu, output_conllu) elif dataset == 'train' and augment: write_augmented_dataset(input_conllu, output_conllu, augment_punct) else: shutil.copyfile(input_conllu, output_conllu)
def process_treebank(treebank, paths, args): if treebank.startswith("UD_"): udbase_dir = paths["UDBASE"] train_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True) augment = check_lemmas(train_conllu) if not augment: print( "No lemma information found in %s. Not augmenting the dataset" % train_conllu) else: # TODO: check the data to see if there are lemmas or not augment = True prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"], augment=augment)