def _tmx_to_source_target(tmx_file, source_resfile, target_resfile, do_cleaning=False): source_target_pairs = cleaner_en_xx.paracrawl_v3_pairs(tmx_file) if do_cleaning: source_target_pairs = cleaner_en_xx.clean_en_xx_pairs(source_target_pairs) for source, target in source_target_pairs: source_resfile.write(source) source_resfile.write("\n") target_resfile.write(target) target_resfile.write("\n")
def compile_data(tmp_dir, datasets, filename, datatypes_to_clean=None): """Concatenates all `datasets` and saves to `filename`.""" datatypes_to_clean = datatypes_to_clean or [] filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) return filename with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) if url.startswith("http"): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if dataset[1][0] == "tmx": tmx_filename = os.path.join(tmp_dir, dataset[1][1]) if tmx_filename.endswith(".gz"): new_filename = tmx_filename.strip(".gz") if not tf.gfile.Exists(new_filename): generator_utils.gunzip_file(tmx_filename, new_filename) tmx_filename = new_filename source, target = None, None with tf.gfile.Open(tmx_filename) as tmx_file: stream = cleaner_en_xx.paracrawl_v3_pairs(tmx_file) if "tmx" in datatypes_to_clean: stream = cleaner_en_xx.clean_en_xx_pairs(stream) for source, target in stream: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") elif dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file(tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[trg_column] source, target = source.strip(), target.strip() clean_pairs = [(source, target)] if "tsv" in datatypes_to_clean: clean_pairs = cleaner_en_xx.clean_en_xx_pairs(clean_pairs) for source, target in clean_pairs: if source and target: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = ( lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath for example in text_problems.text2text_txt_iterator( lang1_filepath, lang2_filepath): line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) clean_pairs = [(line1res, line2res)] if "txt" in datatypes_to_clean: clean_pairs = cleaner_en_xx.clean_en_xx_pairs(clean_pairs) for line1res, line2res in clean_pairs: if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename