Exemplo n.º 1
0
def _tmx_to_source_target(tmx_file, source_resfile, target_resfile,
                          do_cleaning=False):
  source_target_pairs = cleaner_en_xx.paracrawl_v3_pairs(tmx_file)
  if do_cleaning:
    source_target_pairs = cleaner_en_xx.clean_en_xx_pairs(source_target_pairs)
  for source, target in source_target_pairs:
    source_resfile.write(source)
    source_resfile.write("\n")
    target_resfile.write(target)
    target_resfile.write("\n")
Exemplo n.º 2
0
def compile_data(tmp_dir, datasets, filename, datatypes_to_clean=None):
  """Concatenates all `datasets` and saves to `filename`."""
  datatypes_to_clean = datatypes_to_clean or []
  filename = os.path.join(tmp_dir, filename)
  lang1_fname = filename + ".lang1"
  lang2_fname = filename + ".lang2"
  if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname):
    tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname,
                    lang2_fname)
    return filename
  with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile:
    with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile:
      for dataset in datasets:
        url = dataset[0]
        compressed_filename = os.path.basename(url)
        compressed_filepath = os.path.join(tmp_dir, compressed_filename)
        if url.startswith("http"):
          generator_utils.maybe_download(tmp_dir, compressed_filename, url)

        if dataset[1][0] == "tmx":
          tmx_filename = os.path.join(tmp_dir, dataset[1][1])
          if tmx_filename.endswith(".gz"):
            new_filename = tmx_filename.strip(".gz")
            if not tf.gfile.Exists(new_filename):
              generator_utils.gunzip_file(tmx_filename, new_filename)
            tmx_filename = new_filename
          source, target = None, None
          with tf.gfile.Open(tmx_filename) as tmx_file:
            stream = cleaner_en_xx.paracrawl_v3_pairs(tmx_file)
            if "tmx" in datatypes_to_clean:
              stream = cleaner_en_xx.clean_en_xx_pairs(stream)
            for source, target in stream:
              lang1_resfile.write(source)
              lang1_resfile.write("\n")
              lang2_resfile.write(target)
              lang2_resfile.write("\n")

        elif dataset[1][0] == "tsv":
          _, src_column, trg_column, glob_pattern = dataset[1]
          filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          if not filenames:
            # Capture *.tgz and *.tar.gz too.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
            filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
          for tsv_filename in filenames:
            if tsv_filename.endswith(".gz"):
              new_filename = tsv_filename.strip(".gz")
              generator_utils.gunzip_file(tsv_filename, new_filename)
              tsv_filename = new_filename
            with tf.gfile.Open(tsv_filename) as tsv_file:
              for line in tsv_file:
                if line and "\t" in line:
                  parts = line.split("\t")
                  source, target = parts[src_column], parts[trg_column]
                  source, target = source.strip(), target.strip()
                  clean_pairs = [(source, target)]
                  if "tsv" in datatypes_to_clean:
                    clean_pairs = cleaner_en_xx.clean_en_xx_pairs(clean_pairs)
                  for source, target in clean_pairs:
                    if source and target:
                      lang1_resfile.write(source)
                      lang1_resfile.write("\n")
                      lang2_resfile.write(target)
                      lang2_resfile.write("\n")

        else:
          lang1_filename, lang2_filename = dataset[1]
          lang1_filepath = os.path.join(tmp_dir, lang1_filename)
          lang2_filepath = os.path.join(tmp_dir, lang2_filename)
          is_sgm = (
              lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm"))

          if not (tf.gfile.Exists(lang1_filepath) and
                  tf.gfile.Exists(lang2_filepath)):
            # For .tar.gz and .tgz files, we read compressed.
            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
            with tarfile.open(compressed_filepath, mode) as corpus_tar:
              corpus_tar.extractall(tmp_dir)
          if lang1_filepath.endswith(".gz"):
            new_filepath = lang1_filepath.strip(".gz")
            generator_utils.gunzip_file(lang1_filepath, new_filepath)
            lang1_filepath = new_filepath
          if lang2_filepath.endswith(".gz"):
            new_filepath = lang2_filepath.strip(".gz")
            generator_utils.gunzip_file(lang2_filepath, new_filepath)
            lang2_filepath = new_filepath

          for example in text_problems.text2text_txt_iterator(
              lang1_filepath, lang2_filepath):
            line1res = _preprocess_sgm(example["inputs"], is_sgm)
            line2res = _preprocess_sgm(example["targets"], is_sgm)
            clean_pairs = [(line1res, line2res)]
            if "txt" in datatypes_to_clean:
              clean_pairs = cleaner_en_xx.clean_en_xx_pairs(clean_pairs)
            for line1res, line2res in clean_pairs:
              if line1res and line2res:
                lang1_resfile.write(line1res)
                lang1_resfile.write("\n")
                lang2_resfile.write(line2res)
                lang2_resfile.write("\n")

  return filename