Пример #1
0
def _maybe_download_corpora(tmp_dir, is_training):
  """Download corpora if necessary and unzip them.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    list of all files generated and path to file containing train/dev/test split info.
  """
  cnn_filename = "cnn_stories.tgz"
  cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
  dailymail_filename = "dailymail_stories.tgz"
  dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/")
  if not tf.gfile.Exists(cnn_finalpath):
    cnn_file = generator_utils.maybe_download_from_drive(
        tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL)
    with tarfile.open(cnn_file, "r:gz") as cnn_tar:
      cnn_tar.extractall(tmp_dir)
  if not tf.gfile.Exists(dailymail_finalpath):
    dailymail_file = generator_utils.maybe_download_from_drive(
        tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL)
    with tarfile.open(dailymail_file, "r:gz") as dailymail_tar:
      dailymail_tar.extractall(tmp_dir)

  cnn_files = tf.gfile.Glob(cnn_finalpath + "*")
  dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*")
  all_files = cnn_files + dailymail_files

  if is_training:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt", _TRAIN_URLS)
  else:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt", _DEV_URLS)

  return all_files, urls_path
Пример #2
0
def _get_celeba(directory):
    """Download and extract CELEBA to directory unless it is there."""
    # path = os.path.join(directory, _CELEBA_NAME)
    path = generator_utils.maybe_download_from_drive(directory, _CELEBA_NAME,
                                                     _CELEBA_URL)
    if not tf.gfile.Exists(path):
        zipfile.ZipFile(path + ".zip", "r").extractall(directory)
Пример #3
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    # Thresholds in the number of characters for LM examples
    lo_thresh = 10
    up_thresh = 256*8

    if dataset_split == problem.DatasetSplit.TRAIN:
      (fname, fid) = self.train_name_id
    else:
      (fname, fid) = self.dev_name_id

    wikifiles = []
    url = "https://drive.google.com/uc?export=download&id=" + fid
    download_path = generator_utils.maybe_download_from_drive(
        tmp_dir, fname, url)
    wiki_file = os.path.join(tmp_dir, fname[:-3])
    if not tf.gfile.Exists(wiki_file):
      generator_utils.gunzip_file(download_path, wiki_file)
    wikifiles.append(wiki_file)

    txt = ""
    for wiki_file in wikifiles:
      for line in tf.gfile.Open(wiki_file):
        line = line.strip()
        if len(txt) + len(line) > up_thresh:
          ret = txt
          txt = ""
          if len(ret) > lo_thresh and len(ret) < up_thresh:
            yield {"targets": ret}

        if not txt:
          txt = line
        else:
          txt = " ".join([txt, line])
Пример #4
0
def _get_vqa_v2_image_feature_dataset(
    directory, feature_url, feature_filename="mscoco_feat.tar.gz"):
  """Extract the VQA V2 feature data set to directory unless it's there."""
  feature_file = generator_utils.maybe_download_from_drive(
      directory, feature_filename, feature_url)
  with tarfile.open(feature_file, "r:gz") as feature_tar:
    feature_tar.extractall(directory)
Пример #5
0
def _get_vqa_v2_image_feature_dataset(
    directory, feature_url, feature_filename="mscoco_feat.tar.gz"):
  """Extract the VQA V2 feature data set to directory unless it's there."""
  feature_file = generator_utils.maybe_download_from_drive(
      directory, feature_filename, feature_url)
  with tarfile.open(feature_file, "r:gz") as feature_tar:
    feature_tar.extractall(directory)
Пример #6
0
def _get_vqa_v2_annotations(directory,
                            annotation_url,
                            annotation_filename="vqa_v2.tar.gz"):
  """Extract the VQA V2 annotation files to directory unless it's there."""
  annotation_file = generator_utils.maybe_download_from_drive(
      directory, annotation_filename, annotation_url)
  with tarfile.open(annotation_file, "r:gz") as annotation_tar:
    annotation_tar.extractall(directory)
Пример #7
0
def _get_vqa_v2_annotations(directory,
                            annotation_url,
                            annotation_filename="vqa_v2.tar.gz"):
    """Extract the VQA V2 annotation files to directory unless it's there."""
    annotation_file = generator_utils.maybe_download_from_drive(
        directory, annotation_filename, annotation_url)
    with tarfile.open(annotation_file, "r:gz") as annotation_tar:
        annotation_tar.extractall(directory)
Пример #8
0
def generator_samples(tmp_dir):
    """Generator for the dataset samples.

  If not present, download and extract the dataset.

  Args:
    tmp_dir: path to the directory where to download the dataset.

  Yields:
    A CodingPbInfo object containing the next challenge informations.
  """
    # Step1: Download dataset (eventually)
    data_zip_path = generator_utils.maybe_download_from_drive(
        directory=tmp_dir,
        filename=_DATASET_FILENAME,
        url=_DATASET_URL,
    )
    tf.logging.info("Data downloaded in: {}".format(data_zip_path))

    # Step2: Extract dataset
    # We could deduce _DATASET_PB_PATH from the zip file (instead of
    # hardcoded path)
    data_rootdir = os.path.join(tmp_dir, _DATASET_PB_PATH)
    if not tf.gfile.Exists(data_rootdir):
        with zipfile.ZipFile(data_zip_path, "r") as corpus_zip:
            corpus_zip.extractall(tmp_dir)
        # We could remove the extracted __MACOSX folder
        tf.logging.info("Data extracted in: {}".format(tmp_dir))
    else:
        tf.logging.info("Data already extracted in: {}".format(tmp_dir))

    # Step3: Extract the problems list on the extracted folder
    def contains_samples(subdir, dirs, files):  # pylint: disable=unused-argument
        """Check that the folder contains a problem."""
        return (_DESC_DIR_NAME in dirs and _CODE_PY_DIR_NAME in dirs)

    def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
        """Return the filenames of the problem."""
        # More could be extracted (like the expected inputs/outputs
        # pairs, the problem difficulty, the names of the algorithmic techniques
        # needed)
        desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt")
        code_rootdir = os.path.join(subdir, _CODE_PY_DIR_NAME)
        code_files = [
            f for f in tf.gfile.Glob(os.path.join(code_rootdir, "*.txt"))
        ]
        return CodingPbInfo(desc_file=desc_file, code_files=code_files)

    # The dataset contains problem from two different sources (CodeChef
    # and CodeForces). Due to the limited number of samples, all problems from
    # both sources are merged
    for w in tf.gfile.Walk(data_rootdir):
        if contains_samples(*w):
            yield next_sample(*w)
Пример #9
0
def _maybe_download_corpora(tmp_dir, dataset_split):
  """Download corpora if necessary and unzip them.

  Args:
    tmp_dir: directory containing dataset.
    dataset_split: whether we're in train/dev/test mode.

  Returns:
    List of all files generated and path to file containing
      train/dev/test split info.
  """
  cnn_filename = "cnn_stories.tgz"
  cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
  dailymail_filename = "dailymail_stories.tgz"
  dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/")
  if not tf.gfile.Exists(cnn_finalpath):
    cnn_file = generator_utils.maybe_download_from_drive(
        tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL)
    with tarfile.open(cnn_file, "r:gz") as cnn_tar:
      cnn_tar.extractall(tmp_dir)
  if not tf.gfile.Exists(dailymail_finalpath):
    dailymail_file = generator_utils.maybe_download_from_drive(
        tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL)
    with tarfile.open(dailymail_file, "r:gz") as dailymail_tar:
      dailymail_tar.extractall(tmp_dir)

  cnn_files = tf.gfile.Glob(cnn_finalpath + "*")
  dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*")
  all_files = cnn_files + dailymail_files

  if dataset_split == problem.DatasetSplit.TRAIN:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt",
                                               _TRAIN_URLS)
  elif dataset_split == problem.DatasetSplit.EVAL:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt",
                                               _DEV_URLS)
  else:
    urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt",
                                               _TEST_URLS)

  return all_files, urls_path
Пример #10
0
def _get_wmt_ende_bpe_dataset(directory, filename):
  """Extract the WMT en-de corpus `filename` to directory unless it's there."""
  train_path = os.path.join(directory, filename)
  if not (tf.gfile.Exists(train_path + ".de") and
          tf.gfile.Exists(train_path + ".en")):
    url = ("https://drive.google.com/uc?export=download&id="
           "0B_bZck-ksdkpM25jRUN2X2UxMm8")
    corpus_file = generator_utils.maybe_download_from_drive(
        directory, "wmt16_en_de.tar.gz", url)
    with tarfile.open(corpus_file, "r:gz") as corpus_tar:
      corpus_tar.extractall(directory)
  return train_path
Пример #11
0
def _get_ltltstr_dataset(directory, filename):
  train_path = os.path.join(directory, filename)
  if not (tf.gfile.Exists(train_path + ".lt_str_lbl")): # generate stress text file from ascii stressed text
    if not tf.gfile.Exists(train_path + ".lt") or not tf.gfile.Exists(train_path + ".lt_str_ascii"):
      for url ,_ in _LTLTSTR_TRAIN_DATASETS:
        corpus_file = generator_utils.maybe_download_from_drive(
            directory, url.split('/')[-1], url)
        with tarfile.open(corpus_file, "r:gz") as corpus_tar:
          corpus_tar.extractall(directory)

    stressed_text2stress(train_path + ".lt_str_ascii", train_path + ".lt_str_lbl", _STRESS_CLASS_LABELS)    
  return train_path
Пример #12
0
def _get_wmt_ende_bpe_dataset(directory, filename):
  """Extract the WMT en-de corpus `filename` to directory unless it's there."""
  train_path = os.path.join(directory, filename)
  if not (tf.gfile.Exists(train_path + ".de") and
          tf.gfile.Exists(train_path + ".en")):
    url = ("https://drive.google.com/uc?export=download&id="
           "0B_bZck-ksdkpM25jRUN2X2UxMm8")
    corpus_file = generator_utils.maybe_download_from_drive(
        directory, "wmt16_en_de.tar.gz", url)
    with tarfile.open(corpus_file, "r:gz") as corpus_tar:
      corpus_tar.extractall(directory)
  return train_path
Пример #13
0
    def testMaybeDownloadFromDrive(self):
        tmp_dir = self.get_temp_dir()
        (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
        tmp_file_name = os.path.basename(tmp_file_path)

        # Download Google index to the temporary file.http.
        res_path = generator_utils.maybe_download_from_drive(
            tmp_dir, tmp_file_name + ".http", "http://drive.google.com")
        self.assertEqual(res_path, tmp_file_path + ".http")

        # Clean up.
        os.remove(tmp_file_path + ".http")
        os.remove(tmp_file_path)
  def testMaybeDownloadFromDrive(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Download Google index to the temporary file.http.
    res_path = generator_utils.maybe_download_from_drive(
        tmp_dir, tmp_file_name + ".http", "http://drive.google.com")
    self.assertEqual(res_path, tmp_file_path + ".http")

    # Clean up.
    os.remove(tmp_file_path + ".http")
    os.remove(tmp_file_path)
def _get_num2text_dataset(directory, filename):
    """Extract the WMT en-de corpus `filename` to directory unless it's there."""
    train_path = os.path.join(directory, filename)
    if not (tf.gfile.Exists(train_path + "_num_p8_v7.txt")
            and tf.gfile.Exists(train_path + "_txt_p8_v7.txt")):
        url = _NUM2TEXT_DATASETS[0][0]
        corpus_file = generator_utils.maybe_download_from_drive(
            directory, "num2text-p8_5-v7.tar.gz", url
        ) if "drive.google.com" in url else generator_utils.maybe_download(
            directory, "num2text-p8_5-v7.tar.gz", url)
        with tarfile.open(corpus_file, "r:gz") as corpus_tar:
            corpus_tar.extractall(directory)
    return train_path
Пример #16
0
def _get_vqa_v2_dataset(directory):
    """Extract the VQA V2 data set to directory unless it's there."""
    for url in _MSCOCO_IMAGE_URLS:
        filename = os.path.basename(url)
        download_url = os.path.join(_MSCOCO_ROOT_URL, url)
        path = generator_utils.maybe_download(directory, filename,
                                              download_url)
        unzip_dir = os.path.join(directory, filename.strip(".zip"))
        if not tf.gfile.Exists(unzip_dir):
            zipfile.ZipFile(path, "r").extractall(directory)

    annotation_file = generator_utils.maybe_download_from_drive(
        directory, "vqa_v2.tar.gz", _VQA_V2_ANNOTATION_URL)
    with tarfile.open(annotation_file, "r:gz") as annotation_tar:
        annotation_tar.extractall(directory)
Пример #17
0
def _maybe_download_corpora(tmp_dir):
    """Download corpora if necessary and unzip them.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    filepath of the downloaded corpus file.
  """
    cnn_filename = "cnn_stories.tgz"
    cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
    dailymail_filename = "dailymail_stories.tgz"
    dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/")
    if not tf.gfile.Exists(cnn_finalpath):
        cnn_file = generator_utils.maybe_download_from_drive(
            tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL)
        with tarfile.open(cnn_file, "r:gz") as cnn_tar:
            cnn_tar.extractall(tmp_dir)
    if not tf.gfile.Exists(dailymail_finalpath):
        dailymail_file = generator_utils.maybe_download_from_drive(
            tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL)
        with tarfile.open(dailymail_file, "r:gz") as dailymail_tar:
            dailymail_tar.extractall(tmp_dir)
    return [cnn_finalpath, dailymail_finalpath]
Пример #18
0
def _get_wmt_enit_bpe_dataset(directory, filename):
    """Extract the WMT en-it corpus `filename` to directory unless it's there."""
    train_path = os.path.join(directory, filename)
    if not (tf.gfile.Exists(train_path + ".it")
            and tf.gfile.Exists(train_path + ".en")):
        url = (
            "https://drive.google.com/open?id=1F3apMpe1lijbUzZfMNPBJlvURgV3Sx2t"
        )
        corpus_file = generator_utils.maybe_download_from_drive(
            directory, "News-Commentary-enit.tar", url)

        # convert zip to tar

        with tarfile.open(corpus_file, "r:gz") as corpus_tar:
            corpus_tar.extractall(directory)
    return train_path
Пример #19
0
def _prepare_serchqa_data(tmp_dir):
    file_path = generator_utils.maybe_download_from_drive(
        tmp_dir, _FILENAME + '.zip', _DRIVE_URL)
    try:
        zip_ref = zipfile.ZipFile(file_path, 'r')
        zip_ref.extractall(tmp_dir)
        zip_ref.close()

    except zipfile.BadZipfile:
        tf.logging.error(
            "Please dowload the file 'SearchQA.zip' to the tmp_dir "
            "through address: "
            "https://drive.google.com/open?id=0B51lBZ1gs1XTR3BIVTJQWkREQU0")
        raise zipfile.BadZipfile

    return os.path.join(tmp_dir, _FILENAME)
Пример #20
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    """Generate samples."""
    if dataset_split == problem.DatasetSplit.TRAIN:
      file_names_ids = self.train_names_ids
    elif dataset_split == problem.DatasetSplit.TEST:
      file_names_ids = self.test_names_ids
    else:
      file_names_ids = self.eval_names_ids

    wiki_generators = []
    for (fname, fid) in file_names_ids:
      url = "https://drive.google.com/uc?export=download&id=" + fid
      download_path = generator_utils.maybe_download_from_drive(
          tmp_dir, fname, url)
      wiki_file = os.path.join(tmp_dir, fname[:-3])
      if not tf.gfile.Exists(wiki_file):
        generator_utils.gunzip_file(download_path, wiki_file)
      wiki_generators.append(
          concat_generator(wiki_file, self.combine_characters_threshold))

    for example in mix_generators(wiki_generators):
      yield example
Пример #21
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        """Generate samples."""
        if dataset_split == problem.DatasetSplit.TRAIN:
            file_names_ids = self.train_names_ids
        elif dataset_split == problem.DatasetSplit.TEST:
            file_names_ids = self.test_names_ids
        else:
            file_names_ids = self.eval_names_ids

        wiki_generators = []
        for (fname, fid) in file_names_ids:
            url = "https://drive.google.com/uc?export=download&id=" + fid
            download_path = generator_utils.maybe_download_from_drive(
                tmp_dir, fname, url)
            wiki_file = os.path.join(tmp_dir, fname[:-3])
            if not tf.gfile.Exists(wiki_file):
                generator_utils.gunzip_file(download_path, wiki_file)
            wiki_generators.append(
                concat_generator(wiki_file, self.combine_characters_threshold))

        for example in mix_generators(wiki_generators):
            yield example
Пример #22
0
def generator_samples(tmp_dir, pb_cst):
  """Generator for the dataset samples.

  If not present, download and extract the dataset.

  Args:
    tmp_dir: path to the directory where to download the dataset.
    pb_cst: CodingPbConstants object defining paths

  Yields:
    A CodingPbInfo object containing the next challenge informations.
  """
  # Step1: Download dataset (eventually)
  data_zip_path = generator_utils.maybe_download_from_drive(
      directory=tmp_dir,
      filename=_DATASET_FILENAME,
      url=_DATASET_URL,
  )
  tf.logging.info("Data downloaded in: {}".format(data_zip_path))

  # Step2: Extract dataset
  # We could deduce _DATASET_PB_PATH from the zip file (instead of
  # hardcoded path)
  data_rootdir = os.path.join(tmp_dir, _DATASET_PB_PATH)
  if not tf.gfile.Exists(data_rootdir):
    with zipfile.ZipFile(data_zip_path, "r") as corpus_zip:
      corpus_zip.extractall(tmp_dir)
    # We could remove the extracted __MACOSX folder
    tf.logging.info("Data extracted in: {}".format(tmp_dir))
  else:
    tf.logging.info("Data already extracted in: {}".format(tmp_dir))

  # Step3: Extract the problems list on the extracted folder
  def contains_samples(subdir, dirs, files):  # pylint: disable=unused-argument
    """Check that the folder contains a problem."""
    return (
        _DESC_DIR_NAME in dirs and
        pb_cst.code_dir_name in dirs
    )

  def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
    """Return the filenames of the problem."""
    # More could be extracted (like the expected inputs/outputs
    # pairs, the problem difficulty, the names of the algorithmic techniques
    # needed)
    desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt")
    code_files = []
    # As the dataset is noisy, the program deduce the language from the file
    # content.
    code_pattern = os.path.join(subdir, pb_cst.code_dir_name, "*.txt")
    for f in tf.gfile.Glob(code_pattern):
      with tf.gfile.GFile(f, mode="r") as target_file:
        # Hack to filter C++/Java files. In theory some python comments could
        # make the file be considered as C++ but in practice the chance of
        # getting a false negative is low.
        content = target_file.read()
        if not any(p in content for p in pb_cst.filter_patterns):
          code_files.append(f)
    return CodingPbInfo(
        desc_file=desc_file,
        code_files=code_files
    )

  # The dataset contains problem from two different sources (CodeChef
  # and CodeForces). Due to the limited number of samples, all problems from
  # both sources are merged
  for w in tf.gfile.Walk(data_rootdir):
    if contains_samples(*w):
      yield next_sample(*w)
Пример #23
0
    def generator(self, tmp_dir, how_many, start_from=0):
        """Image generator for CELEBA dataset.

    Args:
      tmp_dir: path to temporary storage directory.
      how_many: how many images and labels to generate.
      start_from: from which image to start.

    Yields:
      A dictionary representing the images with the following fields:
      * image/encoded: the string encoding the image as JPEG,
      * image/format: the string "jpeg" representing image format,
    """
        out_paths = []
        for fname, url in [self.IMG_DATA, self.LANDMARKS_DATA, self.ATTR_DATA]:
            path = generator_utils.maybe_download_from_drive(
                tmp_dir, fname, url)
            out_paths.append(path)

        img_path, landmarks_path, attr_path = out_paths  # pylint: disable=unbalanced-tuple-unpacking
        unzipped_folder = img_path[:-4]
        if not tf.gfile.Exists(unzipped_folder):
            zipfile.ZipFile(img_path, "r").extractall(tmp_dir)

        with tf.gfile.Open(landmarks_path) as f:
            landmarks_raw = f.read()

        with tf.gfile.Open(attr_path) as f:
            attr_raw = f.read()

        def process_landmarks(raw_data):
            landmarks = {}
            lines = raw_data.split("\n")
            headings = lines[1].strip().split()
            for line in lines[2:-1]:
                values = line.strip().split()
                img_name = values[0]
                landmark_values = [int(v) for v in values[1:]]
                landmarks[img_name] = landmark_values
            return landmarks, headings

        def process_attrs(raw_data):
            attrs = {}
            lines = raw_data.split("\n")
            headings = lines[1].strip().split()
            for line in lines[2:-1]:
                values = line.strip().split()
                img_name = values[0]
                attr_values = [int(v) for v in values[1:]]
                attrs[img_name] = attr_values
            return attrs, headings

        img_landmarks, _ = process_landmarks(landmarks_raw)
        img_attrs, _ = process_attrs(attr_raw)

        image_files = list(sorted(tf.gfile.Glob(unzipped_folder + "/*.jpg")))
        for filename in image_files[start_from:start_from + how_many]:
            img_name = os.path.basename(filename)
            landmarks = img_landmarks[img_name]
            attrs = img_attrs[img_name]

            with tf.gfile.Open(filename, "rb") as f:
                encoded_image_data = f.read()
                yield {
                    "image/encoded": [encoded_image_data],
                    "image/format": ["jpeg"],
                    "attributes": attrs,
                    "landmarks": landmarks,
                }
Пример #24
0
  def generator(self, tmp_dir, how_many, start_from=0):
    """Image generator for CELEBA dataset.

    Args:
      tmp_dir: path to temporary storage directory.
      how_many: how many images and labels to generate.
      start_from: from which image to start.

    Yields:
      A dictionary representing the images with the following fields:
      * image/encoded: the string encoding the image as JPEG,
      * image/format: the string "jpeg" representing image format,
    """
    out_paths = []
    for fname, url in [self.IMG_DATA, self.LANDMARKS_DATA, self.ATTR_DATA]:
      path = generator_utils.maybe_download_from_drive(tmp_dir, fname, url)
      out_paths.append(path)

    img_path, landmarks_path, attr_path = out_paths  # pylint: disable=unbalanced-tuple-unpacking
    unzipped_folder = img_path[:-4]
    if not tf.gfile.Exists(unzipped_folder):
      zipfile.ZipFile(img_path, "r").extractall(tmp_dir)

    with tf.gfile.Open(landmarks_path) as f:
      landmarks_raw = f.read()

    with tf.gfile.Open(attr_path) as f:
      attr_raw = f.read()

    def process_landmarks(raw_data):
      landmarks = {}
      lines = raw_data.split("\n")
      headings = lines[1].strip().split()
      for line in lines[2:-1]:
        values = line.strip().split()
        img_name = values[0]
        landmark_values = [int(v) for v in values[1:]]
        landmarks[img_name] = landmark_values
      return landmarks, headings

    def process_attrs(raw_data):
      attrs = {}
      lines = raw_data.split("\n")
      headings = lines[1].strip().split()
      for line in lines[2:-1]:
        values = line.strip().split()
        img_name = values[0]
        attr_values = [int(v) for v in values[1:]]
        attrs[img_name] = attr_values
      return attrs, headings

    img_landmarks, _ = process_landmarks(landmarks_raw)
    img_attrs, _ = process_attrs(attr_raw)

    image_files = tf.gfile.Glob(unzipped_folder + "/*.jpg")
    for filename in image_files[start_from:start_from + how_many]:
      img_name = os.path.basename(filename)
      landmarks = img_landmarks[img_name]
      attrs = img_attrs[img_name]

      with tf.gfile.Open(filename, "rb") as f:
        encoded_image_data = f.read()
        yield {
            "image/encoded": [encoded_image_data],
            "image/format": ["jpeg"],
            "attributes": attrs,
            "landmarks": landmarks,
        }
Пример #25
0
def generator_samples(tmp_dir, pb_cst):
  """Generator for the dataset samples.

  If not present, download and extract the dataset.

  Args:
    tmp_dir: path to the directory where to download the dataset.
    pb_cst: CodingPbConstants object defining paths

  Yields:
    A CodingPbInfo object containing the next challenge informations.
  """
  # Step1: Download dataset (eventually)
  data_zip_path = generator_utils.maybe_download_from_drive(
      directory=tmp_dir,
      filename=_DATASET_FILENAME,
      url=_DATASET_URL,
  )
  tf.logging.info("Data downloaded in: {}".format(data_zip_path))

  # Step2: Extract dataset
  # We could deduce _DATASET_PB_PATH from the zip file (instead of
  # hardcoded path)
  data_rootdir = os.path.join(tmp_dir, _DATASET_PB_PATH)
  if not tf.gfile.Exists(data_rootdir):
    with zipfile.ZipFile(data_zip_path, "r") as corpus_zip:
      corpus_zip.extractall(tmp_dir)
    # We could remove the extracted __MACOSX folder
    tf.logging.info("Data extracted in: {}".format(tmp_dir))
  else:
    tf.logging.info("Data already extracted in: {}".format(tmp_dir))

  # Step3: Extract the problems list on the extracted folder
  def contains_samples(subdir, dirs, files):  # pylint: disable=unused-argument
    """Check that the folder contains a problem."""
    return (
        _DESC_DIR_NAME in dirs and
        pb_cst.code_dir_name in dirs
    )

  def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
    """Return the filenames of the problem."""
    # More could be extracted (like the expected inputs/outputs
    # pairs, the problem difficulty, the names of the algorithmic techniques
    # needed)
    desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt")
    code_files = []
    # As the dataset is noisy, the program deduce the language from the file
    # content.
    code_pattern = os.path.join(subdir, pb_cst.code_dir_name, "*.txt")
    for f in tf.gfile.Glob(code_pattern):
      with tf.gfile.GFile(f, mode="r") as target_file:
        # Hack to filter C++/Java files. In theory some python comments could
        # make the file be concidered as C++ but in practice the chance of
        # getting a false negative is low.
        content = target_file.read()
        if not any(p in content for p in pb_cst.filter_patterns):
          code_files.append(f)
    return CodingPbInfo(
        desc_file=desc_file,
        code_files=code_files
    )

  # The dataset contains problem from two different sources (CodeChef
  # and CodeForces). Due to the limited number of samples, all problems from
  # both sources are merged
  for w in tf.gfile.Walk(data_rootdir):
    if contains_samples(*w):
      yield next_sample(*w)