def _maybe_download_corpora(tmp_dir, is_training): """Download corpora if necessary and unzip them. Args: tmp_dir: directory containing dataset. Returns: list of all files generated and path to file containing train/dev/test split info. """ cnn_filename = "cnn_stories.tgz" cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/") dailymail_filename = "dailymail_stories.tgz" dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/") if not tf.gfile.Exists(cnn_finalpath): cnn_file = generator_utils.maybe_download_from_drive( tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL) with tarfile.open(cnn_file, "r:gz") as cnn_tar: cnn_tar.extractall(tmp_dir) if not tf.gfile.Exists(dailymail_finalpath): dailymail_file = generator_utils.maybe_download_from_drive( tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL) with tarfile.open(dailymail_file, "r:gz") as dailymail_tar: dailymail_tar.extractall(tmp_dir) cnn_files = tf.gfile.Glob(cnn_finalpath + "*") dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*") all_files = cnn_files + dailymail_files if is_training: urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt", _TRAIN_URLS) else: urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt", _DEV_URLS) return all_files, urls_path
def _get_celeba(directory): """Download and extract CELEBA to directory unless it is there.""" # path = os.path.join(directory, _CELEBA_NAME) path = generator_utils.maybe_download_from_drive(directory, _CELEBA_NAME, _CELEBA_URL) if not tf.gfile.Exists(path): zipfile.ZipFile(path + ".zip", "r").extractall(directory)
def generate_samples(self, data_dir, tmp_dir, dataset_split): # Thresholds in the number of characters for LM examples lo_thresh = 10 up_thresh = 256*8 if dataset_split == problem.DatasetSplit.TRAIN: (fname, fid) = self.train_name_id else: (fname, fid) = self.dev_name_id wikifiles = [] url = "https://drive.google.com/uc?export=download&id=" + fid download_path = generator_utils.maybe_download_from_drive( tmp_dir, fname, url) wiki_file = os.path.join(tmp_dir, fname[:-3]) if not tf.gfile.Exists(wiki_file): generator_utils.gunzip_file(download_path, wiki_file) wikifiles.append(wiki_file) txt = "" for wiki_file in wikifiles: for line in tf.gfile.Open(wiki_file): line = line.strip() if len(txt) + len(line) > up_thresh: ret = txt txt = "" if len(ret) > lo_thresh and len(ret) < up_thresh: yield {"targets": ret} if not txt: txt = line else: txt = " ".join([txt, line])
def _get_vqa_v2_image_feature_dataset( directory, feature_url, feature_filename="mscoco_feat.tar.gz"): """Extract the VQA V2 feature data set to directory unless it's there.""" feature_file = generator_utils.maybe_download_from_drive( directory, feature_filename, feature_url) with tarfile.open(feature_file, "r:gz") as feature_tar: feature_tar.extractall(directory)
def _get_vqa_v2_image_feature_dataset( directory, feature_url, feature_filename="mscoco_feat.tar.gz"): """Extract the VQA V2 feature data set to directory unless it's there.""" feature_file = generator_utils.maybe_download_from_drive( directory, feature_filename, feature_url) with tarfile.open(feature_file, "r:gz") as feature_tar: feature_tar.extractall(directory)
def _get_vqa_v2_annotations(directory, annotation_url, annotation_filename="vqa_v2.tar.gz"): """Extract the VQA V2 annotation files to directory unless it's there.""" annotation_file = generator_utils.maybe_download_from_drive( directory, annotation_filename, annotation_url) with tarfile.open(annotation_file, "r:gz") as annotation_tar: annotation_tar.extractall(directory)
def _get_vqa_v2_annotations(directory, annotation_url, annotation_filename="vqa_v2.tar.gz"): """Extract the VQA V2 annotation files to directory unless it's there.""" annotation_file = generator_utils.maybe_download_from_drive( directory, annotation_filename, annotation_url) with tarfile.open(annotation_file, "r:gz") as annotation_tar: annotation_tar.extractall(directory)
def generator_samples(tmp_dir): """Generator for the dataset samples. If not present, download and extract the dataset. Args: tmp_dir: path to the directory where to download the dataset. Yields: A CodingPbInfo object containing the next challenge informations. """ # Step1: Download dataset (eventually) data_zip_path = generator_utils.maybe_download_from_drive( directory=tmp_dir, filename=_DATASET_FILENAME, url=_DATASET_URL, ) tf.logging.info("Data downloaded in: {}".format(data_zip_path)) # Step2: Extract dataset # We could deduce _DATASET_PB_PATH from the zip file (instead of # hardcoded path) data_rootdir = os.path.join(tmp_dir, _DATASET_PB_PATH) if not tf.gfile.Exists(data_rootdir): with zipfile.ZipFile(data_zip_path, "r") as corpus_zip: corpus_zip.extractall(tmp_dir) # We could remove the extracted __MACOSX folder tf.logging.info("Data extracted in: {}".format(tmp_dir)) else: tf.logging.info("Data already extracted in: {}".format(tmp_dir)) # Step3: Extract the problems list on the extracted folder def contains_samples(subdir, dirs, files): # pylint: disable=unused-argument """Check that the folder contains a problem.""" return (_DESC_DIR_NAME in dirs and _CODE_PY_DIR_NAME in dirs) def next_sample(subdir, dirs, files): # pylint: disable=unused-argument """Return the filenames of the problem.""" # More could be extracted (like the expected inputs/outputs # pairs, the problem difficulty, the names of the algorithmic techniques # needed) desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt") code_rootdir = os.path.join(subdir, _CODE_PY_DIR_NAME) code_files = [ f for f in tf.gfile.Glob(os.path.join(code_rootdir, "*.txt")) ] return CodingPbInfo(desc_file=desc_file, code_files=code_files) # The dataset contains problem from two different sources (CodeChef # and CodeForces). Due to the limited number of samples, all problems from # both sources are merged for w in tf.gfile.Walk(data_rootdir): if contains_samples(*w): yield next_sample(*w)
def _maybe_download_corpora(tmp_dir, dataset_split): """Download corpora if necessary and unzip them. Args: tmp_dir: directory containing dataset. dataset_split: whether we're in train/dev/test mode. Returns: List of all files generated and path to file containing train/dev/test split info. """ cnn_filename = "cnn_stories.tgz" cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/") dailymail_filename = "dailymail_stories.tgz" dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/") if not tf.gfile.Exists(cnn_finalpath): cnn_file = generator_utils.maybe_download_from_drive( tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL) with tarfile.open(cnn_file, "r:gz") as cnn_tar: cnn_tar.extractall(tmp_dir) if not tf.gfile.Exists(dailymail_finalpath): dailymail_file = generator_utils.maybe_download_from_drive( tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL) with tarfile.open(dailymail_file, "r:gz") as dailymail_tar: dailymail_tar.extractall(tmp_dir) cnn_files = tf.gfile.Glob(cnn_finalpath + "*") dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*") all_files = cnn_files + dailymail_files if dataset_split == problem.DatasetSplit.TRAIN: urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt", _TRAIN_URLS) elif dataset_split == problem.DatasetSplit.EVAL: urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt", _DEV_URLS) else: urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt", _TEST_URLS) return all_files, urls_path
def _get_wmt_ende_bpe_dataset(directory, filename): """Extract the WMT en-de corpus `filename` to directory unless it's there.""" train_path = os.path.join(directory, filename) if not (tf.gfile.Exists(train_path + ".de") and tf.gfile.Exists(train_path + ".en")): url = ("https://drive.google.com/uc?export=download&id=" "0B_bZck-ksdkpM25jRUN2X2UxMm8") corpus_file = generator_utils.maybe_download_from_drive( directory, "wmt16_en_de.tar.gz", url) with tarfile.open(corpus_file, "r:gz") as corpus_tar: corpus_tar.extractall(directory) return train_path
def _get_ltltstr_dataset(directory, filename): train_path = os.path.join(directory, filename) if not (tf.gfile.Exists(train_path + ".lt_str_lbl")): # generate stress text file from ascii stressed text if not tf.gfile.Exists(train_path + ".lt") or not tf.gfile.Exists(train_path + ".lt_str_ascii"): for url ,_ in _LTLTSTR_TRAIN_DATASETS: corpus_file = generator_utils.maybe_download_from_drive( directory, url.split('/')[-1], url) with tarfile.open(corpus_file, "r:gz") as corpus_tar: corpus_tar.extractall(directory) stressed_text2stress(train_path + ".lt_str_ascii", train_path + ".lt_str_lbl", _STRESS_CLASS_LABELS) return train_path
def _get_wmt_ende_bpe_dataset(directory, filename): """Extract the WMT en-de corpus `filename` to directory unless it's there.""" train_path = os.path.join(directory, filename) if not (tf.gfile.Exists(train_path + ".de") and tf.gfile.Exists(train_path + ".en")): url = ("https://drive.google.com/uc?export=download&id=" "0B_bZck-ksdkpM25jRUN2X2UxMm8") corpus_file = generator_utils.maybe_download_from_drive( directory, "wmt16_en_de.tar.gz", url) with tarfile.open(corpus_file, "r:gz") as corpus_tar: corpus_tar.extractall(directory) return train_path
def testMaybeDownloadFromDrive(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Download Google index to the temporary file.http. res_path = generator_utils.maybe_download_from_drive( tmp_dir, tmp_file_name + ".http", "http://drive.google.com") self.assertEqual(res_path, tmp_file_path + ".http") # Clean up. os.remove(tmp_file_path + ".http") os.remove(tmp_file_path)
def testMaybeDownloadFromDrive(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) tmp_file_name = os.path.basename(tmp_file_path) # Download Google index to the temporary file.http. res_path = generator_utils.maybe_download_from_drive( tmp_dir, tmp_file_name + ".http", "http://drive.google.com") self.assertEqual(res_path, tmp_file_path + ".http") # Clean up. os.remove(tmp_file_path + ".http") os.remove(tmp_file_path)
def _get_num2text_dataset(directory, filename): """Extract the WMT en-de corpus `filename` to directory unless it's there.""" train_path = os.path.join(directory, filename) if not (tf.gfile.Exists(train_path + "_num_p8_v7.txt") and tf.gfile.Exists(train_path + "_txt_p8_v7.txt")): url = _NUM2TEXT_DATASETS[0][0] corpus_file = generator_utils.maybe_download_from_drive( directory, "num2text-p8_5-v7.tar.gz", url ) if "drive.google.com" in url else generator_utils.maybe_download( directory, "num2text-p8_5-v7.tar.gz", url) with tarfile.open(corpus_file, "r:gz") as corpus_tar: corpus_tar.extractall(directory) return train_path
def _get_vqa_v2_dataset(directory): """Extract the VQA V2 data set to directory unless it's there.""" for url in _MSCOCO_IMAGE_URLS: filename = os.path.basename(url) download_url = os.path.join(_MSCOCO_ROOT_URL, url) path = generator_utils.maybe_download(directory, filename, download_url) unzip_dir = os.path.join(directory, filename.strip(".zip")) if not tf.gfile.Exists(unzip_dir): zipfile.ZipFile(path, "r").extractall(directory) annotation_file = generator_utils.maybe_download_from_drive( directory, "vqa_v2.tar.gz", _VQA_V2_ANNOTATION_URL) with tarfile.open(annotation_file, "r:gz") as annotation_tar: annotation_tar.extractall(directory)
def _maybe_download_corpora(tmp_dir): """Download corpora if necessary and unzip them. Args: tmp_dir: directory containing dataset. Returns: filepath of the downloaded corpus file. """ cnn_filename = "cnn_stories.tgz" cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/") dailymail_filename = "dailymail_stories.tgz" dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/") if not tf.gfile.Exists(cnn_finalpath): cnn_file = generator_utils.maybe_download_from_drive( tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL) with tarfile.open(cnn_file, "r:gz") as cnn_tar: cnn_tar.extractall(tmp_dir) if not tf.gfile.Exists(dailymail_finalpath): dailymail_file = generator_utils.maybe_download_from_drive( tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL) with tarfile.open(dailymail_file, "r:gz") as dailymail_tar: dailymail_tar.extractall(tmp_dir) return [cnn_finalpath, dailymail_finalpath]
def _get_wmt_enit_bpe_dataset(directory, filename): """Extract the WMT en-it corpus `filename` to directory unless it's there.""" train_path = os.path.join(directory, filename) if not (tf.gfile.Exists(train_path + ".it") and tf.gfile.Exists(train_path + ".en")): url = ( "https://drive.google.com/open?id=1F3apMpe1lijbUzZfMNPBJlvURgV3Sx2t" ) corpus_file = generator_utils.maybe_download_from_drive( directory, "News-Commentary-enit.tar", url) # convert zip to tar with tarfile.open(corpus_file, "r:gz") as corpus_tar: corpus_tar.extractall(directory) return train_path
def _prepare_serchqa_data(tmp_dir): file_path = generator_utils.maybe_download_from_drive( tmp_dir, _FILENAME + '.zip', _DRIVE_URL) try: zip_ref = zipfile.ZipFile(file_path, 'r') zip_ref.extractall(tmp_dir) zip_ref.close() except zipfile.BadZipfile: tf.logging.error( "Please dowload the file 'SearchQA.zip' to the tmp_dir " "through address: " "https://drive.google.com/open?id=0B51lBZ1gs1XTR3BIVTJQWkREQU0") raise zipfile.BadZipfile return os.path.join(tmp_dir, _FILENAME)
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate samples.""" if dataset_split == problem.DatasetSplit.TRAIN: file_names_ids = self.train_names_ids elif dataset_split == problem.DatasetSplit.TEST: file_names_ids = self.test_names_ids else: file_names_ids = self.eval_names_ids wiki_generators = [] for (fname, fid) in file_names_ids: url = "https://drive.google.com/uc?export=download&id=" + fid download_path = generator_utils.maybe_download_from_drive( tmp_dir, fname, url) wiki_file = os.path.join(tmp_dir, fname[:-3]) if not tf.gfile.Exists(wiki_file): generator_utils.gunzip_file(download_path, wiki_file) wiki_generators.append( concat_generator(wiki_file, self.combine_characters_threshold)) for example in mix_generators(wiki_generators): yield example
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate samples.""" if dataset_split == problem.DatasetSplit.TRAIN: file_names_ids = self.train_names_ids elif dataset_split == problem.DatasetSplit.TEST: file_names_ids = self.test_names_ids else: file_names_ids = self.eval_names_ids wiki_generators = [] for (fname, fid) in file_names_ids: url = "https://drive.google.com/uc?export=download&id=" + fid download_path = generator_utils.maybe_download_from_drive( tmp_dir, fname, url) wiki_file = os.path.join(tmp_dir, fname[:-3]) if not tf.gfile.Exists(wiki_file): generator_utils.gunzip_file(download_path, wiki_file) wiki_generators.append( concat_generator(wiki_file, self.combine_characters_threshold)) for example in mix_generators(wiki_generators): yield example
def generator_samples(tmp_dir, pb_cst): """Generator for the dataset samples. If not present, download and extract the dataset. Args: tmp_dir: path to the directory where to download the dataset. pb_cst: CodingPbConstants object defining paths Yields: A CodingPbInfo object containing the next challenge informations. """ # Step1: Download dataset (eventually) data_zip_path = generator_utils.maybe_download_from_drive( directory=tmp_dir, filename=_DATASET_FILENAME, url=_DATASET_URL, ) tf.logging.info("Data downloaded in: {}".format(data_zip_path)) # Step2: Extract dataset # We could deduce _DATASET_PB_PATH from the zip file (instead of # hardcoded path) data_rootdir = os.path.join(tmp_dir, _DATASET_PB_PATH) if not tf.gfile.Exists(data_rootdir): with zipfile.ZipFile(data_zip_path, "r") as corpus_zip: corpus_zip.extractall(tmp_dir) # We could remove the extracted __MACOSX folder tf.logging.info("Data extracted in: {}".format(tmp_dir)) else: tf.logging.info("Data already extracted in: {}".format(tmp_dir)) # Step3: Extract the problems list on the extracted folder def contains_samples(subdir, dirs, files): # pylint: disable=unused-argument """Check that the folder contains a problem.""" return ( _DESC_DIR_NAME in dirs and pb_cst.code_dir_name in dirs ) def next_sample(subdir, dirs, files): # pylint: disable=unused-argument """Return the filenames of the problem.""" # More could be extracted (like the expected inputs/outputs # pairs, the problem difficulty, the names of the algorithmic techniques # needed) desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt") code_files = [] # As the dataset is noisy, the program deduce the language from the file # content. code_pattern = os.path.join(subdir, pb_cst.code_dir_name, "*.txt") for f in tf.gfile.Glob(code_pattern): with tf.gfile.GFile(f, mode="r") as target_file: # Hack to filter C++/Java files. In theory some python comments could # make the file be considered as C++ but in practice the chance of # getting a false negative is low. content = target_file.read() if not any(p in content for p in pb_cst.filter_patterns): code_files.append(f) return CodingPbInfo( desc_file=desc_file, code_files=code_files ) # The dataset contains problem from two different sources (CodeChef # and CodeForces). Due to the limited number of samples, all problems from # both sources are merged for w in tf.gfile.Walk(data_rootdir): if contains_samples(*w): yield next_sample(*w)
def generator(self, tmp_dir, how_many, start_from=0): """Image generator for CELEBA dataset. Args: tmp_dir: path to temporary storage directory. how_many: how many images and labels to generate. start_from: from which image to start. Yields: A dictionary representing the images with the following fields: * image/encoded: the string encoding the image as JPEG, * image/format: the string "jpeg" representing image format, """ out_paths = [] for fname, url in [self.IMG_DATA, self.LANDMARKS_DATA, self.ATTR_DATA]: path = generator_utils.maybe_download_from_drive( tmp_dir, fname, url) out_paths.append(path) img_path, landmarks_path, attr_path = out_paths # pylint: disable=unbalanced-tuple-unpacking unzipped_folder = img_path[:-4] if not tf.gfile.Exists(unzipped_folder): zipfile.ZipFile(img_path, "r").extractall(tmp_dir) with tf.gfile.Open(landmarks_path) as f: landmarks_raw = f.read() with tf.gfile.Open(attr_path) as f: attr_raw = f.read() def process_landmarks(raw_data): landmarks = {} lines = raw_data.split("\n") headings = lines[1].strip().split() for line in lines[2:-1]: values = line.strip().split() img_name = values[0] landmark_values = [int(v) for v in values[1:]] landmarks[img_name] = landmark_values return landmarks, headings def process_attrs(raw_data): attrs = {} lines = raw_data.split("\n") headings = lines[1].strip().split() for line in lines[2:-1]: values = line.strip().split() img_name = values[0] attr_values = [int(v) for v in values[1:]] attrs[img_name] = attr_values return attrs, headings img_landmarks, _ = process_landmarks(landmarks_raw) img_attrs, _ = process_attrs(attr_raw) image_files = list(sorted(tf.gfile.Glob(unzipped_folder + "/*.jpg"))) for filename in image_files[start_from:start_from + how_many]: img_name = os.path.basename(filename) landmarks = img_landmarks[img_name] attrs = img_attrs[img_name] with tf.gfile.Open(filename, "rb") as f: encoded_image_data = f.read() yield { "image/encoded": [encoded_image_data], "image/format": ["jpeg"], "attributes": attrs, "landmarks": landmarks, }
def generator(self, tmp_dir, how_many, start_from=0): """Image generator for CELEBA dataset. Args: tmp_dir: path to temporary storage directory. how_many: how many images and labels to generate. start_from: from which image to start. Yields: A dictionary representing the images with the following fields: * image/encoded: the string encoding the image as JPEG, * image/format: the string "jpeg" representing image format, """ out_paths = [] for fname, url in [self.IMG_DATA, self.LANDMARKS_DATA, self.ATTR_DATA]: path = generator_utils.maybe_download_from_drive(tmp_dir, fname, url) out_paths.append(path) img_path, landmarks_path, attr_path = out_paths # pylint: disable=unbalanced-tuple-unpacking unzipped_folder = img_path[:-4] if not tf.gfile.Exists(unzipped_folder): zipfile.ZipFile(img_path, "r").extractall(tmp_dir) with tf.gfile.Open(landmarks_path) as f: landmarks_raw = f.read() with tf.gfile.Open(attr_path) as f: attr_raw = f.read() def process_landmarks(raw_data): landmarks = {} lines = raw_data.split("\n") headings = lines[1].strip().split() for line in lines[2:-1]: values = line.strip().split() img_name = values[0] landmark_values = [int(v) for v in values[1:]] landmarks[img_name] = landmark_values return landmarks, headings def process_attrs(raw_data): attrs = {} lines = raw_data.split("\n") headings = lines[1].strip().split() for line in lines[2:-1]: values = line.strip().split() img_name = values[0] attr_values = [int(v) for v in values[1:]] attrs[img_name] = attr_values return attrs, headings img_landmarks, _ = process_landmarks(landmarks_raw) img_attrs, _ = process_attrs(attr_raw) image_files = tf.gfile.Glob(unzipped_folder + "/*.jpg") for filename in image_files[start_from:start_from + how_many]: img_name = os.path.basename(filename) landmarks = img_landmarks[img_name] attrs = img_attrs[img_name] with tf.gfile.Open(filename, "rb") as f: encoded_image_data = f.read() yield { "image/encoded": [encoded_image_data], "image/format": ["jpeg"], "attributes": attrs, "landmarks": landmarks, }
def generator_samples(tmp_dir, pb_cst): """Generator for the dataset samples. If not present, download and extract the dataset. Args: tmp_dir: path to the directory where to download the dataset. pb_cst: CodingPbConstants object defining paths Yields: A CodingPbInfo object containing the next challenge informations. """ # Step1: Download dataset (eventually) data_zip_path = generator_utils.maybe_download_from_drive( directory=tmp_dir, filename=_DATASET_FILENAME, url=_DATASET_URL, ) tf.logging.info("Data downloaded in: {}".format(data_zip_path)) # Step2: Extract dataset # We could deduce _DATASET_PB_PATH from the zip file (instead of # hardcoded path) data_rootdir = os.path.join(tmp_dir, _DATASET_PB_PATH) if not tf.gfile.Exists(data_rootdir): with zipfile.ZipFile(data_zip_path, "r") as corpus_zip: corpus_zip.extractall(tmp_dir) # We could remove the extracted __MACOSX folder tf.logging.info("Data extracted in: {}".format(tmp_dir)) else: tf.logging.info("Data already extracted in: {}".format(tmp_dir)) # Step3: Extract the problems list on the extracted folder def contains_samples(subdir, dirs, files): # pylint: disable=unused-argument """Check that the folder contains a problem.""" return ( _DESC_DIR_NAME in dirs and pb_cst.code_dir_name in dirs ) def next_sample(subdir, dirs, files): # pylint: disable=unused-argument """Return the filenames of the problem.""" # More could be extracted (like the expected inputs/outputs # pairs, the problem difficulty, the names of the algorithmic techniques # needed) desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt") code_files = [] # As the dataset is noisy, the program deduce the language from the file # content. code_pattern = os.path.join(subdir, pb_cst.code_dir_name, "*.txt") for f in tf.gfile.Glob(code_pattern): with tf.gfile.GFile(f, mode="r") as target_file: # Hack to filter C++/Java files. In theory some python comments could # make the file be concidered as C++ but in practice the chance of # getting a false negative is low. content = target_file.read() if not any(p in content for p in pb_cst.filter_patterns): code_files.append(f) return CodingPbInfo( desc_file=desc_file, code_files=code_files ) # The dataset contains problem from two different sources (CodeChef # and CodeForces). Due to the limited number of samples, all problems from # both sources are merged for w in tf.gfile.Walk(data_rootdir): if contains_samples(*w): yield next_sample(*w)