def parsed_files_generator(path_to_dir_with_preprocessed_projects, train_test_valid, percent, start_from, dao): for file in os.listdir( os.path.join(path_to_dir_with_preprocessed_projects, train_test_valid)): if file.startswith(".") or get_project_name( file) in dao.processed_projects_cache: continue if include_to_df(file, percent, start_from): yield file
def __init__(self, path, text_field, label_field, **kwargs): """Create an IMDB dataset instance given a path and fields. Arguments: path: Path to the dataset's highest level directory text_field: The field that will be used for text data. label_field: The field that will be used for label data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ threshold = kwargs.pop("threshold", 0.0) context_len = kwargs.pop("context_len", 0) data_params = kwargs.pop("data", None) path_to_ignored_projects = os.path.join(path, '..', '..', '..', f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}") logger.info(f"Loading ignored projects from {path_to_ignored_projects} ...") ignored_projects_set = set(read_list(path_to_ignored_projects)) fields = [('text', text_field), ('label', label_field)] examples = [] for c_filename_before, c_filename_after, l_filename in file_mapper(path, ContextsDataset._get_pair, extension='label'): if not include_to_df(os.path.basename(l_filename), data_params.percent, data_params.start_from): continue proj_name = re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "", get_dir_and_file(l_filename)) if proj_name in ignored_projects_set: continue c_file_before = None c_file_after = None l_file = None try: c_file_before = open(c_filename_before, 'r') c_file_after = open(c_filename_after, 'r') l_file = open(l_filename, 'r') for context_before, context_after, level in zip(c_file_before, c_file_after, l_file): level = level.rstrip('\n') if level: context_for_prediction = ContextsDataset._get_context_for_prediction(context_before, context_after, context_len, data_params.backwards) example = data.Example.fromlist([context_for_prediction, level], fields) examples.append(example) except FileNotFoundError: project_name = c_filename_before[:-len(ContextsDataset.FW_CONTEXTS_FILE_EXT)] logger.error(f"Project context not loaded: {project_name}") continue finally: if c_file_before is not None: c_file_before.close() if c_file_after is not None: c_file_after.close() if l_file is not None: l_file.close() if not examples: raise ValueError( f"Examples list is empty. (percent={data_params.percent}, start from={data_params.start_from})") random.shuffle(examples) logger.debug(f"Number of examples gathered from {path}: {len(examples)} ") super(ContextsDataset, self).__init__(examples, fields, **kwargs)
def test_include_to_df_smoke_true(self): result = include_to_df('120_file', 50.0, 0.0) self.assertTrue(result)
def test_include_to_df_invalid_filename(self): with self.assertRaises(ValueError): include_to_df('file', 0.1, 99.9)
def test_include_to_df_invalid_start_from(self): with self.assertRaises(ValueError): include_to_df('file', 0.1, 150)
def test_include_to_df_invalid_percent(self): with self.assertRaises(ValueError): include_to_df('30_file', 101, 99.9)
def test_include_to_df_zero_percent(self): with self.assertRaises(ValueError): include_to_df('990_file', 0.0, 99.0)
def test_include_to_df_999(self): result = include_to_df('999_file', 0.1, 99.9) self.assertTrue(result)
def test_include_to_df_zero_chunk_false(self): result = include_to_df('0_file', 0.1, 0.1) self.assertFalse(result)
def test_include_to_df_zero_chunk_true(self): result = include_to_df('0_file', 0.1, 0.0) self.assertTrue(result)
def test_include_to_df_smoke_false(self): result = include_to_df('120_file', 50.0, 50.0) self.assertFalse(result)