def parsed_files_generator(path_to_dir_with_preprocessed_projects,
                           train_test_valid, percent, start_from, dao):
    for file in os.listdir(
            os.path.join(path_to_dir_with_preprocessed_projects,
                         train_test_valid)):
        if file.startswith(".") or get_project_name(
                file) in dao.processed_projects_cache:
            continue
        if include_to_df(file, percent, start_from):
            yield file
예제 #2
0
    def __init__(self, path, text_field, label_field, **kwargs):
        """Create an IMDB dataset instance given a path and fields.

        Arguments:
            path: Path to the dataset's highest level directory
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        threshold = kwargs.pop("threshold", 0.0)
        context_len = kwargs.pop("context_len", 0)
        data_params = kwargs.pop("data", None)

        path_to_ignored_projects = os.path.join(path, '..', '..', '..', f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}")
        logger.info(f"Loading ignored projects from {path_to_ignored_projects} ...")
        ignored_projects_set = set(read_list(path_to_ignored_projects))

        fields = [('text', text_field), ('label', label_field)]
        examples = []

        for c_filename_before, c_filename_after, l_filename in file_mapper(path, ContextsDataset._get_pair,
                                                                           extension='label'):
            if not include_to_df(os.path.basename(l_filename), data_params.percent, data_params.start_from):
                continue

            proj_name = re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "", get_dir_and_file(l_filename))
            if proj_name in ignored_projects_set:
                continue

            c_file_before = None
            c_file_after = None
            l_file = None
            try:
                c_file_before = open(c_filename_before, 'r')
                c_file_after = open(c_filename_after, 'r')
                l_file = open(l_filename, 'r')
                for context_before, context_after, level in zip(c_file_before, c_file_after, l_file):
                    level = level.rstrip('\n')
                    if level:
                        context_for_prediction = ContextsDataset._get_context_for_prediction(context_before,
                                                                                             context_after,
                                                                                             context_len,
                                                                                             data_params.backwards)
                        example = data.Example.fromlist([context_for_prediction, level], fields)
                        examples.append(example)

            except FileNotFoundError:
                project_name = c_filename_before[:-len(ContextsDataset.FW_CONTEXTS_FILE_EXT)]
                logger.error(f"Project context not loaded: {project_name}")
                continue
            finally:
                if c_file_before is not None:
                    c_file_before.close()
                if c_file_after is not None:
                    c_file_after.close()
                if l_file is not None:
                    l_file.close()

        if not examples:
            raise ValueError(
                f"Examples list is empty. (percent={data_params.percent}, start from={data_params.start_from})")

        random.shuffle(examples)
        logger.debug(f"Number of examples gathered from {path}: {len(examples)} ")
        super(ContextsDataset, self).__init__(examples, fields, **kwargs)
    def test_include_to_df_smoke_true(self):
        result = include_to_df('120_file', 50.0, 0.0)

        self.assertTrue(result)
 def test_include_to_df_invalid_filename(self):
     with self.assertRaises(ValueError):
         include_to_df('file', 0.1, 99.9)
 def test_include_to_df_invalid_start_from(self):
     with self.assertRaises(ValueError):
         include_to_df('file', 0.1, 150)
 def test_include_to_df_invalid_percent(self):
     with self.assertRaises(ValueError):
         include_to_df('30_file', 101, 99.9)
 def test_include_to_df_zero_percent(self):
     with self.assertRaises(ValueError):
         include_to_df('990_file', 0.0, 99.0)
    def test_include_to_df_999(self):
        result = include_to_df('999_file', 0.1, 99.9)

        self.assertTrue(result)
    def test_include_to_df_zero_chunk_false(self):
        result = include_to_df('0_file', 0.1, 0.1)

        self.assertFalse(result)
예제 #10
0
    def test_include_to_df_zero_chunk_true(self):
        result = include_to_df('0_file', 0.1, 0.0)

        self.assertTrue(result)
예제 #11
0
    def test_include_to_df_smoke_false(self):
        result = include_to_df('120_file', 50.0, 50.0)

        self.assertFalse(result)