def create_cases( case_creators, case_creators_picker, filename: str ) -> Tuple[List[Optional[Tuple[List[str], List[str], bool]]], str]: rel_path = get_dir_and_file(filename) with open(filename, 'r') as f: res = [] for line in f: list_of_words = line.rstrip('\n').split(" ") list_of_words = remove_some_log_statements(list_of_words) if placeholders['log_statement'] in list_of_words: case_creator = case_creators_picker(case_creators) res.append(case_creator.create_from(list_of_words)) else: res.append(None) return res, rel_path
def calc_logged_stats(path_to_label_file): stats = defaultdict(int) with open(path_to_label_file, 'r') as f: for line in f: stripped_line = line.rstrip('\n') if stripped_line in ['1', '0']: stats[WITH_LOGGING] += 1 elif stripped_line == '': stats[NO_LOGGING] += 1 else: raise AssertionError( f"Invalid line: {stripped_line} in file: {path_to_label_file}" ) if stats == {}: logger.warning( f"The project {path_to_label_file} contains no files. Skipping...") return None else: return stats, re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "", get_dir_and_file(path_to_label_file))
def __init__(self, path, text_field, label_field, **kwargs): """Create an IMDB dataset instance given a path and fields. Arguments: path: Path to the dataset's highest level directory text_field: The field that will be used for text data. label_field: The field that will be used for label data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ threshold = kwargs.pop("threshold", 0.0) context_len = kwargs.pop("context_len", 0) data_params = kwargs.pop("data", None) path_to_ignored_projects = os.path.join( path, '..', '..', '..', f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}") logger.info( f"Loading ignored projects from {path_to_ignored_projects} ...") ignored_projects_set = set(read_list(path_to_ignored_projects)) fields = [('text', text_field), ('label', label_field)] examples = [] for c_filename_before, c_filename_after, l_filename in file_mapper( path, ContextsDataset._get_pair, lambda fi: fi.endswith('label')): if not included_in_fraction(os.path.basename(l_filename), data_params.percent, data_params.start_from): continue proj_name = re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "", get_dir_and_file(l_filename)) if proj_name in ignored_projects_set: continue c_file_before = None c_file_after = None l_file = None try: c_file_before = open(c_filename_before, 'r') c_file_after = open(c_filename_after, 'r') l_file = open(l_filename, 'r') for context_before, context_after, level in zip( c_file_before, c_file_after, l_file): level = level.rstrip('\n') if level: context_for_prediction = ContextsDataset._get_context_for_prediction( context_before, context_after, context_len, data_params.backwards) example = data.Example.fromlist( [context_for_prediction, level], fields) examples.append(example) except FileNotFoundError: project_name = c_filename_before[:-len(ContextsDataset. FW_CONTEXTS_FILE_EXT)] logger.error(f"Project context not loaded: {project_name}") continue finally: if c_file_before is not None: c_file_before.close() if c_file_after is not None: c_file_after.close() if l_file is not None: l_file.close() if not examples: raise ValueError( f"Examples list is empty. (percent={data_params.percent}, start from={data_params.start_from})" ) random.shuffle(examples) logger.debug( f"Number of examples gathered from {path}: {len(examples)} ") super(ContextsDataset, self).__init__(examples, fields, **kwargs)