def test_filecount(self): output_json = convert_sectlabel_to_json(SECTLABEL_FILENAME) output = output_json["parse_sect"] file_numbers = [each_line["file_no"] for each_line in output] file_numbers = set(file_numbers) assert len(file_numbers) == 40 # number of files expected
def get_lines_labels(self, filename: str) -> (List[str], List[str]): parsect_json = convert_sectlabel_to_json(filename) texts = [] labels = [] parsect_json = parsect_json["parse_sect"] for line_json in parsect_json: text = line_json["text"] label = line_json["label"] texts.append(text) labels.append(label) (train_lines, train_labels), (validation_lines, validation_labels), ( test_lines, test_labels, ) = self.get_train_valid_test_stratified_split(texts, labels, self.classname2idx) if self.dataset_type == "train": texts = train_lines labels = train_labels elif self.dataset_type == "valid": texts = validation_lines labels = validation_labels elif self.dataset_type == "test": texts = test_lines labels = test_labels if self.debug: # randomly sample `self.debug_dataset_proportion` samples and return num_text = len(texts) np.random.seed(1729) # so we can debug deterministically random_ints = np.random.randint( 0, num_text - 1, size=int(self.debug_dataset_proportion * num_text)) random_ints = list(random_ints) sample_texts = [] sample_labels = [] for random_int in random_ints: sample_texts.append(texts[random_int]) sample_labels.append(labels[random_int]) texts = sample_texts labels = sample_labels return texts, labels
def get_parsect_data(): parsect_json = convert_sectlabel_to_json(SECT_LABEL_FILE) return parsect_json
def test_label_not_empty(self): output_json = convert_sectlabel_to_json(SECTLABEL_FILENAME) output = output_json["parse_sect"] labels = [bool(each_line["label"]) for each_line in output] assert all(labels)