def __init__(self, filename, batch_size, opt, vocab, evaluation=False): self.batch_size = batch_size self.opt = opt self.vocab = vocab self.eval = evaluation with open(filename) as infile: data = jsonl.load(infile) # filter and sample data if opt.get('sample_train', 1.0) < 1.0 and not self.eval: keep = int(opt['sample_train'] * len(data)) data = random.sample(data, keep) print("Subsample training set with rate {:g}".format( opt['sample_train'])) self.raw_data = data data = self.preprocess(data, vocab, opt) # shuffle for training if not evaluation: indices = list(range(len(data))) random.shuffle(indices) data = [data[i] for i in indices] self.raw_data = [self.raw_data[i] for i in indices] self.num_examples = len(data) # chunk into batches data = [ data[i:i + batch_size] for i in range(0, len(data), batch_size) ] self.data = data print("{} batches created for {}.".format(len(data), filename))
def __init__(self, filename, batch_size, opt, vocab, char_vocab, evaluation=False): self.batch_size = batch_size self.opt = opt self.vocab = vocab self.char_vocab = char_vocab self.eval = evaluation self.label2id = constant.TYPE_TO_ID_IOB if opt['scheme'] == 'iob' \ else constant.TYPE_TO_ID_IOBES with open(filename) as infile: data = jsonl.load(infile) self.raw_data = data data = self.preprocess(data, opt) # shuffle for training if not evaluation: indices = list(range(len(data))) random.shuffle(indices) data = [data[i] for i in indices] self.raw_data = [self.raw_data[i] for i in indices] self.id2label = dict([(v, k) for k, v in self.label2id.items()]) self.labels = [[self.id2label[lid] for lid in d[-1]] for d in data] self.num_examples = len(data) # chunk into batches data = [ data[i:i + batch_size] for i in range(0, len(data), batch_size) ] self.data = data print("{} batches created for {}.".format(len(data), filename))
def __init__(self, filename, batch_size, opt, vocab, evaluation=False): self.batch_size = batch_size self.opt = opt self.vocab = vocab self.eval = evaluation with open(filename) as infile: data = jsonl.load(infile) self.raw_data = data data = self.preprocess(data, vocab, opt) # shuffle for training if not evaluation: indices = list(range(len(data))) random.shuffle(indices) data = [data[i] for i in indices] self.raw_data = [self.raw_data[i] for i in indices] self.num_examples = len(data) # batching data = [ data[i:i + batch_size] for i in range(0, len(data), batch_size) ] self.data = data print("{} batches created for {}.".format(len(data), filename))
def load_tokens(filename): with open(filename) as infile: data = jsonl.load(infile) tokens = [] for d in data: tokens += d['findings'] + d['impression'] + d['background'] tokens = list(map(vocab.normalize_token, tokens)) print("{} tokens from {} examples loaded from {}.".format(len(tokens), len(data), filename)) return tokens
def load_tokens(filename): with open(filename) as infile: data = jsonl.load(infile) tokens = [] chars = [] for d in data: tokens += d['token'] if 'char' in d: chars += sum(d['char'], []) tokens = list(map(vocab.normalize_token, tokens)) print("{} tokens, {} chars from {} examples loaded from {}.".format( len(tokens), len(chars), len(data), filename)) chars = chars if len(chars) > 0 else None return tokens, chars
if __name__ == "__main__": random.seed(RANDOM_SEED) train_percent = opt["train_percent"] dev_percent = opt["dev_percent"] test_percent = 100 - train_percent - dev_percent if not (1 <= train_percent < 100 and 1 <= dev_percent < 100 and train_percent + dev_percent < 100): print( "Error: --train_percent and --dev_percent needs to be a integer in range [1, 99] and their sum must be < 100" ) exit(1) with open(opt['data_path']) as infile: data = jsonl.load(infile) random.shuffle(data) print(f"Data shuffled... random seed = {RANDOM_SEED}") split_1 = int((train_percent / 100) * len(data)) split_2 = int((dev_percent / 100) * len(data)) train_data = data[:split_1] dev_data = data[split_1:split_1 + split_2] test_data = data[split_1 + split_2:] print( f"Split train/dev/test by {train_percent}/{dev_percent}/{test_percent} ratio" ) print( f"train set: {len(train_data)} examples, dev set: {len(dev_data)} examples, test set: {len(test_data)} examples" ) with open(os.path.join(opt['output_dir'], "train.jsonl"),
def __init__(self, filename, img_folder_path, batch_size, opt, vocab, evaluation=False): self.batch_size = batch_size self.opt = opt self.vocab = vocab self.eval = evaluation with open(filename) as infile: data = jsonl.load(infile) # filter and sample data if opt.get('sample_train', 1.0) < 1.0 and not self.eval: keep = int(opt['sample_train'] * len(data)) data = random.sample(data, keep) print("Subsample training set with rate {:g}".format( opt['sample_train'])) # removed raw_data for now, since data will mapped with images # self.raw_data = data data = self.preprocess(data, vocab, opt) combined_data = [] # map data with images report_without_image = 0 for report in data: subject_id = report[0] study_id = report[1] report_images_path = os.path.join( img_folder_path, 'p' + str(subject_id)[:2], # 10000032 -> p10 'p' + str(subject_id), 's' + str(study_id)) if os.path.isdir(report_images_path): images = self.get_image_from_folder(report_images_path) # TODO: 1) add transform options 2) image.to_tensor? # produce one sample for each image&report combination for image in images: combined_data.append(report + [image]) else: report_without_image += 1 data = combined_data print( f"Combined xray with report data, {report_without_image} reports have no corresponding image" ) # shuffle for training if not evaluation: indices = list(range(len(data))) random.shuffle(indices) data = [data[i] for i in indices] # self.raw_data = [self.raw_data[i] for i in indices] self.num_examples = len(data) # chunk into batches data = [ data[i:i + batch_size] for i in range(0, len(data), batch_size) ] self.data = data print("{} batches created for {}.".format(len(data), filename))