예제 #1
0
    def __init__(self, filename, batch_size, opt, vocab, evaluation=False):
        self.batch_size = batch_size
        self.opt = opt
        self.vocab = vocab
        self.eval = evaluation

        with open(filename) as infile:
            data = jsonl.load(infile)

        # filter and sample data
        if opt.get('sample_train', 1.0) < 1.0 and not self.eval:
            keep = int(opt['sample_train'] * len(data))
            data = random.sample(data, keep)
            print("Subsample training set with rate {:g}".format(
                opt['sample_train']))

        self.raw_data = data
        data = self.preprocess(data, vocab, opt)
        # shuffle for training
        if not evaluation:
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]
            self.raw_data = [self.raw_data[i] for i in indices]
        self.num_examples = len(data)

        # chunk into batches
        data = [
            data[i:i + batch_size] for i in range(0, len(data), batch_size)
        ]
        self.data = data
        print("{} batches created for {}.".format(len(data), filename))
예제 #2
0
    def __init__(self,
                 filename,
                 batch_size,
                 opt,
                 vocab,
                 char_vocab,
                 evaluation=False):
        self.batch_size = batch_size
        self.opt = opt
        self.vocab = vocab
        self.char_vocab = char_vocab
        self.eval = evaluation
        self.label2id = constant.TYPE_TO_ID_IOB if opt['scheme'] == 'iob' \
                else constant.TYPE_TO_ID_IOBES

        with open(filename) as infile:
            data = jsonl.load(infile)
        self.raw_data = data
        data = self.preprocess(data, opt)
        # shuffle for training
        if not evaluation:
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]
            self.raw_data = [self.raw_data[i] for i in indices]
        self.id2label = dict([(v, k) for k, v in self.label2id.items()])
        self.labels = [[self.id2label[lid] for lid in d[-1]] for d in data]
        self.num_examples = len(data)

        # chunk into batches
        data = [
            data[i:i + batch_size] for i in range(0, len(data), batch_size)
        ]
        self.data = data
        print("{} batches created for {}.".format(len(data), filename))
    def __init__(self, filename, batch_size, opt, vocab, evaluation=False):
        self.batch_size = batch_size
        self.opt = opt
        self.vocab = vocab
        self.eval = evaluation

        with open(filename) as infile:
            data = jsonl.load(infile)

        self.raw_data = data
        data = self.preprocess(data, vocab, opt)

        # shuffle for training
        if not evaluation:
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]
            self.raw_data = [self.raw_data[i] for i in indices]

        self.num_examples = len(data)

        # batching
        data = [
            data[i:i + batch_size] for i in range(0, len(data), batch_size)
        ]
        self.data = data
        print("{} batches created for {}.".format(len(data), filename))
예제 #4
0
def load_tokens(filename):
    with open(filename) as infile:
        data = jsonl.load(infile)
        tokens = []
        for d in data:
            tokens += d['findings'] + d['impression'] + d['background']
        tokens = list(map(vocab.normalize_token, tokens))
    print("{} tokens from {} examples loaded from {}.".format(len(tokens), len(data), filename))
    return tokens
예제 #5
0
def load_tokens(filename):
    with open(filename) as infile:
        data = jsonl.load(infile)
        tokens = []
        chars = []
        for d in data:
            tokens += d['token']
            if 'char' in d:
                chars += sum(d['char'], [])
        tokens = list(map(vocab.normalize_token, tokens))
    print("{} tokens, {} chars from {} examples loaded from {}.".format(
        len(tokens), len(chars), len(data), filename))
    chars = chars if len(chars) > 0 else None
    return tokens, chars
예제 #6
0
if __name__ == "__main__":
    random.seed(RANDOM_SEED)
    train_percent = opt["train_percent"]
    dev_percent = opt["dev_percent"]
    test_percent = 100 - train_percent - dev_percent

    if not (1 <= train_percent < 100 and 1 <= dev_percent < 100
            and train_percent + dev_percent < 100):
        print(
            "Error: --train_percent and --dev_percent needs to be a integer in range [1, 99] and their sum must be < 100"
        )
        exit(1)

    with open(opt['data_path']) as infile:
        data = jsonl.load(infile)
        random.shuffle(data)
        print(f"Data shuffled... random seed = {RANDOM_SEED}")
        split_1 = int((train_percent / 100) * len(data))
        split_2 = int((dev_percent / 100) * len(data))
        train_data = data[:split_1]
        dev_data = data[split_1:split_1 + split_2]
        test_data = data[split_1 + split_2:]
        print(
            f"Split train/dev/test by {train_percent}/{dev_percent}/{test_percent} ratio"
        )
        print(
            f"train set: {len(train_data)} examples, dev set: {len(dev_data)} examples, test set: {len(test_data)} examples"
        )

        with open(os.path.join(opt['output_dir'], "train.jsonl"),
예제 #7
0
    def __init__(self,
                 filename,
                 img_folder_path,
                 batch_size,
                 opt,
                 vocab,
                 evaluation=False):
        self.batch_size = batch_size
        self.opt = opt
        self.vocab = vocab
        self.eval = evaluation

        with open(filename) as infile:
            data = jsonl.load(infile)

        # filter and sample data
        if opt.get('sample_train', 1.0) < 1.0 and not self.eval:
            keep = int(opt['sample_train'] * len(data))
            data = random.sample(data, keep)
            print("Subsample training set with rate {:g}".format(
                opt['sample_train']))

        # removed raw_data for now, since data will mapped with images
        # self.raw_data = data
        data = self.preprocess(data, vocab, opt)

        combined_data = []
        # map data with images
        report_without_image = 0
        for report in data:
            subject_id = report[0]
            study_id = report[1]
            report_images_path = os.path.join(
                img_folder_path,
                'p' + str(subject_id)[:2],  # 10000032 -> p10
                'p' + str(subject_id),
                's' + str(study_id))
            if os.path.isdir(report_images_path):
                images = self.get_image_from_folder(report_images_path)
                # TODO: 1) add transform options 2) image.to_tensor?
                # produce one sample for each image&report combination
                for image in images:
                    combined_data.append(report + [image])
            else:
                report_without_image += 1
        data = combined_data
        print(
            f"Combined xray with report data, {report_without_image} reports have no corresponding image"
        )
        # shuffle for training
        if not evaluation:
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]
            # self.raw_data = [self.raw_data[i] for i in indices]
        self.num_examples = len(data)

        # chunk into batches
        data = [
            data[i:i + batch_size] for i in range(0, len(data), batch_size)
        ]
        self.data = data
        print("{} batches created for {}.".format(len(data), filename))