示例#1
0
    def create_fake_data(cls, binary_class: Optional[str] = None) -> BIODataset:
        data = [
            cls.create_entry(['single'], ['B-Tag'], 0, 1.0),
            cls.create_entry(['single', 'double'], ['B-Tag', 'I-Tag'], 1, 1.0),
            cls.create_entry(['single', 'double', 'triple'], ['B-Tag', 'I-Tag', 'O'], 2, 1.0),
            cls.create_entry(['no_label'], ['O'], 3, 1.0),
        ]

        dataset = BIODataset(0, 'fake_file.txt', binary_class)

        # hack around reading a file
        dataset.data = data

        return dataset
示例#2
0
    def create_fake_data(cls,
                         binary_class: Optional[str] = None) -> BIODataset:
        data = [
            create_entry(['this', 'is', 'an', 'reaction'],
                         ['O', 'O', 'O', 'ADR'], 0),
            create_entry(['this', 'is', 'an', 'reaction', 'an', 'reaction'],
                         ['O', 'O', 'O', 'O', 'O', 'ADR'], 1)
        ]

        dataset = BIODataset(0, 'fake_file.txt', binary_class)

        # hack around reading a file
        dataset.data = data

        return dataset
示例#3
0
def main():
    args = get_args().parse_args()
    device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu'
    train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset)

    train_bio = BIODataset(
        dataset_id=0,
        file_name=train_file,
    )
    train_bio.parse_file()

    train_reader = BIODatasetReader(bio_dataset=train_bio, )

    train_data: Iterator[Instance] = train_reader.read('temp.txt')

    valid_bio = BIODataset(
        dataset_id=1,
        file_name=valid_file,
    )
    valid_bio.parse_file()

    valid_reader = BIODatasetReader(bio_dataset=valid_bio, )

    valid_data: Iterator[Instance] = valid_reader.read('temp.txt')
    vocab = Vocabulary.from_instances(train_data + valid_data)

    if args.cuda:
        cuda_device = 0
        cached_embedder = cached_embedder.cuda(cuda_device)
    else:
        cuda_device = -1

    feature_extractor = SpaCyFeatureExtractor()
    feature_extractor.cache(
        dataset_id=0,
        dataset=train_data,
        vocab=vocab,
    )
    feature_extractor.cache(
        dataset_id=1,
        dataset=valid_data,
        vocab=vocab,
    )

    save_file_name = get_save_file(
        feature_extractor_type=args.feature_extractor,
        dataset_type=args.dataset)

    save_file = PickleSaveFile(file_name=save_file_name)

    feature_extractor.save(save_file=save_file)
    save_file.close()
示例#4
0
def main():
    args = get_args().parse_args()
    device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu'
    train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset)
    token_embedder, token_indexer, text_field_embedder_kwargs = get_embedder_info(
        args.embedder)

    train_bio = BIODataset(
        dataset_id=0,
        file_name=train_file,
    )
    train_bio.parse_file()

    train_reader = BIODatasetReader(
        bio_dataset=train_bio,
        token_indexers={
            'tokens': token_indexer,
        },
    )

    train_data = train_reader.read('temp.txt')

    valid_bio = BIODataset(
        dataset_id=1,
        file_name=valid_file,
    )
    valid_bio.parse_file()

    valid_reader = BIODatasetReader(
        bio_dataset=valid_bio,
        token_indexers={
            'tokens': token_indexer,
        },
    )

    valid_data = valid_reader.read('temp.txt')

    vocab = Vocabulary.from_instances(train_data + valid_data)
    embedder = BasicTextFieldEmbedder({"tokens": token_embedder},
                                      **text_field_embedder_kwargs)
    cached_embedder = CachedTextFieldEmbedder(text_field_embedder=embedder, )

    if args.cuda:
        cuda_device = 0
        cached_embedder = cached_embedder.cuda(cuda_device)
    else:
        cuda_device = -1

    cached_embedder.cache(
        dataset_id=train_bio.dataset_id,
        dataset=train_data,
        vocab=vocab,
        cuda_device=cuda_device,
    )

    cached_embedder.cache(
        dataset_id=valid_bio.dataset_id,
        dataset=valid_data,
        vocab=vocab,
        cuda_device=cuda_device,
    )

    save_file_name = get_save_file(embedder_type=args.embedder,
                                   dataset_type=args.dataset)

    save_file = H5SaveFile(file_name=save_file_name)

    cached_embedder.save(save_file=save_file)
    save_file.close()
示例#5
0
def main():
    args = get_active_args().parse_args()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)

    device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu'

    train_file, valid_file, test_file = get_dataset_files(dataset=args.dataset)

    class_labels: List[str] = construct_f1_class_labels(args.binary_class)

    train_bio = BIODataset(
        dataset_id=0,
        file_name=train_file,
        binary_class=args.binary_class,
    )

    train_bio.parse_file()

    if args.test:
        print('using test set')
    valid_bio = BIODataset(
        dataset_id=1,
        file_name=valid_file if not args.test else test_file,
        binary_class=args.binary_class,
    )

    valid_bio.parse_file()

    vocab = construct_vocab([train_bio, valid_bio])

    unlabeled_corpus = UnlabeledBIODataset(
        dataset_id=train_bio.dataset_id,
        bio_data=train_bio,
    )

    model = build_model(
        model_type=args.model_type,
        vocab=vocab,
        hidden_dim=args.hidden_dim,
        class_labels=class_labels,
        cached=args.cached,
    )

    oracle = GoldOracle(train_bio)

    active_train(
        model=model,
        unlabeled_dataset=unlabeled_corpus,
        valid_dataset=valid_bio,
        vocab=vocab,
        oracle=oracle,
        optimizer_type=args.opt_type,
        optimizer_learning_rate=args.opt_lr,
        optimizer_weight_decay=args.opt_weight_decay,
        use_weak=args.use_weak,
        weak_fine_tune=args.use_weak_fine_tune,
        weak_weight=args.weak_weight,
        weak_function=args.weak_function,
        weak_collator=args.weak_collator,
        sample_strategy=args.sample_strategy,
        batch_size=args.batch_size,
        patience=args.patience,
        num_epochs=args.num_epochs,
        device=device,
        log_dir=args.log_dir,
        model_name=args.model_name,
    )
示例#6
0
def main():
    dataset = BIODataset(dataset_id=0, file_name=CADEC_VALID_ORIGINAL)
    dataset.parse_file()
    valid_data, test_data = create_split(dataset, 0.5)
    serialize_split(valid_data, CADEC_VALID)
    serialize_split(test_data, CADEC_TEST)