示例#1
0
def annotate_with_deepnog(identifier: str,
                          protein_list: List[SeqRecord],
                          database: str = 'eggNOG5',
                          tax_level: int = 2,
                          confidence_threshold: float = None,
                          verb: bool = True) -> GenotypeRecord:
    """
    Assign proteins belonging to a sample to orthologous groups using deepnog.

    :param identifier: The name associated with the sample.
    :param protein_list: A list of SeqRecords containing protein sequences.
    :param database: Orthologous group/family database to use.
    :param tax_level: The NCBI taxon ID of the taxonomic level to use from the given database.
    :param confidence_threshold: Confidence threshold of deepnog annotations below which annotations
                                 will be discarded.
    :param verb: Whether to print verbose progress messages.
    :returns: a GenotypeRecord suitable for use with phenotrex.
    """
    if not (database, tax_level) in DEEPNOG_VALID_CONFIG:
        raise RuntimeError(
            f'Unknown database and/or tax level: {database}/{tax_level}')

    device = set_device('auto')
    torch.set_num_threads(1)
    weights_path = get_weights_path(
        database=database,
        level=str(tax_level),
        architecture=DEEPNOG_ARCH,
    )
    model_dict = torch.load(weights_path, map_location=device)
    model = load_nn(
        architecture=DEEPNOG_ARCH,
        model_dict=model_dict,
        device=device,
    )
    class_labels = model_dict['classes']
    dataset = PreloadedProteinDataset(protein_list)
    preds, confs, ids, indices = predict(model,
                                         dataset,
                                         device,
                                         batch_size=1,
                                         num_workers=1,
                                         verbose=3 if verb else 0)
    threshold = float(model.threshold) if hasattr(
        model, 'threshold') else confidence_threshold
    df = create_df(
        class_labels,
        preds,
        confs,
        ids,
        indices,
        threshold=threshold,
    )

    cogs = [x for x in df.prediction.unique() if x]
    feature_type_str = f'{database}-tax-{tax_level}'
    return GenotypeRecord(identifier=identifier,
                          feature_type=feature_type_str,
                          features=cogs)
示例#2
0
def test_create_df():
    """ Test correct creation of data frame. """
    class_labels = ['class1', 'class2']
    preds = torch.tensor([1, 0])
    confs = torch.tensor([0.8, 0.3])
    ids = ['sequence2', 'sequence1']
    indices = [2, 1]
    df = create_df(class_labels, preds, confs, ids, indices)
    assert (df.shape == (2, 4))
    assert (sum(df['index'] == [1, 2]) == 2)
    assert (sum(df['sequence_id'] == ['sequence1', 'sequence2']) == 2)
    assert (sum(df['prediction'] == ['class1', 'class2']) == 2)
    df_confs = df['confidence'].tolist()
    assert (df_confs[0] < 0.5)
    assert (df_confs[1] > 0.5)
示例#3
0
def test_fit_model_and_predict(architecture):
    """ Fit each DeepNOG model on the dummy data, and assert inference
        on the same training data gives perfect predictions.
    """
    with TemporaryDirectory(prefix='deepnog_pytest_') as d:
        config = get_config(DEEPNOG_CONFIG)
        module = config['architecture'][architecture]['module']
        cls = config['architecture'][architecture]['class']

        result = fit(
            architecture=architecture,
            module=module,
            cls=cls,
            training_sequences=TRAINING_FASTA,
            validation_sequences=TRAINING_FASTA,
            training_labels=TRAINING_CSV,
            validation_labels=TRAINING_CSV,
            n_epochs=2,
            shuffle=True,
            tensorboard_dir=None,
            random_seed=123,
            config_file=DEEPNOG_CONFIG,
            verbose=0,
            out_dir=Path(d),
        )

        dataset = ProteinIterableDataset(
            TRAINING_FASTA,
            TRAINING_CSV,
        )
        preds, confs, ids, indices = predict(result.model,
                                             dataset,
                                             num_workers=0,
                                             verbose=0)
        df_pred = create_df(dataset.label_encoder.classes_,
                            preds,
                            confs,
                            ids,
                            indices,
                            threshold=1e-15)
        df_true = pd.read_csv(TRAINING_CSV)
        df = df_true.merge(df_pred,
                           left_on="protein_id",
                           right_on="sequence_id")
        np.testing.assert_array_equal(df.prediction, df.eggnog_id)
示例#4
0
def test_create_df_with_duplicates():
    """ Test correct exclusion of duplicates. """
    class_labels = ['class1', 'class2']
    preds = torch.tensor([1, 0, 0, 1, 0])
    confs = torch.tensor([0.8, 0.3, 0.1, 0.6, 0.8])
    ids = ['sequence2', 'sequence1', 'sequence2', 'sequence3', 'sequence1']
    indices = [1, 2, 3, 4, 5]
    with pytest.warns(UserWarning, match='Detected 2 duplicate sequences'):
        df = create_df(class_labels, preds, confs, ids, indices)
    assert (df.shape == (3, 4))
    assert (sum(df['index'] == [1, 2, 4]) == 3)
    assert (sum(
        df['sequence_id'] == ['sequence2', 'sequence1', 'sequence3']) == 3)
    assert (sum(df['prediction'] == ['class2', 'class1', 'class2']) == 3)
    df_confs = df['confidence'].tolist()
    assert (df_confs[0] > 0.5)
    assert (df_confs[1] < 0.5)
    assert (df_confs[2] > 0.5)
示例#5
0
def _start_inference(args, arch_module, arch_cls):
    from pandas import read_csv, DataFrame
    import torch
    from deepnog.data import ProteinIterableDataset
    from deepnog.learning import predict
    from deepnog.utils import create_df, get_logger, get_weights_path, load_nn
    from deepnog.utils.metrics import estimate_performance

    logger = get_logger(__name__, verbose=args.verbose)
    # Intra-op parallelization appears rather inefficient.
    # Users may override with environmental variable: export OMP_NUM_THREADS=8
    torch.set_num_threads(1)

    # Construct path to saved parameters of NN
    if args.weights is not None:
        weights_path = args.weights
    else:
        weights_path = get_weights_path(
            database=args.database,
            level=str(args.tax),
            architecture=args.architecture,
            verbose=args.verbose,
        )
    # Load neural network parameters
    logger.info(f'Loading NN-parameters from {weights_path} ...')
    model_dict = torch.load(weights_path, map_location=args.device)

    # Load dataset
    logger.info(f'Accessing dataset from {args.file} ...')
    dataset = ProteinIterableDataset(args.file,
                                     labels_file=args.test_labels,
                                     f_format=args.fformat)

    # Load class names
    try:
        class_labels = model_dict['classes']
    except KeyError:
        class_labels = dataset.label_encoder.classes_

    # Load neural network model
    model = load_nn(architecture=(arch_module, arch_cls),
                    model_dict=model_dict,
                    phase=args.phase,
                    device=args.device)

    # If given, set confidence threshold for prediction
    if args.confidence_threshold is not None:
        if 0.0 < args.confidence_threshold <= 1.0:
            threshold = float(args.confidence_threshold)
        else:
            logger.error(f'Invalid confidence threshold specified: '
                         f'{args.confidence_threshold} not in range (0, 1].')
            sys.exit(1)
    elif hasattr(model, 'threshold'):
        threshold = float(model.threshold)
        logger.info(f'Applying confidence threshold from model: {threshold}')
    else:
        threshold = None

    # Predict labels of given data
    logger.info('Starting protein sequence group/family inference ...')
    logger.debug(
        f'Processing {args.batch_size} sequences per iteration (minibatch)')
    preds, confs, ids, indices = predict(model,
                                         dataset,
                                         args.device,
                                         batch_size=args.batch_size,
                                         num_workers=args.num_workers,
                                         verbose=args.verbose)

    # Construct results dataframe
    df = create_df(class_labels,
                   preds,
                   confs,
                   ids,
                   indices,
                   threshold=threshold)

    if args.out is None:
        save_file = sys.stdout
        logger.info('Writing predictions to stdout')
    else:
        save_file = args.out
        Path(args.out).parent.mkdir(parents=True, exist_ok=True)
        logger.info(f'Writing prediction to {save_file}')

    columns = ['sequence_id', 'prediction', 'confidence']
    separator = {'csv': ',', 'tsv': '\t', 'legacy': ';'}.get(args.outformat)
    df.to_csv(save_file, sep=separator, index=False, columns=columns)

    # Measure test set performance, if labels were provided
    if args.test_labels is not None:
        if args.out is None:
            perf_file = sys.stderr
            logger.info('Writing test set performance to stderr')
        else:
            perf_file = Path(save_file).with_suffix('.performance.csv')
            logger.info(f'Writing test set performance to {perf_file}')
        # Ensure object dtype to avoid int-str mismatches
        df_true = read_csv(args.test_labels, dtype=object, index_col=0)
        df = df.astype(dtype={columns[1]: object})
        perf = estimate_performance(df_true=df_true, df_pred=df)
        df_perf = DataFrame(data=[
            perf,
        ])
        df_perf['experiment'] = args.file
        df_perf.to_csv(perf_file, )
    logger.info('All done.')
    return