Пример #1
0
def test_unit_train_classify(tmpdir):
    tmpdir = str(tmpdir)
    out_path = os.path.join(tmpdir, 'model.pkl')
    run([
        'train', '--model',
        get_test_file('random_forest_test.json'), '--classes',
        get_test_file('BGC0000015.classes.csv'), '--output', out_path,
        get_test_file('BGC0000015.pfam.csv')
    ])

    assert os.path.exists(out_path)

    model = SequenceModelWrapper.load(out_path)

    domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv'))

    classes = model.predict(
        [sample for _, sample in domains.groupby('sequence_id')])

    assert isinstance(classes, pd.DataFrame)
    assert list(classes.columns) == ['class1', 'class2', 'class3', 'class4']

    assert len(classes.index) == 2

    assert list(classes.iloc[0] > 0.5) == [True, False, True, False]
    assert list(classes.iloc[1] > 0.5) == [False, True, False, True]
Пример #2
0
    def run(self, inputs, output, model, target, classes, config, log,
            validation, verbose):

        pipeline = SequenceModelWrapper.from_config(model, vars=dict(config))

        if classes:
            class_df = util.read_compatible_csv(classes).set_index(
                'sequence_id').astype('int8')
            train_samples, train_y = util.read_samples_with_classes(
                inputs, class_df)
            logging.info('Training samples:\n%s', train_y.sum())

            validation_samples, validation_y = util.read_samples_with_classes(
                validation, class_df)
            if len(validation_y):
                logging.info('Validation samples:\n%s', validation_y.sum())
        else:
            train_samples, train_y = util.read_samples(inputs, target)
            validation_samples, validation_y = util.read_samples(
                validation, target)
        pipeline.fit(samples=train_samples,
                     y=train_y,
                     debug_progress_path=log,
                     validation_samples=validation_samples,
                     validation_y=validation_y,
                     verbose=verbose)

        pipeline.save(output)

        if log:
            logging.info('Progress log saved to: %s', log)
        logging.info('Trained model saved to: %s', output)
Пример #3
0
def test_unit_train_detect(model, tmpdir):
    tmpdir = str(tmpdir)
    out_path = os.path.join(tmpdir, 'model.pkl')
    run([
        'train', '--model',
        get_test_file(model), '--config', 'PFAM2VEC',
        get_test_file('pfam2vec.test.tsv'), '--output', out_path,
        get_test_file('BGC0000015.pfam.csv'),
        get_test_file('negative.pfam.csv')
    ])

    assert os.path.exists(out_path)

    model = SequenceModelWrapper.load(out_path)

    pos_domains = pd.read_csv(get_test_file('BGC0000015.pfam.csv'))
    neg_domains = pd.read_csv(get_test_file('negative.pfam.csv'))

    pos_prediction = model.predict(pos_domains)
    neg_prediction = model.predict(neg_domains)

    assert isinstance(pos_prediction, pd.Series)
    assert isinstance(neg_prediction, pd.Series)

    assert pos_prediction.index.equals(pos_domains.index)
    assert neg_prediction.index.equals(neg_domains.index)

    assert pos_prediction.mean() > 0.5
    assert neg_prediction.mean() < 0.5
Пример #4
0
 def __init__(self, classifier, score_threshold=0.5):
     if classifier is None or not isinstance(classifier, six.string_types):
         raise ValueError(
             'Expected classifier name, got {}'.format(classifier))
     self.classifier_name = classifier
     self.score_threshold = score_threshold
     classifier_path = util.get_model_path(self.classifier_name,
                                           'classifier')
     self.model = SequenceModelWrapper.load(classifier_path)
     self.total_class_counts = pd.Series()
Пример #5
0
 def print_model(self, name, model_path):
     logging.info("-" * 80)
     logging.info('Model: %s', name)
     try:
         model = SequenceModelWrapper.load(model_path)
         logging.info('Type: %s', type(model.model).__name__)
         logging.info('Version: %s', model.version)
         logging.info('Timestamp: %s (%s)', model.timestamp,
                      datetime.fromtimestamp(model.timestamp).isoformat())
     except Exception as e:
         logging.warning('Model not supported: %s', e)
         return False
     return True
Пример #6
0
 def __init__(self, classifier, score_threshold=0.5):
     if classifier is None or not isinstance(classifier, six.string_types):
         raise ValueError('Expected classifier name or path, got {}'.format(classifier))
     if (os.path.exists(classifier) or os.path.sep in classifier) and not os.path.isdir(classifier):
         classifier_path = classifier
         # Set classifier name to filename without suffix
         classifier, _ = os.path.splitext(os.path.basename(classifier))
     else:
         classifier_path = util.get_model_path(classifier, 'classifier')
     self.classifier_name = classifier
     self.score_threshold = score_threshold
     self.model = SequenceModelWrapper.load(classifier_path)
     self.total_class_counts = pd.Series()
Пример #7
0
 def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_gap=0,
              merge_max_nucl_gap=0, min_nucl=1, min_proteins=1, min_domains=1, min_bio_domains=0):
     self.score_threshold = score_threshold
     if detector is None or not isinstance(detector, six.string_types):
         raise ValueError('Expected detector name, got {}'.format(detector))
     self.detector_name = detector
     self.detector_label = label or self.detector_name
     self.score_column = util.format_bgc_score_column(self.detector_name)
     self.merge_max_protein_gap = merge_max_protein_gap
     self.merge_max_nucl_gap = merge_max_nucl_gap
     self.min_nucl = min_nucl
     self.min_proteins = min_proteins
     self.min_domains = min_domains
     self.min_bio_domains = min_bio_domains
     model_path = util.get_model_path(self.detector_name, 'detector')
     self.model = SequenceModelWrapper.load(model_path)
     self.num_detected = 0
Пример #8
0
    def __init__(self, detector, label=None, score_threshold=0.5, merge_max_protein_gap=0,
                 merge_max_nucl_gap=0, min_nucl=1, min_proteins=1, min_domains=1, min_bio_domains=0):
        self.score_threshold = score_threshold
        if detector is None or not isinstance(detector, six.string_types):
            raise ValueError('Expected detector name or path, got {}'.format(detector))
        if (os.path.exists(detector) or os.path.sep in detector) and not os.path.isdir(detector):
            model_path = detector
            # Set detector name to filename without suffix
            detector, _ = os.path.splitext(os.path.basename(detector))
        else:
            model_path = util.get_model_path(detector, 'detector')

        self.detector_name = detector
        self.detector_label = label or self.detector_name
        self.score_column = util.format_bgc_score_column(self.detector_name)
        self.merge_max_protein_gap = merge_max_protein_gap
        self.merge_max_nucl_gap = merge_max_nucl_gap
        self.min_nucl = min_nucl
        self.min_proteins = min_proteins
        self.min_domains = min_domains
        self.min_bio_domains = min_bio_domains
        self.model = SequenceModelWrapper.load(model_path)
        self.num_detected = 0