예제 #1
0
    def run(self, inputs, limit_to_record, output_gbk, output_tsv,
            prodigal_meta_mode, protein):
        first_output = output_gbk or output_tsv
        if not first_output:
            raise ValueError(
                'Specify at least one of --output-gbk or --output-tsv')

        tmp_dir_path = first_output + '.tmp'
        logging.debug('Using TMP dir: %s', tmp_dir_path)
        if not os.path.exists(tmp_dir_path):
            os.mkdir(tmp_dir_path)

        prepare_step = DeepBGCAnnotator(tmp_dir_path=tmp_dir_path,
                                        prodigal_meta_mode=prodigal_meta_mode)

        writers = []
        if output_gbk:
            writers.append(GenbankWriter(out_path=output_gbk))
        if output_tsv:
            writers.append(PfamTSVWriter(out_path=output_tsv))

        num_records = 0
        for i, input_path in enumerate(inputs):
            logging.info('Processing input file %s/%s: %s', i + 1, len(inputs),
                         input_path)
            with util.SequenceParser(input_path, protein=protein) as parser:
                for record in parser.parse():
                    if limit_to_record and record.id not in limit_to_record:
                        logging.debug(
                            'Skipping record %s not matching filter %s',
                            record.id, limit_to_record)
                        continue
                    prepare_step.run(record)
                    for writer in writers:
                        writer.write(record)
                    num_records += 1

        logging.debug('Removing TMP directory: %s', tmp_dir_path)
        shutil.rmtree(tmp_dir_path)

        prepare_step.print_summary()

        for writer in writers:
            writer.close()

        logging.info('Saved %s fully annotated records to %s', num_records,
                     first_output)
예제 #2
0
    def run(self, inputs, output_gbk, output_tsv):
        first_output = output_gbk or output_tsv
        if not first_output:
            raise ValueError(
                'Specify at least one of --output-gbk or --output-tsv')

        tmp_dir_path = first_output + '.tmp'
        logging.debug('Using TMP dir: %s', tmp_dir_path)
        if not os.path.exists(tmp_dir_path):
            os.mkdir(tmp_dir_path)

        prepare_step = DeepBGCAnnotator(tmp_dir_path=tmp_dir_path)

        writers = []
        if output_gbk:
            writers.append(GenbankWriter(out_path=output_gbk))
        if output_tsv:
            writers.append(PfamTSVWriter(out_path=output_tsv))

        num_records = 0
        for input_path in inputs:
            fmt = util.guess_format(input_path)
            if not fmt:
                raise NotImplementedError(
                    "Sequence file type not recognized: {}, ".format(
                        input_path),
                    "Please provide a GenBank or FASTA sequence "
                    "with an appropriate file extension.")
            records = SeqIO.parse(input_path, fmt)
            for record in records:
                prepare_step.run(record)
                for writer in writers:
                    writer.write(record)
                num_records += 1

        logging.debug('Removing TMP directory: %s', tmp_dir_path)
        shutil.rmtree(tmp_dir_path)

        prepare_step.print_summary()

        for writer in writers:
            writer.close()

        logging.info('Saved %s fully annotated records to %s', num_records,
                     first_output)
예제 #3
0
    def run(self, inputs, output, detectors, no_detector, labels, classifiers,
            no_classifier, is_minimal_output, limit_to_record, score,
            classifier_score, merge_max_protein_gap, merge_max_nucl_gap,
            min_nucl, min_proteins, min_domains, min_bio_domains,
            prodigal_meta_mode, protein):
        if not detectors:
            detectors = ['deepbgc']
        if not classifiers:
            classifiers = ['product_class', 'product_activity']
        if not output:
            # if not specified, set output path to name of first input file without extension
            output, _ = os.path.splitext(
                os.path.basename(os.path.normpath(inputs[0])))

        if not os.path.exists(output):
            os.mkdir(output)

        # Save log to LOG.txt file
        logger = logging.getLogger('')
        logger.addHandler(
            logging.FileHandler(os.path.join(output, self.LOG_FILENAME)))

        # Define report dir paths
        tmp_path = os.path.join(output, self.TMP_DIRNAME)
        evaluation_path = os.path.join(output, self.PLOT_DIRNAME)
        output_file_name = os.path.basename(os.path.normpath(output))

        steps = []
        steps.append(
            DeepBGCAnnotator(tmp_dir_path=tmp_path,
                             prodigal_meta_mode=prodigal_meta_mode))
        if not no_detector:
            if not labels:
                labels = [None] * len(detectors)
            elif len(labels) != len(detectors):
                raise ValueError(
                    'A separate label should be provided for each of the detectors: {}'
                    .format(detectors))

            for detector, label in zip(detectors, labels):
                steps.append(
                    DeepBGCDetector(
                        detector=detector,
                        label=label,
                        score_threshold=score,
                        merge_max_protein_gap=merge_max_protein_gap,
                        merge_max_nucl_gap=merge_max_nucl_gap,
                        min_nucl=min_nucl,
                        min_proteins=min_proteins,
                        min_domains=min_domains,
                        min_bio_domains=min_bio_domains))

        writers = []
        writers.append(
            GenbankWriter(out_path=os.path.join(output, output_file_name +
                                                '.full.gbk')))
        writers.append(
            AntismashJSONWriter(
                out_path=os.path.join(output, output_file_name +
                                      '.antismash.json')))
        is_evaluation = False
        if not is_minimal_output:
            writers.append(
                BGCGenbankWriter(
                    out_path=os.path.join(output, output_file_name +
                                          '.bgc.gbk')))
            writers.append(
                ClusterTSVWriter(
                    out_path=os.path.join(output, output_file_name +
                                          '.bgc.tsv')))
            writers.append(
                PfamTSVWriter(
                    out_path=os.path.join(output, output_file_name +
                                          '.pfam.tsv')))

            is_evaluation = True
            writers.append(
                PfamScorePlotWriter(
                    out_path=os.path.join(evaluation_path, output_file_name +
                                          '.score.png')))
            writers.append(
                BGCRegionPlotWriter(
                    out_path=os.path.join(evaluation_path, output_file_name +
                                          '.bgc.png')))
            writers.append(
                ROCPlotWriter(
                    out_path=os.path.join(evaluation_path, output_file_name +
                                          '.roc.png')))
            writers.append(
                PrecisionRecallPlotWriter(
                    out_path=os.path.join(evaluation_path, output_file_name +
                                          '.pr.png')))

        writers.append(
            ReadmeWriter(out_path=os.path.join(output, 'README.txt'),
                         root_path=output,
                         writers=writers))

        if not no_classifier:
            for classifier in classifiers:
                steps.append(
                    DeepBGCClassifier(classifier=classifier,
                                      score_threshold=classifier_score))

        # Create temp and evaluation dir
        if not os.path.exists(tmp_path):
            os.mkdir(tmp_path)
        if is_evaluation:
            if not os.path.exists(evaluation_path):
                os.mkdir(evaluation_path)

        record_idx = 0
        for i, input_path in enumerate(inputs):
            logging.info('Processing input file %s/%s: %s', i + 1, len(inputs),
                         input_path)
            with util.SequenceParser(input_path, protein=protein) as parser:
                for record in parser.parse():
                    if limit_to_record and record.id not in limit_to_record:
                        logging.debug(
                            'Skipping record %s not matching filter %s',
                            record.id, limit_to_record)
                        continue

                    record_idx += 1
                    logging.info('=' * 80)
                    logging.info('Processing record #%s: %s', record_idx,
                                 record.id)
                    for step in steps:
                        step.run(record)

                    logging.info('Saving processed record %s', record.id)
                    for writer in writers:
                        writer.write(record)

        logging.info('=' * 80)
        for step in steps:
            step.print_summary()

        for writer in writers:
            writer.close()

        logging.info('=' * 80)
        logging.info('Saved DeepBGC result to: {}'.format(output))