示例#1
0
    def write_to_genbank(self, filename: str = None, directory: str = None, record: SeqRecord = None) -> None:
        """ Writes a genbank file containing only the information contained
            within the Region.
        """
        if not filename:
            filename = "%s.region%03d.gbk" % (self.parent_record.id, self.get_region_number())
        if directory:
            filename = os.path.join(directory, filename)

        if record is None:
            record = self.parent_record.to_biopython()
        assert isinstance(record, SeqRecord)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cluster_record = record[self.location.start:self.location.end]

        cluster_record.annotations["date"] = record.annotations.get("date", '')
        cluster_record.annotations["source"] = record.annotations.get("source", '')
        cluster_record.annotations["organism"] = record.annotations.get("organism", '')
        cluster_record.annotations["taxonomy"] = record.annotations.get("taxonomy", [])
        cluster_record.annotations["data_file_division"] = record.annotations.get("data_file_division", 'UNK')
        cluster_record.annotations["comment"] = record.annotations.get("comment", '')

        # update the antiSMASH annotation to include some cluster details
        comment_end_marker = "##antiSMASH-Data-END"
        cluster_comment = ("NOTE: This is a single cluster extracted from a larger record!\n"
                           "Orig. start  :: {start}\n"
                           "Orig. end    :: {end}\n"
                           "{end_marker}").format(start=self.location.start,
                                                  end=self.location.end,
                                                  end_marker=comment_end_marker)
        original = cluster_record.annotations["comment"]
        cluster_record.annotations["comment"] = original.replace(comment_end_marker, cluster_comment)

        # our cut-out clusters are always linear
        cluster_record.annotations["topology"] = "linear"

        # renumber clusters, superclusters and regions to reflect changes
        first_supercluster = min(sc.get_supercluster_number() for sc in self.superclusters)
        first_cluster = min(cluster.get_cluster_number() for cluster in self.get_unique_clusters())
        first_subregion = min(sub.get_subregion_number() for sub in self.subregions) if self.subregions else 0
        for feature in cluster_record.features:
            if feature.type == "region":
                supers = feature.qualifiers.get("candidate_cluster_numbers")
                if not supers:
                    continue
                feature.qualifiers["candidate_cluster_numbers"] = [str(int(num) - first_supercluster) for num in supers]
            elif feature.type == SuperCluster.FEATURE_TYPE:
                new = str(int(feature.qualifiers["candidate_cluster_number"][0]) - first_supercluster)
                feature.qualifiers["candidate_cluster_number"] = [new]
                new_clusters = [str(int(num) - first_cluster) for num in feature.qualifiers["protoclusters"]]
                feature.qualifiers["protoclusters"] = new_clusters
            elif feature.type in ["protocluster", "protocluster_core"]:
                new = str(int(feature.qualifiers["protocluster_number"][0]) - first_cluster)
                feature.qualifiers["cluster_number"] = [new]
            elif feature.type == "subregion":
                new = str(int(feature.qualifiers["subregion_number"][0]) - first_subregion)
                feature.qualifiers["subregion_number"] = [new]

        seqio.write([cluster_record], filename, 'genbank')
示例#2
0
def run_prodigal(record: Record, options: ConfigType) -> None:
    """ Run progidal to annotate prokaryotic sequences
    """
    if "basedir" in options.get('prodigal', ''):
        basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        name = record.id.lstrip('-')
        if not name:
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([record.to_biopython()], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal).stderr
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r", err)
            raise RuntimeError("prodigal error: %s" % err)
        found = 0
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip(
            ).split("_")

            try:
                start = int(start_chunk)
                end = int(end_chunk)
                if prodigal_strand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r',
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                start, end = end, start

            loc = FeatureLocation(start - 1, end, strand=strand)
            translation = record.get_aa_translation_from_location(loc)
            feature = CDSFeature(loc,
                                 locus_tag='ctg%s_%s' %
                                 (record.record_index, name),
                                 translation=translation,
                                 translation_table=record.transl_table)
            record.add_cds_feature(feature)
            found += 1
    logging.debug("prodigal found %d CDS features", found)
示例#3
0
def write(seq_records, options):
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername,
                            "%s.final.embl" % basename)
    logging.debug("Writing seq_records to %r" % output_name)
    if options.input_type == 'nucl':
        seqio.write(seq_records, output_name, 'embl')
示例#4
0
    def write_to_genbank(self, filename=None, directory=None, record=None):
        """ Writes a genbank file containing only the information contained
            within the Cluster.
        """
        if not filename:
            filename = "%s.cluster%03d.gbk" % (self.parent_record.id,
                                               self.get_cluster_number())
        if directory:
            filename = os.path.join(directory, filename)

        if record is None:
            record = self.parent_record.to_biopython()
        assert isinstance(record, SeqRecord)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cluster_record = record[self.location.start:self.location.end]

        cluster_record.annotations["date"] = record.annotations.get("date", '')
        cluster_record.annotations["source"] = record.annotations.get(
            "source", '')
        cluster_record.annotations["organism"] = record.annotations.get(
            "organism", '')
        cluster_record.annotations["taxonomy"] = record.annotations.get(
            "taxonomy", [])
        cluster_record.annotations[
            "data_file_division"] = record.annotations.get(
                "data_file_division", 'UNK')
        # our cut-out clusters are always linear
        cluster_record.annotations["topology"] = "linear"

        seqio.write([cluster_record], filename, 'genbank')
示例#5
0
def write_search_fasta(record: Record) -> str:
    """ Constructs a FASTA representation of a record and writes it to a
        file in the current directory.

        Returns:
            the name of the file created
    """
    filename = "{}.fasta".format(record.id)
    with open(filename, 'w') as handle:
        seqio.write([record.to_biopython()], handle, 'fasta')
    return filename
示例#6
0
def write(seq_records, options):
    basename = seq_records[0].id
    if options.input_type == 'nucl':
        output_name = path.join(options.outputfoldername, "%s.final.gbk" % basename)
        logging.debug("Writing %s seq_records to %r" % (len(seq_records), output_name))
        seqio.write(seq_records, output_name, 'genbank')
        i=1
        for rec in seq_records:
            # For compatibility with the database importer, we have to check whether we are dealing
            # with a seq_record obtained from a file (then its class will be SeqRecord) or from a#
            # database (then its class will be DBSeqRecord)
            #
            # running the cluster extraction on a DBSeqRecord will throw an exception, as splitting the object is not supported
            if rec.__class__.__name__ == 'SeqRecord':
                for cluster in utils.get_cluster_features(rec):
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        cluster_rec = rec[cluster.location.start:cluster.location.end]
                    output_name = path.join(options.outputfoldername,
                                            "%s.cluster%03d.gbk" % (basename, i))
                    seqio.write([cluster_rec], output_name, 'genbank')
                    i += 1
    else:
        seq_records = seq_record_convert_nucl_to_prot(seq_records, options)
        output_name = path.join(options.outputfoldername, "%s.final.gp" % basename)
        logging.debug("Writing seq_records to %r" % output_name)
        seqio.write(seq_records, output_name, 'genbank')
示例#7
0
def write(seq_records, options):
    basename = seq_records[0].id
    if options.input_type == 'nucl':
        output_name = path.join(options.outputfoldername,
                                "%s.final.gbk" % basename)
        for rec in seq_records:
            for cluster in utils.get_cluster_features(rec):
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    cluster_rec = rec[cluster.location.start:cluster.location.
                                      end]

                cluster_rec.annotations["date"] = rec.annotations.get(
                    "date", '')
                cluster_rec.annotations["source"] = rec.annotations.get(
                    "source", '')
                cluster_rec.annotations["organism"] = rec.annotations.get(
                    "organism", '')
                cluster_rec.annotations["taxonomy"] = rec.annotations.get(
                    "taxonomy", [])
                cluster_rec.annotations[
                    "data_file_division"] = rec.annotations.get(
                        "data_file_division", 'UNK')
                # our cut-out clusters are always linear
                cluster_rec.annotations["topology"] = "linear"

                cluster_name = path.join(
                    options.outputfoldername, "%s.cluster%03d.gbk" %
                    (basename, utils.get_cluster_number(cluster)))
                seqio.write([cluster_rec], cluster_name, 'genbank')
    else:
        seq_records = seq_record_convert_nucl_to_prot(seq_records, options)
        output_name = path.join(options.outputfoldername,
                                "%s.final.gp" % basename)

    logging.debug("Writing seq_records to %r" % output_name)
    seqio.write(seq_records, output_name, 'genbank')
示例#8
0
def run_glimmer(seq_record, options):
    "Run glimmer3 to annotate prokaryotic sequences"
    basedir = utils.get_genefinding_basedir(options)
    with TemporaryDirectory(change=True):
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        longorfs_file = '%s.longorfs' % name
        icm_file = '%s.icm' % name
        result_file = '%s.predict' % name

        # run long-orfs
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')
        long_orfs = [path.join(basedir, 'long-orfs')]
        long_orfs.extend([
            '-l', '-n', '-t', '1.15', '--trans_table', '11', fasta_file,
            longorfs_file
        ])
        out, err, _ = execute(long_orfs)
        if err.find('ERROR') > -1:
            logging.error("Locating long orfs failed: %r" % err)
            return

        # run extract
        extract = [
            path.join(basedir, 'extract'), '-t', fasta_file, longorfs_file
        ]
        out, err, retcode = execute(extract)
        if out == '':
            logging.error("Failed to extract genes from model, aborting: %r" %
                          err)
            return

        build_icm = [path.join(basedir, 'build-icm'), '-r', icm_file]
        out, err, retcode = execute(build_icm, input=out)
        if err != '':
            logging.error("Failed to build gene model: %r" % err)
            return

        # run glimmer3
        glimmer = [path.join(basedir, 'glimmer3')]
        glimmer.extend([
            '-l', '-o', '50', '-g', '90', '-q', '3000', '-t', '30',
            '--trans_table', '11', fasta_file, icm_file, name
        ])

        out, err, retcode = execute(glimmer)
        if err.find('ERROR') > -1:
            logging.error("Failed to run glimmer3: %r" % err)
            return
        for line in open(result_file, 'r'):
            # skip first line
            if line.startswith('>'):
                continue

            name, start, end, strand, score = line.split()

            try:
                start = int(start)
                end = int(end)
                strand = int(strand)
            except ValueError:
                logging.error('Malformatted glimmer output line %r' %
                              line.rstrip())

            if start > end:
                bpy_strand = -1
                tmp = start
                start = end
                end = tmp
            else:
                bpy_strand = 1

            loc = FeatureLocation(start - 1, end, strand=bpy_strand)
            feature = SeqFeature(location=loc,
                                 id=name,
                                 type="CDS",
                                 qualifiers={
                                     'locus_tag':
                                     ['ctg%s_%s' % (options.record_idx, name)],
                                     'note': ['Glimmer score: %s' % score]
                                 })
            seq_record.features.append(feature)
示例#9
0
def run_prodigal(seq_record, options):
    "Run progidal to annotate prokaryotic sequences"
    if "prodigal" in options:
        if "basedir" in options.prodigal:
            basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding == "prodigal-m" or len(seq_record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal)[1]
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r" % err)
            return
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start, end, prodigalStrand = line[1:].rstrip().split("_")

            try:
                start = int(start)
                end = int(end)
                if prodigalStrand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r' %
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                tmp = start
                start = end
                end = tmp

            loc = FeatureLocation(start - 1, end, strand=strand)
            feature = SeqFeature(location=loc,
                                 id=name,
                                 type="CDS",
                                 qualifiers={
                                     'locus_tag':
                                     ['ctg%s_%s' % (options.record_idx, name)]
                                 })
            seq_record.features.append(feature)
示例#10
0
 def test_write_calls_biopython(self):
     "Test writing Bio.SeqIO records"
     mock("Bio.SeqIO.write", tracker=self.tt, returns=[])
     expected_trace = "    Called Bio.SeqIO.write(['fake'], DummyHandle('test.gbk'), 'genbank')"
     seqio.write(['fake'], self.handle, "genbank")
     assert_same_trace(self.tt, expected_trace)
示例#11
0
 def test_write_calls_biopython(self):
     "Test writing Bio.SeqIO records"
     mock("Bio.SeqIO.write", tracker=self.tt, returns=[])
     expected_trace = "    Called Bio.SeqIO.write(['fake'], DummyHandle('test.gbk'), 'genbank')"
     seqio.write(['fake'], self.handle, "genbank")
     assert_same_trace(self.tt, expected_trace)
示例#12
0
def run_glimmerhmm(seq_record, options):
    basedir = utils.get_genefinding_basedir(options)
    with TemporaryDirectory(change=True):
        #Write FASTA file and run GlimmerHMM
        utils.fix_record_name_id(seq_record, options)
        name = seq_record.id
        while len(name) > 0 and name[0] == '-':
            name = name[1:]
        if name == "":
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([seq_record], handle, 'fasta')
        glimmerhmm = ['glimmerhmm']
        glimmerhmm.extend([
            fasta_file,
            utils.get_full_path(__file__,
                                "train_%s" % options.glimmerhmm_train_folder),
            "-g"
        ])
        out, err, retcode = execute(glimmerhmm)
        if err.find('ERROR') > -1:
            logging.error("Failed to run GlimmerHMM: %r" % err)
            return

        #Parse GlimmerHMM predictions
        resultstext = out
        if "CDS" not in resultstext:
            logging.error("GlimmerHMM gene prediction failed: no genes found.")
        resultstext = resultstext.replace("\r", " ")
        lines = resultstext.split("\n")
        lines = lines[2:-1]
        orfnames = []
        positions = []
        strands = []
        x = 0
        orfnr = 0
        starts = []
        ends = []
        for line in lines:
            columns = line.split("\t")
            if len(columns) > 1:
                if x == 0:
                    if columns[6] == "+":
                        bpy_strand = 1
                    else:
                        bpy_strand = -1
                    if "mRNA" not in line:
                        starts.append(int(columns[3]))
                        ends.append(int(columns[4]))
                elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]:
                    if columns[6] == "+":
                        bpy_strand = 1
                    else:
                        bpy_strand = -1
                    strands.append(bpy_strand)
                    starts.append(int(columns[3]))
                    ends.append(int(columns[4]))
                    orfnames.append("orf" + (5 - orfnr) * "0" + str(orfnr))
                    orfnr += 1
                    if len(starts) == 1:
                        if starts[0] == 0:
                            starts[0] = 1
                        if ends[0] == 0:
                            ends[0] = 1
                        positions.append([[starts[0] - 1, ends[0]]])
                    else:
                        pos = []
                        if bpy_strand == -1:
                            starts.reverse()
                            ends.reverse()
                        for i in starts:
                            if i == 0:
                                i = 1
                            if ends[starts.index(i)] == 0:
                                ends[starts.index(i)] = 1
                            pos.append([i - 1, ends[starts.index(i)]])
                        positions.append(pos)
                    starts = []
                    ends = []
                elif "mRNA" not in line:
                    starts.append(int(columns[3]))
                    ends.append(int(columns[4]))
            x += 1
        if len(orfnames) == 0:
            logging.error("GlimmerHMM gene prediction failed. Please check the " \
                "format of your input FASTA file.")
        #Create seq_record features for identified genes
        idx = 0
        for orfname in orfnames:
            bpy_strand = strands[idx]
            genepositions = positions[idx]
            #For genes with only one CDS
            if len(genepositions) == 1:
                gstart, gend = genepositions[0]
                loc = FeatureLocation(gstart, gend, strand=bpy_strand)
                feature = SeqFeature(
                    location=loc,
                    id=orfname,
                    type="CDS",
                    qualifiers={
                        'locus_tag':
                        ['ctg%s_%s' % (options.record_idx, orfname)]
                    })
                seq_record.features.append(feature)
            #For genes with multiple exons
            else:
                gstart, gend = min(genepositions[0]), max(genepositions[-1])
                sublocations = []
                for exonstart, exonend in genepositions:
                    exonloc = FeatureLocation(exonstart,
                                              exonend,
                                              strand=bpy_strand)
                    sublocations.append(exonloc)
                loc = CompoundLocation(sublocations)
                feature = SeqFeature(
                    location=loc,
                    id=orfname,
                    type="CDS",
                    qualifiers={
                        'locus_tag':
                        ['ctg%s_%s' % (options.record_idx, orfname)]
                    })
                seq_record.features.append(feature)
            idx += 1
示例#13
0
    def write_to_genbank(self, filename: str = None, directory: str = None, record: SeqRecord = None) -> None:
        """ Writes a genbank file containing only the information contained
            within the Region.
        """
        if not filename:
            filename = "%s.region%03d.gbk" % (self.parent_record.id, self.get_region_number())
        if directory:
            filename = os.path.join(directory, filename)

        if record is None:
            record = self.parent_record.to_biopython()
        assert isinstance(record, SeqRecord)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cluster_record = record[self.location.start:self.location.end]

        cluster_record.annotations["date"] = record.annotations.get("date", '')
        cluster_record.annotations["source"] = record.annotations.get("source", '')
        cluster_record.annotations["organism"] = record.annotations.get("organism", '')
        cluster_record.annotations["taxonomy"] = record.annotations.get("taxonomy", [])
        cluster_record.annotations["data_file_division"] = record.annotations.get("data_file_division", 'UNK')
        cluster_record.annotations["comment"] = record.annotations.get("comment", '')
        # biopython does not persist the molecule_type annotation in slices,
        # despite it being required for output to the genbank format
        cluster_record.annotations["molecule_type"] = record.annotations["molecule_type"]

        # update the antiSMASH annotation to include some cluster details
        comment_end_marker = "##antiSMASH-Data-END"
        cluster_comment = ("NOTE: This is a single cluster extracted from a larger record!\n"
                           "Orig. start  :: {start}\n"
                           "Orig. end    :: {end}\n"
                           "{end_marker}").format(start=self.location.start,
                                                  end=self.location.end,
                                                  end_marker=comment_end_marker)
        original = cluster_record.annotations["comment"]
        cluster_record.annotations["comment"] = original.replace(comment_end_marker, cluster_comment)

        # our cut-out clusters are always linear
        cluster_record.annotations["topology"] = "linear"

        # renumber clusters, candidate_clusters and regions to reflect changes
        # also update positions of RiPP component locations
        if self.candidate_clusters:
            first_candidate_cluster = min(sc.get_candidate_cluster_number() for sc in self.candidate_clusters)
            first_cluster = min(cluster.get_protocluster_number() for cluster in self.get_unique_protoclusters())
        else:
            first_candidate_cluster = 0
            first_cluster = 0
        first_subregion = min(sub.get_subregion_number() for sub in self.subregions) if self.subregions else 0
        for feature in cluster_record.features:
            if feature.type == Region.FEATURE_TYPE:
                candidates = feature.qualifiers.get("candidate_cluster_numbers")
                if not candidates:
                    continue
                candidates = [str(int(num) - first_candidate_cluster + 1) for num in candidates]
                feature.qualifiers["candidate_cluster_numbers"] = candidates
            elif feature.type == CandidateCluster.FEATURE_TYPE:
                new = str(int(feature.qualifiers["candidate_cluster_number"][0]) - first_candidate_cluster + 1)
                feature.qualifiers["candidate_cluster_number"] = [new]
                new_clusters = [str(int(num) - first_cluster + 1) for num in feature.qualifiers["protoclusters"]]
                feature.qualifiers["protoclusters"] = new_clusters
            elif feature.type in ["protocluster", "proto_core"]:
                new = str(int(feature.qualifiers["protocluster_number"][0]) - first_cluster + 1)
                feature.qualifiers["protocluster_number"] = [new]
            elif feature.type == "subregion":
                new = str(int(feature.qualifiers["subregion_number"][0]) - first_subregion + 1)
                feature.qualifiers["subregion_number"] = [new]
            elif feature.type == "CDS_motif":
                for qual in ["leader_location", "tail_location"]:
                    if qual not in feature.qualifiers:
                        continue
                    loc = location_from_string(feature.qualifiers[qual][0])
                    parts = []
                    for part in loc.parts:
                        new_start = part.start - self.location.start
                        new_end = part.end - self.location.start
                        parts.append(FeatureLocation(new_start, new_end, part.strand))
                    feature.qualifiers[qual] = [str(build_location_from_others(parts))]

        seqio.write([cluster_record], filename, 'genbank')