Exemplo n.º 1
0
    def test_get_cluster_cds_features(self):
        "Test utils.get_cluster_cds_features()"
        cluster1, cluster2 = utils.get_cluster_features(self.record)
        self.assertEqual(self.features[0], cluster1)
        self.assertEqual(self.features[-1], cluster2)

        clusterfeatures = utils.get_cluster_cds_features(cluster1, self.record)
        self.assertEqual(self.features[3:6], clusterfeatures)

        clusterfeatures = utils.get_cluster_cds_features(cluster2, self.record)
        self.assertEqual(self.features[-3:-1], clusterfeatures)
Exemplo n.º 2
0
def write(seq_records, options):
    basename = options.outputfoldername
    options.svgdir = path.join(basename, "svg")
    logging.debug("Writing seq_records SVGs to %r" % options.svgdir)
    if not path.exists(options.svgdir):
        os.mkdir(options.svgdir)
    for seq_record in seq_records:
        if len(utils.get_cluster_features(seq_record)) > 0:
            #Parse clusterblast output to prepare visualization
            prepare_visualization(options, seq_record)
            create_svgs(options, seq_record)
Exemplo n.º 3
0
    def test_get_cluster_aSDomain_features(self):
        "Test utils.get_cluster_aSDomain_features()"
        cluster1, cluster2 = utils.get_cluster_features(self.record)
        self.assertEqual(self.features[0], cluster1)
        self.assertEqual(self.features[-1], cluster2)

        clusterfeatures = utils.get_cluster_aSDomain_features(
            cluster1, self.record)
        self.assertEqual([], clusterfeatures)

        clusterfeatures = utils.get_cluster_aSDomain_features(
            cluster2, self.record)
        self.assertEqual([self.features[-5]], clusterfeatures)
Exemplo n.º 4
0
def load_genecluster_info(seq_record, options):
    #Gather and store data on each gene cluster
    smcogdict, _ = utils.get_smcog_annotations(seq_record)
    gtrcoglist = ['SMCOG1045','SMCOG1062','SMCOG1102']
    transportercoglist = ['SMCOG1000','SMCOG1005','SMCOG1011','SMCOG1020','SMCOG1029','SMCOG1033','SMCOG1035','SMCOG1044','SMCOG1065','SMCOG1067','SMCOG1069','SMCOG1074','SMCOG1085','SMCOG1096','SMCOG1106','SMCOG1118','SMCOG1131','SMCOG1166','SMCOG1169','SMCOG1184','SMCOG1202','SMCOG1205','SMCOG1214','SMCOG1234','SMCOG1243','SMCOG1245','SMCOG1252','SMCOG1254','SMCOG1288']
    seq_record.qgeneclusterdata = {}
    geneclusters = utils.get_cluster_features(seq_record)
    for genecluster in geneclusters:
        geneclusternr = utils.get_cluster_number(genecluster)
        clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize = retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr)
        if options.clusterblast:
            hitgeneclusterdata = retrieve_clusterblast_info(seq_record, geneclusternr)
        else:
            hitgeneclusterdata = {}
        pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred = retrieve_pksnrps_info(seq_record, geneclusternr, pksnrpsprots)
        seq_record.qgeneclusterdata[geneclusternr] = [clustertype, clustersize, clustergenes, annotations, starts, ends, strands, pksnrpsprots, pksnrpsprotsnames, pksnrpsdomains, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, gtrs, transporters, colors, hitgeneclusterdata, structpred, krpredictionsdict]
Exemplo n.º 5
0
def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if options.coexpress:
            js_cluster["geo"] = utils.get_geotable_json(features)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"
        js_cluster['domains'] = utils.get_cluster_domains(cluster, record)

        if options.enable_cdhit:
            js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table(
                cluster, record)

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters
Exemplo n.º 6
0
def generate_structure_images(seq_records, options):
    "Generate the structure images based on Monomers prediction in cluster feature"
    
    for seq_record in seq_records:
        # Ugly temporary solution:
        # At first we have to regenerate the relevant information for the pksnrpsvars dictionary from the seq_record file
        pksnrpsvars = utils.Storage()
        pksnrpsvars.compound_pred_dict = {}
        pksnrpsvars.failedstructures = []
        
        geneclusters = utils.get_cluster_features(seq_record)
        
        for genecluster in geneclusters:
            geneclusternr = utils.get_cluster_number(genecluster)
            pksnrpsvars.compound_pred_dict[geneclusternr] = utils.get_structure_pred(genecluster)
        if len(pksnrpsvars.compound_pred_dict) > 0:
            generate_chemical_structure_preds(pksnrpsvars, seq_record, options)
Exemplo n.º 7
0
def store_detection_details(results_by_id, rulesdict, seq_record):
    clusters = utils.get_cluster_features(seq_record)
    for cluster in clusters:
        type_combo = utils.get_cluster_type(cluster)
        if '-' in type_combo:
            clustertypes = type_combo.split('-')
        else:
            clustertypes = [type_combo]

        if not 'note' in cluster.qualifiers:
            cluster.qualifiers['note'] = []
        rule_string = "Detection rule(s) for this cluster type:"
        for clustertype in clustertypes:
            rule_string += " %s: (%s);" % (clustertype,
                                           rulesdict[clustertype][0])

        cluster.qualifiers['note'].append(rule_string)
Exemplo n.º 8
0
def write(seq_records, options):
    """Write all cluster proteins to a file

    Args:
        seq_records (iterable): An iterable containing Bio.SeqRecords
        options (argparse.Namespace): The options passed to the program
    """
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename)
    logging.debug("Writing seq_records to %r" % output_name)

    with open(output_name, 'w+') as handle:
        for seq_record in seq_records:
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clustertype = utils.get_cluster_type(cluster)
                clusternr = utils.get_cluster_number(cluster)
                for feature in utils.get_cluster_cds_features(cluster, seq_record):
                    qual = feature.qualifiers
                    fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0])
                    handle.write( fasta_header )
                    handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )
Exemplo n.º 9
0
def write(seq_records, options):
    basename = seq_records[0].id
    if options.input_type == 'nucl':
        output_name = path.join(options.outputfoldername,
                                "%s.final.gbk" % basename)
        for rec in seq_records:
            for cluster in utils.get_cluster_features(rec):
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    cluster_rec = rec[cluster.location.start:cluster.location.
                                      end]

                cluster_rec.annotations["date"] = rec.annotations.get(
                    "date", '')
                cluster_rec.annotations["source"] = rec.annotations.get(
                    "source", '')
                cluster_rec.annotations["organism"] = rec.annotations.get(
                    "organism", '')
                cluster_rec.annotations["taxonomy"] = rec.annotations.get(
                    "taxonomy", [])
                cluster_rec.annotations[
                    "data_file_division"] = rec.annotations.get(
                        "data_file_division", 'UNK')
                # our cut-out clusters are always linear
                cluster_rec.annotations["topology"] = "linear"

                cluster_name = path.join(
                    options.outputfoldername, "%s.cluster%03d.gbk" %
                    (basename, utils.get_cluster_number(cluster)))
                seqio.write([cluster_rec], cluster_name, 'genbank')
    else:
        seq_records = seq_record_convert_nucl_to_prot(seq_records, options)
        output_name = path.join(options.outputfoldername,
                                "%s.final.gp" % basename)

    logging.debug("Writing seq_records to %r" % output_name)
    seqio.write(seq_records, output_name, 'genbank')
Exemplo n.º 10
0
 def test_find_clusters(self):
     i = 0
     nseqdict = {"Metabolite0": "?", "Metabolite1": "?"}
     self.config.next_clusternr = 1
     for gene_id in self.feature_by_id:
         if gene_id != "GENE_X":
             clustertype = "Metabolite%d" % (i % 2)
             hmm_detection._update_sec_met_entry(
                 self.feature_by_id[gene_id], self.results_by_id[gene_id],
                 clustertype, nseqdict)
             i += 1
     hmm_detection.find_clusters(self.record, self.rulesdict)
     result_clusters = [
         sorted([
             utils.get_gene_id(f)
             for f in utils.get_cluster_cds_features(feature, self.record)
         ]) for feature in utils.get_cluster_features(self.record)
     ]
     expected_clusters = [["GENE_1", "GENE_2"], ["GENE_3"],
                          ["GENE_4", "GENE_5"]]
     self.assertEqual(result_clusters,
                      expected_clusters,
                      msg="\nResult : %s\nExpected : %s" %
                      (result_clusters, expected_clusters))
Exemplo n.º 11
0
def insert_modified_monomers(pksnrpsvars, seq_record):
    locusTag_domain = []

    #Extracting gene cluster type (e.g., "transatpks")
    for f in utils.get_cluster_features(seq_record):
        cluster_info = f.qualifiers

    #pksnrpsvars.domainnamesdict = {'CRYAR_RS43165': ['PKS_KS', 'PKS_AT',...]}
    #Get a unique set of genes having ATs
    for key in pksnrpsvars.domainnamesdict.keys():
        if key not in locusTag_domain:
            locusTag_domain.append(key)

    locusTag_domain = sorted(set(locusTag_domain))

    for locusTag in locusTag_domain:
        at_list = find_duplicate_position(
            pksnrpsvars.domainnamesdict[locusTag], 'PKS_AT')
        #For transatpks
        ks_list = find_duplicate_position(
            pksnrpsvars.domainnamesdict[locusTag], 'PKS_KS')
        kr_list = find_duplicate_position(
            pksnrpsvars.domainnamesdict[locusTag], 'PKS_KR')
        dh_list = find_duplicate_position(
            pksnrpsvars.domainnamesdict[locusTag], 'PKS_DH')
        er_list = find_duplicate_position(
            pksnrpsvars.domainnamesdict[locusTag], 'PKS_ER')

        if 'transatpks' not in cluster_info['product'][0]:
            for at_idx in range(len(at_list)):
                #Monomer change caused by only KR
                for kr_idx in range(len(kr_list)):
                    if at_idx + 1 <= len(at_list) - 1:
                        if kr_list[kr_idx] > at_list[at_idx] and kr_list[
                                kr_idx] < at_list[at_idx + 1]:
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "mal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "mmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "mxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "emal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohemal"
                    if at_idx + 1 > len(at_list) - 1:
                        if kr_list[kr_idx] > at_list[at_idx]:
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "mal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "mmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "mxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "emal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ohemal"

    #Monomer change caused by KR and DH
                for dh_idx in range(len(dh_list)):
                    if at_idx + 1 <= len(at_list) - 1:
                        if dh_list[dh_idx] > at_list[at_idx] and dh_list[
                                dh_idx] < at_list[at_idx + 1]:
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohemal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccemal"
                    if at_idx + 1 > len(at_list) - 1:
                        if dh_list[dh_idx] > at_list[at_idx]:
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ohemal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "ccemal"

    #Monomer change caused by KR, DH and ER
                for er_idx in range(len(er_list)):
                    if at_idx + 1 <= len(at_list) - 1:
                        if er_list[er_idx] > at_list[at_idx] and er_list[
                                er_idx] < at_list[at_idx + 1]:
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccemal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redemal"
                    if at_idx + 1 > len(at_list) - 1:
                        if er_list[er_idx] > at_list[at_idx]:
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                          str(at_idx +
                                                              1)] == "ccemal":
                                pksnrpsvars.consensuspreds[locusTag + "_AT" +
                                                           str(at_idx +
                                                               1)] = "redemal"

        if 'transatpks' in cluster_info['product'][0]:
            for ks_idx in range(len(ks_list)):
                #Monomer change caused by only KR
                for kr_idx in range(len(kr_list)):
                    if ks_idx + 1 <= len(ks_list) - 1:
                        if kr_list[kr_idx] > ks_list[ks_idx] and kr_list[
                                kr_idx] < ks_list[ks_idx + 1]:
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "mal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "mmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "mxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "emal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohemal"
                    if ks_idx + 1 > len(ks_list) - 1:
                        if kr_list[kr_idx] > ks_list[ks_idx]:
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "mal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "mmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "mxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "emal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ohemal"

    #Monomer change caused by KR and DH
                for dh_idx in range(len(dh_list)):
                    if ks_idx + 1 <= len(ks_list) - 1:
                        if dh_list[dh_idx] > ks_list[ks_idx] and dh_list[
                                dh_idx] < ks_list[ks_idx + 1]:
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohemal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccemal"
                    if ks_idx + 1 > len(ks_list) - 1:
                        if dh_list[dh_idx] > ks_list[ks_idx]:
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ohemal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "ccemal"

    #Monomer change caused by KR, DH and ER
                for er_idx in range(len(er_list)):
                    if ks_idx + 1 <= len(ks_list) - 1:
                        if er_list[er_idx] > ks_list[ks_idx] and er_list[
                                er_idx] < ks_list[ks_idx + 1]:
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccemal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redemal"
                    if ks_idx + 1 > len(ks_list) - 1:
                        if er_list[er_idx] > ks_list[ks_idx]:
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccmmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redmmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccmxmal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redmxmal"
                            if pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                          str(ks_idx +
                                                              1)] == "ccemal":
                                pksnrpsvars.consensuspreds[locusTag + "_KS" +
                                                           str(ks_idx +
                                                               1)] = "redemal"
Exemplo n.º 12
0
def run_coexpress(seq_record, all_gene_expressions, geo):
    options = get_config()
    cl_count = 1
    cl_list = utils.get_cluster_features(seq_record)

    gene_expressions = all_gene_expressions[seq_record.id]

    logging.info('Running CoExpress analysis on the clusters..')
    for cluster in cl_list:
        logging.debug(
            'Running CoExpress analysis on record "%s".. (Cluster %s of %s)' %
            (geo["info"]["id"], cl_count, len(cl_list)))
        features = utils.get_cluster_cds_features(cluster, seq_record)
        cl_count += 1
        cluster_genes = {}

        for feature in features:
            gene_id = utils.get_gene_id(feature)
            if gene_id in gene_expressions:
                cluster_genes[gene_id] = gene_expressions[gene_id]

        #calculate correlation value between genes
        for gene_1 in cluster_genes:
            if "cor" not in cluster_genes[gene_1]:
                cluster_genes[gene_1]["cor"] = {}
            if "exp" not in cluster_genes[gene_1]:
                continue
            for gene_2 in cluster_genes:
                if "cor" not in cluster_genes[gene_2]:
                    cluster_genes[gene_2]["cor"] = {}
                if gene_2 == gene_1:
                    continue
                if "exp" not in cluster_genes[gene_2]:
                    continue
                if gene_1 in cluster_genes[gene_2]["cor"]:
                    continue
                cor_val = calc_correlation_value(cluster_genes[gene_1],
                                                 cluster_genes[gene_2])
                cluster_genes[gene_1]["cor"][gene_2] = cor_val
                cluster_genes[gene_2]["cor"][gene_1] = cor_val

        #calculate distance value for building dendogram
        for gene_1 in cluster_genes:
            if "dist" not in cluster_genes[gene_1]:
                cluster_genes[gene_1]["dist"] = {}
            for gene_2 in cluster_genes:
                if "dist" not in cluster_genes[gene_2]:
                    cluster_genes[gene_2]["dist"] = {}
                dist = 100.0
                if "cor" in cluster_genes[gene_1] and gene_2 in cluster_genes[
                        gene_1]["cor"]:
                    cor_val = min(1.00, cluster_genes[gene_1]["cor"][gene_2])
                    dist = 100.0 * (1.0 - cor_val)
                cluster_genes[gene_1]["dist"][gene_2] = dist
                cluster_genes[gene_2]["dist"][gene_1] = dist

        # check for remote genes, add if correlation value >= 0.9
        for gene_1 in cluster_genes:
            for seqid in all_gene_expressions:
                prefix = "%s:" % seqid.replace(":", "_")
                for gene_2 in all_gene_expressions[seqid]:
                    if (
                            prefix + gene_2
                    ) not in options.hmm_results:  # only add biosynthetic remote genes
                        continue
                    if gene_2 == gene_1:
                        continue
                    if gene_2 in cluster_genes:
                        continue
                    cor_val = min(
                        1.00,
                        calc_correlation_value(
                            cluster_genes[gene_1],
                            all_gene_expressions[seqid][gene_2]))
                    if 1.00 > cor_val >= 0.9:
                        cluster_genes[gene_1]["dist"][gene_2] = 100.0 * (
                            1.0 - cor_val)

        # review the remote genes, discard genes with less than 2 edges
        if True:
            edges_count = {}
            for gene_1 in cluster_genes:
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if gene_2 not in cluster_genes:
                        if gene_2 not in edges_count:
                            edges_count[gene_2] = 0
                        edges_count[gene_2] += 1
            for gene_1 in cluster_genes:
                new_dists = {}
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if (gene_2 in cluster_genes) or (edges_count[gene_2] >= 2):
                        new_dists[gene_2] = cluster_genes[gene_1]["dist"][
                            gene_2]
                cluster_genes[gene_1]["dist"] = new_dists

        # review the remote genes, discard genes without any connection to cluster's biosynthetic genes
        if True:
            have_connections = []
            prefix = "%s:" % seq_record.id.replace(":", "_")
            for gene_1 in cluster_genes:
                if (prefix + gene_1) in options.hmm_results:
                    for gene_2 in cluster_genes[gene_1]["dist"]:
                        if (gene_2 not in cluster_genes) and (
                                gene_2 not in have_connections):
                            have_connections.append(gene_2)
            for gene_1 in cluster_genes:
                new_dists = {}
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if (gene_2 in cluster_genes) or (gene_2
                                                     in have_connections):
                        new_dists[gene_2] = cluster_genes[gene_1]["dist"][
                            gene_2]
                cluster_genes[gene_1]["dist"] = new_dists

        #update seq_record
        update_features(features, cluster_genes, geo)

    if False:  #This feature is temporarily disabled, saved for next version #options.coexpress_signal_cluster_size < len(overlaps):
        logging.info('Running expression signal analysis on seq_record..')
        signals = []
        n = options.coexpress_signal_cluster_size - 1
        #build list of cluster locations (for annotating signal regions)
        clrefs = []
        for cluster in cl_list:
            clrefs.append(((cluster.location.start, cluster.location.end),
                           utils.get_cluster_number(cluster)))
        clrefs = sorted(clrefs, key=lambda cl: cl[0][0])
        #build signals
        for i in xrange(0, len(overlaps) - n):
            genes = []
            for overlap in overlaps[i:i + n]:
                gene = overlap[0]
                for feature in overlap:
                    if utils.get_gene_id(feature) in gene_expressions:
                        gene = feature
                        break
                genes.append(gene)
            cors = []
            checked = []
            hits = []
            for x in xrange(0, len(genes)):
                gene_x = utils.get_gene_id(genes[x])
                if prefix + gene_x in options.hmm_results:
                    hits.append(options.hmm_results[prefix +
                                                    gene_x][0].query_id)
                for y in xrange(0, len(genes)):
                    if ((x, y) in checked) or ((y, x) in checked):
                        continue
                    cor_val = 0
                    gene_y = utils.get_gene_id(genes[y])
                    if (gene_x in gene_expressions) and (gene_y
                                                         in gene_expressions):
                        cor_val = calc_correlation_value(
                            gene_expressions[gene_x], gene_expressions[gene_y])
                    cors.append(cor_val)
                    checked.append((x, y))
            sloc = (genes[0].location.start + genes[-1].location.end) / 2
            cor_val = 0
            if len(cors) > 0 and len(list(set(hits))) > 1:
                cor_val = np.median(cors)
            cl_idx = -1
            for clref in clrefs:
                if sloc < clref[0][0]:
                    continue
                if sloc <= clref[0][1]:
                    cl_idx = clref[1]
                    break
            signals.append((sloc, cor_val, cl_idx))
        if "coexpress_signal" not in options:
            options.coexpress_signal = {}
        if geo["info"]["id"] not in options.coexpress_signal:
            options.coexpress_signal[geo["info"]["id"]] = {}
        options.coexpress_signal[geo["info"]["id"]][seq_record.id] = signals
Exemplo n.º 13
0
def main():
    multiprocessing.freeze_support()
    res_object = {}

    # get genome files
    files = []
    for line in open(sys.argv[1], 'r'):
        files.append(path.expanduser(line.replace("\n", "")))

    # mockup antismash run per files
    i = 1
    for fpath in files:
        res_object[fpath] = {}
        print "Processing %s... (%d/%d)" % (fpath, i, len(files))
        i += 1
        options = get_mockup_config()
        options.sequences = [fpath]
        config.set_config(options)
        run_antismash.setup_logging(
            options)  #To-DO: get antismash logging to works!

        # load plugins
        plugins = run_antismash.load_detection_plugins()
        run_antismash.filter_plugins(plugins, options,
                                     options.enabled_cluster_types)

        # parse to seq_records
        seq_records = run_antismash.parse_input_sequences(options)
        options.next_clusternr = 1

        for seq_record in seq_records:
            if options.input_type == 'nucl':
                seq_records = [
                    record for record in seq_records if len(record.seq) > 1000
                ]
                if len(seq_records) < 1:
                    continue
            utils.sort_features(seq_record)
            run_antismash.strip_record(seq_record)
            utils.fix_record_name_id(seq_record, options)

            # fetch results_by_id
            feature_by_id = utils.get_feature_dict(seq_record)
            results = []
            results_by_id = {}
            for feature in utils.get_cds_features(seq_record):
                prefix = "%s:" % seq_record.id.replace(":", "_")
                gene_id = utils.get_gene_id(feature)
                if (prefix + gene_id) in options.hmm_results:
                    results_by_id[gene_id] = options.hmm_results[prefix +
                                                                 gene_id]
                    for res in results_by_id[gene_id]:
                        results.append(res)

            # ignore short aa's
            min_length_aa = 100
            short_cds_buffer = []
            for f in seq_record.features:  # temporarily remove short aa
                if f.type == "CDS" and len(
                        f.qualifiers['translation']
                    [0]) < min_length_aa and not results_by_id.has_key(
                        utils.get_gene_id(f)):
                    short_cds_buffer.append(f)
                    seq_record.features.remove(f)

            overlaps = utils.get_overlaps_table(seq_record)
            rulesdict = hmm_detection.create_rules_dict(
                options.enabled_cluster_types)
            # find total cdhit numbers in the chromosome
            total_cdhit = len(
                utils.get_cdhit_table(utils.get_cds_features(seq_record))[0])
            res_object[fpath][seq_record.id] = {
                "total_clusters": 0,
                "total_genes": len(overlaps[0]),
                "total_cdhit": total_cdhit,
                "genes_with_hits": 0,
                "largest_cdhit": 0,
                "largest_domain_variations": 0,
                "per_hits": {},
                "cluster_types": {}
            }

            # filter overlap hits
            results, results_by_id = hmm_detection.filter_results(
                results, results_by_id, overlaps, feature_by_id)

            # count hits
            for gene_id in results_by_id:
                res_gene = results_by_id[gene_id]
                if len(res_gene) > 0:
                    res_object[fpath][seq_record.id]["genes_with_hits"] += 1
                for hsp in res_gene:
                    domain_name = hsp.query_id.replace("plants/", "")
                    if domain_name not in res_object[fpath][
                            seq_record.id]["per_hits"]:
                        res_object[fpath][
                            seq_record.id]["per_hits"][domain_name] = 0
                    res_object[fpath][
                        seq_record.id]["per_hits"][domain_name] += 1

            # do cluster finding algorithm
            typedict = hmm_detection.apply_cluster_rules(
                results_by_id, feature_by_id, options.enabled_cluster_types,
                rulesdict, overlaps)
            hmm_detection.fix_hybrid_clusters_typedict(typedict)
            nseqdict = hmm_detection.get_nseq()
            for cds in results_by_id.keys():
                feature = feature_by_id[cds]
                if typedict[cds] != "none":
                    hmm_detection._update_sec_met_entry(
                        feature, results_by_id[cds], typedict[cds], nseqdict)
            hmm_detection.find_clusters(seq_record, rulesdict, overlaps)
            seq_record.features.extend(short_cds_buffer)
            res_object[fpath][seq_record.id]["total_clusters"] += len(
                utils.get_cluster_features(seq_record))

            # do cluster specific and unspecific analysis
            if len(utils.get_cluster_features(seq_record)) > 0:
                run_antismash.cluster_specific_analysis(
                    plugins, seq_record, options)
            run_antismash.unspecific_analysis(seq_record, options)

            #Rearrange hybrid clusters name alphabetically
            hmm_detection.fix_hybrid_clusters(seq_record)

            #before writing to output, remove all hmm_detection's subdir prefixes from clustertype
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = []
                    for name in prod.split('-'):
                        prod_name.append(name.split('/')[-1])
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = [
                                (ct.split('/')[-1])
                                for ct in row.split('Type: ')[-1].split('-')
                            ]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        elif row.startswith('Domains detected: '):
                            cluster_results = []
                            for cluster_result in row.split(
                                    'Domains detected: ')[-1].split(';'):
                                cluster_results.append(
                                    cluster_result.split(' (E-value')[0].split(
                                        '/')[-1] + ' (E-value' +
                                    cluster_result.split(' (E-value')[-1])
                            temp_qual.append('Domains detected: ' +
                                             ";".join(cluster_results))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            #on plants, remove plant clustertype from hybrid types, and replace single
            #plant clustertype with "putative"
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = list(set(prod.split('-')))
                    if (len(prod_name) > 1) and ("plant" in prod_name):
                        prod_name.remove("plant")
                    elif prod_name == ["plant"]:
                        prod_name = ["putative"]
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = list(
                                set(row.split('Type: ')[-1].split('-')))
                            if (len(clustertypes) > 1) and ("plant"
                                                            in clustertypes):
                                clustertypes.remove("plant")
                            elif clustertypes == ["plant"]:
                                clustertypes = ["putative"]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            # find largest cdhit number & largest domain diversity in a cluster
            res_object[fpath][seq_record.id]["average_cdhit"] = 0
            res_object[fpath][seq_record.id]["average_domain_variations"] = 0
            cdhit_numbers = []
            domain_numbers = []
            for cluster in utils.get_cluster_features(seq_record):
                cluster_type = utils.get_cluster_type(cluster)
                if cluster_type not in res_object[fpath][
                        seq_record.id]["cluster_types"]:
                    res_object[fpath][
                        seq_record.id]["cluster_types"][cluster_type] = 0
                res_object[fpath][
                    seq_record.id]["cluster_types"][cluster_type] += 1
                num_cdhit = len(
                    utils.get_cluster_cdhit_table(cluster, seq_record))
                num_domain = len(utils.get_cluster_domains(
                    cluster, seq_record))
                cdhit_numbers.append(num_cdhit)
                domain_numbers.append(num_domain)
                if num_cdhit > res_object[fpath][
                        seq_record.id]["largest_cdhit"]:
                    res_object[fpath][
                        seq_record.id]["largest_cdhit"] = num_cdhit
                if num_domain > res_object[fpath][
                        seq_record.id]["largest_domain_variations"]:
                    res_object[fpath][seq_record.id][
                        "largest_domain_variations"] = num_domain
            if len(cdhit_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_cdhit"] = numpy.median(cdhit_numbers)
            if len(domain_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_domain_variations"] = numpy.median(domain_numbers)

        with open('result.js', 'w') as h:
            h.write('var result = %s;' % json.dumps(res_object, indent=4))
Exemplo n.º 14
0
def write(seq_records, options):
    if options.input_type == 'prot':
        return
    #Open up TXT file and XLS record
    outfolder = options.full_outputfolder_path
    txtfile = open(path.join(outfolder, "geneclusters.txt"), "w")
    wb = Workbook()
    font1 = Font()
    style1 = XFStyle()
    style1.font = font1
    font1.bold = True
    ws0 = wb.add_sheet('0')
    ws0.write(0, 0, "Input accession number", style1)
    ws0.write(0, 1, "Input name", style1)
    ws0.write(0, 2, "Gene cluster type", style1)
    ws0.write(0, 3, "Gene cluster genes", style1)
    ws0.write(0, 4, "Gene cluster gene accessions", style1)
    if options.knownclusterblast:
        ws0.write(0, 5, "Compound with gene cluster of highest homology",
                  style1)
    #For each gene cluster, write out info
    column = 1
    for seq_record in seq_records:
        clusters = utils.get_cluster_features(seq_record)
        for cluster in clusters:
            clustertype = utils.get_cluster_type(cluster)
            clusternr = utils.get_cluster_number(cluster)
            clustergenes = [
                utils.get_gene_id(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            accessions = [
                utils.get_gene_acc(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            ws0.write(column, 0, seq_record.id)
            try:
                ws0.write(column, 1, seq_record.description)
            except:
                ws0.write(
                    column, 1,
                    "Name to long to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            ws0.write(column, 2, clustertype)
            try:
                ws0.write(column, 3, ";".join(clustergenes))
            except:
                ws0.write(
                    column, 3,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            try:
                ws0.write(column, 4, ";".join(accessions))
            except:
                ws0.write(
                    column, 4,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            if hasattr(seq_record, 'closestcompounddict') and \
               seq_record.closestcompounddict.has_key(clusternr):
                ws0.write(column, 5, seq_record.closestcompounddict[clusternr])
            column += 1
            txtfile.write("\t".join([
                seq_record.id, seq_record.description, clustertype, ";".join(
                    clustergenes), ";".join(accessions)
            ]) + "\n")
    wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))
Exemplo n.º 15
0
def write_data_to_seq_record(pksnrpsvars, seq_record, options):
    #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record
    #
    # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers

    for f in utils.get_cluster_features(seq_record):
	cluster_info = f.qualifiers

    for feature in pksnrpsvars.pksnrpscoregenes:
        nrat = 0
        nra = 0
        nrcal = 0
        nrkr = 0
        nrXdom = 0
        secmetqualifiers = feature.qualifiers['sec_met']
        updated_secmetqualifiers = []
        # BiosynML:creating object to add detailed substrate predictions
        updated_secmetqualifiers_predictions = []
        domainFeatures = []
        gene_id = utils.get_gene_id(feature)
        for qualifier in secmetqualifiers:
            if "NRPS/PKS Domain:" not in qualifier:
                updated_secmetqualifiers.append(qualifier)
                updated_secmetqualifiers_predictions.append(qualifier)
            else:
                # extract domain type, start and end position from qualifier string
                match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier)
                if not match_pos_obj:
                    logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier)
                    sys.exit(1)
                domain_type = match_pos_obj.group(1)
                start_aa = int(match_pos_obj.group(2))
                end_aa = int(match_pos_obj.group(3))
                evalue = float(match_pos_obj.group(4))
                score = float (match_pos_obj.group(5))

                #calculate respective positions based on aa coordinates
                if feature.location.strand==1:
                    start = feature.location.start + ( 3 * start_aa )
                    end = feature.location.start + ( 3* end_aa )
                else:
                    end = feature.location.end - ( 3 * start_aa )
                    start = feature.location.end - ( 3 * end_aa)
                loc = FeatureLocation(start, end, strand=feature.strand)

                # set up new CDS_motif feature
                domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag)
                domainFeature.qualifiers['domain'] = [domain_type]
                if feature.qualifiers.has_key('locus_tag'):
                    domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag']
                else:
                    domainFeature.qualifiers['locus_tag'] = [gene_id]
                domainFeature.qualifiers['detection'] = ["hmmscan"]
                domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"]
                domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))]
                domainFeature.qualifiers['score'] = [score]
                if feature.qualifiers.has_key('transl_table'):
                    [transl_table] = feature.qualifiers['transl_table']
                else:
                    transl_table = 1
                domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))]

                domainFeature_specificity = []

                if domain_type == "AMP-binding":
                    nra += 1
                    domainname = gene_id + "_A" + str(nra)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("NRPSpredictor2 SVM: %s" % pksnrpsvars.nrps_svm_preds[domainname])
                    domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.nrps_code_preds[domainname])
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_nrps_preds[domainname])
                    domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname])


                    newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname, pksnrpsvars.nrps_svm_preds[domainname], pksnrpsvars.nrps_code_preds[domainname], pksnrpsvars.minowa_nrps_preds[domainname], pksnrpsvars.consensuspreds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname,pksnrpsvars.nrps_code_preds_details[domainname], pksnrpsvars.nrps_svm_preds_details[domainname],  pksnrpsvars.minowa_nrps_preds_details[domainname], pksnrpsvars.consensuspreds[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_AT":
                    nrat += 1
                    domainname = gene_id + "_AT" + str(nrat)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname])
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname])
                    #For t1pks, t2pks and t3pks
                    if 'transatpks' not in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname])
                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                    #For transatpks
                    elif 'transatpks' in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname])

                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "CAL_domain":
                    nrcal += 1
                    domainname = gene_id + "_CAL" + str(nrcal)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname])
                    newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_KR":
                    nrkr += 1
                    domainname = gene_id + "_KR" + str(nrkr)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname])
                    domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname])
                    newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                else:
                    nrXdom += 1
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)]
                    updated_secmetqualifiers.append(qualifier)
                domainFeature.qualifiers['specificity'] = domainFeature_specificity
                if _map_domaintype(domain_type):
                    domainFeature.qualifiers['domain_subtype'] = [domain_type]
                    domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)]
                domainFeatures.append(domainFeature)

        feature.qualifiers['sec_met'] = updated_secmetqualifiers
        # BiosynML: creating new 'sec_met_predictions' qualifier
        #feature.qualifiers['sec_met_predictions'] = updated_secmetqualifiers_predictions
        seq_record.features.extend(domainFeatures)

        if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id):
            feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id])

    #Save consensus structure + link to structure image to seq_record
    clusters = utils.get_cluster_features(seq_record)
    for cluster in clusters:
        clusternr = utils.get_cluster_number(cluster)
        if pksnrpsvars.compound_pred_dict.has_key(clusternr):
            structpred = pksnrpsvars.compound_pred_dict[clusternr]
            cluster.qualifiers['note'].append("Monomers prediction: " + structpred)
            cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
Exemplo n.º 16
0
def generate_substrates_order(genecluster, geneorder, pksnrpsvars, seq_record):
    #Generate substrates order from predicted gene order and consensus predictions
    prediction = ""

    for f in utils.get_cluster_features(seq_record):
        cluster_info = f.qualifiers

    for k in geneorder:
        if len(prediction) == 0 or prediction[-1] != "(":
            prediction += "("
        domains = pksnrpsvars.domainnamesdict[k]
        nra = 0
        nrat = 0
        nrcal = 0
        nrtransat = 0
        domainnr = 0
        consensuspred_list = []

        for l in domains:
            if 'transatpks' not in cluster_info['product'][0]:
                if "PKS_AT" in l:
                    if domainnr > 0:
                        prediction += "-"
                    nrat += 1
                    prediction = prediction + pksnrpsvars.consensuspreds[
                        k + "_AT" + str(nrat)]
                    consensuspred_list.append(
                        pksnrpsvars.consensuspreds[k + "_AT" + str(nrat)])
                    domainnr += 1
            elif 'transatpks' in cluster_info['product'][0]:
                if "PKS_KS" in l:
                    if domainnr > 0:
                        prediction += "-"
                    nrtransat += 1
                    prediction = prediction + pksnrpsvars.consensuspreds[
                        k + "_KS" + str(nrtransat)]
                    consensuspred_list.append(
                        pksnrpsvars.consensuspreds[k + "_KS" + str(nrtransat)])
                    domainnr += 1
            if "AMP-binding" in l or "A-OX" in l:
                if domainnr > 0:
                    prediction += "-"
                nra += 1
                prediction = prediction + pksnrpsvars.consensuspreds[k + "_A" +
                                                                     str(nra)]
                consensuspred_list.append(pksnrpsvars.consensuspreds[k + "_A" +
                                                                     str(nra)])
                domainnr += 1
            if "CAL_domain" in l:
                if domainnr > 0:
                    prediction += "-"
                nrcal += 1
                prediction = prediction + pksnrpsvars.consensuspreds[
                    k + "_CAL" + str(nrcal)]
                consensuspred_list.append(
                    pksnrpsvars.consensuspreds[k + "_CAL" + str(nrcal)])
                domainnr += 1
        if pksnrpsvars.consensuspred_gene_dict.has_key(k):
            logging.warn(
                "WARNING: Consensus specificity prediction already defined for %s; possibly duplicate genename? Overwriting entries for %s"
                % (k, k))
        pksnrpsvars.consensuspred_gene_dict[k] = consensuspred_list
        if prediction[-3:] == "+ (":
            prediction = prediction[:-1]
        elif prediction[-1] != "(":
            prediction += ") + "
    prediction = prediction[:-3]
    pksnrpsvars.compound_pred_dict[genecluster] = prediction
Exemplo n.º 17
0
def get_inter_cluster_relation(seq_records, geo_id):
    logging.debug('Calculating inter cluster relations on geo_record "%s"..' %
                  (geo_id))
    data = []
    full_g = nx.Graph()
    cluster_genes = {}
    bio_genes = set()
    cur_cluster1 = 0
    # First, inspect all cluster to get cluster_genes
    for record in seq_records:
        for cluster in utils.get_cluster_features(record):
            cur_cluster1 += 1
            cluster_genes[cur_cluster1] = set()

            for cluster_gene in utils.get_cluster_cds_features(
                    cluster, record):
                # We only care about cluster_genes that have a geo match
                for cluster_gene_geo in utils.parse_geo_feature(cluster_gene):
                    # We only care about data from the current geo_id
                    if cluster_gene_geo['rec_id'] == geo_id:
                        cur_gene1 = utils.get_gene_id(cluster_gene)
                        cur_gene1_distances = cluster_gene_geo['dist']
                        cur_gene1_neighbors = set(cur_gene1_distances)

                        # Add each gene to cluster_genes, and to the full_g(raph) and to bio_genes
                        cluster_genes[cur_cluster1].add(cur_gene1)
                        full_g.add_node(cur_gene1)
                        if 'sec_met' in cluster_gene.qualifiers:
                            bio_genes.add(cur_gene1)

                        # Get intra-cluster edges
                        interactions = cur_gene1_neighbors.intersection(
                            cluster_genes[cur_cluster1])
                        update_g(cur_gene1, interactions, cur_gene1_distances,
                                 full_g)

                        # From the second cluster onwards, we'll add inter-cluster edges backwards, i.e.: 2-1, 3-1, 3-2, 4-1, 4-2, etc...
                        if cur_cluster1 is not 1:
                            for cur_cluster2 in cluster_genes:
                                if cur_cluster1 is not cur_cluster2:
                                    interactions = cur_gene1_neighbors.intersection(
                                        cluster_genes[cur_cluster2])
                                    update_g(cur_gene1, interactions,
                                             cur_gene1_distances, full_g)

    # Remove single nodes
    for node in full_g.nodes():
        if full_g.degree(node) == 0:
            full_g.remove_node(node)

    # Get communities
    community_dict = community.best_partition(full_g)

    number_of_clusters = len(cluster_genes)

    # Now check inter-cluster interactions
    for i in range(1, number_of_clusters + 1):
        cluster1 = cluster_genes[i]

        for j in range(i + 1, number_of_clusters + 1):
            cluster2 = cluster_genes[j]
            cluster3 = cluster1.union(cluster2)

            cluster_pair_g = full_g.subgraph(cluster3)

            communities_present = np.unique(
                [community_dict[n] for n in cluster3 if n in community_dict])

            # CRITERIA 1 = only intra-community edges
            for cur_community in communities_present:
                cur_community_nodes = [
                    n for n in cluster3 if n in community_dict
                    and community_dict[n] == cur_community
                ]
                cur_community_g = cluster_pair_g.subgraph(cur_community_nodes)

                decomposed_g = list(
                    nx.connected_component_subgraphs(cur_community_g))
                for cur_g in decomposed_g:
                    # CRITERIA 2 = no isolates. anything with a clustering_coefficient=0 will be pruned out.
                    clustering_coefficient = nx.clustering(cur_g)

                    pred_nodes = [
                        n for n in clustering_coefficient
                        if clustering_coefficient[n] > 0
                    ]
                    pred_g = cur_g.subgraph(pred_nodes)
                    pred_edges = pred_g.edges()

                    prediction = set(pred_g.nodes())
                    prediction_cluster1 = prediction.intersection(cluster1)
                    prediction_cluster2 = prediction.intersection(cluster2)

                    bio_prediction = prediction.intersection(bio_genes)
                    bio_prediction_cluster1 = prediction_cluster1.intersection(
                        bio_genes)
                    bio_prediction_cluster2 = prediction_cluster2.intersection(
                        bio_genes)

                    #CRITERIA 3 = at least 2 genes per cluster
                    #CRITERIA 5 = at least 1 bio per cluster
                    #CRITERIA 4 = at least 3 bio
                    if (len(prediction_cluster1) >= 2
                            and len(prediction_cluster2) >= 2
                            and len(bio_prediction_cluster1) >= 1
                            and len(bio_prediction_cluster2) >= 1
                            and len(bio_prediction) >= 3):

                        pred_edges1 = [
                            n for n in pred_edges
                            if n[0] in cluster1 and n[1] in cluster1
                        ]
                        pred_edges2 = [
                            n for n in pred_edges
                            if n[0] in cluster2 and n[1] in cluster2
                        ]

                        pred_edges12 = [
                            n for n in pred_edges
                            if n[0] in cluster1 and n[1] in cluster2
                        ]
                        pred_edges21 = [
                            n for n in pred_edges
                            if n[0] in cluster2 and n[1] in cluster1
                        ]
                        inter_cluster_edges = pred_edges12 + pred_edges21

                        data.append({})
                        data[-1]['source'] = {}
                        data[-1]['source']['id'] = i
                        data[-1]['source']['links'] = pred_edges1

                        data[-1]['target'] = {}
                        data[-1]['target']['id'] = j
                        data[-1]['target']['links'] = pred_edges2

                        data[-1]['links'] = inter_cluster_edges
    return data
Exemplo n.º 18
0
 def test_get_cluster_features(self):
     "Test utils.get_cluster_features()"
     clusters = utils.get_cluster_features(self.rec)
     features = utils.get_all_features_of_type(self.rec, "cluster")
     self.assertListEqual(clusters, features)
Exemplo n.º 19
0
def calculate_consensus_prediction(pksnrpsvars, seq_record):
    # Combine substrate specificity predictions into consensus prediction
    pksnrpsvars.consensuspreds = {}
    pksnrpsvars.consensuspreds_transat = {}
    available_smiles_parts = [
        'GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'MET', 'PRO', 'PHE', 'TRP', 'SER',
        'THR', 'ASN', 'GLN', 'TYR', 'CYS', 'LYS', 'ARG', 'HIS', 'ASP', 'GLU',
        'MPRO', 'ORN', 'PGLY', 'DAB', 'BALA', 'AEO', 'DHA', 'PIP', 'BMT',
        'gly', 'ala', 'val', 'leu', 'ile', 'met', 'pro', 'phe', 'trp', 'ser',
        'thr', 'asn', 'gln', 'tyr', 'cys', 'lys', 'arg', 'his', 'asp', 'glu',
        'aaa', 'mpro', 'dhb', '2hiva', 'orn', 'pgly', 'dab', 'bala', 'aeo',
        '4mha', 'pico', 'phg', 'dha', 'scy', 'pip', 'bmt', 'adds', 'aad',
        'abu', 'hiv', 'dhpg', 'bht', '3-me-glu', '4pPro', 'ala-b', 'ala-d',
        'dht', 'Sal', 'tcl', 'lys-b', 'hpg', 'hyv-d', 'iva', 'vol', 'mal',
        'mmal', 'ohmal', 'redmal', 'mxmal', 'emal', 'nrp', 'pk', 'Gly', 'Ala',
        'Val', 'Leu', 'Ile', 'Met', 'Pro', 'Phe', 'Trp', 'Ser', 'Thr', 'Asn',
        'Gln', 'Tyr', 'Cys', 'Lys', 'Arg', 'His', 'Asp', 'Glu', 'Mpro',
        '23Dhb', '34Dhb', '2Hiva', 'Orn', 'Pgly', 'Dab', 'Bala', 'Aeo', '4Mha',
        'Pico', 'Aaa', 'Dha', 'Scy', 'Pip', 'Bmt', 'Adds', 'DHpg', 'DHB',
        'nrp', 'pk'
    ]

    # Extracting gene cluster type (e.g., "transatpks")
    for f in utils.get_cluster_features(seq_record):
        cluster_info = f.qualifiers

    for feature in pksnrpsvars.pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        nra = 0
        nrat = 0
        nrcal = 0
        nrtransat = 0
        j = pksnrpsvars.domaindict[locus]

        for k in j:
            if 'transatpks' not in cluster_info['product'][0]:
                if k[0] == "PKS_AT":
                    nrat += 1
                    preds = []
                    preds.append(pksnrpsvars.minowa_pks_preds[locus + "_AT" +
                                                              str(nrat)])
                    preds.append(pksnrpsvars.pks_code_preds[locus + "_AT" +
                                                            str(nrat)])
                    cpred = "n"
                    for l in preds:
                        if preds.count(l) > 1:
                            if l in available_smiles_parts:
                                pksnrpsvars.consensuspreds[locus + "_AT" +
                                                           str(nrat)] = l
                            else:
                                pksnrpsvars.consensuspreds[locus + "_AT" +
                                                           str(nrat)] = "pk"
                            cpred = "y"
                    if cpred == "n":
                        pksnrpsvars.consensuspreds[locus + "_AT" +
                                                   str(nrat)] = "pk"
            elif 'transatpks' in cluster_info['product'][0]:
                if k[0] == "PKS_AT":
                    nrat += 1
                    preds = []
                    preds.append(pksnrpsvars.minowa_pks_preds[locus + "_AT" +
                                                              str(nrat)])
                    preds.append(pksnrpsvars.pks_code_preds[locus + "_AT" +
                                                            str(nrat)])
                    cpred = "n"

                    # Only for the writing purpose in sec_record (i.e., trans-AT)
                    for l in preds:
                        if preds.count(l) > 1:
                            if l in available_smiles_parts:
                                pksnrpsvars.consensuspreds_transat[
                                    locus + "_AT" + str(nrat)] = l
                            else:
                                pksnrpsvars.consensuspreds_transat[
                                    locus + "_AT" + str(nrat)] = "pk"
                            cpred = "y"
                    if cpred == "n":
                        pksnrpsvars.consensuspreds_transat[locus + "_AT" +
                                                           str(nrat)] = "pk"
                # For chemical display purpose for chemicals from trans-AT PKS gene cluster
                # mal is always assumed for trans-AT
                if k[0] == "PKS_KS":
                    nrtransat += 1
                    pksnrpsvars.consensuspreds[locus + "_KS" +
                                               str(nrtransat)] = "mal"
                    cpred = "y"
            if k[0] == "AMP-binding" or k[0] == "A-OX":
                nra += 1
                if pksnrpsvars.sandpuma_res[locus + "_A" +
                                            str(nra)] == "no_call":
                    pksnrpsvars.consensuspreds[locus + "_A" + str(nra)] = "nrp"
                else:
                    pksnrpsvars.consensuspreds[
                        locus + "_A" +
                        str(nra)] = pksnrpsvars.sandpuma_res[locus + "_A" +
                                                             str(nra)]
            if k[0] == "CAL_domain":
                nrcal += 1
                if pksnrpsvars.minowa_cal_preds[
                        locus + "_CAL" + str(nrcal)] in available_smiles_parts:
                    pksnrpsvars.consensuspreds[
                        locus + "_CAL" +
                        str(nrcal)] = pksnrpsvars.minowa_cal_preds[locus +
                                                                   "_CAL" +
                                                                   str(nrcal)]
                else:
                    pksnrpsvars.consensuspreds[locus + "_CAL" +
                                               str(nrcal)] = "pk"
Exemplo n.º 20
0
def fix_hybrid_clusters(seq_record):
    clusters = utils.get_cluster_features(seq_record)
    for cluster in clusters:
        clustertypes = cluster.qualifiers['product'][0].split("-")
        clustertypes.sort()
        cluster.qualifiers['product'][0] = "-".join(clustertypes)
Exemplo n.º 21
0
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options):
    #Create directory to store structures
    options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures"))
    if not os.path.exists(options.structuresfolder):
        os.mkdir(options.structuresfolder)

    #Combine predictions into a prediction of the final chemical structure and generate images
    geneclusters = utils.get_cluster_features(seq_record)

    for genecluster in geneclusters:
        geneclusternr = utils.get_cluster_number(genecluster)
        smiles_string = ""
        if pksnrpsvars.compound_pred_dict.has_key(geneclusternr):

            #print "output_modules/html/pksnrpsvars.compound_pred_dict:"
            #print pksnrpsvars.compound_pred_dict

            residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ")


            #Now generates SMILES of predicted secondary metabolites without NP.searcher
            residuesList = residues.split(" ")

            #Counts the number of malonate and its derivatives in polyketides
            mal_count = 0
            for i in residuesList:
                if "mal" in i:
                    mal_count += 1

            nrresidues = len(residuesList)

            #Reflecting reduction states of ketide groups starting at beta carbon of type 1 polyketide
            if "pk" in residuesList and "mal" in residuesList[-1]:
                residuesList.pop(residuesList.index('pk')+1)
                residuesList.append('pks-end1')
            elif mal_count == len(residuesList):
                if residuesList[0] == "mal":
                    residuesList[0] = "pks-start1"
                if residuesList[-1] == "ccmal":
                    residuesList.append('pks-end2')

            if nrresidues > 1:
                #Conventionally used aaSMILES was used;
                #chirality expressed with "@@" causes indigo error
                smiles_monomer = open(os.path.dirname(os.path.realpath(__file__)) + os.sep + 'aaSMILES.txt','r')
                smiles = smiles_monomer.readline()
                smiles = smiles_monomer.readline()

                aa_smiles_dict = {}
                while smiles:
                    smiles = smiles.split()
                    if len(smiles) > 1:
                        smiles[0] = smiles[0].strip()
                        smiles[1] = smiles[1].strip()
                        aa_smiles_dict[smiles[0]] = smiles[1]
                    smiles = smiles_monomer.readline()
                smiles_monomer.close()

                for monomer in residuesList:
                    if monomer in aa_smiles_dict.keys():
                        smiles_string += aa_smiles_dict[monomer]
                logging.debug("Cluster %s: smiles_string: %s", geneclusternr, smiles_string)
                with TemporaryDirectory(change=True):
                    smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w")
                    smilesfile.write(smiles_string)
                    smilesfile.close()
                    depictstatus = depict_smile(geneclusternr, options.structuresfolder)
                if depictstatus == "failed":
                    pksnrpsvars.failedstructures.append(geneclusternr)
        elif utils.get_cluster_type(genecluster) == "ectoine":
            smiles_string = "CC1=NCCC(N1)C(=O)O"
            with TemporaryDirectory(change=True):
                smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w")
                smilesfile.write(smiles_string)
                smilesfile.close()
                depictstatus = depict_smile(geneclusternr, options.structuresfolder)
            if depictstatus == "failed":
                pksnrpsvars.failedstructures.append(geneclusternr)
            elif genecluster in pksnrpsvars.failedstructures:
                del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)]
            pksnrpsvars.compound_pred_dict[geneclusternr] = "ectoine"
        _update_sec_met_entry(genecluster, smiles_string)
Exemplo n.º 22
0
def annotate_geneclusters(seq_record, options):
    """Re-annotate gene clusters in the seq_record"""
    pfam_features = utils.get_pfam_features(seq_record)
    cf_clusters = find_cf_clusters(pfam_features, seq_record, options)
    # Integrate ClusterFinder clusters with existing cluster features
    newclusters = []
    cluster_features = utils.get_cluster_features(seq_record)
    secmet_cds_features = utils.get_secmet_cds_features(seq_record)

    for cf_cluster in cf_clusters:
        overlaps = False
        cf_type = "cf_putative"
        for cluster in cluster_features:
            if not utils.features_overlap(cf_cluster, cluster):
                continue

            overlaps = True

            # Get signature genes from antiSMASH-predicted cluster
            features_in_cluster = utils.get_cluster_cds_features(
                cluster, seq_record)
            cluster_sig_genes = [
                gene for gene in secmet_cds_features
                if gene in features_in_cluster
            ]

            # Predict gene cluster borders using ClusterFinder
            if options.borderpredict:
                if ((cluster.location.end + cluster.location.start) /
                        2) in cf_cluster.location:
                    # Make sure that antiSMASH signature genes are still included in the cluster
                    for sig_gene in cluster_sig_genes:
                        startpoint = min(
                            [sig_gene.location.start, sig_gene.location.end])
                        endpoint = max(
                            [sig_gene.location.start, sig_gene.location.end])
                        if cf_cluster.location.start > startpoint:
                            cf_cluster.location = FeatureLocation(
                                startpoint, cf_cluster.location.end)
                        if cf_cluster.location.end < endpoint:
                            cf_cluster.location = FeatureLocation(
                                cf_cluster.location.start, endpoint)
                    cluster_border = SeqFeature(cf_cluster.location,
                                                type="cluster_border")
                    cluster_border.qualifiers = {
                        "tool": ["clusterfinder"],
                        "probability": [cf_cluster.probability],
                        "note": ["best prediction"],
                    }
                    seq_record.features.append(cluster_border)
            elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end:
                cluster.location = cf_cluster.location
            elif cf_cluster.location.start < cluster.location.start:
                cluster.location = FeatureLocation(cf_cluster.location.start,
                                                   cluster.location.end)
            elif cf_cluster.location.end > cluster.location.end:
                cluster.location = FeatureLocation(cluster.location.start,
                                                   cf_cluster.location.end)
            cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
        if not overlaps and not ('borderpredict_only' in options
                                 and options.borderpredict_only):
            cf_cluster_CDSs = utils.get_cluster_cds_features(
                cf_cluster, seq_record)
            for CDS in cf_cluster_CDSs:
                if 'sec_met' in CDS.qualifiers:
                    type_sec_met_qualifiers = [
                        feat for feat in CDS.qualifiers['sec_met']
                        if "Type: " in feat
                    ]
                    for qualifier in type_sec_met_qualifiers:
                        if "cf_fatty_acid" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_fatty_acid"
                            elif cf_type == "cf_saccharide":
                                cf_type = "cf_fatty_acid-saccharide"
                        if "cf_saccharide" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_saccharide"
                            elif cf_type == "cf_fatty_acid":
                                cf_type = "cf_fatty_acid-saccharide"
            new_cluster = SeqFeature(cf_cluster.location, type="cluster")
            new_cluster.qualifiers['product'] = [cf_type]
            new_cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
            newclusters.append(new_cluster)

    if len(newclusters):
        seq_record.features.extend(newclusters)
        renumber_clusters(seq_record, options)