def write(seq_records, options): """Write all cluster proteins to a file Args: seq_records (iterable): An iterable containing Bio.SeqRecords options (argparse.Namespace): The options passed to the program """ basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename) logging.debug("Writing seq_records to %r" % output_name) with open(output_name, 'w+') as handle: for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) for feature in utils.get_cluster_cds_features( cluster, seq_record): qual = feature.qualifiers fasta_header = '>%s:%s %s #%s - %s\n' % ( qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0]) handle.write(fasta_header) handle.write( '%s\n' % '\n'.join(textwrap.wrap(qual['translation'][0], 60)))
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) borders = utils.get_cluster_cluster_border_features(cluster, record) tta_codons = [] all_misc_features = utils.get_all_features_of_type( record, 'misc_feature') for feature in all_misc_features: if not utils.features_overlap(cluster, feature): continue if 'note' not in feature.qualifiers: continue for note in feature.qualifiers['note']: if note.startswith('tta leucine codon'): tta_codons.append(feature) break js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['borders'] = convert_cluster_border_features(borders) js_cluster['tta_codons'] = convert_tta_codons(tta_codons) js_cluster['type'] = utils.get_cluster_type(cluster) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist, transportercoglist, geneclusternr): allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] pksnrpscoregenes = [ utils.get_gene_id(cds) for cds in utils.get_pksnrps_cds_features(seq_record) ] feature_by_id = utils.get_feature_dict(seq_record) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record) ] clustertype = utils.get_cluster_type( utils.get_cluster_by_nr(seq_record, geneclusternr)) annotations = {} colors = [] starts = [] ends = [] strands = [] pksnrpsprots = [] gtrs = [] transporters = [] for j in clustergenes: cdsfeature = feature_by_id[j] if cdsfeature.qualifiers.has_key('product'): annotations[j] = cdsfeature.qualifiers['product'][0] else: annotations[j] = 'Unannotated gene' starts.append(cdsfeature.location.start) ends.append(cdsfeature.location.end) if cdsfeature.strand == -1: strands.append("-") else: strands.append("+") if j in allcoregenes: colors.append("#810E15") else: colors.append("grey") if j in pksnrpscoregenes: pksnrpsprots.append(j) if smcogdict.has_key(j): if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist: gtrs.append(j) if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist: transporters.append(j) clustersize = max(ends) - min(starts) return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
def write(seq_records, options): logging.debug("Exporting antiSMASH information as txt tables") #Don't store TXT tables for protein input if options.input_type == 'prot': return #Localize output folder, create TXT subdirectory txt_outfolder = options.full_outputfolder_path + os.sep + "txt" if not os.path.exists(txt_outfolder): os.mkdir(txt_outfolder) #Define table names tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable" #For each gene cluster, write out info to TXT files for seq_record in seq_records: if len(utils.get_cluster_features(seq_record)) > 0: #Open up TXT files txt_files = {} for table in tables: txt_files[table] = open( path.join( txt_outfolder, "%s_%s.txt" % (seq_record.id.partition(".")[0], table)), "w") #Gather all information info = utils.Storage() info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, [] clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) info.clusternrs.append(clusternr) info.clustertypes[clusternr] = utils.get_cluster_type(cluster) info.clustergenes[clusternr] = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.accessions[clusternr] = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features( cluster, seq_record) ] info.cdsmotifs[clusternr] = utils.get_all_features_of_type( seq_record, ["CDS_motif"]) info.seq_record = seq_record #Write information to tables for table in tables: getattr(write_tables, 'write_' + table)(txt_files[table], info, options) for table in tables: txt_files[table].close()
def convert_clusters(record, annotations, options): """Convert cluster SeqFeatures to JSON""" js_clusters = [] for cluster in utils.get_cluster_features(record): features = utils.get_cluster_cds_features(cluster, record) js_cluster = {} js_cluster['start'] = int(cluster.location.start) + 1 js_cluster['end'] = int(cluster.location.end) js_cluster['idx'] = utils.get_cluster_number(cluster) js_cluster['orfs'] = convert_cds_features(record, features, annotations, options) js_cluster['type'] = utils.get_cluster_type(cluster) if options.coexpress: js_cluster["geo"] = utils.get_geotable_json(features) if 'probability' in cluster.qualifiers: js_cluster['probability'] = cluster.qualifiers['probability'][0] if options.input_type == 'prot': js_cluster['unordered'] = True js_cluster['knowncluster'] = "-" js_cluster['BGCid'] = "-" js_cluster['domains'] = utils.get_cluster_domains(cluster, record) if options.enable_cdhit: js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table( cluster, record) if 'knownclusterblast' in cluster.qualifiers: knownclusters = cluster.qualifiers['knownclusterblast'] bestcluster = [ kcluster for kcluster in knownclusters if kcluster.startswith('1.') ] if not len(bestcluster) == 1: logging.warning( "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters." % str(knownclusters)) else: reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0]) js_cluster['knowncluster'] = reObj.group(2) js_cluster['BGCid'] = reObj.group(1) logging.debug('Found closest cluster "%s" for cluster no. %s' % (js_cluster['knowncluster'], utils.get_cluster_number(cluster))) js_clusters.append(js_cluster) return js_clusters
def store_detection_details(results_by_id, rulesdict, seq_record): clusters = utils.get_cluster_features(seq_record) for cluster in clusters: type_combo = utils.get_cluster_type(cluster) if '-' in type_combo: clustertypes = type_combo.split('-') else: clustertypes = [type_combo] if not 'note' in cluster.qualifiers: cluster.qualifiers['note'] = [] rule_string = "Detection rule(s) for this cluster type:" for clustertype in clustertypes: rule_string += " %s: (%s);" % (clustertype, rulesdict[clustertype][0]) cluster.qualifiers['note'].append(rule_string)
def write(seq_records, options): """Write all cluster proteins to a file Args: seq_records (iterable): An iterable containing Bio.SeqRecords options (argparse.Namespace): The options passed to the program """ basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename) logging.debug("Writing seq_records to %r" % output_name) with open(output_name, 'w+') as handle: for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) for feature in utils.get_cluster_cds_features(cluster, seq_record): qual = feature.qualifiers fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0]) handle.write( fasta_header ) handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options): #Create directory to store structures options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures")) if not os.path.exists(options.structuresfolder): os.mkdir(options.structuresfolder) #Combine predictions into a prediction of the final chemical structure and generate images geneclusters = utils.get_cluster_features(seq_record) for genecluster in geneclusters: geneclusternr = utils.get_cluster_number(genecluster) smiles_string = "" if pksnrpsvars.compound_pred_dict.has_key(geneclusternr): #print "output_modules/html/pksnrpsvars.compound_pred_dict:" #print pksnrpsvars.compound_pred_dict residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ") #Now generates SMILES of predicted secondary metabolites without NP.searcher residuesList = residues.split(" ") #Counts the number of malonate and its derivatives in polyketides mal_count = 0 for i in residuesList: if "mal" in i: mal_count += 1 nrresidues = len(residuesList) #Reflecting reduction states of ketide groups starting at beta carbon of type 1 polyketide if "pk" in residuesList and "mal" in residuesList[-1]: residuesList.pop(residuesList.index('pk')+1) residuesList.append('pks-end1') elif mal_count == len(residuesList): if residuesList[0] == "mal": residuesList[0] = "pks-start1" if residuesList[-1] == "ccmal": residuesList.append('pks-end2') if nrresidues > 1: #Conventionally used aaSMILES was used; #chirality expressed with "@@" causes indigo error smiles_monomer = open(os.path.dirname(os.path.realpath(__file__)) + os.sep + 'aaSMILES.txt','r') smiles = smiles_monomer.readline() smiles = smiles_monomer.readline() aa_smiles_dict = {} while smiles: smiles = smiles.split() if len(smiles) > 1: smiles[0] = smiles[0].strip() smiles[1] = smiles[1].strip() aa_smiles_dict[smiles[0]] = smiles[1] smiles = smiles_monomer.readline() smiles_monomer.close() for monomer in residuesList: if monomer in aa_smiles_dict.keys(): smiles_string += aa_smiles_dict[monomer] logging.debug("Cluster %s: smiles_string: %s", geneclusternr, smiles_string) with TemporaryDirectory(change=True): smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif utils.get_cluster_type(genecluster) == "ectoine": smiles_string = "CC1=NCCC(N1)C(=O)O" with TemporaryDirectory(change=True): smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w") smilesfile.write(smiles_string) smilesfile.close() depictstatus = depict_smile(geneclusternr, options.structuresfolder) if depictstatus == "failed": pksnrpsvars.failedstructures.append(geneclusternr) elif genecluster in pksnrpsvars.failedstructures: del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)] pksnrpsvars.compound_pred_dict[geneclusternr] = "ectoine" _update_sec_met_entry(genecluster, smiles_string)
def write(seq_records, options): if options.input_type == 'prot': return #Open up TXT file and XLS record outfolder = options.full_outputfolder_path txtfile = open(path.join(outfolder, "geneclusters.txt"), "w") wb = Workbook() font1 = Font() style1 = XFStyle() style1.font = font1 font1.bold = True ws0 = wb.add_sheet('0') ws0.write(0, 0, "Input accession number", style1) ws0.write(0, 1, "Input name", style1) ws0.write(0, 2, "Gene cluster type", style1) ws0.write(0, 3, "Gene cluster genes", style1) ws0.write(0, 4, "Gene cluster gene accessions", style1) if options.knownclusterblast: ws0.write(0, 5, "Compound with gene cluster of highest homology", style1) #For each gene cluster, write out info column = 1 for seq_record in seq_records: clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clustertype = utils.get_cluster_type(cluster) clusternr = utils.get_cluster_number(cluster) clustergenes = [ utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] accessions = [ utils.get_gene_acc(cds) for cds in utils.get_cluster_cds_features(cluster, seq_record) ] ws0.write(column, 0, seq_record.id) try: ws0.write(column, 1, seq_record.description) except: ws0.write( column, 1, "Name to long to be contained in Excel cell; see txt file in downloadable zip archive." ) ws0.write(column, 2, clustertype) try: ws0.write(column, 3, ";".join(clustergenes)) except: ws0.write( column, 3, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) try: ws0.write(column, 4, ";".join(accessions)) except: ws0.write( column, 4, "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive." ) if hasattr(seq_record, 'closestcompounddict') and \ seq_record.closestcompounddict.has_key(clusternr): ws0.write(column, 5, seq_record.closestcompounddict[clusternr]) column += 1 txtfile.write("\t".join([ seq_record.id, seq_record.description, clustertype, ";".join( clustergenes), ";".join(accessions) ]) + "\n") wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))
def main(): multiprocessing.freeze_support() res_object = {} # get genome files files = [] for line in open(sys.argv[1], 'r'): files.append(path.expanduser(line.replace("\n", ""))) # mockup antismash run per files i = 1 for fpath in files: res_object[fpath] = {} print "Processing %s... (%d/%d)" % (fpath, i, len(files)) i += 1 options = get_mockup_config() options.sequences = [fpath] config.set_config(options) run_antismash.setup_logging( options) #To-DO: get antismash logging to works! # load plugins plugins = run_antismash.load_detection_plugins() run_antismash.filter_plugins(plugins, options, options.enabled_cluster_types) # parse to seq_records seq_records = run_antismash.parse_input_sequences(options) options.next_clusternr = 1 for seq_record in seq_records: if options.input_type == 'nucl': seq_records = [ record for record in seq_records if len(record.seq) > 1000 ] if len(seq_records) < 1: continue utils.sort_features(seq_record) run_antismash.strip_record(seq_record) utils.fix_record_name_id(seq_record, options) # fetch results_by_id feature_by_id = utils.get_feature_dict(seq_record) results = [] results_by_id = {} for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) # ignore short aa's min_length_aa = 100 short_cds_buffer = [] for f in seq_record.features: # temporarily remove short aa if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) overlaps = utils.get_overlaps_table(seq_record) rulesdict = hmm_detection.create_rules_dict( options.enabled_cluster_types) # find total cdhit numbers in the chromosome total_cdhit = len( utils.get_cdhit_table(utils.get_cds_features(seq_record))[0]) res_object[fpath][seq_record.id] = { "total_clusters": 0, "total_genes": len(overlaps[0]), "total_cdhit": total_cdhit, "genes_with_hits": 0, "largest_cdhit": 0, "largest_domain_variations": 0, "per_hits": {}, "cluster_types": {} } # filter overlap hits results, results_by_id = hmm_detection.filter_results( results, results_by_id, overlaps, feature_by_id) # count hits for gene_id in results_by_id: res_gene = results_by_id[gene_id] if len(res_gene) > 0: res_object[fpath][seq_record.id]["genes_with_hits"] += 1 for hsp in res_gene: domain_name = hsp.query_id.replace("plants/", "") if domain_name not in res_object[fpath][ seq_record.id]["per_hits"]: res_object[fpath][ seq_record.id]["per_hits"][domain_name] = 0 res_object[fpath][ seq_record.id]["per_hits"][domain_name] += 1 # do cluster finding algorithm typedict = hmm_detection.apply_cluster_rules( results_by_id, feature_by_id, options.enabled_cluster_types, rulesdict, overlaps) hmm_detection.fix_hybrid_clusters_typedict(typedict) nseqdict = hmm_detection.get_nseq() for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": hmm_detection._update_sec_met_entry( feature, results_by_id[cds], typedict[cds], nseqdict) hmm_detection.find_clusters(seq_record, rulesdict, overlaps) seq_record.features.extend(short_cds_buffer) res_object[fpath][seq_record.id]["total_clusters"] += len( utils.get_cluster_features(seq_record)) # do cluster specific and unspecific analysis if len(utils.get_cluster_features(seq_record)) > 0: run_antismash.cluster_specific_analysis( plugins, seq_record, options) run_antismash.unspecific_analysis(seq_record, options) #Rearrange hybrid clusters name alphabetically hmm_detection.fix_hybrid_clusters(seq_record) #before writing to output, remove all hmm_detection's subdir prefixes from clustertype for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = [] for name in prod.split('-'): prod_name.append(name.split('/')[-1]) prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = [ (ct.split('/')[-1]) for ct in row.split('Type: ')[-1].split('-') ] temp_qual.append('Type: ' + "-".join(clustertypes)) elif row.startswith('Domains detected: '): cluster_results = [] for cluster_result in row.split( 'Domains detected: ')[-1].split(';'): cluster_results.append( cluster_result.split(' (E-value')[0].split( '/')[-1] + ' (E-value' + cluster_result.split(' (E-value')[-1]) temp_qual.append('Domains detected: ' + ";".join(cluster_results)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual #on plants, remove plant clustertype from hybrid types, and replace single #plant clustertype with "putative" for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = list(set(prod.split('-'))) if (len(prod_name) > 1) and ("plant" in prod_name): prod_name.remove("plant") elif prod_name == ["plant"]: prod_name = ["putative"] prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = list( set(row.split('Type: ')[-1].split('-'))) if (len(clustertypes) > 1) and ("plant" in clustertypes): clustertypes.remove("plant") elif clustertypes == ["plant"]: clustertypes = ["putative"] temp_qual.append('Type: ' + "-".join(clustertypes)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual # find largest cdhit number & largest domain diversity in a cluster res_object[fpath][seq_record.id]["average_cdhit"] = 0 res_object[fpath][seq_record.id]["average_domain_variations"] = 0 cdhit_numbers = [] domain_numbers = [] for cluster in utils.get_cluster_features(seq_record): cluster_type = utils.get_cluster_type(cluster) if cluster_type not in res_object[fpath][ seq_record.id]["cluster_types"]: res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] = 0 res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] += 1 num_cdhit = len( utils.get_cluster_cdhit_table(cluster, seq_record)) num_domain = len(utils.get_cluster_domains( cluster, seq_record)) cdhit_numbers.append(num_cdhit) domain_numbers.append(num_domain) if num_cdhit > res_object[fpath][ seq_record.id]["largest_cdhit"]: res_object[fpath][ seq_record.id]["largest_cdhit"] = num_cdhit if num_domain > res_object[fpath][ seq_record.id]["largest_domain_variations"]: res_object[fpath][seq_record.id][ "largest_domain_variations"] = num_domain if len(cdhit_numbers) > 0: res_object[fpath][seq_record.id][ "average_cdhit"] = numpy.median(cdhit_numbers) if len(domain_numbers) > 0: res_object[fpath][seq_record.id][ "average_domain_variations"] = numpy.median(domain_numbers) with open('result.js', 'w') as h: h.write('var result = %s;' % json.dumps(res_object, indent=4))
def test_get_cluster_type(self): "Test utils.get_cluster_type()" cluster = FakeFeature('cluster', FeatureLocation(23, 42), {'product': ['fake']}) self.assertEqual('fake', utils.get_cluster_type(cluster))