示例#1
0
def write(seq_records, options):
    """Write all cluster proteins to a file

    Args:
        seq_records (iterable): An iterable containing Bio.SeqRecords
        options (argparse.Namespace): The options passed to the program
    """
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername,
                            "%s_genecluster_proteins.fa" % basename)
    logging.debug("Writing seq_records to %r" % output_name)

    with open(output_name, 'w+') as handle:
        for seq_record in seq_records:
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clustertype = utils.get_cluster_type(cluster)
                clusternr = utils.get_cluster_number(cluster)
                for feature in utils.get_cluster_cds_features(
                        cluster, seq_record):
                    qual = feature.qualifiers
                    fasta_header = '>%s:%s %s #%s - %s\n' % (
                        qual['locus_tag'][0], qual['protein_id'][0],
                        clustertype, clusternr, qual['product'][0])
                    handle.write(fasta_header)
                    handle.write(
                        '%s\n' %
                        '\n'.join(textwrap.wrap(qual['translation'][0], 60)))
示例#2
0
文件: js.py 项目: chevrm/transPACT
def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)
        borders = utils.get_cluster_cluster_border_features(cluster, record)

        tta_codons = []
        all_misc_features = utils.get_all_features_of_type(
            record, 'misc_feature')
        for feature in all_misc_features:
            if not utils.features_overlap(cluster, feature):
                continue
            if 'note' not in feature.qualifiers:
                continue

            for note in feature.qualifiers['note']:
                if note.startswith('tta leucine codon'):
                    tta_codons.append(feature)
                    break

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['borders'] = convert_cluster_border_features(borders)
        js_cluster['tta_codons'] = convert_tta_codons(tta_codons)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters
示例#3
0
def retrieve_gene_cluster_annotations(seq_record, smcogdict, gtrcoglist,
                                      transportercoglist, geneclusternr):
    allcoregenes = [
        utils.get_gene_id(cds)
        for cds in utils.get_secmet_cds_features(seq_record)
    ]
    pksnrpscoregenes = [
        utils.get_gene_id(cds)
        for cds in utils.get_pksnrps_cds_features(seq_record)
    ]
    feature_by_id = utils.get_feature_dict(seq_record)
    clustergenes = [
        utils.get_gene_id(cds) for cds in utils.get_cluster_cds_features(
            utils.get_cluster_by_nr(seq_record, geneclusternr), seq_record)
    ]
    clustertype = utils.get_cluster_type(
        utils.get_cluster_by_nr(seq_record, geneclusternr))
    annotations = {}
    colors = []
    starts = []
    ends = []
    strands = []
    pksnrpsprots = []
    gtrs = []
    transporters = []
    for j in clustergenes:
        cdsfeature = feature_by_id[j]
        if cdsfeature.qualifiers.has_key('product'):
            annotations[j] = cdsfeature.qualifiers['product'][0]
        else:
            annotations[j] = 'Unannotated gene'
        starts.append(cdsfeature.location.start)
        ends.append(cdsfeature.location.end)
        if cdsfeature.strand == -1:
            strands.append("-")
        else:
            strands.append("+")
        if j in allcoregenes:
            colors.append("#810E15")
        else:
            colors.append("grey")
        if j in pksnrpscoregenes:
            pksnrpsprots.append(j)
        if smcogdict.has_key(j):
            if len(smcogdict[j]) > 0 and smcogdict[j][0] in gtrcoglist:
                gtrs.append(j)
            if len(smcogdict[j]) > 0 and smcogdict[j][0] in transportercoglist:
                transporters.append(j)
    clustersize = max(ends) - min(starts)
    return clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize
示例#4
0
def write(seq_records, options):
    logging.debug("Exporting antiSMASH information as txt tables")
    #Don't store TXT tables for protein input
    if options.input_type == 'prot':
        return
    #Localize output folder, create TXT subdirectory
    txt_outfolder = options.full_outputfolder_path + os.sep + "txt"
    if not os.path.exists(txt_outfolder):
        os.mkdir(txt_outfolder)
    #Define table names
    tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable"
    #For each gene cluster, write out info to TXT files
    for seq_record in seq_records:
        if len(utils.get_cluster_features(seq_record)) > 0:
            #Open up TXT files
            txt_files = {}
            for table in tables:
                txt_files[table] = open(
                    path.join(
                        txt_outfolder, "%s_%s.txt" %
                        (seq_record.id.partition(".")[0], table)), "w")
            #Gather all information
            info = utils.Storage()
            info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, []
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clusternr = utils.get_cluster_number(cluster)
                info.clusternrs.append(clusternr)
                info.clustertypes[clusternr] = utils.get_cluster_type(cluster)
                info.clustergenes[clusternr] = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.accessions[clusternr] = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.cdsmotifs[clusternr] = utils.get_all_features_of_type(
                    seq_record, ["CDS_motif"])
            info.seq_record = seq_record
            #Write information to tables
            for table in tables:
                getattr(write_tables, 'write_' + table)(txt_files[table], info,
                                                        options)
            for table in tables:
                txt_files[table].close()
示例#5
0
def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if options.coexpress:
            js_cluster["geo"] = utils.get_geotable_json(features)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"
        js_cluster['domains'] = utils.get_cluster_domains(cluster, record)

        if options.enable_cdhit:
            js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table(
                cluster, record)

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters
示例#6
0
def store_detection_details(results_by_id, rulesdict, seq_record):
    clusters = utils.get_cluster_features(seq_record)
    for cluster in clusters:
        type_combo = utils.get_cluster_type(cluster)
        if '-' in type_combo:
            clustertypes = type_combo.split('-')
        else:
            clustertypes = [type_combo]

        if not 'note' in cluster.qualifiers:
            cluster.qualifiers['note'] = []
        rule_string = "Detection rule(s) for this cluster type:"
        for clustertype in clustertypes:
            rule_string += " %s: (%s);" % (clustertype,
                                           rulesdict[clustertype][0])

        cluster.qualifiers['note'].append(rule_string)
示例#7
0
def write(seq_records, options):
    """Write all cluster proteins to a file

    Args:
        seq_records (iterable): An iterable containing Bio.SeqRecords
        options (argparse.Namespace): The options passed to the program
    """
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename)
    logging.debug("Writing seq_records to %r" % output_name)

    with open(output_name, 'w+') as handle:
        for seq_record in seq_records:
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clustertype = utils.get_cluster_type(cluster)
                clusternr = utils.get_cluster_number(cluster)
                for feature in utils.get_cluster_cds_features(cluster, seq_record):
                    qual = feature.qualifiers
                    fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0])
                    handle.write( fasta_header )
                    handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )
示例#8
0
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options):
    #Create directory to store structures
    options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures"))
    if not os.path.exists(options.structuresfolder):
        os.mkdir(options.structuresfolder)

    #Combine predictions into a prediction of the final chemical structure and generate images
    geneclusters = utils.get_cluster_features(seq_record)

    for genecluster in geneclusters:
        geneclusternr = utils.get_cluster_number(genecluster)
        smiles_string = ""
        if pksnrpsvars.compound_pred_dict.has_key(geneclusternr):

            #print "output_modules/html/pksnrpsvars.compound_pred_dict:"
            #print pksnrpsvars.compound_pred_dict

            residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ")


            #Now generates SMILES of predicted secondary metabolites without NP.searcher
            residuesList = residues.split(" ")

            #Counts the number of malonate and its derivatives in polyketides
            mal_count = 0
            for i in residuesList:
                if "mal" in i:
                    mal_count += 1

            nrresidues = len(residuesList)

            #Reflecting reduction states of ketide groups starting at beta carbon of type 1 polyketide
            if "pk" in residuesList and "mal" in residuesList[-1]:
                residuesList.pop(residuesList.index('pk')+1)
                residuesList.append('pks-end1')
            elif mal_count == len(residuesList):
                if residuesList[0] == "mal":
                    residuesList[0] = "pks-start1"
                if residuesList[-1] == "ccmal":
                    residuesList.append('pks-end2')

            if nrresidues > 1:
                #Conventionally used aaSMILES was used;
                #chirality expressed with "@@" causes indigo error
                smiles_monomer = open(os.path.dirname(os.path.realpath(__file__)) + os.sep + 'aaSMILES.txt','r')
                smiles = smiles_monomer.readline()
                smiles = smiles_monomer.readline()

                aa_smiles_dict = {}
                while smiles:
                    smiles = smiles.split()
                    if len(smiles) > 1:
                        smiles[0] = smiles[0].strip()
                        smiles[1] = smiles[1].strip()
                        aa_smiles_dict[smiles[0]] = smiles[1]
                    smiles = smiles_monomer.readline()
                smiles_monomer.close()

                for monomer in residuesList:
                    if monomer in aa_smiles_dict.keys():
                        smiles_string += aa_smiles_dict[monomer]
                logging.debug("Cluster %s: smiles_string: %s", geneclusternr, smiles_string)
                with TemporaryDirectory(change=True):
                    smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w")
                    smilesfile.write(smiles_string)
                    smilesfile.close()
                    depictstatus = depict_smile(geneclusternr, options.structuresfolder)
                if depictstatus == "failed":
                    pksnrpsvars.failedstructures.append(geneclusternr)
        elif utils.get_cluster_type(genecluster) == "ectoine":
            smiles_string = "CC1=NCCC(N1)C(=O)O"
            with TemporaryDirectory(change=True):
                smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w")
                smilesfile.write(smiles_string)
                smilesfile.close()
                depictstatus = depict_smile(geneclusternr, options.structuresfolder)
            if depictstatus == "failed":
                pksnrpsvars.failedstructures.append(geneclusternr)
            elif genecluster in pksnrpsvars.failedstructures:
                del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)]
            pksnrpsvars.compound_pred_dict[geneclusternr] = "ectoine"
        _update_sec_met_entry(genecluster, smiles_string)
示例#9
0
def write(seq_records, options):
    if options.input_type == 'prot':
        return
    #Open up TXT file and XLS record
    outfolder = options.full_outputfolder_path
    txtfile = open(path.join(outfolder, "geneclusters.txt"), "w")
    wb = Workbook()
    font1 = Font()
    style1 = XFStyle()
    style1.font = font1
    font1.bold = True
    ws0 = wb.add_sheet('0')
    ws0.write(0, 0, "Input accession number", style1)
    ws0.write(0, 1, "Input name", style1)
    ws0.write(0, 2, "Gene cluster type", style1)
    ws0.write(0, 3, "Gene cluster genes", style1)
    ws0.write(0, 4, "Gene cluster gene accessions", style1)
    if options.knownclusterblast:
        ws0.write(0, 5, "Compound with gene cluster of highest homology",
                  style1)
    #For each gene cluster, write out info
    column = 1
    for seq_record in seq_records:
        clusters = utils.get_cluster_features(seq_record)
        for cluster in clusters:
            clustertype = utils.get_cluster_type(cluster)
            clusternr = utils.get_cluster_number(cluster)
            clustergenes = [
                utils.get_gene_id(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            accessions = [
                utils.get_gene_acc(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            ws0.write(column, 0, seq_record.id)
            try:
                ws0.write(column, 1, seq_record.description)
            except:
                ws0.write(
                    column, 1,
                    "Name to long to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            ws0.write(column, 2, clustertype)
            try:
                ws0.write(column, 3, ";".join(clustergenes))
            except:
                ws0.write(
                    column, 3,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            try:
                ws0.write(column, 4, ";".join(accessions))
            except:
                ws0.write(
                    column, 4,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            if hasattr(seq_record, 'closestcompounddict') and \
               seq_record.closestcompounddict.has_key(clusternr):
                ws0.write(column, 5, seq_record.closestcompounddict[clusternr])
            column += 1
            txtfile.write("\t".join([
                seq_record.id, seq_record.description, clustertype, ";".join(
                    clustergenes), ";".join(accessions)
            ]) + "\n")
    wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))
示例#10
0
def main():
    multiprocessing.freeze_support()
    res_object = {}

    # get genome files
    files = []
    for line in open(sys.argv[1], 'r'):
        files.append(path.expanduser(line.replace("\n", "")))

    # mockup antismash run per files
    i = 1
    for fpath in files:
        res_object[fpath] = {}
        print "Processing %s... (%d/%d)" % (fpath, i, len(files))
        i += 1
        options = get_mockup_config()
        options.sequences = [fpath]
        config.set_config(options)
        run_antismash.setup_logging(
            options)  #To-DO: get antismash logging to works!

        # load plugins
        plugins = run_antismash.load_detection_plugins()
        run_antismash.filter_plugins(plugins, options,
                                     options.enabled_cluster_types)

        # parse to seq_records
        seq_records = run_antismash.parse_input_sequences(options)
        options.next_clusternr = 1

        for seq_record in seq_records:
            if options.input_type == 'nucl':
                seq_records = [
                    record for record in seq_records if len(record.seq) > 1000
                ]
                if len(seq_records) < 1:
                    continue
            utils.sort_features(seq_record)
            run_antismash.strip_record(seq_record)
            utils.fix_record_name_id(seq_record, options)

            # fetch results_by_id
            feature_by_id = utils.get_feature_dict(seq_record)
            results = []
            results_by_id = {}
            for feature in utils.get_cds_features(seq_record):
                prefix = "%s:" % seq_record.id.replace(":", "_")
                gene_id = utils.get_gene_id(feature)
                if (prefix + gene_id) in options.hmm_results:
                    results_by_id[gene_id] = options.hmm_results[prefix +
                                                                 gene_id]
                    for res in results_by_id[gene_id]:
                        results.append(res)

            # ignore short aa's
            min_length_aa = 100
            short_cds_buffer = []
            for f in seq_record.features:  # temporarily remove short aa
                if f.type == "CDS" and len(
                        f.qualifiers['translation']
                    [0]) < min_length_aa and not results_by_id.has_key(
                        utils.get_gene_id(f)):
                    short_cds_buffer.append(f)
                    seq_record.features.remove(f)

            overlaps = utils.get_overlaps_table(seq_record)
            rulesdict = hmm_detection.create_rules_dict(
                options.enabled_cluster_types)
            # find total cdhit numbers in the chromosome
            total_cdhit = len(
                utils.get_cdhit_table(utils.get_cds_features(seq_record))[0])
            res_object[fpath][seq_record.id] = {
                "total_clusters": 0,
                "total_genes": len(overlaps[0]),
                "total_cdhit": total_cdhit,
                "genes_with_hits": 0,
                "largest_cdhit": 0,
                "largest_domain_variations": 0,
                "per_hits": {},
                "cluster_types": {}
            }

            # filter overlap hits
            results, results_by_id = hmm_detection.filter_results(
                results, results_by_id, overlaps, feature_by_id)

            # count hits
            for gene_id in results_by_id:
                res_gene = results_by_id[gene_id]
                if len(res_gene) > 0:
                    res_object[fpath][seq_record.id]["genes_with_hits"] += 1
                for hsp in res_gene:
                    domain_name = hsp.query_id.replace("plants/", "")
                    if domain_name not in res_object[fpath][
                            seq_record.id]["per_hits"]:
                        res_object[fpath][
                            seq_record.id]["per_hits"][domain_name] = 0
                    res_object[fpath][
                        seq_record.id]["per_hits"][domain_name] += 1

            # do cluster finding algorithm
            typedict = hmm_detection.apply_cluster_rules(
                results_by_id, feature_by_id, options.enabled_cluster_types,
                rulesdict, overlaps)
            hmm_detection.fix_hybrid_clusters_typedict(typedict)
            nseqdict = hmm_detection.get_nseq()
            for cds in results_by_id.keys():
                feature = feature_by_id[cds]
                if typedict[cds] != "none":
                    hmm_detection._update_sec_met_entry(
                        feature, results_by_id[cds], typedict[cds], nseqdict)
            hmm_detection.find_clusters(seq_record, rulesdict, overlaps)
            seq_record.features.extend(short_cds_buffer)
            res_object[fpath][seq_record.id]["total_clusters"] += len(
                utils.get_cluster_features(seq_record))

            # do cluster specific and unspecific analysis
            if len(utils.get_cluster_features(seq_record)) > 0:
                run_antismash.cluster_specific_analysis(
                    plugins, seq_record, options)
            run_antismash.unspecific_analysis(seq_record, options)

            #Rearrange hybrid clusters name alphabetically
            hmm_detection.fix_hybrid_clusters(seq_record)

            #before writing to output, remove all hmm_detection's subdir prefixes from clustertype
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = []
                    for name in prod.split('-'):
                        prod_name.append(name.split('/')[-1])
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = [
                                (ct.split('/')[-1])
                                for ct in row.split('Type: ')[-1].split('-')
                            ]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        elif row.startswith('Domains detected: '):
                            cluster_results = []
                            for cluster_result in row.split(
                                    'Domains detected: ')[-1].split(';'):
                                cluster_results.append(
                                    cluster_result.split(' (E-value')[0].split(
                                        '/')[-1] + ' (E-value' +
                                    cluster_result.split(' (E-value')[-1])
                            temp_qual.append('Domains detected: ' +
                                             ";".join(cluster_results))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            #on plants, remove plant clustertype from hybrid types, and replace single
            #plant clustertype with "putative"
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = list(set(prod.split('-')))
                    if (len(prod_name) > 1) and ("plant" in prod_name):
                        prod_name.remove("plant")
                    elif prod_name == ["plant"]:
                        prod_name = ["putative"]
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = list(
                                set(row.split('Type: ')[-1].split('-')))
                            if (len(clustertypes) > 1) and ("plant"
                                                            in clustertypes):
                                clustertypes.remove("plant")
                            elif clustertypes == ["plant"]:
                                clustertypes = ["putative"]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            # find largest cdhit number & largest domain diversity in a cluster
            res_object[fpath][seq_record.id]["average_cdhit"] = 0
            res_object[fpath][seq_record.id]["average_domain_variations"] = 0
            cdhit_numbers = []
            domain_numbers = []
            for cluster in utils.get_cluster_features(seq_record):
                cluster_type = utils.get_cluster_type(cluster)
                if cluster_type not in res_object[fpath][
                        seq_record.id]["cluster_types"]:
                    res_object[fpath][
                        seq_record.id]["cluster_types"][cluster_type] = 0
                res_object[fpath][
                    seq_record.id]["cluster_types"][cluster_type] += 1
                num_cdhit = len(
                    utils.get_cluster_cdhit_table(cluster, seq_record))
                num_domain = len(utils.get_cluster_domains(
                    cluster, seq_record))
                cdhit_numbers.append(num_cdhit)
                domain_numbers.append(num_domain)
                if num_cdhit > res_object[fpath][
                        seq_record.id]["largest_cdhit"]:
                    res_object[fpath][
                        seq_record.id]["largest_cdhit"] = num_cdhit
                if num_domain > res_object[fpath][
                        seq_record.id]["largest_domain_variations"]:
                    res_object[fpath][seq_record.id][
                        "largest_domain_variations"] = num_domain
            if len(cdhit_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_cdhit"] = numpy.median(cdhit_numbers)
            if len(domain_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_domain_variations"] = numpy.median(domain_numbers)

        with open('result.js', 'w') as h:
            h.write('var result = %s;' % json.dumps(res_object, indent=4))
示例#11
0
 def test_get_cluster_type(self):
     "Test utils.get_cluster_type()"
     cluster = FakeFeature('cluster', FeatureLocation(23, 42),
                           {'product': ['fake']})
     self.assertEqual('fake', utils.get_cluster_type(cluster))