Exemplo n.º 1
0
 def test_get_overlaps_table(self):
     "Test utils.get_overlaps_table()"
     mock_features = [
         FakeFeature('CDS', FeatureLocation(10, 40), {"locus_tag": ["G1"]}),
         FakeFeature('CDS', FeatureLocation(40, 50), {"locus_tag": ["G2"]}),
         FakeFeature('CDS', FeatureLocation(45, 70), {"locus_tag": ["G3"]}),
         FakeFeature('CDS', FeatureLocation(75, 100),
                     {"locus_tag": ["G4"]}),
         FakeFeature('CDS', FeatureLocation(101, 110),
                     {"locus_tag": ["G5"]}),
     ]
     mock_rec = FakeRecord(mock_features)
     result = utils.get_overlaps_table(mock_rec)
     expected = ([[mock_features[0]], [mock_features[1], mock_features[2]],
                  [mock_features[3]], [mock_features[4]]], {
                      'G5': 3,
                      'G4': 2,
                      'G3': 1,
                      'G2': 1,
                      'G1': 0
                  })
     self.assertEqual(result, expected, msg=result)
Exemplo n.º 2
0
 def test_apply_cluster_rules(self):
     enabled_clustertypes = list(set(self.rulesdict.keys()))
     detected_types = hmm_detection.apply_cluster_rules(
         self.results_by_id, self.feature_by_id, enabled_clustertypes,
         self.rulesdict, utils.get_overlaps_table(self.record))
     for gid in detected_types:
         detected_types[gid] = set(detected_types[gid].split("-"))
     expected_types = {
         "GENE_1":
         set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]),
         "GENE_2":
         set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]),
         "GENE_3":
         set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]),
         "GENE_4":
         set(["MetaboliteA", "MetaboliteB", "MetaboliteC"]),
         "GENE_5":
         set(["MetaboliteA", "MetaboliteB", "MetaboliteC"])
     }
     self.assertEqual(detected_types,
                      expected_types,
                      msg="\nResult : %s\nExpected : %s" %
                      (detected_types, expected_types))
Exemplo n.º 3
0
def main():
    multiprocessing.freeze_support()
    res_object = {}

    # get genome files
    files = []
    for line in open(sys.argv[1], 'r'):
        files.append(path.expanduser(line.replace("\n", "")))

    # mockup antismash run per files
    i = 1
    for fpath in files:
        res_object[fpath] = {}
        print "Processing %s... (%d/%d)" % (fpath, i, len(files))
        i += 1
        options = get_mockup_config()
        options.sequences = [fpath]
        config.set_config(options)
        run_antismash.setup_logging(
            options)  #To-DO: get antismash logging to works!

        # load plugins
        plugins = run_antismash.load_detection_plugins()
        run_antismash.filter_plugins(plugins, options,
                                     options.enabled_cluster_types)

        # parse to seq_records
        seq_records = run_antismash.parse_input_sequences(options)
        options.next_clusternr = 1

        for seq_record in seq_records:
            if options.input_type == 'nucl':
                seq_records = [
                    record for record in seq_records if len(record.seq) > 1000
                ]
                if len(seq_records) < 1:
                    continue
            utils.sort_features(seq_record)
            run_antismash.strip_record(seq_record)
            utils.fix_record_name_id(seq_record, options)

            # fetch results_by_id
            feature_by_id = utils.get_feature_dict(seq_record)
            results = []
            results_by_id = {}
            for feature in utils.get_cds_features(seq_record):
                prefix = "%s:" % seq_record.id.replace(":", "_")
                gene_id = utils.get_gene_id(feature)
                if (prefix + gene_id) in options.hmm_results:
                    results_by_id[gene_id] = options.hmm_results[prefix +
                                                                 gene_id]
                    for res in results_by_id[gene_id]:
                        results.append(res)

            # ignore short aa's
            min_length_aa = 100
            short_cds_buffer = []
            for f in seq_record.features:  # temporarily remove short aa
                if f.type == "CDS" and len(
                        f.qualifiers['translation']
                    [0]) < min_length_aa and not results_by_id.has_key(
                        utils.get_gene_id(f)):
                    short_cds_buffer.append(f)
                    seq_record.features.remove(f)

            overlaps = utils.get_overlaps_table(seq_record)
            rulesdict = hmm_detection.create_rules_dict(
                options.enabled_cluster_types)
            # find total cdhit numbers in the chromosome
            total_cdhit = len(
                utils.get_cdhit_table(utils.get_cds_features(seq_record))[0])
            res_object[fpath][seq_record.id] = {
                "total_clusters": 0,
                "total_genes": len(overlaps[0]),
                "total_cdhit": total_cdhit,
                "genes_with_hits": 0,
                "largest_cdhit": 0,
                "largest_domain_variations": 0,
                "per_hits": {},
                "cluster_types": {}
            }

            # filter overlap hits
            results, results_by_id = hmm_detection.filter_results(
                results, results_by_id, overlaps, feature_by_id)

            # count hits
            for gene_id in results_by_id:
                res_gene = results_by_id[gene_id]
                if len(res_gene) > 0:
                    res_object[fpath][seq_record.id]["genes_with_hits"] += 1
                for hsp in res_gene:
                    domain_name = hsp.query_id.replace("plants/", "")
                    if domain_name not in res_object[fpath][
                            seq_record.id]["per_hits"]:
                        res_object[fpath][
                            seq_record.id]["per_hits"][domain_name] = 0
                    res_object[fpath][
                        seq_record.id]["per_hits"][domain_name] += 1

            # do cluster finding algorithm
            typedict = hmm_detection.apply_cluster_rules(
                results_by_id, feature_by_id, options.enabled_cluster_types,
                rulesdict, overlaps)
            hmm_detection.fix_hybrid_clusters_typedict(typedict)
            nseqdict = hmm_detection.get_nseq()
            for cds in results_by_id.keys():
                feature = feature_by_id[cds]
                if typedict[cds] != "none":
                    hmm_detection._update_sec_met_entry(
                        feature, results_by_id[cds], typedict[cds], nseqdict)
            hmm_detection.find_clusters(seq_record, rulesdict, overlaps)
            seq_record.features.extend(short_cds_buffer)
            res_object[fpath][seq_record.id]["total_clusters"] += len(
                utils.get_cluster_features(seq_record))

            # do cluster specific and unspecific analysis
            if len(utils.get_cluster_features(seq_record)) > 0:
                run_antismash.cluster_specific_analysis(
                    plugins, seq_record, options)
            run_antismash.unspecific_analysis(seq_record, options)

            #Rearrange hybrid clusters name alphabetically
            hmm_detection.fix_hybrid_clusters(seq_record)

            #before writing to output, remove all hmm_detection's subdir prefixes from clustertype
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = []
                    for name in prod.split('-'):
                        prod_name.append(name.split('/')[-1])
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = [
                                (ct.split('/')[-1])
                                for ct in row.split('Type: ')[-1].split('-')
                            ]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        elif row.startswith('Domains detected: '):
                            cluster_results = []
                            for cluster_result in row.split(
                                    'Domains detected: ')[-1].split(';'):
                                cluster_results.append(
                                    cluster_result.split(' (E-value')[0].split(
                                        '/')[-1] + ' (E-value' +
                                    cluster_result.split(' (E-value')[-1])
                            temp_qual.append('Domains detected: ' +
                                             ";".join(cluster_results))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            #on plants, remove plant clustertype from hybrid types, and replace single
            #plant clustertype with "putative"
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = list(set(prod.split('-')))
                    if (len(prod_name) > 1) and ("plant" in prod_name):
                        prod_name.remove("plant")
                    elif prod_name == ["plant"]:
                        prod_name = ["putative"]
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = list(
                                set(row.split('Type: ')[-1].split('-')))
                            if (len(clustertypes) > 1) and ("plant"
                                                            in clustertypes):
                                clustertypes.remove("plant")
                            elif clustertypes == ["plant"]:
                                clustertypes = ["putative"]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            # find largest cdhit number & largest domain diversity in a cluster
            res_object[fpath][seq_record.id]["average_cdhit"] = 0
            res_object[fpath][seq_record.id]["average_domain_variations"] = 0
            cdhit_numbers = []
            domain_numbers = []
            for cluster in utils.get_cluster_features(seq_record):
                cluster_type = utils.get_cluster_type(cluster)
                if cluster_type not in res_object[fpath][
                        seq_record.id]["cluster_types"]:
                    res_object[fpath][
                        seq_record.id]["cluster_types"][cluster_type] = 0
                res_object[fpath][
                    seq_record.id]["cluster_types"][cluster_type] += 1
                num_cdhit = len(
                    utils.get_cluster_cdhit_table(cluster, seq_record))
                num_domain = len(utils.get_cluster_domains(
                    cluster, seq_record))
                cdhit_numbers.append(num_cdhit)
                domain_numbers.append(num_domain)
                if num_cdhit > res_object[fpath][
                        seq_record.id]["largest_cdhit"]:
                    res_object[fpath][
                        seq_record.id]["largest_cdhit"] = num_cdhit
                if num_domain > res_object[fpath][
                        seq_record.id]["largest_domain_variations"]:
                    res_object[fpath][seq_record.id][
                        "largest_domain_variations"] = num_domain
            if len(cdhit_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_cdhit"] = numpy.median(cdhit_numbers)
            if len(domain_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_domain_variations"] = numpy.median(domain_numbers)

        with open('result.js', 'w') as h:
            h.write('var result = %s;' % json.dumps(res_object, indent=4))
Exemplo n.º 4
0
def detect_signature_genes(seq_record, enabled_clustertypes, options):
    "Function to be executed by module"
    logging.info('Detecting gene clusters using HMM library')
    feature_by_id = utils.get_feature_dict(seq_record)
    rulesdict = create_rules_dict(enabled_clustertypes)
    results = []
    sig_by_name = {}
    results_by_id = {}
    for sig in get_sig_profiles():
        sig_by_name[sig.name] = sig

    for feature in utils.get_cds_features(seq_record):
        prefix = "%s:" % seq_record.id.replace(":", "_")
        gene_id = utils.get_gene_id(feature)
        if (prefix + gene_id) in options.hmm_results:
            results_by_id[gene_id] = options.hmm_results[prefix + gene_id]
            for res in results_by_id[gene_id]:
                results.append(res)

    short_cds_buffer = []
    if options.ignore_short_aa:
        # Temporarily filter out cds with < prot_min_length AA length
        min_length_aa = 50
        if options.eukaryotic:
            min_length_aa = 100
        for f in seq_record.features:
            if f.type == "CDS" and len(
                    f.qualifiers['translation']
                [0]) < min_length_aa and not results_by_id.has_key(
                    utils.get_gene_id(f)):
                short_cds_buffer.append(f)
                seq_record.features.remove(f)

    #Get overlap tables (for overlap filtering etc)
    overlaps = utils.get_overlaps_table(seq_record)

    #Filter results by comparing scores of different models (for PKS systems)
    results_to_delete = [gene_id for gene_id in results_by_id]
    results, results_by_id = filter_results(results, results_by_id, overlaps,
                                            feature_by_id)

    #Update filtered results back to the options.hmm_results
    for gene_id in results_by_id:
        results_to_delete.remove(gene_id)
        prefix = "%s:" % seq_record.id.replace(":", "_")
        if (prefix + gene_id) in options.hmm_results:
            options.hmm_results[(prefix + gene_id)] = results_by_id[gene_id]
    for gene_id in results_to_delete:
        prefix = "%s:" % seq_record.id.replace(":", "_")
        if (prefix + gene_id) in options.hmm_results:
            del options.hmm_results[(prefix + gene_id)]

    #Use rules to determine gene clusters
    typedict = apply_cluster_rules(results_by_id, feature_by_id,
                                   enabled_clustertypes, rulesdict, overlaps)

    #Rearrange hybrid clusters name in typedict alphabetically
    fix_hybrid_clusters_typedict(typedict)

    #Find number of sequences on which each pHMM is based
    nseqdict = get_nseq()

    #Save final results to seq_record
    for cds in results_by_id.keys():
        feature = feature_by_id[cds]
        if typedict[cds] != "none":
            _update_sec_met_entry(feature, results_by_id[cds], typedict[cds],
                                  nseqdict)

    find_clusters(seq_record, rulesdict, overlaps)

    #Find additional NRPS/PKS genes in gene clusters
    add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict)

    #Rearrange hybrid clusters name alphabetically
    fix_hybrid_clusters(seq_record)

    #Add details of gene cluster detection to cluster features
    store_detection_details(results_by_id, rulesdict, seq_record)

    # Re-add the short CDSs
    seq_record.features.extend(short_cds_buffer)
    utils.sort_features(seq_record)

    #If all-orfs option on, remove irrelevant short orfs
    if options.all_orfs:
        remove_irrelevant_allorfs(seq_record)

    #Display %identity
    if options.enable_cdhit:
        store_percentage_identities(seq_record)
Exemplo n.º 5
0
def detect_signature_genes(seq_record, enabled_clustertypes, options):
    "Function to be executed by module"
    feature_by_id = utils.get_feature_dict(seq_record)
    full_fasta = utils.get_multifasta(seq_record)
    rulesdict = create_rules_dict(enabled_clustertypes)
    results = []
    sig_by_name = {}
    results_by_id = {}
    for sig in _signature_profiles:
        sig_by_name[sig.name] = sig

    runresults = utils.run_hmmsearch(utils.get_full_path(
        __file__, 'bgc_seeds.hmm'),
                                     full_fasta,
                                     use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                logging.error(
                    'BUG: Failed to find signature for ID %s / ACC %s',
                    hsp.query_id, acc)
                continue
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    #Get overlap tables (for overlap filtering etc)
    overlaps = utils.get_overlaps_table(seq_record)

    #Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id)

    # Filter results of overlapping genes (only for plants)
    if options.taxon == 'plants':
        results, results_by_id = filter_result_overlapping_genes(
            results, results_by_id, overlaps, feature_by_id)

    #Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    #Use rules to determine gene clusters
    typedict = apply_cluster_rules(results_by_id, feature_by_id,
                                   enabled_clustertypes, rulesdict, overlaps)

    #Find number of sequences on which each pHMM is based
    nseqdict = get_nseq()

    #Save final results to seq_record
    for cds in results_by_id.keys():
        feature = feature_by_id[cds]
        _update_sec_met_entry(feature, results_by_id[cds], typedict[cds],
                              nseqdict)

    find_clusters(seq_record, rulesdict)

    #Find additional NRPS/PKS genes in gene clusters
    add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict)
    #Add details of gene cluster detection to cluster features
    store_detection_details(results_by_id, rulesdict, seq_record)
    #If all-orfs option on, remove irrelevant short orfs
    if options.all_orfs:
        remove_irrelevant_allorfs(seq_record)