Exemplo n.º 1
0
def add_diversity_data_to_serovar(serovar_data):
    for serovar in serovar_data:
        human_removed_taxa = {}
        data = serovar_data[serovar]
        for taxon in data['associated_taxa']:
            if taxon == 'h**o sapien':
                continue
            human_removed_taxa[taxon] = data['associated_taxa'][taxon]

        taxa_counts = list(human_removed_taxa.values())
        if len(taxa_counts) > 0 and sum(taxa_counts) >= 10:
            serovar_data[serovar]['taxa_entropy'] = calc_shanon_entropy(
                taxa_counts)
            serovar_data[serovar]['taxa_shannon_index'] = alpha.shannon(
                taxa_counts)
            serovar_data[serovar]['taxa_simpson_index'] = alpha.simpson(
                taxa_counts)
            serovar_data[serovar]['taxa_simpson_index_e'] = alpha.simpson_e(
                taxa_counts)
            serovar_data[serovar]['taxa_chao1'] = alpha.chao1(taxa_counts)

        plasmid_counts = list(serovar_data[serovar]['plasmids'].values())
        if len(plasmid_counts) > 0 and sum(plasmid_counts) >= 10:
            serovar_data[serovar]['plasmid_entropy'] = calc_shanon_entropy(
                plasmid_counts)
            serovar_data[serovar]['plasmid_shannon_index'] = alpha.shannon(
                plasmid_counts)
            serovar_data[serovar]['plasmid_simpson_index'] = alpha.simpson(
                plasmid_counts)
            serovar_data[serovar]['plasmid_simpson_index_e'] = alpha.simpson_e(
                plasmid_counts)
            serovar_data[serovar]['plasmid_chao1'] = alpha.chao1(
                plasmid_counts)

    return serovar_data
Exemplo n.º 2
0
def full_seq_freq(report, alns):
    """
    return a list of tuples of the form:
      (sequence, fraction, count, total_reads)
    where sequence is the complete sequence
    sequences with only 1 read are dropped
    """
    failed = sum(1 for x in alns if x is None)
    tot = len(report) + failed
    good_ctr, all_ctr, seq_to_cdr3 = Counter(), Counter(), {}
    for _, row in report.iterrows():
        seq = "".join(row[domain] for domain, _ in DOMAIN_LENS)
        seq = seq.replace("-", "").upper()
        seq_to_cdr3[seq] = row["H-CDR3"].replace("-", "").upper()
        all_ctr[seq] += 1
        if row["complete?"] and not row["frameshift?"]:
            good_ctr[seq] += 1

    singletons, ret = 0, []
    for seq, ct in good_ctr.most_common():
        if ct < 2:
            singletons += 1
        else:
            ret.append((seq, seq_to_cdr3[seq], ct / tot, ct, tot))
    ret.insert(0, ("", "", "", "", ""))
    ret.insert(
        0,
        ("unique_singleton_sequences", "", singletons / tot, singletons, tot))
    ret.insert(
        0,
        ("chao1_estimated_diversity", "", "", "",
         alpha.chao1(list(all_ctr.values()))),
    )
    return ret
Exemplo n.º 3
0
 def testChao1(self, otu):
     diversity = [0] * len(otu[0])
     for j in range(len(otu[0])):
         diversity[j] = alpha.chao1([row[j] for row in otu],
                                    bias_corrected=True)
     print(diversity)
     print(self.chao1(otu, bias_corrected=True))
Exemplo n.º 4
0
    def test_chao1(self):
        self.assertEqual(chao1(self.counts), 9.75)
        self.assertEqual(chao1(self.counts, bias_corrected=False), 10.5)

        self.assertEqual(chao1(self.no_singles), 4)
        self.assertEqual(chao1(self.no_singles, bias_corrected=False), 4)

        self.assertEqual(chao1(self.no_doubles), 5)
        self.assertEqual(chao1(self.no_doubles, bias_corrected=False), 5)
Exemplo n.º 5
0
    def test_chao1(self):
        self.assertEqual(chao1(self.counts), 9.75)
        self.assertEqual(chao1(self.counts, bias_corrected=False), 10.5)

        self.assertEqual(chao1(self.no_singles), 4)
        self.assertEqual(chao1(self.no_singles, bias_corrected=False), 4)

        self.assertEqual(chao1(self.no_doubles), 5)
        self.assertEqual(chao1(self.no_doubles, bias_corrected=False), 5)
Exemplo n.º 6
0
def mercat_compute_alpha_beta_diversity(counts,bif):

    abm = dict()

    abm['shannon'] = skbio_alpha.shannon(counts)
    abm['simpson'] = skbio_alpha.simpson(counts)
    abm['simpson_e'] = skbio_alpha.simpson_e(counts)
    abm['goods_coverage'] = skbio_alpha.goods_coverage(counts)
    abm['fisher_alpha'] = skbio_alpha.fisher_alpha(counts)
    abm['dominance'] = skbio_alpha.dominance(counts)
    abm['chao1'] = skbio_alpha.chao1(counts)
    abm['chao1_ci'] = skbio_alpha.chao1_ci(counts)
    abm['ace'] = skbio_alpha.ace(counts)

    with open(bif + "_diversity_metrics.txt", 'w') as dmptr:
        for abmetric in abm:
            dmptr.write(abmetric + " = " + str(abm[abmetric]) + "\n")
Exemplo n.º 7
0
def get_chao_subsample(ids_list, subset_size):
    """Calculate a single Chao1 using an id list and a target subset size"""

    id_subset = random.sample(ids_list, subset_size)

    otu_dict = dict()
    for otu_id in id_subset:

        if otu_id not in otu_dict:
            otu_dict[otu_id] = 1
        else:
            otu_dict[otu_id] += 1

    count_list_subset = list(otu_dict.values())
    chao_estimate = skbio.chao1(count_list_subset)

    return chao_estimate
Exemplo n.º 8
0
def get_chao_subsample(ids_list, subset_size):

    """Calculate a single Chao1 using an id list and a target subset size"""

    id_subset = random.sample(ids_list, subset_size)

    otu_dict = dict()
    for otu_id in id_subset:

        if otu_id not in otu_dict:
            otu_dict[otu_id] = 1
        else:
            otu_dict[otu_id] += 1

    count_list_subset = list(otu_dict.values())
    chao_estimate = skbio.chao1(count_list_subset)

    return chao_estimate
Exemplo n.º 9
0
def alpha_diversity(args):
    """
        Our counts data in the biomfile is per OTU NOT per sample as needed.
        So it must be transformed
    """

    try:
        json_data = open(args.in_file, 'r')
    except:
        print("NO FILE FOUND ERROR")
        sys.exit()

    data = json.load(json_data)
    json_data.close()
    #size = len(data['rows'])*len(data['columns'])
    #A = np.arange(size).reshape((len(data['rows']),len(data['columns'])))
    A = np.zeros(shape=(len(data['rows']), len(data['columns'])))
    #A.astype(int)
    #print A
    for i, counts in enumerate(data['data']):
        #print 'OTU:',data['rows'][i]['id'],  counts
        #print alpha.chao1(counts)
        A[i] = counts
        #pass

    X = A.astype(int)  # insure int
    #print X
    Y = np.transpose(X)
    txt = "Dataset\tobserved richness\tACE\tchao1\tShannon\tSimpson"
    print(txt)
    for i, row in enumerate(Y):
        ds = data['columns'][i]['id']
        row = row.tolist()

        try:
            ace = alpha.ace(row)
        except:
            ace = 'error'

        try:
            chao1 = alpha.chao1(row)
        except:
            chao1 = 'error'

        try:
            osd = alpha.osd(row)
        except:
            osd = ['error']

        try:
            simpson = alpha.simpson(row)
        except:
            simpson = 'error'

        try:
            shannon = alpha.shannon(row)
        except:
            shannon = 'error'
        txt = ds + "\t" + str(osd[0]) + "\t" + str(ace) + "\t" + str(
            chao1) + "\t" + str(shannon) + "\t" + str(simpson)

        print(txt)
Exemplo n.º 10
0
def mobtyper_plasmid_summarize(mobtyper):
    summary = {}
    for sample_id in mobtyper:
        plasmids = mobtyper[sample_id]
        for plasmid_id in plasmids:
            data = plasmids[plasmid_id]
            if not plasmid_id in summary:
                summary[plasmid_id] = {
                    'replicons': {},
                    'relaxases': {},
                    'overall_mobility': '',
                    'mobility': {
                        'conjugative': 0,
                        'mobilizable': 0,
                        'non-mobilizable': 0
                    },
                    'overall_serovar': '',
                    'serovar': {},
                    'continent': {},
                    'country': {},
                    'primary_sample_category': {},
                    'secondary_sample_category': {},
                    'associated_taxa': {},
                    'earliest_year': 0,
                    'year': {},
                    'samples': [],
                    'total_samples': 0,
                    'num_resistant': 0,
                    'proportion_resistant': 0,
                    'resistance_genes': {},
                    'serovar_entropy': -1,
                    'serovar_shannon_index': -1,
                    'serovar_simpson_index': -1,
                    'serovar_simpson_index_e': -1,
                    'serovar_chao1': 0,
                    'num_serovars': 0,
                    'poportion_human': 0,
                    'taxa_entropy': -1,
                    'taxa_shannon_index': -1,
                    'taxa_simpson_index': -1,
                    'taxa_simpson_index_e': -1,
                    'taxa_chao1': -1,
                }
            summary[plasmid_id]['total_samples'] += 1
            summary[plasmid_id]['samples'].append(sample_id)
            mobility = data['predicted_mobility']
            summary[plasmid_id]['mobility'][mobility] += 1

            rep = data['rep_type(s)'].split(",")
            for r in rep:
                if r not in summary[plasmid_id]['replicons']:
                    summary[plasmid_id]['replicons'][r] = 0
                summary[plasmid_id]['replicons'][r] += 1

            mob = data['relaxase_type(s)'].split(",")
            for m in mob:
                if m not in summary[plasmid_id]['relaxases']:
                    summary[plasmid_id]['relaxases'][m] = 0
                summary[plasmid_id]['relaxases'][m] += 1

            res_genes = data['resistance_genes']

            if len(res_genes) > 0:
                summary[plasmid_id]['num_resistant'] += 1
                for gene_id in res_genes:
                    if not gene_id in summary[plasmid_id]['resistance_genes']:
                        summary[plasmid_id]['resistance_genes'][gene_id] = 0
                    summary[plasmid_id]['resistance_genes'][
                        gene_id] += res_genes[gene_id]

            if not 'metadata' in data:
                continue

            for field_id in data['metadata']:
                value = data['metadata'][field_id]

                if value == 'nan' or value == '':
                    value = 'unknown'

                if not field_id in summary[plasmid_id]:
                    continue

                if field_id == 'associated_taxa':
                    for v in value:
                        if v == '' or v == 'nan':
                            continue
                        if not v in summary[plasmid_id][field_id]:
                            summary[plasmid_id][field_id][v] = 0
                        summary[plasmid_id][field_id][v] += 1
                    continue
                if field_id in ('resistance_genes'):
                    continue

                if not value in summary[plasmid_id][field_id]:
                    summary[plasmid_id][field_id][value] = 0
                summary[plasmid_id][field_id][value] += 1

    for plasmid_id in summary:
        serovar_counts = list(summary[plasmid_id]['serovar'].values())
        if len(summary[plasmid_id]['year']) > 0:
            summary[plasmid_id]['earliest_year'] = min(
                list(summary[plasmid_id]['year'].keys()))
        if 'human' in summary[plasmid_id]['primary_sample_category']:
            value = summary[plasmid_id]['primary_sample_category']['human']
        else:
            value = 0

        summary[plasmid_id][
            'poportion_human'] = value / summary[plasmid_id]['total_samples']

        summary[plasmid_id]['num_serovars'] = len(
            summary[plasmid_id]['serovar'])
        summary[plasmid_id]['proportion_resistant'] = summary[plasmid_id][
            'num_resistant'] / summary[plasmid_id]['total_samples']

        summary[plasmid_id]['overall_mobility'] = max(
            summary[plasmid_id]['mobility'],
            key=summary[plasmid_id]['mobility'].get)
        if len(summary[plasmid_id]['serovar']) > 0:
            summary[plasmid_id]['overall_serovar'] = max(
                summary[plasmid_id]['serovar'],
                key=summary[plasmid_id]['serovar'].get)

        if len(serovar_counts) > 0 and sum(serovar_counts) >= 10:
            summary[plasmid_id]['serovar_entropy'] = calc_shanon_entropy(
                serovar_counts)
            summary[plasmid_id]['serovar_shannon_index'] = alpha.shannon(
                serovar_counts)
            summary[plasmid_id]['serovar_simpson_index'] = alpha.simpson(
                serovar_counts)
            summary[plasmid_id]['serovar_simpson_index_e'] = alpha.simpson_e(
                serovar_counts)
            summary[plasmid_id]['serovar_chao1'] = alpha.chao1(serovar_counts)
        else:
            print("{}\t{}".format(plasmid_id, sum(serovar_counts)))
            print(summary[plasmid_id])
        human_removed_taxa = {}
        for taxon in summary[plasmid_id]['associated_taxa']:
            if taxon == 'h**o sapiens':
                continue
            human_removed_taxa[taxon] = summary[plasmid_id]['associated_taxa'][
                taxon]

        taxa_counts = list(human_removed_taxa.values())
        if len(taxa_counts) > 0 and sum(taxa_counts) >= 10:
            summary[plasmid_id]['taxa_entropy'] = calc_shanon_entropy(
                taxa_counts)
            summary[plasmid_id]['taxa_shannon_index'] = alpha.shannon(
                taxa_counts)
            summary[plasmid_id]['taxa_simpson_index'] = alpha.simpson(
                taxa_counts)
            summary[plasmid_id]['taxa_simpson_index_e'] = alpha.simpson_e(
                taxa_counts)
            summary[plasmid_id]['taxa_chao1'] = alpha.chao1(taxa_counts)

    return summary
Exemplo n.º 11
0
def alpha_diversity(args):
    """
        Our counts data in the biomfile is per OTU NOT per sample as needed.
        So it must be transformed
    """

    try:
        json_data = open(args.in_file, 'r')
    except:
        print("NO FILE FOUND ERROR")
        sys.exit()


    data = json.load(json_data)
    json_data.close()
    #size = len(data['rows'])*len(data['columns'])
    #A = np.arange(size).reshape((len(data['rows']),len(data['columns'])))
    A = np.zeros(shape=(len(data['rows']),len(data['columns'])))
    #A.astype(int)
    #print A
    for i,counts in enumerate(data['data']):
        #print 'OTU:',data['rows'][i]['id'],  counts
        #print alpha.chao1(counts)
        A[i] = counts
        #pass

    X = A.astype(int)   # insure int
    #print X
    Y = np.transpose(X)
    txt = "Dataset\tobserved richness\tACE\tchao1\tShannon\tSimpson"
    print(txt)
    for i,row in enumerate(Y):
        ds = data['columns'][i]['id']
        row = row.tolist()

        try:
            ace       = alpha.ace(row)
        except:
            ace = 'error'

        try:
            chao1     = alpha.chao1(row)
        except:
            chao1 = 'error'

        try:
            osd       = alpha.osd(row)
        except:
            osd = ['error']

        try:
            simpson   = alpha.simpson(row)
        except:
            simpson = 'error'

        try:
            shannon   = alpha.shannon(row)
        except:
            shannon = 'error'
        txt = ds+"\t"+str(osd[0])+"\t"+str(ace)+"\t"+str(chao1)+"\t"+str(shannon)+"\t"+str(simpson)

        print(txt)