def add_diversity_data_to_serovar(serovar_data): for serovar in serovar_data: human_removed_taxa = {} data = serovar_data[serovar] for taxon in data['associated_taxa']: if taxon == 'h**o sapien': continue human_removed_taxa[taxon] = data['associated_taxa'][taxon] taxa_counts = list(human_removed_taxa.values()) if len(taxa_counts) > 0 and sum(taxa_counts) >= 10: serovar_data[serovar]['taxa_entropy'] = calc_shanon_entropy( taxa_counts) serovar_data[serovar]['taxa_shannon_index'] = alpha.shannon( taxa_counts) serovar_data[serovar]['taxa_simpson_index'] = alpha.simpson( taxa_counts) serovar_data[serovar]['taxa_simpson_index_e'] = alpha.simpson_e( taxa_counts) serovar_data[serovar]['taxa_chao1'] = alpha.chao1(taxa_counts) plasmid_counts = list(serovar_data[serovar]['plasmids'].values()) if len(plasmid_counts) > 0 and sum(plasmid_counts) >= 10: serovar_data[serovar]['plasmid_entropy'] = calc_shanon_entropy( plasmid_counts) serovar_data[serovar]['plasmid_shannon_index'] = alpha.shannon( plasmid_counts) serovar_data[serovar]['plasmid_simpson_index'] = alpha.simpson( plasmid_counts) serovar_data[serovar]['plasmid_simpson_index_e'] = alpha.simpson_e( plasmid_counts) serovar_data[serovar]['plasmid_chao1'] = alpha.chao1( plasmid_counts) return serovar_data
def full_seq_freq(report, alns): """ return a list of tuples of the form: (sequence, fraction, count, total_reads) where sequence is the complete sequence sequences with only 1 read are dropped """ failed = sum(1 for x in alns if x is None) tot = len(report) + failed good_ctr, all_ctr, seq_to_cdr3 = Counter(), Counter(), {} for _, row in report.iterrows(): seq = "".join(row[domain] for domain, _ in DOMAIN_LENS) seq = seq.replace("-", "").upper() seq_to_cdr3[seq] = row["H-CDR3"].replace("-", "").upper() all_ctr[seq] += 1 if row["complete?"] and not row["frameshift?"]: good_ctr[seq] += 1 singletons, ret = 0, [] for seq, ct in good_ctr.most_common(): if ct < 2: singletons += 1 else: ret.append((seq, seq_to_cdr3[seq], ct / tot, ct, tot)) ret.insert(0, ("", "", "", "", "")) ret.insert( 0, ("unique_singleton_sequences", "", singletons / tot, singletons, tot)) ret.insert( 0, ("chao1_estimated_diversity", "", "", "", alpha.chao1(list(all_ctr.values()))), ) return ret
def testChao1(self, otu): diversity = [0] * len(otu[0]) for j in range(len(otu[0])): diversity[j] = alpha.chao1([row[j] for row in otu], bias_corrected=True) print(diversity) print(self.chao1(otu, bias_corrected=True))
def test_chao1(self): self.assertEqual(chao1(self.counts), 9.75) self.assertEqual(chao1(self.counts, bias_corrected=False), 10.5) self.assertEqual(chao1(self.no_singles), 4) self.assertEqual(chao1(self.no_singles, bias_corrected=False), 4) self.assertEqual(chao1(self.no_doubles), 5) self.assertEqual(chao1(self.no_doubles, bias_corrected=False), 5)
def mercat_compute_alpha_beta_diversity(counts,bif): abm = dict() abm['shannon'] = skbio_alpha.shannon(counts) abm['simpson'] = skbio_alpha.simpson(counts) abm['simpson_e'] = skbio_alpha.simpson_e(counts) abm['goods_coverage'] = skbio_alpha.goods_coverage(counts) abm['fisher_alpha'] = skbio_alpha.fisher_alpha(counts) abm['dominance'] = skbio_alpha.dominance(counts) abm['chao1'] = skbio_alpha.chao1(counts) abm['chao1_ci'] = skbio_alpha.chao1_ci(counts) abm['ace'] = skbio_alpha.ace(counts) with open(bif + "_diversity_metrics.txt", 'w') as dmptr: for abmetric in abm: dmptr.write(abmetric + " = " + str(abm[abmetric]) + "\n")
def get_chao_subsample(ids_list, subset_size): """Calculate a single Chao1 using an id list and a target subset size""" id_subset = random.sample(ids_list, subset_size) otu_dict = dict() for otu_id in id_subset: if otu_id not in otu_dict: otu_dict[otu_id] = 1 else: otu_dict[otu_id] += 1 count_list_subset = list(otu_dict.values()) chao_estimate = skbio.chao1(count_list_subset) return chao_estimate
def alpha_diversity(args): """ Our counts data in the biomfile is per OTU NOT per sample as needed. So it must be transformed """ try: json_data = open(args.in_file, 'r') except: print("NO FILE FOUND ERROR") sys.exit() data = json.load(json_data) json_data.close() #size = len(data['rows'])*len(data['columns']) #A = np.arange(size).reshape((len(data['rows']),len(data['columns']))) A = np.zeros(shape=(len(data['rows']), len(data['columns']))) #A.astype(int) #print A for i, counts in enumerate(data['data']): #print 'OTU:',data['rows'][i]['id'], counts #print alpha.chao1(counts) A[i] = counts #pass X = A.astype(int) # insure int #print X Y = np.transpose(X) txt = "Dataset\tobserved richness\tACE\tchao1\tShannon\tSimpson" print(txt) for i, row in enumerate(Y): ds = data['columns'][i]['id'] row = row.tolist() try: ace = alpha.ace(row) except: ace = 'error' try: chao1 = alpha.chao1(row) except: chao1 = 'error' try: osd = alpha.osd(row) except: osd = ['error'] try: simpson = alpha.simpson(row) except: simpson = 'error' try: shannon = alpha.shannon(row) except: shannon = 'error' txt = ds + "\t" + str(osd[0]) + "\t" + str(ace) + "\t" + str( chao1) + "\t" + str(shannon) + "\t" + str(simpson) print(txt)
def mobtyper_plasmid_summarize(mobtyper): summary = {} for sample_id in mobtyper: plasmids = mobtyper[sample_id] for plasmid_id in plasmids: data = plasmids[plasmid_id] if not plasmid_id in summary: summary[plasmid_id] = { 'replicons': {}, 'relaxases': {}, 'overall_mobility': '', 'mobility': { 'conjugative': 0, 'mobilizable': 0, 'non-mobilizable': 0 }, 'overall_serovar': '', 'serovar': {}, 'continent': {}, 'country': {}, 'primary_sample_category': {}, 'secondary_sample_category': {}, 'associated_taxa': {}, 'earliest_year': 0, 'year': {}, 'samples': [], 'total_samples': 0, 'num_resistant': 0, 'proportion_resistant': 0, 'resistance_genes': {}, 'serovar_entropy': -1, 'serovar_shannon_index': -1, 'serovar_simpson_index': -1, 'serovar_simpson_index_e': -1, 'serovar_chao1': 0, 'num_serovars': 0, 'poportion_human': 0, 'taxa_entropy': -1, 'taxa_shannon_index': -1, 'taxa_simpson_index': -1, 'taxa_simpson_index_e': -1, 'taxa_chao1': -1, } summary[plasmid_id]['total_samples'] += 1 summary[plasmid_id]['samples'].append(sample_id) mobility = data['predicted_mobility'] summary[plasmid_id]['mobility'][mobility] += 1 rep = data['rep_type(s)'].split(",") for r in rep: if r not in summary[plasmid_id]['replicons']: summary[plasmid_id]['replicons'][r] = 0 summary[plasmid_id]['replicons'][r] += 1 mob = data['relaxase_type(s)'].split(",") for m in mob: if m not in summary[plasmid_id]['relaxases']: summary[plasmid_id]['relaxases'][m] = 0 summary[plasmid_id]['relaxases'][m] += 1 res_genes = data['resistance_genes'] if len(res_genes) > 0: summary[plasmid_id]['num_resistant'] += 1 for gene_id in res_genes: if not gene_id in summary[plasmid_id]['resistance_genes']: summary[plasmid_id]['resistance_genes'][gene_id] = 0 summary[plasmid_id]['resistance_genes'][ gene_id] += res_genes[gene_id] if not 'metadata' in data: continue for field_id in data['metadata']: value = data['metadata'][field_id] if value == 'nan' or value == '': value = 'unknown' if not field_id in summary[plasmid_id]: continue if field_id == 'associated_taxa': for v in value: if v == '' or v == 'nan': continue if not v in summary[plasmid_id][field_id]: summary[plasmid_id][field_id][v] = 0 summary[plasmid_id][field_id][v] += 1 continue if field_id in ('resistance_genes'): continue if not value in summary[plasmid_id][field_id]: summary[plasmid_id][field_id][value] = 0 summary[plasmid_id][field_id][value] += 1 for plasmid_id in summary: serovar_counts = list(summary[plasmid_id]['serovar'].values()) if len(summary[plasmid_id]['year']) > 0: summary[plasmid_id]['earliest_year'] = min( list(summary[plasmid_id]['year'].keys())) if 'human' in summary[plasmid_id]['primary_sample_category']: value = summary[plasmid_id]['primary_sample_category']['human'] else: value = 0 summary[plasmid_id][ 'poportion_human'] = value / summary[plasmid_id]['total_samples'] summary[plasmid_id]['num_serovars'] = len( summary[plasmid_id]['serovar']) summary[plasmid_id]['proportion_resistant'] = summary[plasmid_id][ 'num_resistant'] / summary[plasmid_id]['total_samples'] summary[plasmid_id]['overall_mobility'] = max( summary[plasmid_id]['mobility'], key=summary[plasmid_id]['mobility'].get) if len(summary[plasmid_id]['serovar']) > 0: summary[plasmid_id]['overall_serovar'] = max( summary[plasmid_id]['serovar'], key=summary[plasmid_id]['serovar'].get) if len(serovar_counts) > 0 and sum(serovar_counts) >= 10: summary[plasmid_id]['serovar_entropy'] = calc_shanon_entropy( serovar_counts) summary[plasmid_id]['serovar_shannon_index'] = alpha.shannon( serovar_counts) summary[plasmid_id]['serovar_simpson_index'] = alpha.simpson( serovar_counts) summary[plasmid_id]['serovar_simpson_index_e'] = alpha.simpson_e( serovar_counts) summary[plasmid_id]['serovar_chao1'] = alpha.chao1(serovar_counts) else: print("{}\t{}".format(plasmid_id, sum(serovar_counts))) print(summary[plasmid_id]) human_removed_taxa = {} for taxon in summary[plasmid_id]['associated_taxa']: if taxon == 'h**o sapiens': continue human_removed_taxa[taxon] = summary[plasmid_id]['associated_taxa'][ taxon] taxa_counts = list(human_removed_taxa.values()) if len(taxa_counts) > 0 and sum(taxa_counts) >= 10: summary[plasmid_id]['taxa_entropy'] = calc_shanon_entropy( taxa_counts) summary[plasmid_id]['taxa_shannon_index'] = alpha.shannon( taxa_counts) summary[plasmid_id]['taxa_simpson_index'] = alpha.simpson( taxa_counts) summary[plasmid_id]['taxa_simpson_index_e'] = alpha.simpson_e( taxa_counts) summary[plasmid_id]['taxa_chao1'] = alpha.chao1(taxa_counts) return summary
def alpha_diversity(args): """ Our counts data in the biomfile is per OTU NOT per sample as needed. So it must be transformed """ try: json_data = open(args.in_file, 'r') except: print("NO FILE FOUND ERROR") sys.exit() data = json.load(json_data) json_data.close() #size = len(data['rows'])*len(data['columns']) #A = np.arange(size).reshape((len(data['rows']),len(data['columns']))) A = np.zeros(shape=(len(data['rows']),len(data['columns']))) #A.astype(int) #print A for i,counts in enumerate(data['data']): #print 'OTU:',data['rows'][i]['id'], counts #print alpha.chao1(counts) A[i] = counts #pass X = A.astype(int) # insure int #print X Y = np.transpose(X) txt = "Dataset\tobserved richness\tACE\tchao1\tShannon\tSimpson" print(txt) for i,row in enumerate(Y): ds = data['columns'][i]['id'] row = row.tolist() try: ace = alpha.ace(row) except: ace = 'error' try: chao1 = alpha.chao1(row) except: chao1 = 'error' try: osd = alpha.osd(row) except: osd = ['error'] try: simpson = alpha.simpson(row) except: simpson = 'error' try: shannon = alpha.shannon(row) except: shannon = 'error' txt = ds+"\t"+str(osd[0])+"\t"+str(ace)+"\t"+str(chao1)+"\t"+str(shannon)+"\t"+str(simpson) print(txt)