def remove_samples(self, samps): ## Allow to just pass in one sample as a string if isinstance(samps, str): samps = [samps] if set(samps) > set(self.samples_vcforder): raise IPyradError( " Trying to remove samples not present in the vcf file: {}". format(samps)) ## Remove the samples from the sample list mask = np.isin(self.samples_vcforder, samps) self.samples_vcforder = self.samples_vcforder[~mask] self.genotypes = self.genotypes[:, ~mask] ## Remove biallelic singletons. If you don't do this you get ## a nasty error during svd, like this: ## https://stackoverflow.com/questions/33447808/sklearns-plsregression-valueerror-array-must-not-contain-infs-or-nans ac = self.genotypes.count_alleles() flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) self.genotypes = self.genotypes.compress(flt, axis=0) if len(self.samples_vcforder) < self.ncomponents: self.ncomponents = len(self.samples_vcforder) print( " INFO: Number of PCs may not exceed the number of samples.\n Setting number of PCs = {}" .format(self.ncomponents))
def plot_pairwise_dist(self, labels=None, ax=None, cmap=None, cdict=None, metric="euclidean"): """ Plot pairwise distances between all samples labels: bool or list by default labels aren't included. If labels == True, then labels are read in from the vcf file. Alternatively, labels can be passed in as a list, should be same length as the number of samples. """ allele_counts = self.genotypes.to_n_alt() dist = allel.pairwise_distance(allele_counts, metric=metric) if not ax: fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(1, 1, 1) if isinstance(labels, bool): if labels: labels = list(self.samples_vcforder) elif isinstance(labels, type(None)): pass else: ## If not bool or None (default), then check to make sure the list passed in ## is the right length if not len(labels) == len(self.samples_vcforder): raise IPyradError(LABELS_LENGTH_ERROR.format(len(labels), len(self.samples_vcforder))) allel.plot.pairwise_distance(dist, labels=labels, ax=ax, colorbar=False)
def plot(self, show_test_labels=True, use_edge_lengths=False, collapse_outgroup=False, pct_tree_x=0.5, pct_tree_y=0.7, *args, **kwargs): """ draw a multi-panel figure with tree, tests, and results """ ## check for attributes if not self.newick: raise IPyradError("baba plot requires a newick treefile") if not self.tests: raise IPyradError("baba plot must have a .tests attribute") ## ensure tests is a list if isinstance(self.tests, dict): self.tests = [self.tests] ## re-decompose the tree ttree = toytree.tree( self.newick, orient='down', use_edge_lengths=use_edge_lengths, ) ## make the plot canvas, axes, panel = baba_panel_plot( ttree=ttree, tests=self.tests, boots=self.results_boots, show_test_labels=show_test_labels, use_edge_lengths=use_edge_lengths, collapse_outgroup=collapse_outgroup, pct_tree_x=pct_tree_x, pct_tree_y=pct_tree_y, *args, **kwargs) return canvas, axes, panel
def showstats(parsedict): """ loads assembly or dies, and print stats to screen """ #project_dir = parsedict['1'] project_dir = parsedict["project_dir"] if not project_dir: project_dir = "./" ## Be nice if somebody also puts in the file extension #assembly_name = parsedict['0'] assembly_name = parsedict["assembly_name"] my_assembly = os.path.join(project_dir, assembly_name) ## If the project_dir doesn't exist don't even bother trying harder. if not os.path.isdir(project_dir): msg = """ Trying to print stats for Assembly ({}) that doesn't exist. You must first run steps before you can show results. """.format(project_dir) sys.exit(msg) if not assembly_name: msg = """ Assembly name is not set in params.txt, meaning it was either changed or erased since the Assembly was started. Please restore the original name. You can find the name of your Assembly in the "project dir": {}. """.format(project_dir) raise IPyradError(msg) data = ip.load_json(my_assembly, quiet=True, cli=True) print("\nSummary stats of Assembly {}".format(data.name) \ +"\n------------------------------------------------") if not data.stats.empty: print(data.stats) print("\n\nFull stats files"\ +"\n------------------------------------------------") fullcurdir = os.path.realpath(os.path.curdir) for i in range(1, 8): #enumerate(sorted(data.stats_files)): key = "s"+str(i) try: val = data.stats_files[key] val = val.replace(fullcurdir, ".") print("step {}: {}".format(i, val)) except (KeyError, AttributeError): print("step {}: None".format(i)) print("\n") else: print("No stats to display")
def __init__(self, data=None, pops=None, ncomps=10, quiet=True): """ ipyrad.analysis Baba Class object. Parameters ---------- data : Assembly object or path to file Either an ipyrad assembly or a string path to a .vcf file. If it's a string path then you'll probably want to specify pops as well or else all your dots will be the same color. pops : dict or path to file A dictionary specifying the population assignment of each sample. This is optional, since by default if you used a pops file during your assembly the assembly object will include the pops info internally. ncomps : int The number of PCs to calculate. Probably most people won't care to mess with this, but it's simple enough to make it flexible. Functions --------- run() ... plot() ... """ self.quiet = quiet self.ncomponents = ncomps ## parse data as (1) path to data file, or (2) ndarray if isinstance(data, Assembly): self.assembly = data self.pops = data.populations try: self.data = data.outfiles.vcf except AttributeError as inst: raise IPyradError(MISSING_VCF_ERROR) else: ## You need a dummy assembly because we use the machinery ## of _link_populations below to read in the pops data self.assembly = Assembly("ipyrad-pca-tmp", quiet=True) self.data = os.path.realpath(data) self.pops = {} if pops: if isinstance(pops, dict): ## This is kind of stupid since we're just going to undo this ## in like 5 lines, but it gets the passed in pops into the ## same format as an assembly.populations dict, just easier to ## treat everything the same way. self.pops = {x:(0, y) for x, y in pops.items()} else: if not os.path.isfile(pops): raise IPyradError("popfile does not exist - {}".format(pops)) ## If the file you pass in doesn't have the stupid ipyrad minsamp mindat = [i.lstrip("#").lstrip().rstrip() for i in \ open(pops, 'r').readlines() if i.startswith("#")] if not mindat: lines = open(pops, 'r').readlines() p = set([x.split()[1].strip() for x in lines]) with open(pops, 'a') as outfile: outfile.write("# " + " ".join(["{}:1".format(x) for x in p])) self.assembly.paramsdict["pop_assign_file"] = os.path.realpath(pops) self.assembly._link_populations() self.pops = self.assembly.populations ## Here the populations continues to maintain info about minsamps, ## which we just get rid of for clarity. Sorry this is dumb, I couldn't ## figure out a clean way to extract from a tuple inside the dict values. tmpdict = {} for samp in self.pops: tmpdict[samp] = self.pops[samp][1] self.pops = tmpdict ## Read in the vcf and extract the samples and the data ## This will set self.samples_vcforder which is a list of sample names ## in the order they appear in the vcf file self._load_calldata() ## If no pops linked yet (either none in the assembly or none passed in) ## then everybody goes into one giant default population. if not self.pops: self.pops = {"All_samples":self.samples_vcforder} if not self.quiet: print(" Using populations:\n{}".format(self.pops)) if not self.pops: print(" No populations assigned, so PCA will be monochrome.")
def plot(self, pcs=[1, 2], ax=None, cmap=None, cdict=None, legend=True, title=None, outfile=None): """ Do the PCA and plot it. Parameters --------- pcs: list of ints ... ax: matplotlib axis ... cmap: matplotlib colormap ... cdict: dictionary mapping pop names to colors ... legend: boolean, whether or not to show the legend """ ## Specify which 2 pcs to plot, default is pc1 and pc2 pc1 = pcs[0] - 1 pc2 = pcs[1] - 1 if pc1 < 0 or pc2 > self.ncomponents - 1: raise IPyradError("PCs are 1-indexed. 1 is min & {} is max".format(self.ncomponents)) ## Convert genotype data to allele count data ## We do this here because we might want to try different ways ## of accounting for missing data and "alt" allele counts treat ## missing data as "ref" allele_counts = self.genotypes.to_n_alt() ## Actually do the pca if self.ncomponents > len(self.samples_vcforder): self.ncomponents = len(self.samples_vcforder) print(" INFO: # PCs < # samples. Forcing # PCs = {}".format(self.ncomponents)) coords, model = allel.pca(allele_counts, n_components=self.ncomponents, scaler='patterson') self.pcs = pd.DataFrame(coords, index=self.samples_vcforder, columns=["PC{}".format(x) for x in range(1,self.ncomponents+1)]) ## Just allow folks to pass in the name of the cmap they want to use if isinstance(cmap, str): try: cmap = cm.get_cmap(cmap) except: raise IPyradError(" Bad cmap value: {}".format(cmap)) if not cmap and not cdict: if not self.quiet: print(" Using default cmap: Spectral") cmap = cm.get_cmap('Spectral') if cmap: if cdict: print(" Passing in both cmap and cdict defaults to using the cmap value.") popcolors = cmap(np.arange(len(self.pops))/len(self.pops)) cdict = {i:j for i, j in zip(self.pops.keys(), popcolors)} fig = "" if not ax: fig = plt.figure(figsize=(6, 5)) ax = fig.add_subplot(1, 1, 1) x = coords[:, pc1] y = coords[:, pc2] for pop in self.pops: ## Don't include pops with no samples, it makes the legend look stupid ## TODO: This doesn't prevent empty pops from showing up in the legend for some reason. if len(self.pops[pop]) > 0: mask = np.isin(self.samples_vcforder, self.pops[pop]) ax.plot(x[mask], y[mask], marker='o', linestyle=' ', color=cdict[pop], label=pop, markersize=6, mec='k', mew=.5) ax.set_xlabel('PC%s (%.1f%%)' % (pc1+1, model.explained_variance_ratio_[pc1]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (pc2+1, model.explained_variance_ratio_[pc2]*100)) if legend: ax.legend(bbox_to_anchor=(1, 1), loc='upper left') if fig: fig.tight_layout() if title: ax.set_title(title) if outfile: try: plt.savefig(outfile, format="png", bbox_inches="tight") except: print(" Saving pca.plot() failed to save figure to {}".format(outfile)) return ax
def _loci_to_arr(loci, taxdict, mindict): """ return a frequency array from a loci file for all loci with taxa from taxdict and min coverage from mindict. """ ## make the array (4 or 5) and a mask array to remove loci without cov nloci = len(loci) keep = np.zeros(nloci, dtype=np.bool_) arr = np.zeros((nloci, 4, 300), dtype=np.float64) ## six rows b/c one for each p3, and for the fused p3 ancestor if len(taxdict) == 5: arr = np.zeros((nloci, 6, 300), dtype=np.float64) ## if not mindict, make one that requires 1 in each taxon if isinstance(mindict, int): mindict = {i: mindict for i in taxdict} elif isinstance(mindict, dict): mindict = {i: mindict[i] for i in taxdict} else: mindict = {i: 1 for i in taxdict} ## raise error if names are not 'p[int]' allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5'] if any([i not in allowed_names for i in taxdict]): raise IPyradError(\ "keys in taxdict must be named 'p1' through 'p4' or 'p5'") ## parse key names keys = sorted([i for i in taxdict.keys() if i[0] == 'p']) outg = keys[-1] ## grab seqs just for the good guys for loc in xrange(nloci): ## parse the locus lines = loci[loc].split("\n")[:-1] names = [i.split()[0] for i in lines] seqs = np.array([list(i.split()[1]) for i in lines]) ## check that names cover the taxdict (still need to check by site) covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \ for tax in taxdict] ## keep locus if all(covs): keep[loc] = True ## get the refseq refidx = np.where([i in taxdict[outg] for i in names])[0] refseq = seqs[refidx].view(np.uint8) ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]]) ## freq of ref in outgroup iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter 4-taxon freqs if len(taxdict) == 4: for tidx, key in enumerate(keys[:-1]): ## get idx of names in test tax nidx = np.where([i in taxdict[key] for i in names])[0] sidx = seqs[nidx].view(np.uint8) ## get freq of sidx iseq = _reffreq2(ancestral, sidx, GETCONS2) ## fill it in arr[loc, tidx, :iseq.shape[1]] = iseq else: ## entere p5; and fill it in iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter p1 nidx = np.where([i in taxdict['p1'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 0, :iseq.shape[1]] = iseq ## enter p2 nidx = np.where([i in taxdict['p2'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 1, :iseq.shape[1]] = iseq ## enter p3 with p4 masked, and p4 with p3 masked nidx = np.where([i in taxdict['p3'] for i in names])[0] nidy = np.where([i in taxdict['p4'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) sidy = seqs[nidy].view(np.uint8) xseq = _reffreq2(ancestral, sidx, GETCONS2) yseq = _reffreq2(ancestral, sidy, GETCONS2) mask3 = xseq != 0 mask4 = yseq != 0 xseq[mask4] = 0 yseq[mask3] = 0 arr[loc, 2, :xseq.shape[1]] = xseq arr[loc, 3, :yseq.shape[1]] = yseq ## enter p34 nidx = nidx.tolist() + nidy.tolist() sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 4, :iseq.shape[1]] = iseq ## size-down array to the number of loci that have taxa for the test arr = arr[keep, :, :] ## size-down sites to arr = masknulls(arr) return arr, keep
def batch( baba, ipyclient=None, ): """ distributes jobs to the parallel client """ ## parse args handle = baba.data taxdicts = baba.tests mindicts = baba.params.mincov nboots = baba.params.nboots ## if ms generator make into reusable list sims = 0 if isinstance(handle, types.GeneratorType): handle = list(handle) sims = 1 else: ## expand locifile path to full path handle = os.path.realpath(handle) ## parse taxdicts into names and lists if it a dictionary #if isinstance(taxdicts, dict): # names, taxdicts = taxdicts.keys(), taxdicts.values() #else: # names = [] names = [] if isinstance(taxdicts, dict): taxdicts = [taxdicts] ## an array to hold results (len(taxdicts), nboots) tot = len(taxdicts) resarr = np.zeros((tot, 7), dtype=np.float64) bootsarr = np.zeros((tot, nboots), dtype=np.float64) paneldict = {} ## TODO: Setup a wrapper to find and cleanup ipyclient ## define the function and parallelization to use, ## if no ipyclient then drops back to using multiprocessing. if not ipyclient: # ipyclient = ip.core.parallel.get_client(**self._ipcluster) raise IPyradError("you must enter an ipyparallel.Client() object") else: lbview = ipyclient.load_balanced_view() ## submit jobs to run on the cluster queue start = time.time() asyncs = {} idx = 0 ## prepare data before sending to engines ## if it's a str (locifile) then parse it here just once. if isinstance(handle, str): with open(handle, 'r') as infile: loci = infile.read().strip().split("|\n") if isinstance(handle, list): pass #sims() ## iterate over tests (repeats mindicts if fewer than taxdicts) itests = iter(taxdicts) imdict = itertools.cycle([mindicts]) #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])): for i in xrange(len(ipyclient)): ## next entries unless fewer than len ipyclient, skip try: test = next(itests) mindict = next(imdict) except StopIteration: continue ## if it's sim data then convert to an array if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 ## block until finished, print progress if requested. finished = 0 try: while 1: keys = [i for (i, j) in asyncs.items() if j.ready()] ## check for failures for job in keys: if not asyncs[job].successful(): raise IPyradWarningExit(\ " error: {}: {}".format(job, asyncs[job].exception())) ## enter results for successful jobs else: _res, _bot = asyncs[job].result() ## store D4 results if _res.shape[0] == 1: resarr[job] = _res.T.as_matrix()[:, 0] bootsarr[job] = _bot ## or store D5 results else: paneldict[job] = _res.T ## remove old job del asyncs[job] finished += 1 ## submit next job if there is one. try: test = next(itests) mindict = next(imdict) if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 except StopIteration: pass ## count finished and break if all are done. #fin = idx - len(asyncs) elap = datetime.timedelta(seconds=int(time.time() - start)) printstr = " calculating D-stats | {} | " progressbar(tot, finished, printstr.format(elap), spacer="") time.sleep(0.1) if not asyncs: print("") break except KeyboardInterrupt as inst: ## cancel all jobs (ipy & multiproc modes) and then raise error try: ipyclient.abort() except Exception: pass raise inst ## dress up resarr as a Pandas DataFrame if 4-part test if len(test) == 4: if not names: names = range(len(taxdicts)) #print("resarr") #print(resarr) resarr = pd.DataFrame(resarr, index=names, columns=[ "dstat", "bootmean", "bootstd", "Z", "ABBA", "BABA", "nloci" ]) ## sort results and bootsarr to match if test names were supplied resarr = resarr.sort_index() order = [list(resarr.index).index(i) for i in names] bootsarr = bootsarr[order] return resarr, bootsarr else: ## order results dfs listres = [] for key in range(len(paneldict)): listres.append(paneldict[key]) ## make into a multi-index dataframe ntests = len(paneldict) multi_index = [ np.array([[i] * 3 for i in range(ntests)]).flatten(), np.array(['p3', 'p4', 'shared'] * ntests), ] resarr = pd.DataFrame( data=pd.concat(listres).as_matrix(), index=multi_index, columns=listres[0].columns, ) return resarr, None
def plot( self, show_test_labels=True, use_edge_lengths=True, collapse_outgroup=False, pct_tree_x=0.5, pct_tree_y=0.2, subset_tests=None, #toytree_kwargs=None, *args, **kwargs): """ Draw a multi-panel figure with tree, tests, and results Parameters: ----------- height: int ... width: int ... show_test_labels: bool ... use_edge_lengths: bool ... collapse_outgroups: bool ... pct_tree_x: float ... pct_tree_y: float ... subset_tests: list ... ... """ ## check for attributes if not self.newick: raise IPyradError("baba plot requires a newick treefile") if not self.tests: raise IPyradError("baba plot must have a .tests attribute") ## ensure tests is a list if isinstance(self.tests, dict): self.tests = [self.tests] ## re-decompose the tree ttree = toytree.tree( self.newick, orient='down', use_edge_lengths=use_edge_lengths, ) ## subset test to show fewer if subset_tests != None: #tests = self.tests[subset_tests] tests = [self.tests[i] for i in subset_tests] boots = self.results_boots[subset_tests] else: tests = self.tests boots = self.results_boots ## make the plot canvas, axes, panel = baba_panel_plot( ttree=ttree, tests=tests, boots=boots, show_test_labels=show_test_labels, use_edge_lengths=use_edge_lengths, collapse_outgroup=collapse_outgroup, pct_tree_x=pct_tree_x, pct_tree_y=pct_tree_y, *args, **kwargs) return canvas, axes, panel