def _imap_consensus_reduce(self): """ Get consensus sequence for all samples in clade. """ # skip if no imap if not self.imap: return # empty array of shape imap groups iarr = np.zeros((len(self.imap), self.seqarr.shape[1]), dtype=np.uint8) inames = np.array(sorted(self.imap.keys())) # iterate over imap groups for ikey, ivals in self.imap.items(): # get subarray for this group sidxs = [np.where(self.names == i)[0][0] for i in ivals] subarr = self.seqarr[sidxs, :] # get consensus sequence cons = reftrick(subarr, GETCONS)[:, 0] # insert to iarr iidx = np.where(inames == ikey)[0][0] iarr[iidx] = cons # save as new data iarr[iarr == 0] = 78 self.seqarr = iarr self.names = inames self._longname = 1 + max([len(i) for i in self.names]) self._pnames = np.array([ "{}{}".format(name, " " * (self._longname - len(name))) for name in self.names ])
def _loci_to_arr(loci, taxdict, mindict): """ return a frequency array from a loci file for all loci with taxa from taxdict and min coverage from mindict. """ ## get max length of loci maxlen = np.max(np.array([len(locus.split("\n")[0]) for locus in loci])) ## make the array (4 or 5) and a mask array to remove loci without cov nloci = len(loci) maxlen = np.max(np.array([len(locus.split("\n")[0]) for locus in loci])) keep = np.zeros(nloci, dtype=np.bool_) arr = np.zeros((nloci, 4, maxlen), dtype=np.float64) ## six rows b/c one for each p3, and for the fused p3 ancestor if len(taxdict) == 5: # arr = np.zeros((nloci, 6, 300), dtype=np.float64) arr = np.zeros((nloci, 6, maxlen), dtype=np.float64) ## if not mindict, make one that requires 1 in each taxon if isinstance(mindict, int): mindict = {i: mindict for i in taxdict} elif isinstance(mindict, dict): mindict = {i: mindict[i] for i in taxdict} else: mindict = {i: 1 for i in taxdict} ## raise error if names are not 'p[int]' allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5'] if any([i not in allowed_names for i in taxdict]): raise IPyradError(\ "keys in taxdict must be named 'p1' through 'p4' or 'p5'") ## parse key names keys = sorted([i for i in taxdict.keys() if i[0] == 'p']) outg = keys[-1] ## grab seqs just for the good guys for loc in range(nloci): ## parse the locus lines = loci[loc].split("\n")[:-1] names = [i.split()[0] for i in lines] seqs = np.array([list(i.split()[1]) for i in lines]) ## check that names cover the taxdict (still need to check by site) covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \ for tax in taxdict] ## keep locus if all(covs): keep[loc] = True ## get the refseq refidx = np.where([i in taxdict[outg] for i in names])[0] refseq = seqs[refidx].view(np.uint8) ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]]) ## freq of ref in outgroup iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter 4-taxon freqs if len(taxdict) == 4: for tidx, key in enumerate(keys[:-1]): ## get idx of names in test tax nidx = np.where([i in taxdict[key] for i in names])[0] sidx = seqs[nidx].view(np.uint8) ## get freq of sidx iseq = _reffreq2(ancestral, sidx, GETCONS2) ## fill it in arr[loc, tidx, :iseq.shape[1]] = iseq else: ## entere p5; and fill it in iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter p1 nidx = np.where([i in taxdict['p1'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 0, :iseq.shape[1]] = iseq ## enter p2 nidx = np.where([i in taxdict['p2'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 1, :iseq.shape[1]] = iseq ## enter p3 with p4 masked, and p4 with p3 masked nidx = np.where([i in taxdict['p3'] for i in names])[0] nidy = np.where([i in taxdict['p4'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) sidy = seqs[nidy].view(np.uint8) xseq = _reffreq2(ancestral, sidx, GETCONS2) yseq = _reffreq2(ancestral, sidy, GETCONS2) mask3 = xseq != 0 mask4 = yseq != 0 xseq[mask4] = 0 yseq[mask3] = 0 arr[loc, 2, :xseq.shape[1]] = xseq arr[loc, 3, :yseq.shape[1]] = yseq ## enter p34 nidx = nidx.tolist() + nidy.tolist() sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 4, :iseq.shape[1]] = iseq ## size-down array to the number of loci that have taxa for the test arr = arr[keep, :, :] ## size-down sites to arr = masknulls(arr) return arr, keep