def importIsoggSnps(self): 'import ISOGG SNPs' SNP.setClassVariables(self) self.readPreferredSNPnameSet() self.readRepresentativeSNPnameSet() self.readIsoggMultiAllelicPosSet() self.readIsoggOmitSet() self.readIsoggCorrectionsDict() self.parseIsoggTable() self.setDepthFirstNodeList() self.sortSNPlistsAndSetRepresentatives() self.writeIsoggCounts() self.writeUniqueSNPtable() self.writeNewick() self.checkMultiAllelics()
def prioritySortSNPlistAndSetHgSNP(self): ''' first, sorts snp list (or dropped marker list) by priority ranking. then, sets reresentative-SNP-based label: self.hgSNP the standard form incudes the truncated haplogroup label and the label of a representative SNP, separated by a hyphen (e.g. R-V88). ''' # root: no markers if self.isRoot(): self.hgSNP = self.haplogroup # normal case elif self.snpList: self.snpList = SNP.prioritySortMarkerList(self.snpList) self.hgSNP = self.mostHighlyRankedSNP.hgSNP # backup: use discared marker name elif self.droppedMarkerList: self.droppedMarkerList = SNP.prioritySortMarkerList( self.droppedMarkerList) markerName = self.mostHighlyRankedDroppedMarker.name self.hgSNP = '%s-%s' % (self.hgTrunc, markerName) # no markers to use else: if self.parent.hgSNP: symbol = '*' if self.isLeaf() else '+' self.hgSNP = self.parent.hgSNP + symbol # uniquify if necessary if self.hgSNP in Node.hgSNPset: i = 1 hgSNPuniqe = '%s%d' % (self.hgSNP, i) while hgSNPuniqe in Node.hgSNPset: i += 1 hgSNPuniqe = '%s%d' % (self.hgSNP, i) self.hgSNP = hgSNPuniqe else: Node.errAndLog('WARNING. Attempted to set star label, ' + \ 'but parent.hgSNP not set yet: %s\n' % self.haplogroup) self.hgSNP = self.haplogroup Node.hgSNPset.add(self.hgSNP)
def snp(self, rsid): """Returns SNP with given integer-only RSID.""" try: genotype, chromo, position = self._genome[rsid] geno = map(Nucleotide, genotype) return _to_snp("rs%d" % rsid, self._orientation, (geno, chromo, position)) except KeyError: return SNP([], "rs%d" % rsid, self._orientation, 0, 0)
def constructSNP(self, name, haplogroup, position, mutation): ''' typically, instantiates a SNP and adds it to various containers. note that when SNPs are instantiated, they are added to the tree, and this process may entail growing the tree to include the corresponding node. more specialized things occur if a SNP already exists at this position. ''' if self.hg2nodeDict: ancestral, derived = mutation[0], mutation[3] snpKey = (haplogroup, position) if snpKey in self.snpDict: # snp exists under an alias snp = self.snpDict[snpKey] if snp.isAncestral(ancestral) and snp.isDerived(derived): snp.addName(name) self.snpDict[name] = snp else: newSNP = SNP(name, haplogroup, position, ancestral, derived) sys.exit('\n\nERROR! Conlicting SNPs:\n%s\n%s\n' % \ (snp, newSNP)) else: if position in self.snpDict: # another snp with same position oldSNP = self.snpDict[position] if ancestral not in oldSNP.alleleSet or \ derived not in oldSNP.alleleSet: self.multiAllelicNewPosSet.add(position) # typical behavior snp = SNP(name, haplogroup, position, ancestral, derived) self.snpDict[(haplogroup, position)] = snp self.snpDict[name] = snp self.snpDict[position] = snp self.snpList.append(snp) self.snpPosSet.add(position) self.snpNameSet.add(name) self.isoggCountsDict['unique'] += 1
def updateWithBranchAssessment(self, ancSNPlist, derSNPlist): ''' extends derived SNP list, sets most derived SNP, and adds number of ancestral alleles seen. also, manages tracking of whether or not path has pushed through an (anc,der)==(1,0) branch ''' numAncestral, numDerived = len(ancSNPlist), len(derSNPlist) self.numAncestral += numAncestral self.derSNPlist.extend(derSNPlist) if derSNPlist: self.mostDerivedSNP = SNP.mostHighlyRankedMarkerOnList(derSNPlist) if self.hasPushedThrough: self.updatePushThroughVars(numAncestral, numDerived) elif (numAncestral, numDerived) == (1, 0): self.setPushThroughVars()
def positions_from_csv(filename, min_allele_freq): ''' Read SNP positions from a CSV file in following format: chrom,pos,freq 1,1110,0.02631 1,2271,0.03125 1,2402,0.03125 1,4559,0.02777 ''' #log.info('Reading SNPs from file: %s', filename) SNPs = [] with open(os.path.join(os.path.dirname(__file__), filename)) as f: reader = csv.reader(f) header = next(reader, None) for chrom, pos, freq in reader: snp = SNP(int(chrom), int(pos), float(freq)) if snp.freq > min_allele_freq: SNPs.append(snp) return SNPs
l_count = 0 snps = [] for variant_line in variant_lines: # remove /n form end of line # variant_line = variant_line.strip() # remove " from lines # variant_line[0] = variant_line[0].translate(None, '"') # variant_line[-1] = variant_line[-1].translate(None, '"') if len(variant_line) >= 16: success_count += 1 context = variant_line[5].split(",") consequences = variant_line[6].split(",") snps.append( SNP(l_count, variant_line[0], int(variant_line[1]), variant_line[2], variant_line[3], variant_line[4], context, consequences, variant_line[7], variant_line[8], variant_line[9], int(variant_line[10]), variant_line[11], variant_line[12], variant_line[13], variant_line[14], variant_line[15])) elif 16 > len(variant_line) > 4: success_count += 1 # context = variant_line[5].split(",") # consequences = variant_line[6].split(",") snps.append( SNP(l_count, variant_line[0], int(variant_line[1]), variant_line[2], variant_line[3], variant_line[4], variant_line[5], ".", ".", ".", ".", ".", ".", ".", ".", ".", ".")) else: print "INVALID DATA (16 >= length > 4) in Line {}".format(l_count) print variant_line fail_count += 1
def mostHighlyRankedSNP(self): 'the most highly ranked SNP' return SNP.mostHighlyRankedMarkerOnList(self.snpList)
def mostHighlyRankedDroppedMarker(self): 'the most highly ranked dropped marker' return SNP.mostHighlyRankedMarkerOnList(self.droppedMarkerList)
def _to_snp(rsid, orientation, value): genotype, chromo, position = value return SNP(list(genotype), rsid, orientation, chromo, position)