def __init__(self, format, inputPath): """ Constructs the bioinformatics file using the inputPath The input file will have these minimum requirements: - rsID - chromosome - position - sample genotype If input filetype is not specified, then send through parseNstream to predict filetype? """ if os.path.exists(inputPath): self.inputPath = inputPath self.referencePath = '/Volumes/teyden/BIOINFORMATICS/23andme2vcf/23andme_v4_hg19_ref.txt' self.genome = parseNstream._23andmeObject(self.inputPath, sortOption='CHROMOSOME') ## self.chrList = [ '%s' % i for i in range(1,26)] # + ['X', 'Y', 'MT']
133275309, 114364328, 107043718, 101991189, 90338345, 83257441, \ 80373285, 58617616, 64444167, 46709983, 50818468, 156040895, 57227415, 16569] self.positions = [] self.minSNP = 0 self.maxSNP = 0 def size(self): return self.chrLengths[self.num-1] def totalSNPs(self): return len(self.positions) ChromosomeDict = _23andmeObject("../../BIO_DATA/raw23andme_SNPs_teyden.txt", 'CHROMOSOME') ChromosomePositions = {} Chromosomes = {} # Dictionary of Chromosome objects ### (1) One list of SNP positions for every chromosome (1-22, X, Y, MT) """ ChromosomePositions['1'] = a list of positions for chromosome 1 ... etc. """ # Make dict of all SNP positions. Keys = Chromosome #, Values = lst of positions for chrom in ChromosomeDict: ChromosomePositions[chrom] = ChromosomeDict[chrom].keys() ChromosomePositions[chrom].sort() # Could go without Chromosomes[chrom] = Chromosome(chrom=chrom) Chromosomes[chrom].positions = ChromosomePositions[chrom]
def concatenateSNPs(RefSNPFile, UserSNPFile): """ Structure of RefSNPdict and UserSNPdict: key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M) (*) Reference allele is only ONE letter. - Genotype is a string of len=2, the original allele representations from the 23andme file - Variant(s) represents any letters different from the Reference, if none then given '-' - Match Score (0,1,2): the SNP is given a score of ... - 2 (homozygous) if both letters are the same as the Reference - 1 (heterozygous) if one letter is the same as the Reference - 0 (recessive) if neither match the reference (*) Variants given a match score of 0 may need to be switched to the opposite letters (A to T, C to G and vice versa); must check to confirm which are minus vs. plus strands) ### BASIC FORMAT: #chrNum position Ref Genotype Variant Matches(0,1,2) 1 1000 A AA - RETURNS: none, streams to file... """ RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME') UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME') nonMatchedRSIDs = [] numMatchedRSIDs = 0 snpsConcatenated = 0 indel = {} F = open(DEFAULT_OUTPUT_FILEPATH, 'w') for chr in parseNstream.CHROMOSOME_LIST: positions = RefSNPdict[chr].keys() positions.sort() indel[chr] = {} for pos in positions: refAllele = RefSNPdict[chr][pos][GENOTYPE].upper() rsid = RefSNPdict[chr][pos][RSID] # Check allele cases for ref in the user's snps if pos in UserSNPdict[chr]: if rsid != UserSNPdict[chr][pos][RSID]: nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])] if isIndel(UserSNPdict[chr][pos][GENOTYPE]): indel[chr][pos] = { RSID: UserSNPdict[chr][pos][RSID], GENOTYPE: UserSNPdict[chr][pos][GENOTYPE] } else: numMatchedRSIDs += 1 sampleAllele = UserSNPdict[chr][pos][GENOTYPE] # 3 # Chromosomes 1-22 will have an allele pair, hence str len of 2 if len(sampleAllele) == 2: if sampleAllele[0] == '-' and sampleAllele[1] == '-': variant = '-' score = '-' elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower(): variant = '-' score = 2 elif sampleAllele[0].lower() == refAllele.lower(): variant = sampleAllele[1] score = 1 elif sampleAllele[1].lower() == refAllele.lower(): variant = sampleAllele[0] score = 1 else: variant = sampleAllele score = 0 # Mitochondria, X, and Y chromosome alleles are of length 1 elif len(sampleAllele) == 1: if sampleAllele == '-': variant = '-' score = '-' elif sampleAllele.lower() == refAllele.lower(): variant = '-' score = 1 else: variant = sampleAllele score = 0 if score != '-': snpsConcatenated += 1 if variant == '-': F.write('%s' % sampleAllele[0]) else: F.write('%s' % variant) F.close() parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .FASTA file (path=%s)" % DEFAULT_OUTPUT_FILEPATH) parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs) print "Only the RSID's of the user that existed in the reference data base were used. Also, INDELs were not concatenated." parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs)) parseNstream.printMsg('Number of SNPs concatenated: %s' % snpsConcatenated)
def scoreAlleles(RefSNPFile, UserSNPFile): """ Structure of RefSNPdict and UserSNPdict: key = chromosome string as '1', '2', ..., '23' (X), '24' (Y), '25', (MT/M) (*) Reference allele is only ONE letter. - Genotype is a string of len=2, the original allele representations from the 23andme file - Variant(s) represents any letters different from the Reference, if none then given '-' - Match Score (0,1,2): the SNP is given a score of ... - 2 (homozygous) if both letters are the same as the Reference - 1 (heterozygous) if one letter is the same as the Reference - 0 (recessive) if neither match the reference (*) Variants given a match score of 0 may need to be switched to the opposite letters (A to T, C to G and vice versa); must check to confirm which are minus vs. plus strands) ### BASIC FORMAT: #chrNum position Ref Genotype Variant Matches(0,1,2) 1 1000 A AA - RETURNS: none, streams to file... """ RefSNPdict = parseNstream._referenceObject(RefSNPFile, 'CHROMOSOME') UserSNPdict = parseNstream._23andmeObject(UserSNPFile, 'CHROMOSOME') # Keep track of RSIDs in the that do not match in position and rsid value in tuples (ref rsid, user rsid) nonMatchedRSIDs = [] # Count number of RSIDs whose positions in a chr match for both ref and user, and report it at the end. numMatchedRSIDs = 0 # Keep track of indels indel = {} F = open(DEFAULT_OUTPUT_FILEPATH, 'w') F.write('#Chrom\tRSID\tPos\tRef\tGenotype\tVariant(s)\tMatch Score\n') for chr in parseNstream.CHROMOSOME_LIST: positions = RefSNPdict[chr].keys() positions.sort() indel[chr] = {} for pos in positions: refAllele = RefSNPdict[chr][pos][GENOTYPE].upper() rsid = RefSNPdict[chr][pos][RSID] # Check allele cases for ref in the user's snps if pos in UserSNPdict[chr]: if rsid != UserSNPdict[chr][pos][RSID]: nonMatchedRSIDs += [(rsid, UserSNPdict[chr][pos][RSID])] if isIndel(UserSNPdict[chr][pos][GENOTYPE]): indel[chr][pos] = { RSID: UserSNPdict[chr][pos][RSID], GENOTYPE: UserSNPdict[chr][pos][GENOTYPE] } else: numMatchedRSIDs += 1 sampleAllele = UserSNPdict[chr][pos][GENOTYPE] # 3 # Chromosomes 1-22 will have an allele pair, hence str len of 2 if len(sampleAllele) == 2: # No base call at the current rsid. [BLANK] if sampleAllele[0] == '-' and sampleAllele[1] == '-': variant = '-' score = '-' # Score 2, no variants. Both letters are homologous to the reference. [GRAY - default] elif sampleAllele[0].lower() == refAllele.lower() and sampleAllele[1].lower() == refAllele.lower(): variant = '-' score = 2 # Score 1, one variant. Second letter is homologous to the reference. [GREEN - success] elif sampleAllele[0].lower() == refAllele.lower(): variant = sampleAllele[1] score = 1 # Score 1, one variant. First letter is homologous to the reference. [GREEN - success] elif sampleAllele[1].lower() == refAllele.lower(): variant = sampleAllele[0] score = 1 # Score 0, two variants. No homology to the reference. [ORANGE - primary] else: variant = sampleAllele score = 0 # Mitochondria, X, and Y chromosome alleles are of length 1 elif len(sampleAllele) == 1: # No base call at the current rsid. [BLANK] if sampleAllele == '-': variant = '-' score = '-' # Score 1, no variants. Only letter is homologous to the reference. [GRAY - default] elif sampleAllele.lower() == refAllele.lower(): variant = '-' score = 1 else: variant = sampleAllele score = 0 F.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chr, rsid, pos, refAllele, sampleAllele, variant, score)) F.close() parseNstream.printMsg("Completed scoring user SNPs to REF SNPs. Streamed to a .teyden (LOL) file format (path=%s)" % DEFAULT_OUTPUT_FILEPATH) parseNstream.printMsg('Number of matched rsid values: %s' % numMatchedRSIDs) parseNstream.printMsg('Number of unmatched rsid values: %s' % len(nonMatchedRSIDs)) return indel