def test_sequence(faSequence): """Tests if sequences contain data.""" length = faSequence.nSpecies names = [] for i in range(length): names.append(faSequence.seqL[i].name) if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '': raise sb.SequenceDataError("Sequence name or data is missing.") if length > len(set(names)): raise sb.SequenceDataError("Sequence names are not unique.") return
def get_seq_base(self, seq, pos): """Return base at 1-based position `pos` in sequence with name `seq`.""" names = self.get_seq_names() try: i = names.index(seq) except: raise sb.SequenceDataError("Sequence name not found.") if pos > self.seqL[i].dataLen: raise sb.SequenceDataError("Position out of range.") return self.seqL[i].get_base(pos)
def get_nuc_base(self, chrom, pos): """Return base at position *pos* of chromosome *chrom*.""" for i in range(0, self.nBases): if pos == self.baseL[i].pos \ and chrom == self.baseL[i].chrom: return self.baseL[i] raise sb.SequenceDataError('Base at position ' + str(pos) + ' on chromosome ' + str(chrom) + ' not found.')
def save_as_vcf(faSeq, ref, VCFFileName): """Save the given :classL`FaSeq` in VCF format. In general, we want to convert a fasta file with various individuals with the help of a reference that contains one sequence to a VCF file that contains all the SNPs. This can be done with this function. Until now it is not possible to do this conversion for several chromosomes for each individual in one run. Still, the conversion can be done chromosome by chromosome. This function saves the SNPs of *faSeq*, a given :class:`FaSeq` (fasta sequence) object in VCF format to the file *VCFFileName*. The reference genome *ref*, to which *faSeq* is compared to, needs to be passed as a :class:`Seq <cflib.seqbase.Seq>` object. The function compares all sequences in *faSeq* to the sequence given in *ref*. The names of the individuals in the saved VCF file will be the sequence names of the *faSeq* object. :: #CHROM = sequence name of the reference POS = position relative to reference ID = . REF = base of reference ALT = SNP (e.g. 'C' or 'G,T' if 2 different SNPs are present) QUAL = . FILTER = . INFO = . FORMAT = GT :param FaSeq faSeq: :class:`FaSeq` object to be converted. :param Seq ref: :class:`Seq <cflib.seqbase.Seq>` object of the reference sequence. :param str VCFFileName: Name of the VCF output file. """ def get_altBases_string(sAltBases): """Return ALT bases string from given `sAltBases`.""" length = len(sAltBases) if length == 0: return '' string = str(sAltBases[0]) if length > 1: for i in range(1, length): string += ',' + sAltBases[i] return string def get_indiv_string(indivData, altBases, sAltBases): """Return the string of the individual data. Return the string extracted from the indivudal data `indivData` with SNPs `altBases`. `sAltBases` is the string with the alternative bases. E.g.: REF = A ALT = C,G individual i1 has A individual i2 has C individual i3 has G Then the string should look like: '0\t1\t2' -> 0 for REF, 1 for first ALT and 2 for second ALT """ length = len(indivData) if not (indivData[0] in altBases): string = '0' else: string = str(sAltBases.index(indivData[0]) + 1) if length > 1: for i in range(1, len(indivData)): if not (indivData[i] in altBases): string += '\t' + '0' else: string += '\t' + str(sAltBases.index(indivData[i]) + 1) return string def get_vcf_line(chromName, pos, refBase, altBaseString, indivString): """Print a VCF file line with given data to file `VCFFile`.""" string = chromName + '\t' string += str(pos) + '\t' string += '.' + '\t' # id string += refBase + '\t' string += altBaseString + '\t' string += '.' + '\t' # qual string += '.' + '\t' # filter string += '.' + '\t' # info string += "GT" + '\t' # format string += indivString return string if (not isinstance(faSeq, FaSeq)): raise sb.SequenceDataError("`faSeq` is not an FaSeq object.") if (not isinstance(ref, sb.Seq)): raise sb.SequenceDataError("`ref` is not a Seq object.") if faSeq.nSpecies == 0: raise sb.SequenceDataError("`faSeq` has no saved sequences.") for i in range(0, faSeq.nSpecies): if faSeq.seqL[i].dataLen != ref.dataLen: raise sb.SequenceDataError("Sequence " + faSeq.seqL[i].name + " has different length than reference.") VCFFile = sb.gz_open(VCFFileName, mode='w') print(vcf.get_header_line_string(faSeq.get_seq_names()), file=VCFFile) # loop over bases refBase = '' for i in range(0, ref.dataLen): refBase = ref.data[i] altBases = set() indivData = [] # loop over sequences in faSeq and check if there is a SNP for s in range(0, faSeq.nSpecies): indivData.append(faSeq.seqL[s].data[i]) if faSeq.seqL[s].data[i] != refBase: altBases.add(faSeq.seqL[s].data[i]) sAltBases = sorted(altBases) altBaseString = get_altBases_string(sAltBases) indivString = get_indiv_string(indivData, altBases, sAltBases) if altBases != set(): print(get_vcf_line(ref.name, i + 1, refBase, altBaseString, indivString), file=VCFFile) VCFFile.close() return
def set_seq(self, seq): "Set the reference sequence." "" if (not isinstance(seq, sb.Seq)): raise sb.SequenceDataError("`seq` is not a Seq object.") self.refSeq = seq
def __fill_cD(self, iL=None, snpL=None): """Fill *self.cF*. Fill *self.cF* with data from reference at chromosome *self.chrom* and position *self.pos*. Possible SNPs in *self.vcfL* at this position are considered. :param [int] iL: List with vcf indices of the SNPs in *snpL*, must be sorted. :param [NucBase] snpL: List with :class:`NucBase <cflib.vcf.NucBase>` SNPs at this position. None, if there is no SNP. :raises: :class:`NotAValidRefBase <cflib.seqbase.NotAValidRefBase>`, :class:`SequenceDataError <cflib.seqbase.SequenceDataError>` :class:`NotAValidRefBae <cflib.seqbase.NotAValidRefBase>` is raised if the reference base is not valid (e.g. N). :class:`SequenceDataError <cflib.seqbase.SequenceDataError>` is raised if the chromosome names do not match. """ if snpL is not None: logging.debug("Next SNP(s):") for s in snpL: logging.debug(s.get_info()) def get_refBase(): """Get reference base on *chrom* at *pos*.""" return self.refSeq.data[self.pos].lower() def update_cD(pop, baseI, delta=self.ploidy): """Add counts to the countsDictionary cD.""" # FIXME: IUPAC code not handled here. Is this even necessary? if baseI == dna['n'] or baseI == dna['*']: logging.debug("Reference base is unknown. Continue.") return if pop in range(0, self.nPop): self.cD[pop][baseI] += delta logging.debug( "Updating counts dictionary; population %s, " "base index %s.", pop, baseI) else: logging.info( "Ignoring data because population index %s is " "out of range.", pop) raise ValueError() self.purge_cD() # If we check for synonymous bases, do not do anything if base # is not 4-fold degenerate. if self.onlySynonymous is True: if self.refSeq.is_synonymous(self.pos) is False: logging.debug( "Rejection; %s at position %s " "is not a synonymous base.", self.refSeq.data[self.pos], self.pos) raise NoSynBase() refBase = get_refBase() try: r = dna[refBase] except KeyError: raise sb.NotAValidRefBase() # If there are no SNPS, fill *self.cD* with data from reference. if iL is None: for i in range(self.nV): for pop in self.assM[i]: update_cD(pop, r) elif (snpL is not None) and (len(iL) == len(snpL)): # Else, only fill *self.cD* where the individual has no SNP. for i in range(self.nV): if i not in iL: for pop in self.assM[i]: update_cD(pop, r) # Now traverse the SNPs. for sI in range(len(iL)): # Check if the reference bases match. vcfRefBase = snpL[sI].get_ref_base().lower() # Thu Jun 9 09:26:55 CEST 2016: Just use first base if # there are more. indel = False if len(vcfRefBase) > 1: logging.warn("Indel at chrom %s pos %d.", self.chrom, self.pos + self.offset) indel = True vcfRefBase = vcfRefBase[0] if dna[vcfRefBase] != r: print("Error at NucBase:") snpL[sI].print_info() print("The reference base at position", self.pos, "on chromosome", self.chrom, "is", refBase, end=".\n") print("The reference base of the VCF file is", vcfRefBase, end=".\n") raise sb.SequenceDataError("Reference bases do not match.") altBases = snpL[sI].get_alt_base_list() for altBase in altBases: if len(altBase) > 1: indel = True logging.warn("Indel at chrom %s pos %d.", self.chrom, self.pos + self.offset) spData = snpL[sI].get_speciesData() vI = iL[sI] # Loop over individuals. for i in range(0, len(spData)): # Loop over chromatides (e.g. diploid). for d in range(0, self.ploidy): if spData[i][d] is None: pass elif indel or spData[i][d] == 0: bI = r update_cD(self.assM[vI][i], bI, delta=1) else: bI = dna[altBases[spData[i][d] - 1]] logging.debug("Use SNP of %s, population %s", self.indM[vI][i], self.assM[vI][i]) update_cD(self.assM[vI][i], bI, delta=1) else: raise sb.SequenceDataError("SNP information is not correct.")
def fasta_to_cf(fastaFN, countsFN, splitChar='-', chromName="NA", double_fixed_sites=False): """Convert fasta to counts format. The (aligned) sequences in the fasta file are read in and the data is written to a counts format file. Sequence names are stripped at the first dash. If the strupped sequence name coincide, individuals are put into the same population. E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same population homo_sapiens. Take care with large files, this uses a lot of memory. The input as well as the output files can additionally be gzipped (indicated by a .gz file ending). :ivar bool double_fixed_sites: Set to true if heterozygotes are encoded with IUPAC codes. Then, fixed sites will be counted twice so that the level of polymorphism stays correct. """ FaStr = fasta.init_seq(fastaFN) logging.debug("Read in fasta file %s.", fastaFN) seqL = [copy.deepcopy(FaStr.seq)] while (FaStr.read_next_seq() is not None): seqL.append(copy.deepcopy(FaStr.seq)) nSeqs = len(seqL) logging.debug("Number of sequences: %s", nSeqs) for s in seqL: newName = s.name.rsplit(splitChar, maxsplit=1)[0] s.name = newName # s.print_info() logging.debug("Checking sequence lengths.") nSites = seqL[0].dataLen for s in seqL[1:]: if (nSites != s.dataLen): raise sb.SequenceDataError("Sequences do not have equal length.") logging.debug("Creating assignment list.") assL = [] nameL = [seqL[0].name] i = 0 for s in seqL: try: i = nameL.index(s.name) assL.append(i) except ValueError: nameL.append(s.name) assL.append(len(nameL) - 1) nPops = len(nameL) logging.debug("Number of Populations: %s", nPops) logging.debug("Number of Sites: %s", nSites) logging.debug("Populations: %s", nameL) logging.debug("Assignment list: %s", assL) cfw = CFWriter([], countsFN) logging.debug("Manually initializing CFWriter.") cfw.nL = nameL cfw.nPop = len(nameL) cfw.write_HLn() # Loop over sites. for i in range(0, nSites): cfw.purge_cD() cfw.pos = i cfw.chrom = chromName # Loop over sequences / individuals. for s in range(0, nSeqs): base = seqL[s].data[i].lower() cfw.add_base_to_sequence(assL[s], base, double_fixed_sites) cfw.write_Ln() cfw.close()