Exemplo n.º 1
0
 def test_sequence(faSequence):
     """Tests if sequences contain data."""
     length = faSequence.nSpecies
     names = []
     for i in range(length):
         names.append(faSequence.seqL[i].name)
         if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '':
             raise sb.SequenceDataError("Sequence name or data is missing.")
     if length > len(set(names)):
         raise sb.SequenceDataError("Sequence names are not unique.")
     return
Exemplo n.º 2
0
 def get_seq_base(self, seq, pos):
     """Return base at 1-based position `pos` in sequence with name
     `seq`."""
     names = self.get_seq_names()
     try:
         i = names.index(seq)
     except:
         raise sb.SequenceDataError("Sequence name not found.")
     if pos > self.seqL[i].dataLen:
         raise sb.SequenceDataError("Position out of range.")
     return self.seqL[i].get_base(pos)
Exemplo n.º 3
0
 def get_nuc_base(self, chrom, pos):
     """Return base at position *pos* of chromosome *chrom*."""
     for i in range(0, self.nBases):
         if pos == self.baseL[i].pos \
            and chrom == self.baseL[i].chrom:
             return self.baseL[i]
     raise sb.SequenceDataError('Base at position ' + str(pos) +
                                ' on chromosome ' + str(chrom) +
                                ' not found.')
Exemplo n.º 4
0
def save_as_vcf(faSeq, ref, VCFFileName):
    """Save the given :classL`FaSeq` in VCF format.

    In general, we want to convert a fasta file with various
    individuals with the help of a reference that contains one
    sequence to a VCF file that contains all the SNPs.  This can be
    done with this function.  Until now it is not possible to do this
    conversion for several chromosomes for each individual in one run.
    Still, the conversion can be done chromosome by chromosome.

    This function saves the SNPs of *faSeq*, a given :class:`FaSeq`
    (fasta sequence) object in VCF format to the file *VCFFileName*.
    The reference genome *ref*, to which *faSeq* is compared to, needs
    to be passed as a :class:`Seq <cflib.seqbase.Seq>` object.

    The function compares all sequences in *faSeq* to the sequence
    given in *ref*.  The names of the individuals in the saved VCF
    file will be the sequence names of the *faSeq* object.

    ::

      #CHROM = sequence name of the reference
      POS    = position relative to reference
      ID     = .
      REF    = base of reference
      ALT    = SNP (e.g. 'C' or 'G,T' if 2 different SNPs are present)
      QUAL   = .
      FILTER = .
      INFO   = .
      FORMAT = GT

    :param FaSeq faSeq: :class:`FaSeq` object to be converted.
    :param Seq ref: :class:`Seq <cflib.seqbase.Seq>` object of the
                    reference sequence.
    :param str VCFFileName: Name of the VCF output file.

    """
    def get_altBases_string(sAltBases):
        """Return ALT bases string from given `sAltBases`."""
        length = len(sAltBases)
        if length == 0:
            return ''
        string = str(sAltBases[0])
        if length > 1:
            for i in range(1, length):
                string += ',' + sAltBases[i]
        return string

    def get_indiv_string(indivData, altBases, sAltBases):
        """Return the string of the individual data.

        Return the string extracted from the indivudal data
        `indivData` with SNPs `altBases`. `sAltBases` is the string
        with the alternative bases.

        E.g.:
        REF = A
        ALT = C,G
        individual i1 has A
        individual i2 has C
        individual i3 has G

        Then the string should look like:
        '0\t1\t2'
        -> 0 for REF, 1 for first ALT and 2 for second ALT

        """
        length = len(indivData)
        if not (indivData[0] in altBases):
            string = '0'
        else:
            string = str(sAltBases.index(indivData[0]) + 1)
        if length > 1:
            for i in range(1, len(indivData)):
                if not (indivData[i] in altBases):
                    string += '\t' + '0'
                else:
                    string += '\t' + str(sAltBases.index(indivData[i]) + 1)
        return string

    def get_vcf_line(chromName, pos, refBase, altBaseString, indivString):
        """Print a VCF file line with given data to file `VCFFile`."""
        string = chromName + '\t'
        string += str(pos) + '\t'
        string += '.' + '\t'  # id
        string += refBase + '\t'
        string += altBaseString + '\t'
        string += '.' + '\t'  # qual
        string += '.' + '\t'  # filter
        string += '.' + '\t'  # info
        string += "GT" + '\t'  # format
        string += indivString
        return string

    if (not isinstance(faSeq, FaSeq)):
        raise sb.SequenceDataError("`faSeq` is not an FaSeq object.")
    if (not isinstance(ref, sb.Seq)):
        raise sb.SequenceDataError("`ref` is not a Seq object.")
    if faSeq.nSpecies == 0:
        raise sb.SequenceDataError("`faSeq` has no saved sequences.")
    for i in range(0, faSeq.nSpecies):
        if faSeq.seqL[i].dataLen != ref.dataLen:
            raise sb.SequenceDataError("Sequence " + faSeq.seqL[i].name +
                                       " has different length than reference.")
    VCFFile = sb.gz_open(VCFFileName, mode='w')
    print(vcf.get_header_line_string(faSeq.get_seq_names()), file=VCFFile)
    # loop over bases
    refBase = ''
    for i in range(0, ref.dataLen):
        refBase = ref.data[i]
        altBases = set()
        indivData = []
        # loop over sequences in faSeq and check if there is a SNP
        for s in range(0, faSeq.nSpecies):
            indivData.append(faSeq.seqL[s].data[i])
            if faSeq.seqL[s].data[i] != refBase:
                altBases.add(faSeq.seqL[s].data[i])
        sAltBases = sorted(altBases)
        altBaseString = get_altBases_string(sAltBases)
        indivString = get_indiv_string(indivData, altBases, sAltBases)
        if altBases != set():
            print(get_vcf_line(ref.name, i + 1, refBase, altBaseString,
                               indivString),
                  file=VCFFile)
    VCFFile.close()
    return
Exemplo n.º 5
0
 def set_seq(self, seq):
     "Set the reference sequence." ""
     if (not isinstance(seq, sb.Seq)):
         raise sb.SequenceDataError("`seq` is not a Seq object.")
     self.refSeq = seq
Exemplo n.º 6
0
    def __fill_cD(self, iL=None, snpL=None):
        """Fill *self.cF*.

        Fill *self.cF* with data from reference at chromosome
        *self.chrom* and position *self.pos*. Possible SNPs in
        *self.vcfL* at this position are considered.

        :param [int] iL: List with vcf indices of the SNPs in *snpL*,
            must be sorted.
        :param [NucBase] snpL: List with :class:`NucBase
            <cflib.vcf.NucBase>` SNPs at this position. None, if
            there is no SNP.
        :raises: :class:`NotAValidRefBase
            <cflib.seqbase.NotAValidRefBase>`,
            :class:`SequenceDataError
            <cflib.seqbase.SequenceDataError>`

        :class:`NotAValidRefBae <cflib.seqbase.NotAValidRefBase>` is
        raised if the reference base is not valid (e.g. N).

        :class:`SequenceDataError <cflib.seqbase.SequenceDataError>`
        is raised if the chromosome names do not match.

        """
        if snpL is not None:
            logging.debug("Next SNP(s):")
            for s in snpL:
                logging.debug(s.get_info())

        def get_refBase():
            """Get reference base on *chrom* at *pos*."""
            return self.refSeq.data[self.pos].lower()

        def update_cD(pop, baseI, delta=self.ploidy):
            """Add counts to the countsDictionary cD."""
            # FIXME: IUPAC code not handled here.  Is this even necessary?
            if baseI == dna['n'] or baseI == dna['*']:
                logging.debug("Reference base is unknown.  Continue.")
                return
            if pop in range(0, self.nPop):
                self.cD[pop][baseI] += delta
                logging.debug(
                    "Updating counts dictionary; population %s, "
                    "base index %s.", pop, baseI)
            else:
                logging.info(
                    "Ignoring data because population index %s is "
                    "out of range.", pop)
                raise ValueError()

        self.purge_cD()

        # If we check for synonymous bases, do not do anything if base
        # is not 4-fold degenerate.
        if self.onlySynonymous is True:
            if self.refSeq.is_synonymous(self.pos) is False:
                logging.debug(
                    "Rejection; %s at position %s "
                    "is not a synonymous base.", self.refSeq.data[self.pos],
                    self.pos)
                raise NoSynBase()

        refBase = get_refBase()
        try:
            r = dna[refBase]
        except KeyError:
            raise sb.NotAValidRefBase()
        # If there are no SNPS, fill *self.cD* with data from reference.
        if iL is None:
            for i in range(self.nV):
                for pop in self.assM[i]:
                    update_cD(pop, r)
        elif (snpL is not None) and (len(iL) == len(snpL)):
            # Else, only fill *self.cD* where the individual has no SNP.
            for i in range(self.nV):
                if i not in iL:
                    for pop in self.assM[i]:
                        update_cD(pop, r)
            # Now traverse the SNPs.
            for sI in range(len(iL)):
                # Check if the reference bases match.
                vcfRefBase = snpL[sI].get_ref_base().lower()
                # Thu Jun 9 09:26:55 CEST 2016: Just use first base if
                # there are more.
                indel = False
                if len(vcfRefBase) > 1:
                    logging.warn("Indel at chrom %s pos %d.", self.chrom,
                                 self.pos + self.offset)
                    indel = True
                    vcfRefBase = vcfRefBase[0]
                if dna[vcfRefBase] != r:
                    print("Error at NucBase:")
                    snpL[sI].print_info()
                    print("The reference base at position",
                          self.pos,
                          "on chromosome",
                          self.chrom,
                          "is",
                          refBase,
                          end=".\n")
                    print("The reference base of the VCF file is",
                          vcfRefBase,
                          end=".\n")
                    raise sb.SequenceDataError("Reference bases do not match.")
                altBases = snpL[sI].get_alt_base_list()
                for altBase in altBases:
                    if len(altBase) > 1:
                        indel = True
                        logging.warn("Indel at chrom %s pos %d.", self.chrom,
                                     self.pos + self.offset)
                spData = snpL[sI].get_speciesData()
                vI = iL[sI]
                # Loop over individuals.
                for i in range(0, len(spData)):
                    # Loop over chromatides (e.g. diploid).
                    for d in range(0, self.ploidy):
                        if spData[i][d] is None:
                            pass
                        elif indel or spData[i][d] == 0:
                            bI = r
                            update_cD(self.assM[vI][i], bI, delta=1)
                        else:
                            bI = dna[altBases[spData[i][d] - 1]]
                            logging.debug("Use SNP of %s, population %s",
                                          self.indM[vI][i], self.assM[vI][i])
                            update_cD(self.assM[vI][i], bI, delta=1)
        else:
            raise sb.SequenceDataError("SNP information is not correct.")
Exemplo n.º 7
0
def fasta_to_cf(fastaFN,
                countsFN,
                splitChar='-',
                chromName="NA",
                double_fixed_sites=False):
    """Convert fasta to counts format.

    The (aligned) sequences in the fasta file are read in and the data
    is written to a counts format file.

    Sequence names are stripped at the first dash.  If the strupped
    sequence name coincide, individuals are put into the same
    population.

    E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same
    population homo_sapiens.

    Take care with large files, this uses a lot of memory.

    The input as well as the output files can additionally be gzipped
    (indicated by a .gz file ending).

    :ivar bool double_fixed_sites: Set to true if heterozygotes are
    encoded with IUPAC codes.  Then, fixed sites will be counted twice
    so that the level of polymorphism stays correct.

    """

    FaStr = fasta.init_seq(fastaFN)
    logging.debug("Read in fasta file %s.", fastaFN)
    seqL = [copy.deepcopy(FaStr.seq)]

    while (FaStr.read_next_seq() is not None):
        seqL.append(copy.deepcopy(FaStr.seq))

    nSeqs = len(seqL)
    logging.debug("Number of sequences: %s", nSeqs)

    for s in seqL:
        newName = s.name.rsplit(splitChar, maxsplit=1)[0]
        s.name = newName
        # s.print_info()

    logging.debug("Checking sequence lengths.")
    nSites = seqL[0].dataLen
    for s in seqL[1:]:
        if (nSites != s.dataLen):
            raise sb.SequenceDataError("Sequences do not have equal length.")

    logging.debug("Creating assignment list.")
    assL = []
    nameL = [seqL[0].name]
    i = 0
    for s in seqL:
        try:
            i = nameL.index(s.name)
            assL.append(i)
        except ValueError:
            nameL.append(s.name)
            assL.append(len(nameL) - 1)
    nPops = len(nameL)

    logging.debug("Number of Populations: %s", nPops)
    logging.debug("Number of Sites: %s", nSites)
    logging.debug("Populations: %s", nameL)
    logging.debug("Assignment list: %s", assL)

    cfw = CFWriter([], countsFN)
    logging.debug("Manually initializing CFWriter.")
    cfw.nL = nameL
    cfw.nPop = len(nameL)
    cfw.write_HLn()

    # Loop over sites.
    for i in range(0, nSites):
        cfw.purge_cD()
        cfw.pos = i
        cfw.chrom = chromName
        # Loop over sequences / individuals.
        for s in range(0, nSeqs):
            base = seqL[s].data[i].lower()
            cfw.add_base_to_sequence(assL[s], base, double_fixed_sites)
        cfw.write_Ln()
    cfw.close()