Exemplo n.º 1
0
def fasta_to_cf(fastaFN,
                countsFN,
                splitChar='-',
                chromName="NA",
                double_fixed_sites=False):
    """Convert fasta to counts format.

    The (aligned) sequences in the fasta file are read in and the data
    is written to a counts format file.

    Sequence names are stripped at the first dash.  If the strupped
    sequence name coincide, individuals are put into the same
    population.

    E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same
    population homo_sapiens.

    Take care with large files, this uses a lot of memory.

    The input as well as the output files can additionally be gzipped
    (indicated by a .gz file ending).

    :ivar bool double_fixed_sites: Set to true if heterozygotes are
    encoded with IUPAC codes.  Then, fixed sites will be counted twice
    so that the level of polymorphism stays correct.

    """

    FaStr = fasta.init_seq(fastaFN)
    logging.debug("Read in fasta file %s.", fastaFN)
    seqL = [copy.deepcopy(FaStr.seq)]

    while (FaStr.read_next_seq() is not None):
        seqL.append(copy.deepcopy(FaStr.seq))

    nSeqs = len(seqL)
    logging.debug("Number of sequences: %s", nSeqs)

    for s in seqL:
        newName = s.name.rsplit(splitChar, maxsplit=1)[0]
        s.name = newName
        # s.print_info()

    logging.debug("Checking sequence lengths.")
    nSites = seqL[0].dataLen
    for s in seqL[1:]:
        if (nSites != s.dataLen):
            raise sb.SequenceDataError("Sequences do not have equal length.")

    logging.debug("Creating assignment list.")
    assL = []
    nameL = [seqL[0].name]
    i = 0
    for s in seqL:
        try:
            i = nameL.index(s.name)
            assL.append(i)
        except ValueError:
            nameL.append(s.name)
            assL.append(len(nameL) - 1)
    nPops = len(nameL)

    logging.debug("Number of Populations: %s", nPops)
    logging.debug("Number of Sites: %s", nSites)
    logging.debug("Populations: %s", nameL)
    logging.debug("Assignment list: %s", assL)

    cfw = CFWriter([], countsFN)
    logging.debug("Manually initializing CFWriter.")
    cfw.nL = nameL
    cfw.nPop = len(nameL)
    cfw.write_HLn()

    # Loop over sites.
    for i in range(0, nSites):
        cfw.purge_cD()
        cfw.pos = i
        cfw.chrom = chromName
        # Loop over sequences / individuals.
        for s in range(0, nSeqs):
            base = seqL[s].data[i].lower()
            cfw.add_base_to_sequence(assL[s], base, double_fixed_sites)
        cfw.write_Ln()
    cfw.close()
Exemplo n.º 2
0
    ploidy = args.ploidy[0]
else:
    ploidy = None

if args.merge is None:
    cfw = cf.CFWriter(vcfFnL, output, verb=vb)
else:
    mergeList = []
    nameList = []
    for fn in vcfFnL:
        mergeList.append(True)
        nameList.append(fn.split('.', maxsplit=1)[0])
    cfw = cf.CFWriter(vcfFnL, output, mergeL=mergeList,
                      nameL=nameList, verb=vb)

if ploidy is not None:
    cfw.set_ploidy(int(ploidy))

faR = fa.init_seq(fastaRef)
if offset is None:
    rg = faR.seq.get_region_no_description()
else:
    rg = faR.seq.get_region_no_description(offset)

cfw.set_seq(faR.seq)
cfw.write_HLn()
cfw.write_Rn(rg)

cfw.close()
faR.close()
Exemplo n.º 3
0

test_sequence = 'data/fasta-sample2.dat'
print("Testing libPoMo/fasta module.")
######################################################################
print("\n##################################################")
print("Read in test sequence ", test_sequence, '.', sep='')
seq = fa.open_seq(test_sequence)

print("Print sequence information.")
seq.print_info()

######################################################################
print("\n##################################################")
print("Test FaStream object.")
faStr = fa.init_seq(test_sequence)
faStr.print_info(maxB=None)
while faStr.read_next_seq() is not None:
    faStr.print_info()
faStr.close()


######################################################################
print("\n##################################################")
test_sequence = "data/fasta-sample-wolfs.dat"
ref_sequence = "data/fasta-reference-wolf.dat"
fn = "vcf-test-tmp.dat"
print("Compare ", test_sequence, " to ", ref_sequence, '.', sep='')
faSeq = fa.open_seq(test_sequence)
faRef = fa.open_seq(ref_sequence)
refSeq = faRef.get_seq_by_id(0)
Exemplo n.º 4
0
def fasta_to_cf(fastaFN, countsFN, splitChar='-', chromName="NA"):
    """Convert fasta to counts format.

    The (aligned) sequences in the fasta file are read in and the data
    is written to a counts format file.

    Sequence names are stripped at the first dash.  If the strupped
    sequence name coincide, individuals are put into the same
    population.

    E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same
    population homo_sapiens.

    Take care with large files, this uses a lot of memory.

    The input as well as the output files can additionally be gzipped
    (indicated by a .gz file ending).

    """

    FaStr = fasta.init_seq(fastaFN)
    logging.info("Read in fasta file.")
    seqL = [copy.deepcopy(FaStr.seq)]

    while (FaStr.read_next_seq() is not None):
        seqL.append(copy.deepcopy(FaStr.seq))

    nSeqs = len(seqL)
    logging.info("Number of sequences: %s", nSeqs)

    for s in seqL:
        newName = s.name.rsplit(splitChar, maxsplit=1)[0]
        s.name = newName
        # s.print_info()

    logging.info("Checking sequence lengths")
    nSites = seqL[0].dataLen
    for s in seqL[1:]:
        if (nSites != s.dataLen):
            raise sb.SequenceDataError("Sequences do not have equal length.")

    logging.info("Creating assignment list.")
    assL = []
    nameL = [seqL[0].name]
    i = 0
    for s in seqL:
        try:
            i = nameL.index(s.name)
            assL.append(i)
        except ValueError:
            nameL.append(s.name)
            assL.append(len(nameL)-1)
    nPops = len(nameL)

    logging.info("Number of Populations: %s", nPops)
    logging.info("Number of Sites: %s", nSites)
    logging.info("Populations: %s", nameL)
    logging.info("Assignment list: %s", assL)

    cfw = CFWriter([], countsFN)
    logging.warning("Manually initializing CFWriter.")
    cfw.nL = nameL
    cfw.nPop = len(nameL)
    cfw.write_HLn()

    # Loop over sites.
    for i in range(0, nSites):
        cfw.purge_cD()
        cfw.pos = i
        cfw.chrom = chromName
        # Loop over sequences / individuals.
        for s in range(0, nSeqs):
            base = seqL[s].data[i].lower()
            try:
                baseI = dna[base]
            except KeyError:
                raise sb.NotAValidRefBase()
            cfw.cD[assL[s]][baseI] += 1
        cfw.write_Ln()
    cfw.close()
Exemplo n.º 5
0
Arquivo: cf.py Projeto: pomo-dev/PoMo
def fasta_to_cf(fastaFN, countsFN, splitChar="-", chromName="NA", double_fixed_sites=False):
    """Convert fasta to counts format.

    The (aligned) sequences in the fasta file are read in and the data
    is written to a counts format file.

    Sequence names are stripped at the first dash.  If the strupped
    sequence name coincide, individuals are put into the same
    population.

    E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same
    population homo_sapiens.

    Take care with large files, this uses a lot of memory.

    The input as well as the output files can additionally be gzipped
    (indicated by a .gz file ending).

    :ivar bool double_fixed_sites: Set to true if heterozygotes are
    encoded with IUPAC codes.  Then, fixed sites will be counted twice
    so that the level of polymorphism stays correct.

    """

    FaStr = fasta.init_seq(fastaFN)
    logging.debug("Read in fasta file %s.", fastaFN)
    seqL = [copy.deepcopy(FaStr.seq)]

    while FaStr.read_next_seq() is not None:
        seqL.append(copy.deepcopy(FaStr.seq))

    nSeqs = len(seqL)
    logging.debug("Number of sequences: %s", nSeqs)

    for s in seqL:
        newName = s.name.rsplit(splitChar, maxsplit=1)[0]
        s.name = newName
        # s.print_info()

    logging.debug("Checking sequence lengths.")
    nSites = seqL[0].dataLen
    for s in seqL[1:]:
        if nSites != s.dataLen:
            raise sb.SequenceDataError("Sequences do not have equal length.")

    logging.debug("Creating assignment list.")
    assL = []
    nameL = [seqL[0].name]
    i = 0
    for s in seqL:
        try:
            i = nameL.index(s.name)
            assL.append(i)
        except ValueError:
            nameL.append(s.name)
            assL.append(len(nameL) - 1)
    nPops = len(nameL)

    logging.debug("Number of Populations: %s", nPops)
    logging.debug("Number of Sites: %s", nSites)
    logging.debug("Populations: %s", nameL)
    logging.debug("Assignment list: %s", assL)

    cfw = CFWriter([], countsFN)
    logging.debug("Manually initializing CFWriter.")
    cfw.nL = nameL
    cfw.nPop = len(nameL)
    cfw.write_HLn()

    # Loop over sites.
    for i in range(0, nSites):
        cfw.purge_cD()
        cfw.pos = i
        cfw.chrom = chromName
        # Loop over sequences / individuals.
        for s in range(0, nSeqs):
            base = seqL[s].data[i].lower()
            cfw.add_base_to_sequence(assL[s], base, double_fixed_sites)
        cfw.write_Ln()
    cfw.close()
Exemplo n.º 6
0
if args.merge is None:
    cfw = cf.CFWriter(vcfFnL, output, verb=vb)
else:
    mergeList = []
    nameList = []
    for fn in vcfFnL:
        mergeList.append(True)
        nameList.append(fn.split('.', maxsplit=1)[0])
    cfw = cf.CFWriter(vcfFnL,
                      output,
                      mergeL=mergeList,
                      nameL=nameList,
                      verb=vb)

if ploidy is not None:
    cfw.set_ploidy(int(ploidy))

faR = fa.init_seq(fastaRef)
if offset is None:
    rg = faR.seq.get_region_no_description()
else:
    rg = faR.seq.get_region_no_description(offset)

cfw.set_seq(faR.seq)
cfw.write_HLn()
cfw.write_Rn(rg)

cfw.close()
faR.close()
Exemplo n.º 7
0
import libPoMo.fasta as fa  # noqa

test_sequence = 'data/fasta-sample2.dat'
print("Testing libPoMo/fasta module.")
######################################################################
print("\n##################################################")
print("Read in test sequence ", test_sequence, '.', sep='')
seq = fa.open_seq(test_sequence)

print("Print sequence information.")
seq.print_info()

######################################################################
print("\n##################################################")
print("Test FaStream object.")
faStr = fa.init_seq(test_sequence)
faStr.print_info(maxB=None)
while faStr.read_next_seq() is not None:
    faStr.print_info()
faStr.close()

######################################################################
print("\n##################################################")
test_sequence = "data/fasta-sample-wolfs.dat"
ref_sequence = "data/fasta-reference-wolf.dat"
fn = "vcf-test-tmp.dat"
print("Compare ", test_sequence, " to ", ref_sequence, '.', sep='')
faSeq = fa.open_seq(test_sequence)
faRef = fa.open_seq(ref_sequence)
refSeq = faRef.get_seq_by_id(0)
fa.save_as_vcf(faSeq, refSeq, fn)