def fasta_to_cf(fastaFN, countsFN, splitChar='-', chromName="NA", double_fixed_sites=False): """Convert fasta to counts format. The (aligned) sequences in the fasta file are read in and the data is written to a counts format file. Sequence names are stripped at the first dash. If the strupped sequence name coincide, individuals are put into the same population. E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same population homo_sapiens. Take care with large files, this uses a lot of memory. The input as well as the output files can additionally be gzipped (indicated by a .gz file ending). :ivar bool double_fixed_sites: Set to true if heterozygotes are encoded with IUPAC codes. Then, fixed sites will be counted twice so that the level of polymorphism stays correct. """ FaStr = fasta.init_seq(fastaFN) logging.debug("Read in fasta file %s.", fastaFN) seqL = [copy.deepcopy(FaStr.seq)] while (FaStr.read_next_seq() is not None): seqL.append(copy.deepcopy(FaStr.seq)) nSeqs = len(seqL) logging.debug("Number of sequences: %s", nSeqs) for s in seqL: newName = s.name.rsplit(splitChar, maxsplit=1)[0] s.name = newName # s.print_info() logging.debug("Checking sequence lengths.") nSites = seqL[0].dataLen for s in seqL[1:]: if (nSites != s.dataLen): raise sb.SequenceDataError("Sequences do not have equal length.") logging.debug("Creating assignment list.") assL = [] nameL = [seqL[0].name] i = 0 for s in seqL: try: i = nameL.index(s.name) assL.append(i) except ValueError: nameL.append(s.name) assL.append(len(nameL) - 1) nPops = len(nameL) logging.debug("Number of Populations: %s", nPops) logging.debug("Number of Sites: %s", nSites) logging.debug("Populations: %s", nameL) logging.debug("Assignment list: %s", assL) cfw = CFWriter([], countsFN) logging.debug("Manually initializing CFWriter.") cfw.nL = nameL cfw.nPop = len(nameL) cfw.write_HLn() # Loop over sites. for i in range(0, nSites): cfw.purge_cD() cfw.pos = i cfw.chrom = chromName # Loop over sequences / individuals. for s in range(0, nSeqs): base = seqL[s].data[i].lower() cfw.add_base_to_sequence(assL[s], base, double_fixed_sites) cfw.write_Ln() cfw.close()
ploidy = args.ploidy[0] else: ploidy = None if args.merge is None: cfw = cf.CFWriter(vcfFnL, output, verb=vb) else: mergeList = [] nameList = [] for fn in vcfFnL: mergeList.append(True) nameList.append(fn.split('.', maxsplit=1)[0]) cfw = cf.CFWriter(vcfFnL, output, mergeL=mergeList, nameL=nameList, verb=vb) if ploidy is not None: cfw.set_ploidy(int(ploidy)) faR = fa.init_seq(fastaRef) if offset is None: rg = faR.seq.get_region_no_description() else: rg = faR.seq.get_region_no_description(offset) cfw.set_seq(faR.seq) cfw.write_HLn() cfw.write_Rn(rg) cfw.close() faR.close()
test_sequence = 'data/fasta-sample2.dat' print("Testing libPoMo/fasta module.") ###################################################################### print("\n##################################################") print("Read in test sequence ", test_sequence, '.', sep='') seq = fa.open_seq(test_sequence) print("Print sequence information.") seq.print_info() ###################################################################### print("\n##################################################") print("Test FaStream object.") faStr = fa.init_seq(test_sequence) faStr.print_info(maxB=None) while faStr.read_next_seq() is not None: faStr.print_info() faStr.close() ###################################################################### print("\n##################################################") test_sequence = "data/fasta-sample-wolfs.dat" ref_sequence = "data/fasta-reference-wolf.dat" fn = "vcf-test-tmp.dat" print("Compare ", test_sequence, " to ", ref_sequence, '.', sep='') faSeq = fa.open_seq(test_sequence) faRef = fa.open_seq(ref_sequence) refSeq = faRef.get_seq_by_id(0)
def fasta_to_cf(fastaFN, countsFN, splitChar='-', chromName="NA"): """Convert fasta to counts format. The (aligned) sequences in the fasta file are read in and the data is written to a counts format file. Sequence names are stripped at the first dash. If the strupped sequence name coincide, individuals are put into the same population. E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same population homo_sapiens. Take care with large files, this uses a lot of memory. The input as well as the output files can additionally be gzipped (indicated by a .gz file ending). """ FaStr = fasta.init_seq(fastaFN) logging.info("Read in fasta file.") seqL = [copy.deepcopy(FaStr.seq)] while (FaStr.read_next_seq() is not None): seqL.append(copy.deepcopy(FaStr.seq)) nSeqs = len(seqL) logging.info("Number of sequences: %s", nSeqs) for s in seqL: newName = s.name.rsplit(splitChar, maxsplit=1)[0] s.name = newName # s.print_info() logging.info("Checking sequence lengths") nSites = seqL[0].dataLen for s in seqL[1:]: if (nSites != s.dataLen): raise sb.SequenceDataError("Sequences do not have equal length.") logging.info("Creating assignment list.") assL = [] nameL = [seqL[0].name] i = 0 for s in seqL: try: i = nameL.index(s.name) assL.append(i) except ValueError: nameL.append(s.name) assL.append(len(nameL)-1) nPops = len(nameL) logging.info("Number of Populations: %s", nPops) logging.info("Number of Sites: %s", nSites) logging.info("Populations: %s", nameL) logging.info("Assignment list: %s", assL) cfw = CFWriter([], countsFN) logging.warning("Manually initializing CFWriter.") cfw.nL = nameL cfw.nPop = len(nameL) cfw.write_HLn() # Loop over sites. for i in range(0, nSites): cfw.purge_cD() cfw.pos = i cfw.chrom = chromName # Loop over sequences / individuals. for s in range(0, nSeqs): base = seqL[s].data[i].lower() try: baseI = dna[base] except KeyError: raise sb.NotAValidRefBase() cfw.cD[assL[s]][baseI] += 1 cfw.write_Ln() cfw.close()
def fasta_to_cf(fastaFN, countsFN, splitChar="-", chromName="NA", double_fixed_sites=False): """Convert fasta to counts format. The (aligned) sequences in the fasta file are read in and the data is written to a counts format file. Sequence names are stripped at the first dash. If the strupped sequence name coincide, individuals are put into the same population. E.g., homo_sapiens-XXX and homo_sapiens-YYY will be in the same population homo_sapiens. Take care with large files, this uses a lot of memory. The input as well as the output files can additionally be gzipped (indicated by a .gz file ending). :ivar bool double_fixed_sites: Set to true if heterozygotes are encoded with IUPAC codes. Then, fixed sites will be counted twice so that the level of polymorphism stays correct. """ FaStr = fasta.init_seq(fastaFN) logging.debug("Read in fasta file %s.", fastaFN) seqL = [copy.deepcopy(FaStr.seq)] while FaStr.read_next_seq() is not None: seqL.append(copy.deepcopy(FaStr.seq)) nSeqs = len(seqL) logging.debug("Number of sequences: %s", nSeqs) for s in seqL: newName = s.name.rsplit(splitChar, maxsplit=1)[0] s.name = newName # s.print_info() logging.debug("Checking sequence lengths.") nSites = seqL[0].dataLen for s in seqL[1:]: if nSites != s.dataLen: raise sb.SequenceDataError("Sequences do not have equal length.") logging.debug("Creating assignment list.") assL = [] nameL = [seqL[0].name] i = 0 for s in seqL: try: i = nameL.index(s.name) assL.append(i) except ValueError: nameL.append(s.name) assL.append(len(nameL) - 1) nPops = len(nameL) logging.debug("Number of Populations: %s", nPops) logging.debug("Number of Sites: %s", nSites) logging.debug("Populations: %s", nameL) logging.debug("Assignment list: %s", assL) cfw = CFWriter([], countsFN) logging.debug("Manually initializing CFWriter.") cfw.nL = nameL cfw.nPop = len(nameL) cfw.write_HLn() # Loop over sites. for i in range(0, nSites): cfw.purge_cD() cfw.pos = i cfw.chrom = chromName # Loop over sequences / individuals. for s in range(0, nSeqs): base = seqL[s].data[i].lower() cfw.add_base_to_sequence(assL[s], base, double_fixed_sites) cfw.write_Ln() cfw.close()
if args.merge is None: cfw = cf.CFWriter(vcfFnL, output, verb=vb) else: mergeList = [] nameList = [] for fn in vcfFnL: mergeList.append(True) nameList.append(fn.split('.', maxsplit=1)[0]) cfw = cf.CFWriter(vcfFnL, output, mergeL=mergeList, nameL=nameList, verb=vb) if ploidy is not None: cfw.set_ploidy(int(ploidy)) faR = fa.init_seq(fastaRef) if offset is None: rg = faR.seq.get_region_no_description() else: rg = faR.seq.get_region_no_description(offset) cfw.set_seq(faR.seq) cfw.write_HLn() cfw.write_Rn(rg) cfw.close() faR.close()
import libPoMo.fasta as fa # noqa test_sequence = 'data/fasta-sample2.dat' print("Testing libPoMo/fasta module.") ###################################################################### print("\n##################################################") print("Read in test sequence ", test_sequence, '.', sep='') seq = fa.open_seq(test_sequence) print("Print sequence information.") seq.print_info() ###################################################################### print("\n##################################################") print("Test FaStream object.") faStr = fa.init_seq(test_sequence) faStr.print_info(maxB=None) while faStr.read_next_seq() is not None: faStr.print_info() faStr.close() ###################################################################### print("\n##################################################") test_sequence = "data/fasta-sample-wolfs.dat" ref_sequence = "data/fasta-reference-wolf.dat" fn = "vcf-test-tmp.dat" print("Compare ", test_sequence, " to ", ref_sequence, '.', sep='') faSeq = fa.open_seq(test_sequence) faRef = fa.open_seq(ref_sequence) refSeq = faRef.get_seq_by_id(0) fa.save_as_vcf(faSeq, refSeq, fn)