Exemplo n.º 1
0
    def test_exception(self):
        with self.assertRaises(TypeError):
            pyfastx.Fasta(flat_fasta, key_func=1)

        with self.assertRaises(FileExistsError):
            pyfastx.Fasta('a_file_not_exists')

        with self.assertRaises(ValueError):
            self.fastx.fetch('seq1', {'a': 1})

        with self.assertRaises(NameError):
            self.fastx.fetch('seq1', (1, 10))

        with self.assertRaises(ValueError):
            self.fastx.fetch(self.fastx[0].name, (1, 10, 20))

        with self.assertRaises(ValueError):
            self.fastx.fetch(self.fastx[0].name, (20, 10))

        with self.assertRaises(ValueError):
            self.fastx.fetch(self.fastx[0].name, [20, 10])

        with self.assertRaises(IndexError):
            _ = self.fastx[self.count]

        with self.assertRaises(KeyError):
            _ = self.fastx[list()]

        with self.assertRaises(ValueError):
            self.fastx.nl(101)
Exemplo n.º 2
0
    def setUp(self):
        self.fastx = pyfastx.Fasta(gzip_fasta)

        self.fasta = pyfastx.Fasta(flat_fasta)

        self.faidx = pyfaidx.Fasta(flat_fasta, sequence_always_upper=True)

        self.count = len(self.fastx)
Exemplo n.º 3
0
    def test_seq_type(self):
        #test dna format
        self.assertEqual(self.fastx.type, 'DNA')

        #test rna format
        rna = pyfastx.Fasta(rna_fasta)
        self.assertEqual(rna.type, "RNA")

        #test protein format
        prot = pyfastx.Fasta(protein_fasta)
        self.assertEqual(prot.type, "protein")
Exemplo n.º 4
0
def load_seqfile(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=True)
    elif os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=True)
    return seqfile
Exemplo n.º 5
0
	def setUp(self):
		self.fastx = pyfastx.Fasta(gzip_fasta, build_index=False)
		self.fastx.build_index()
		self.fastx.rebuild_index()

		#reload index
		self.fastx = pyfastx.Fasta(gzip_fasta)

		self.fasta = pyfastx.Fasta(flat_fasta)

		self.faidx = pyfaidx.Fasta(flat_fasta, sequence_always_upper=True)
		
		self.count = len(self.fastx)
Exemplo n.º 6
0
def fasta_sample(args):
	fa = pyfastx.Fasta(args.fastx)

	if args.num is not None and args.num > 0:
		seq_num = args.num
		if seq_num > len(fa):
			seq_num = len(fa)

	elif args.prop is not None and 0 < args.prop <= 1:
		seq_num = round(len(fa)*args.prop)
		if seq_num == 0:
			raise RuntimeError("the proportion is too small")

	else:
		raise RuntimeError("specify a right number for seq number or proportion")

	selected = random.sample(range(len(fa)), k=seq_num)

	if args.outfile is None:
		fw = sys.stdout
	else:
		fw = open(args.outfile, 'w')

	for idx in selected:
		s = fa[idx]
		fw.write(">{}\n{}\n".format(s.name, s.seq))

	if args.outfile is None:
		fw.flush()
	else:
		fw.close()
Exemplo n.º 7
0
def fastx_subseq(args):
	fa = pyfastx.Fasta(args.fastx)

	if args.chr is not None:
		if args.chr not in fa:
			raise RuntimeError("no sequence named {} in fasta file".format(args.chr))

		subseq = fa[args.chr]

	else:
		if args.id <= 0:
			raise RuntimeError("sequence id must be a integer between 1 and {}".format(len(fa)))

		subseq = fa[args.id]

	if args.region:
		start, end = args.region.split(':')
		if start:
			start = int(start) - 1
		else:
			start = 0

		if end:
			end = int(end)
		else:
			end = len(s)

		sys.stdout.write("{}\n".format(subseq[start:end].seq))
	else:
		sys.stdout.write("{}\n".format(subseq.seq))
	
	sys.stdout.flush()
Exemplo n.º 8
0
def get_output_handle(fpath: str, fastx: bool = False, out: bool = True):

    if fpath == "-":
        if out:
            handle = sys.stdout
        else:
            handle = sys.stdin
    else:
        p = Path(fpath)
        if not p.parent.is_dir():
            raise NotADirectoryError(
                "Directory specified for output file does not exist: {}".format(
                    p.parent
                )
            )

        if fastx:
            if fpath.endswith("a"):
                handle = pyfastx.Fasta(p)
            else:
                handle = pyfastx.Fastq(p)
        else:
            handle = p.open("w")

    return handle
Exemplo n.º 9
0
def main():
    # Configure argparser
    argparser = get_argparser()

    # Parse the arguments
    args = argparser.parse_args()

    # Input files: json input file to be used as template and
    in_fasta = args.in_fasta
    out_fasta = args.out_fasta

    # Configure logging appropriate for verbosity
    configure_logging(args.verbosity_level)

    # Count total length of bases in fasta
    logging.info("Reading input fasta...")
    fa = pyfastx.Fasta(in_fasta)
    total_contigs = len(fa)
    total_bases = fa.size
    logging.info("Total input contigs: {0}".format(total_contigs))
    logging.info("Total input bases: {0}".format(total_bases))

    # Merge contigs
    logging.info("Cleaning fastq records")
    filter_fasta(fa, out_fasta)
Exemplo n.º 10
0
    def test_build(self):
        self.fastx = pyfastx.Fasta(gzip_fasta, build_index=False)

        if os.path.exists('{}.fxi'.format(gzip_fasta)):
            os.remove('{}.fxi'.format(gzip_fasta))

        self.fastx.build_index()
Exemplo n.º 11
0
	def generateHtml(self):
		sql = "SELECT * FROM primer,primer_meta WHERE id=pid AND id=%s" % self.id
		primer = self.db.get_row(sql)

		table = primer.category
		tid = primer.target

		#table, tid = primer.target.split('-')

		sql = "SELECT path FROM fasta LIMIT 1"

		fasta_file = self.db.get_one(sql.format(table, tid))
		self.fasta = pyfastx.Fasta(fasta_file)

		sql = "SELECT * FROM %s WHERE id=%s" % (table, tid)
		ssr = self.db.get_row(sql)
		seq, left, right = self.getSequence(ssr.sequence, ssr.start, ssr.end)

		tandem = "%s%s%s" % (
			self.formatPrimer(left, primer.start1, primer.length1),
			self.formatTarget(seq),
			self.formatPrimer(right, primer.start2-primer.length2-len(seq)-len(left)+1, primer.length2)
		)

		return template_render("sequence.html", tandem=tandem, ssr=ssr, table=self.table)
def parse_fasta(fasta_file):
    busco_seqs = pyfastx.Fasta(fasta_file)
    ids = busco_seqs.keys()
    busco_names = pd.DataFrame(data=list(ids), columns=['seqNames'])
    busco_names = pd.DataFrame(busco_names.seqNames.str.split("_", 1).tolist(),
                    columns=['buscoId', 'sampleId'])
    return busco_names, busco_seqs
Exemplo n.º 13
0
def fastx_info(args):
	fastx_type = fastx_format_check(args.fastx)

	if fastx_type == 'fasta':
		fa = pyfastx.Fasta(args.fastx)
		comp = fa.composition
		print("Sequence counts: {}".format(len(fa)))
		print("Total bases: {}".format(fa.size))
		print("GC content: {:.2f}%".format(fa.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Mean length: {:.2f}".format(fa.mean))
		print("Median length: {:.2f}".format(fa.median))
		print("Max length: {}".format(len(fa.longest)))
		print("Min length: {}".format(len(fa.shortest)))
		print("N50, L50: {}, {}".format(*fa.nl()))
		print("length >= 1000: {}".format(fa.count(1000)))

	elif fastx_type == 'fastq':
		fq = pyfastx.Fastq(args.fastx)
		comp = fq.composition
		print("Read counts: {}".format(len(fq)))
		print("Total bases: {}".format(fq.size))
		print("GC content: {:.2f}%".format(fq.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Quality encoding system maybe: {}".format(", ".join(fq.encoding_type)))
Exemplo n.º 14
0
def create_fastx_index(fastx):

    if is_fasta(fastx):
        return pyfastx.Fasta(str(fastx), build_index=True), build_read_fasta
    elif is_fastq(fastx):
        return pyfastx.Fastq(str(fastx), build_index=True), build_read_fastq
    else:
        raise ValueError(f'Could not determine input file format: {fastx}')
Exemplo n.º 15
0
def stat_query_mismatch(alnfile: str, reffile: str, sitesfile: str):
    """
    统计指定位点的错配情况
    alnfile为bam/cram文件
    reffile为对应的参考基因组文件
    sitesfile为待检测的位点,两列,gzip或bgzip压缩,第一列为染色体,第二列为坐标位置(1-based),vcf.gz文件符合这一格式,可以直接用vcf.gz作为输入
    """
    print(f'stat_query_mismatch: {alnfile}, {sitesfile}')
    # 打开reffile
    reffa = pyfastx.Fasta(reffile, uppercase=True) # always output uppercase sequence, 为了方便下面比较
    
    # 读取位点信息
    sites = []
    with gzip.open(sitesfile, 'rb') as f:
        for line in f:
            tline = line.decode().strip().split()
            if tline[0][0] != '#':
                sites.append([tline[0], int(tline[1])-1]) # 后续pysam输入的是0-base索引,在这儿提前转换了
    
    # 打开bam文件
    if alnfile.endswith('bam'):
        alnfile = pysam.AlignmentFile(alnfile, 'rb', threads=10)
    elif alnfile.endswith('.cram'):
        alnfile = pysam.AlignmentFile(alnfile, 'rc', reference_filename=reffile, threads=10)
    
    # 开始遍历位点进行判断
    reads2QMis = defaultdict(int) # 每对reads在目标位点(query sites)上与参考基因组(reffile)之间的错配数量。统计错配的话,后续缺失值用0填充就比较合理。
    reads2nQuery = defaultdict(int) # 记录每个reads cover到了几个目标位点,用来区分没有错配还是没有cover到query位点
    
    for chrom, pos in sites:
        refbase = reffa[chrom][pos: pos+1].seq
        # stepper='all': skip reads in which any of the following flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
        for ncolumn, pileupcolumn in enumerate(alnfile.pileup(chrom, pos, pos+1, 
                                                              truncate=True, stepper='all', ignore_orphans=False,
                                                              min_base_quality=0, min_mapping_quality=0)):
            assert pileupcolumn.reference_pos == pos
            for pileupread in pileupcolumn.pileups:
                readname = pileupread.alignment.query_name
                reads2nQuery[readname] += 1
                if not pileupread.is_del:
                    # query position is None if is_del or is_refskip is set. 统一大小写进行比较,fasta前面已经转过了
                    if pileupread.alignment.query_sequence[pileupread.query_position].upper() != refbase:
                        reads2QMis[readname] += 1
                else:
                    reads2QMis[readname] += 1
    alnfile.close()
    
    maxnQuery = max(reads2nQuery.values())
    nQuery_dtype = select_min_dtype_uint(maxnQuery)
    reads2nQuery = pd.Series(reads2nQuery, dtype=nQuery_dtype)
    print(f'max(nQuery): {maxnQuery}, select dtype: {nQuery_dtype}')
    
    maxQMis = max(reads2QMis.values())
    QMis_dtype = select_min_dtype_uint(maxQMis)
    reads2QMis = pd.Series(reads2QMis, dtype=QMis_dtype)
    print(f'max(QMis): {maxQMis}, select dtype: {QMis_dtype}')

    return reads2nQuery, reads2QMis
Exemplo n.º 16
0
    def build_fasta_index(self, fasta_id, fasta_path):
        '''
		build index for fasta file and write fasta sequence to database
		@para fasta_id int, the fasta file id in database
		@para fasta_path str, the file path of fasta
		@return Fasta object
		'''
        #seqs = fasta.GzipFasta(fasta_path)
        self.emit_message("Building fasta index for %s" % fasta_path)

        with multiprocessing.Pool() as pool:
            pool.apply_async(build_full_index, (fasta_path, ))
            pool.close()
            pool.join()

        seqs = pyfastx.Fasta(fasta_path)

        #get sequence detail information
        #sql = "SELECT * FROM seq INNER JOIN fasta ON (seq.fid=fasta.id) WHERE fasta.path='{}' LIMIT 1".format(fasta_path)
        #if not self.db.get_one(sql):
        #	rows = []
        #	for seq in seqs:
        #		compos = seq.composition
        #		ns = sum(compos[b] for b in compos if b not in ['A', 'T', 'G', 'C'])
        #		row = (None, seq.name, fasta_id, len(seq), compos.get('G',0)+compos.get('C',0), ns)
        #		rows.append(row)
        #	self.db.insert("INSERT INTO seq VALUES (?,?,?,?,?,?)", rows)

        sql = "SELECT * FROM option WHERE name='gc_content'"
        if not self.db.get_one(sql):
            gc = seqs.gc_content
            compos = seqs.composition
            ns = sum(compos[b] for b in compos
                     if b not in ['A', 'T', 'G', 'C'])
            self.db.insert("INSERT INTO option (name, value) VALUES (?,?)",
                           [('total_base', str(seqs.size)),
                            ('total_seqs', str(len(seqs))),
                            ('gc_content', str(gc)), ('unkown_base', str(ns))])

        self.total_bases = seqs.size
        seqs = pyfastx.Fasta(fasta_path, build_index=False)
        return seqs
def randomizeMatchingPositions():

    ########################
    #command line arguments#
    ########################

    parser = argparse.ArgumentParser()

    #MANDATORY PARAMETERS
    parser.add_argument("outfile",help="Output fasta-file name.",type=str)

    #OPTIONAL PARAMETERS
    parser.add_argument("--wt",help="Full path to a fasta-file containing the wild type sequence.",type=str)
    parser.add_argument("--seqs",help="Full path to the fasta-file containing the sequences where we want to randomize the positions matching to wild type.",type=str)
    parser.add_argument("--N",help="Exact number of mismatches in seqs needed for including to output.",type=int,default=2)
    parser.add_argument("--addToReadName",help="String added to read names to distinguish them from input reads (default=:randomized).",type=str,default=":randomized")
    parser.add_argument("--alphabet",help="Alphabet used as a string containing each possible character (case sensitive, default=ACGT).",type=str,default='ACGT')

    args = parser.parse_args()

    #read in the wild type sequence
    for name,seq in pyfastx.Fasta(args.wt):
        wtseq = seq

    #read in rest of the sequences, save to outfile those that have N mismatches to wtseq
    #and randomize other positions from them
    with open(args.outfile,'wt') as outfile:
        w = csv.writer(outfile,delimiter='\t')
        for name,seq in pyfastx.Fasta(args.seqs):
            rands = np.random.randint(0,high=len(args.alphabet),size=len(seq)) #draw the random sequence
            newseq = ""
            N_mismatch = 0 #mismatch counter
            for i in range(0,len(seq)):
                if seq[i]!=wtseq[i]:
                    N_mismatch += 1
                    newseq += seq[i]
                else: newseq += args.alphabet[rands[i]]
                if N_mismatch>args.N: break
            if N_mismatch==args.N:
                #save the sequence
                w.writerow(['>'+name+args.addToReadName])
                w.writerow([newseq])
Exemplo n.º 18
0
def get_total_seq_len(fasta_file):
    """Simple function that uses pyfastx to quickly read in a fasta,
    and then the sum of sequence lengths is returned
    """
    try:
        x = [
            len(seq) for h, seq in pyfastx.Fasta(fasta_file, build_index=False)
        ]
    except RuntimeError:
        x = [0]
    return sum(x)
Exemplo n.º 19
0
    def test_key_func(self):
        del self.fastx

        #remove previously created index file
        if os.path.exists("{}.fxi".format(gzip_fasta)):
            os.remove("{}.fxi".format(gzip_fasta))

        self.fastx = pyfastx.Fasta(gzip_fasta, key_func=lambda x: x.split()[1])
        idx = self.get_random_index()
        self.assertEqual(self.fastx[idx].name,
                         self.fastx[idx].description.split()[1])
Exemplo n.º 20
0
def build_index(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile):
        print("fxi index is present")
    else:
        print("buliding fxi index for {}".format(infile))
        if infile.endswith((".fa", ".fa.gz", ".fasta", ".fasta.gz")):
            pyfastx.Fasta(infile)
        else:
            pyfastx.Fastq(infile)
        print("fxi index has been created for {}".format(infile))
Exemplo n.º 21
0
def main():
    if len(sys.argv) < 3:
        print(f"USAGE: {sys.argv[0]} <a.fa> <b.fa>")
        sys.exit(1)

    fp1 = sys.argv[1]
    fp2 = sys.argv[2]
    if CLEAN_IDX:
        for fp in (fp1, fp2):
            Path(fp + ".fxi").unlink(missing_ok=True)

    fa1 = pyfastx.Fasta(fp1)
    fa2 = pyfastx.Fasta(fp2)

    n_seqs = len(fa1)

    if n_seqs != len(fa2):
        raise ValueError("Different number of sequences in the two files")

    ids = sorted(fa1.keys())

    all_seq_ids_same = all(x == y for x, y in zip(ids, sorted(fa2.keys())))
    if not all_seq_ids_same:
        raise ValueError(
            "Sequence IDs are not identical between the two files")

    mtx = np.zeros(shape=(n_seqs, n_seqs), dtype=np.int)

    for i, j in tqdm(product(range(n_seqs), range(n_seqs)),
                     total=n_seqs * n_seqs):
        dist = hamming(fa1[ids[i]].seq, fa2[ids[j]].seq)
        mtx[i][j] = dist

    print(DELIM.join(["sample", *ids]))
    for i, sample in enumerate(ids):
        row = DELIM.join(map(str, mtx[i]))
        print(f"{sample}{DELIM}{row}")

    if CLEAN_IDX:
        for fp in (fp1, fp2):
            Path(fp + ".fxi").unlink(missing_ok=True)
Exemplo n.º 22
0
	def generateHtml(self):
		sql = "SELECT path FROM fasta LIMIT 1"
		fasta_file = self.db.get_one(sql.format(self.table, self.id))
		self.fasta = pyfastx.Fasta(fasta_file)

		sql = "SELECT * FROM %s WHERE id=%s" % (self.table, self.id)
		ssr = self.db.get_row(sql)
		seq, left, right = self.getSequence(ssr.sequence, ssr.start, ssr.end)

		tandem = "%s%s%s" % (self.formatFlank(left), self.formatTarget(seq), self.formatFlank(right))

		return template_render("sequence.html", tandem=tandem, ssr=ssr, table=self.table)
Exemplo n.º 23
0
def chunkPolishSeq(chunkBarcodeWithReadIter, nanoporeReadPath, tempDirPath,
                   finalDirPath, penaltyPath, i, minimapPath, poaPath,
                   raconPath):
    nanoporeRead = pyfastx.Fasta(nanoporeReadPath)
    commandExecuted = list(
        map(polishSeq, chunkBarcodeWithReadIter, repeat(nanoporeRead),
            repeat(tempDirPath), repeat(finalDirPath), repeat(penaltyPath),
            repeat(minimapPath), repeat(poaPath), repeat(raconPath)))
    commandExecuted = ' ;\\\n'.join(commandExecuted)
    os.system(commandExecuted)
    if i % 100 == 0:
        logger.info(f'{i*100} reads processed')
Exemplo n.º 24
0
def generate_coverage(read1, read2, mapping, ref, pwid=0.95, ncpu=1, chunk_size=500000, quiet=False):

    if not quiet: print("Building index and data structures...")

    seq_cov = {}
    for name, seq in pyfastx.Fasta(ref, build_index=False):
        seq_cov[name] = np.zeros(len(seq), dtype=int)

    nreads = 0
    read_len = 0
    for r in mp.fastx_read(read1):
        nreads+=1
        read_len += len(r[1])
    read_len /= nreads
    min_chain_score = int(0.9*read_len)
    min_mis_match = int(read_len-pwid*read_len)

    a = mp.Aligner(ref, preset='sr', n_threads=ncpu, best_n=1000, min_chain_score=min_chain_score)  # load or build index 
    if not a: raise Exception("ERROR: failed to load/build index")

    def mpile(seqs):
        if seqs is None: return([])
        thrbuf = mp.ThreadBuffer()
        hits = []
        chrom=None
        for hit in a.map(seqs[1], buf=thrbuf):
            if (hit.NM<=min_mis_match) and ('S' not in hit.cigar_str) and ('H' not in hit.cigar_str):
                if chrom is None:
                    chrom=mapping[hit.ctg]
                    hits.append((hit.ctg, hit.r_st-1, hit.r_en))
                elif mapping[hit.ctg] == chrom:
                    hits.append((hit.ctg, hit.r_st-1, hit.r_en))
                else:
                    break
        return(hits)

    if not quiet: print("Aligning reads...")
    pool = ThreadPool(ncpu)
    for reads in tqdm(grouper(chain(
        mp.fastx_read(read1),
        mp.fastx_read(read2)), chunk_size), 
        total=int(1+2*nreads/chunk_size), disable=quiet):
        hits = pool.map(mpile, reads)
        for hit in chain.from_iterable(hits):
            if hit is None: continue
            seq_cov[hit[0]][hit[1]:hit[2]] += 1

    #close the pool and wait for the work to finish
    pool.close()
    pool.join()

    return(seq_cov)
Exemplo n.º 25
0
def fasta_split(args):
	fa = pyfastx.Fasta(args.fastx)

	if args.seq_count:
		parts_num = math.ceil(len(fa)/args.seq_count)
	else:
		parts_num = args.file_num

	name, suffix1 = os.path.splitext(os.path.basename(args.fastx))

	if fa.is_gzip:
		name, suffix2 = os.path.splitext(name)

	digit = len(str(parts_num))
	lens = [0] * parts_num
	
	if args.seq_count:
		seqs = [0] * parts_num

	fhs = []

	for i in range(1, parts_num+1):
		if fa.is_gzip:
			subfile = "{}.{}{}{}".format(name, str(i).zfill(digit), suffix2, suffix1)
		else:
			subfile = "{}.{}{}".format(name, str(i).zfill(digit), suffix1)

		if args.outdir is not None:
			subfile = os.path.join(args.outdir, subfile)

		if fa.is_gzip:
			fh = gzip.open(subfile, 'wt')
		else:
			fh = open(subfile, 'w')

		fhs.append(fh)

	ids = fa.keys()
	for chrom in ids.sort('length', reverse=True):
		idx = min_index(lens)
		fhs[idx].write(">%s\n%s\n" % (chrom, fa[chrom].seq))
		lens[idx] += len(fa[chrom])

		if args.seq_count:
			seqs[idx] += 1

			if seqs[idx] == args.seq_count:
				lens[idx] = fa.size

	for fh in fhs:
		fh.close()
Exemplo n.º 26
0
def main(args):
    with open(args.prefix + ".fasta", "w") as O:
        with open(args.prefix + ".meta.csv", "w") as M:
            writer = csv.DictWriter(M,
                                    fieldnames=[
                                        "id", "iso_a3", "country", "continent",
                                        "date", "seqlen", "missing_fraction"
                                    ])
            writer.writeheader()
            for entry in tqdm(pyfastx.Fasta(args.fasta, full_name=True)):
                seqname = entry.name
                # print(seqname)
                if check_for_disallowed_countries(seqname): continue
                meta = seqname.split("|")
                if len(meta) != 3: continue
                if len(meta[0].split("/")) == 1: continue

                country = meta[0].split("/")[1]
                country = country2country.get(country, country)
                iso_a3 = country2iso_a3[country2country.get(country, country)]
                if country == "": continue
                continent = country2continent[country]

                date = meta[2]
                if date_qc(date) == False: continue

                if entry.end < args.seqlen: continue

                missing_chars = sum([
                    n for d, n in entry.composition.items()
                    if d.upper() not in acgt
                ])
                if missing_chars / entry.end > args.missing: continue

                seqid = meta[1]

                writer.writerow({
                    "id": seqid,
                    "country": country,
                    "continent": continent,
                    "iso_a3": iso_a3,
                    "date": date,
                    "seqlen": entry.end,
                    "missing_fraction": missing_chars / entry.end,
                })

                seq = list(entry.seq)
                for pos in [i for i, n in enumerate(seq) if n not in acgt]:
                    seq[pos] = "N"
                seq = "".join(seq)
                O.write(">%s\n%s\n" % (seqid, seq))
Exemplo n.º 27
0
def main():
    args = get_options()

    print("Calling genes from reads is still under active development and may change frequently!")

    # make sure trailing forward slash is present
    args.output_dir = os.path.join(args.output_dir, "")

    # create temporary directory
    temp_dir = os.path.join(tempfile.mkdtemp(dir=args.output_dir), "")

    # check files exist

    # clean database and remove sequences shorter than 300bp.
    temp_db = temp_dir + "temp_db.fasta"
    mapping = {}
    mapping_clust = {}
    with open(temp_db, 'w') as outfile:
        index = 0
        for name, seq in pyfastx.Fasta(args.db, build_index=False):
            if len(seq)<=100: continue 
            outfile.write('>' + str(index) + '\n' + seq + '\n')
            mapping[str(index)] = name
            mapping_clust[str(index)] = name.split("__")[0]
            index += 1

    # align reads
    coverage = generate_coverage(
            read1=args.r1, 
            read2=args.r2,
            ref=temp_db,
            mapping=mapping_clust,
            pwid=args.pwid, 
            ncpu=args.n_cpu, 
            chunk_size=500000, 
            quiet=args.quiet)

    # call genes and write output
    prefix = os.path.basename(args.r1).split('.')[0].strip('1').strip('_')
    prefix += '_' + os.path.basename(args.db).split('.')[0]
    find_genes(coverage, mapping, 
        cov_threshold=args.min_cov,
        prefix=prefix, 
        outdir=args.output_dir, 
        fold_threshold=args.min_fold, 
        quiet=args.quiet)

    # clean up
    shutil.rmtree(temp_dir)
    
    return
Exemplo n.º 28
0
def main():
    # Parse arguments: Requires n_words and outfile destination
    parser = argparse.ArgumentParser(
        description='Produce Byte Pair Encoder trained from .FASTA file.')

    parser.add_argument('-i',
                        metavar='INFILE',
                        type=str,
                        help='Path of FASTA file for training model.')

    args = parser.parse_args()

    assert args.i
    OUTFILE = "{}mer_compare_bpe_dna_wordsize_256.model"

    # Load all seqs into memory. This may be an issue depending on the machine.
    fa = pyfastx.Fasta(args.i)
    #Make everything upper case DNA
    #seqs = ['{}'.format(record.seq).upper().replace('U', 'T') for record in fa]
    seqs = ['{}'.format(record.seq).upper() for record in fa]

    k_compares = [6, 8]
    #Calculate target vocabulary sizes, in descending order
    k_compares = sorted(k_compares, reverse=True)
    vocab_sizes = [4**k for k in k_compares]

    #Calculation loop
    # SentencePiece does not appear to have a natural way of continuing
    # training from a checkpoint.
    # Therefore, approach here is train -> encode -> train etc...

    for i, vocab_size in enumerate(vocab_sizes):
        #Create iterable for model input
        s_iter = iter(seqs)
        #Create bytes stream for model output
        model = BytesIO()
        #Train encoder.
        spm.SentencePieceTrainer.train(sentence_iterator=s_iter,
                                       model_writer=model,
                                       vocab_size=vocab_size,
                                       hard_vocab_limit="False",
                                       max_sentencepiece_length=256)
        #Save the model
        with open(OUTFILE.format(k_compares[i]), 'wb') as f:
            f.write(model.getvalue())

        #Encode the corpus to the reduced vocabulary
        sp = spm.SentencePieceProcessor(model_proto=model.getvalue())

    print("DONE.")
Exemplo n.º 29
0
def create_fastx_index(fastx: Path) -> (pyfastx.Fasta, Path):

    if is_fasta(fastx):
        return pyfastx.Fasta(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    elif is_fastq(fastx):
        return pyfastx.Fastq(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    else:
        raise ValueError(
            f'Could not determine input file format: {fastx}'
        )
Exemplo n.º 30
0
    def process(self):
        self.emit_message("Exporting fasta sequence to %s" % self.outfile)

        table_name = self.model.tableName()

        whole_ssrs = self.db.get_one("SELECT COUNT(1) FROM %s" % table_name)
        total_ssrs = whole_ssrs

        if self.selected == 'whole' or len(self.model.selected) == whole_ssrs:
            sql = "SELECT * FROM {}".format(table_name)
        else:
            ids = sorted(self.model.selected)
            total_ssrs = len(ids)
            sql = "SELECT * FROM {} WHERE id IN ({})".format(
                table_name, ",".join(map(str, ids)))

        current = 0
        progress = 0
        prev_progress = 0
        current_seq = None
        current_name = None

        with open(self.outfile, 'wt') as fp:
            for item in self.db.query(sql):
                if item.sequence != current_name:
                    sql = "SELECT path FROM fasta LIMIT 1"
                    seqfile = self.db.get_one(sql)
                    seqs = pyfastx.Fasta(seqfile)
                    current_seq = seqs[item.sequence].seq
                    current_name = item.sequence

                start = item.start - self.flank
                if start < 1:
                    start = 1
                end = item.end + self.flank
                #ssr = seqs.fetch(item.sequence, (start, end))
                ssr = current_seq[start - 1:end]
                name = ">{}{} {}:{}-{}|motif:{}".format(
                    table_name.upper(), item.id, item.sequence, item.start,
                    item.end, item.motif)
                fp.write("{}\n{}".format(name, format_fasta_sequence(ssr, 70)))

                current += 1
                progress = int(current / total_ssrs * 100)
                if progress > prev_progress:
                    self.emit_progress(progress)
                    prev_progress = progress

        self.emit_finish("Successfully exported to fasta %s" % self.outfile)