예제 #1
0
파일: umitrans.py 프로젝트: bjpop/umitrans
def process_files(options):
    input_filenames = [options.umi] + options.seq
    input_files = [pyfastx.Fastx(fname) for fname in input_filenames] 
    output_filenames = [make_output_filename(name) for name in options.seq]
    output_files = [open(fname, "w") for fname in output_filenames]
    for records in zip_longest(*input_files):
        if len(records) >= 1:
           umi_record = records[0]
           if umi_record is not None and len(umi_record) == 4:
               umi_name, umi_seq, _umi_qual, umi_comment = umi_record
           elif umi_record is None:
               exit_with_error(f"Input FASTQ files do not have the same number of records", EXIT_FILE_IO_ERROR)
           else:
               exit_with_error(f"Badly formed UMI record in input UMI FASTQ file: {umi_record}", EXIT_FILE_IO_ERROR)
           fastq_records = records[1:]
           for output_file, this_record in zip(output_files, fastq_records):
               if this_record is not None and len(this_record) == 4:
                   this_name, this_seq, this_qual, this_comment = this_record 
                   if this_name == umi_name:
                       new_name = this_name + options.sep + umi_seq
                       print(f"@{new_name} {this_comment}\n{this_seq}\n+\n{this_qual}", file=output_file)
               elif this_record is None:
                   exit_with_error(f"Input FASTQ files do not have the same number of records", EXIT_FILE_IO_ERROR)
               else:
                   exit_with_error(f"Badly formed FASTQ record in input FASTQ file: {this_record}", EXIT_FILE_IO_ERROR)
    # FASTX does not appear to provide a proper context manager for files, so
    # we resort to trying to close files here.
    # Issue has been created on GitHub: https://github.com/lmdu/pyfastx/issues/27
    #for file in input_files:
    #    file.close()
    for file in output_files:
        file.close()
예제 #2
0
def find_tandem_repeats_with_multicore(args):
	pool = mp.Pool(args.threads)
	manager = mp.Manager()
	tasks = manager.Queue(args.threads*2)
	event = manager.Event()

	#add workers
	l = len(str(args.threads))
	for i in range(args.threads):
		work_id = str(i).zfill(l)
		pool.apply_async(find_tandem_repeats_worker, (args, tasks, event, work_id))

	#add tasks
	for name, seq, _ in pyfastx.Fastx(args.fasta, uppercase=True):
		tasks.put((name, seq), block=True, timeout=None)

	event.set()
	pool.close()
	pool.join()

	#merge results
	with open(args.out_file, 'wb') as fw:
		for i in range(args.threads):
			temp_file = "{}.{}".format(args.out_file, str(i).zfill(l))

			if os.path.isfile(temp_file):
				with open(temp_file, 'rb') as fh:
					shutil.copyfileobj(fh, fw)

				#remove temp file
				os.remove(temp_file)
예제 #3
0
    def test_fastq_iter(self):
        reads = {}
        with open(flat_fastq) as fh:
            for line in fh:
                line = line.strip()

                if line[0] == '@':
                    name = line.split()[0][1:]
                    comment = ' '.join(line.split()[1:])
                    reads[name] = [comment]
                elif line == '+':
                    continue
                else:
                    reads[name].append(line)

        for name, seq, qual, comment in pyfastx.Fastx(gzip_fastq, "fastq"):
            r = reads[name]

            self.assertEqual(r[0], comment)
            self.assertEqual(r[1], seq)
            self.assertEqual(r[2], qual)
예제 #4
0
파일: workers.py 프로젝트: lmdu/krait2
    def process(self):
        progress = 0
        processed_size = 0
        processed_file = 0

        with multiprocessing.Pool(1) as pool:
            for fasta in self.fastas:
                self.findex = fasta[0]
                total_size = fasta[2]

                #create ssr table for current file
                DB.create_table(self._table, self.findex)
                self.change_status('running')

                seqs = pyfastx.Fastx(fasta[4], uppercase=True)
                sql = self.sql()

                for name, seq, _ in seqs:
                    self.signals.messages.emit(
                        'processing sequence {} in file {}'.format(
                            name, fasta[1]))

                    proc = pool.apply_async(self.search, self.args(name, seq))
                    trs = proc.get()
                    DB.insert_rows(sql, self.rows(trs))

                    processed_size += len(seq)
                    if processed_size > total_size:
                        r = 0
                    else:
                        r = processed_size / total_size
                    p = int((processed_file + r) / self.total_file * 100)
                    if p > progress:
                        self.signals.progress.emit(p)
                        progress = p

                processed_file += 1
                self.change_status('success')
예제 #5
0
def find_tandem_repeats_with_singlecore(args):
	with open(args.out_file, 'w') as fw:
		for name, seq, _ in pyfastx.Fastx(args.fasta, uppercase=True):
			tres = find_tandem_repeats_by_type(name, seq, args)
			format_and_write_to_file(args, fw, tres)
예제 #6
0
    def test_exception(self):
        with self.assertRaises(FileExistsError):
            _ = pyfastx.Fastx('test_file')

        with self.assertRaises(RuntimeError):
            _ = pyfastx.Fastx(gzip_fasta, format="fastx")
예제 #7
0
    def test_fastx_repr(self):
        fa = pyfastx.Fastx(gzip_fasta, "fasta")
        self.assertEqual(repr(fa), "<Fastx> fasta {}".format(gzip_fasta))

        fq = pyfastx.Fastx(gzip_fastq, "fastq")
        self.assertEqual(repr(fq), "<Fastx> fastq {}".format(gzip_fastq))
예제 #8
0
 def test_fasta_upper(self):
     for name, seq, _ in pyfastx.Fastx(flat_fasta, uppercase=True):
         self.assertEqual(str(self.faidx[name]), seq)
예제 #9
0
 def test_fasta_iter(self):
     for name, seq, comment in pyfastx.Fastx(gzip_fasta):
         s = self.faidx[name]
         self.assertEqual(str(seq), seq)
         self.assertEqual(' '.join(s.long_name.split()[1:]), comment)
예제 #10
0
import pyfastx
import simplesam
import os

os.chdir(
    '/research/projects/yu3grp/IO_JY/yu3grp/LVXSCID/patients_scATACseq/multiome_P1'
)
bam_file = './03_chimeric/P1_scMulti_ATAC_S1_pe.mated.filter.bam'
out_sam_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_wCB.sam'
cellID_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_R2.fastq'

#fa = pyfastx.Fastx('./LVX_SCID_P1_S1_L001_pe.mated.filter2.bam_readbarcode')
fa = pyfastx.Fastx(cellID_file)

barcodes = {}
for name, seq, qual, comment in fa:
    barcodes[name] = seq

barcode_tag = 'CB'

with simplesam.Reader(open(bam_file)) as in_bam:
    with simplesam.Writer(open(out_sam_file, 'w'), in_bam.header) as out_sam:
        for read in in_bam:
            #read[umi_tag] = barcodes[read.qname][0]
            read[barcode_tag] = barcodes[read.qname]
            out_sam.write(read)