def test_split_ionfile_by_results(self): ion_file = "NGS/ion_file.fastq" blast_chunk = "NGS/_reaa.csv" NGS.split_ionfile_by_results(ion_file, blast_chunk) cmd = "grep -c '^@' NGS/_reaa.fastq" p = subprocess.check_output(cmd, shell=True) self.assertEqual(p.strip(), '1001') shutil.copyfile("NGS/_reaa.fastq.bak", "NGS/_reaa.fastq")
def test_parse_blast_results(self): # It should work using fasta files blast_table = os.path.join("NGS", "blast_table.csv") ion_file = os.path.join("NGS", "ion_file.fastq") NGS.parse_blast_results(blast_table, ion_file) result = glob.glob("output/gene*") self.assertEqual(len(result), 21) shutil.rmtree("output")
def test_split_ionfile_by_results(self): ion_file = os.path.join(self.cwd, "NGS/ion_file.fastq") blast_chunk = os.path.join(self.cwd, "NGS/_reaa.csv") NGS.split_ionfile_by_results(ion_file, blast_chunk) cmd = "grep -c '^@' " + os.path.join(self.cwd, "NGS", "_reaa.fastq") p = subprocess.check_output(cmd, shell=True) self.assertEqual(int(p.strip()), 1001) os.remove(os.path.join(self.cwd, "NGS", "_reaa.fastq"))
def test_filter_reads(self): ion_chunk = "NGS/_reaa.fastq" blast_chunk = "NGS/_reaa.csv" folder = "NGS" NGS.filter_reads(ion_chunk, blast_chunk, folder) # it should generate many gene_ files result = glob.glob("NGS/gene*") self.assertEqual(len(result), 21) for i in result: os.remove(i)
def test_prepare_data(self): ionfile = os.path.join(self.cwd, "NGS", "ion_file.fastq") NGS.prepare_data(ionfile, 8) expected_file1 = os.path.join("data", "modified", "wrk_ionfile.fasta") expected_file2 = os.path.join("data", "modified", "wrk_ionfile.fastq") self.assertTrue(os.path.isfile(expected_file1)) self.assertTrue(os.path.isfile(expected_file2)) os.remove(expected_file1) os.remove(expected_file2)
def test_filter_reads(self): folder = "NGS" for i in glob.glob(os.path.join("NGS", "gene*")): os.remove(i) ion_chunk = os.path.join("NGS", "reaa.fastq") blast_chunk = os.path.join("NGS", "reaa.csv") NGS.filter_reads(ion_chunk, blast_chunk, folder) cmd = "cat " + os.path.join("NGS", "gene*") cmd += " | grep -c '^@'" p = subprocess.check_output(cmd, shell=True) for i in glob.glob(os.path.join("NGS", "gene*")): os.remove(i) self.assertEqual(int(p.strip()), 23)
def separate_by_index(fastq_file, index_list, folder="", levenshtein_distance=1): from Bio import SeqIO from pyphylogenomics import NGS ''' This function divides FASTQ reads into bins according to a list of indexes (or barcodes). The *index_list* should be in FASTA format. It will compare the template indexes and those in the reads and accept indexes with a difference no bigger than the *levenshtein* distance (default 1 base pair difference). See http://en.wikipedia.org/wiki/Levenshtein_distance * ``fastq_file`` FASTQ format containing reads as produced by IonTorrent * ``index_list`` FASTA format file containing indexes (or barcodes) * ``folder`` *Optional*: Directory containing FASTQ format files to process * ``levenshtein_distance`` *Optional*, default = 1: Maximum number of different nucleotides that will be accepted when comparing template and sequenced indexes (due to erros in base calling during sequencing). Example: >>> from pyphylogenomics import NGS; >>> fastq_file = "gene_rps5.fastq"; >>> index_list = "indexes.fasta"; >>> folder = "output"; >>> NGS.separate_by_index(fastq_file, index_list, folder); You can also automate parsing many FASTQ files at once: >>> from pyphylogenomics import NGS; >>> import glob; # this module allow us selecting many files by using wildcards >>> index_list = "indexes.fasta"; >>> folder = "output"; >>> for file in glob.glob("output/gene*.fastq"): ... NGS.separate_by_index(file, index_list, folder); ''' print "Processing file " + fastq_file; if folder != "": folder = re.sub("/$", "", folder); folder = os.path.abspath(folder); print "Output files will be written into " + folder for seq_record in SeqIO.parse(index_list, "fasta"): for fastq_record in SeqIO.parse(fastq_file, "fastq"): found_index = ""; found_index = NGS.find_index_in_seq(seq_record, fastq_record, levenshtein_distance); if found_index == "TRUE": basename = os.path.basename(fastq_file); if folder != "": filename = "index_" + str(seq_record.id) + "_" + re.sub(".fastq", "", basename) + ".fastq"; filename = os.path.join(folder, filename); else: filename = "index_" + str(seq_record.id) + "_" + re.sub(".fastq", "", basename) + ".fastq"; output_handle = open(filename, "a"); SeqIO.write(fastq_record, output_handle, "fastq"); output_handle.close();
def test_prune(self): folder = "NGS" blast_data = [] f = open("NGS/blast_data.csv", "r") tmp = f.readlines() f.close() for i in tmp: blast_data.append(i.strip()) seq_record = SeqIO.parse("NGS/seq_record.fastq", "fastq") ion_id = "3856" min_aln_length = "40" result = NGS.prune(folder, blast_data, seq_record, ion_id, min_aln_length) # It should drop on seq_record from the blast_data self.assertEqual(len(result), 998)
def filter_reads(ion_chunk, blast_chunk, folder): from Bio import SeqIO; from pyphylogenomics import NGS ''' \* *Internal function* \* Accepting alignment lengths higher than 40 bp longer than our primer lengths ''' min_aln_length = 40; blast_file = open(blast_chunk, "r"); tmp = blast_file.readlines(); blast_file.close(); blast_data = [] for i in tmp: blast_data.append(i.strip()) # iterate over ion torrent reads for seq_record in SeqIO.parse(ion_chunk, "fastq"): if len(blast_data) > 0: #print "\n\nNew record--------------------" #print "seq record id @%s" % seq_record.id # avoid processing seq_records that are not in blast file # first id in blast_data #print blast_data first_id_in_blast_data = blast_data[0].split(",")[0] #print "fist id in blast_data %s" % first_id_in_blast_data if int(seq_record.id) >= int(first_id_in_blast_data): #if str(seq_record.id) == ion_id and aln_length > min_aln_length: if str(seq_record.id) == first_id_in_blast_data: #print "prune" blast_data = NGS.prune(folder, blast_data, seq_record, first_id_in_blast_data, min_aln_length) else: break
from pyphylogenomics import NGS import sys ionfile = sys.argv[1].strip() index_length = 0; NGS.prepare_data(ionfile, index_length);
from pyphylogenomics import NGS; import sys blast_table = sys.argv[1].strip() ion_file = "data/modified/wrk_ionfile.fastq"; NGS.parse_blast_results(blast_table, ion_file);