def format_fastqc(self, rawDataPath, currSample): ## FastQC templates fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html') ## just f****n lump it all in there for now and figure out what you want to format next fqc_object = Fadapa(rawDataPath) ## Module status data module_summary = fqc_object.summary() module_stats = module_summary[1][0] module_pbsq = module_summary[2][0] module_ptsq = module_summary[3][0] module_psqs = module_summary[4][0] module_pbsc = module_summary[5][0] module_psgcc = module_summary[6][0] module_pbnc = module_summary[7][0] module_seqlendist = module_summary[8][0] module_seqdup = module_summary[9][0] module_overrep = module_summary[10][0] module_adapter = module_summary[11][0] ## Basic statistics data basic_stats = fqc_object.clean_data('Basic Statistics') file_name = basic_stats[1][1] file_type = basic_stats[2][1] encoding = basic_stats[3][1] total_sequences = basic_stats[4][1] poor_quality = basic_stats[5][1] seq_len = basic_stats[6][1] gc_pcnt = basic_stats[7][1] ## FastQC html template file with data inserted fqc_return = '' f = open(fastqc_template, 'r') for line in f: line = line.format(MODULE_STATS=module_stats, MODULE_PBSQ=module_pbsq, MODULE_PTSQ=module_ptsq, MODULE_PSQS=module_psqs, MODULE_PBSC=module_pbsc, MODULE_PSGCC=module_psgcc, MODULE_PBNC=module_pbnc, MODULE_SEQLENDIST=module_seqlendist, MODULE_SEQDUP=module_seqdup, MODULE_OVERREP=module_overrep, MODULE_ADAPTER=module_adapter, FQC_FILENAME=file_name, FQC_FILETYPE=file_type, FQC_ENCODING=encoding, FQC_TOTALSEQ=total_sequences, FQC_POORQUAL=poor_quality, FQC_SEQLEN=seq_len, FQC_GCPCNT=gc_pcnt) fqc_return = '{0}{1}'.format(fqc_return, line) f.close() ## return formatted FastQC report return fqc_return
def save_sections_into_file(self): data_file = os.path.join(self._dir, "fastqc_data.txt") if os.path.exists(data_file) and Fadapa: parser = Fadapa(data_file) module = [m[1] for m in parser.summary()][2:9] for m in module: out_file = os.path.join(self._dir, m.replace(" ", "_") + ".tsv") dt = self._get_module(parser, m) dt.to_csv(out_file, sep="\t", index=False)
class TestFadapa(unittest.TestCase): def setUp(self): self.p_data = Fadapa('tests/fastqc_data.txt') def test_summary(self): summary = self.p_data.summary() self.assertEqual(summary[0], ['Module Name', 'Status']) def test_content(self): sys.stdout = StringIO() self.p_data.content() self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC') def test_raw_data(self): data = self.p_data.raw_data('Basic Statistics') self.assertEqual(data[-1], '>>END_MODULE') def test_cleaned_data(self): data = self.p_data.clean_data('Basic Statistics') self.assertEqual(data[0][0], 'Measure')
def format_fastqc(self, rawDataPath, currSample): ## FastQC templates fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html') ## just f****n lump it all in there for now and figure out what you want to format next fqc_object = Fadapa(rawDataPath) ## Module status data module_summary = fqc_object.summary() module_stats = module_summary[1][0]; module_pbsq = module_summary[2][0]; module_ptsq = module_summary[3][0]; module_psqs = module_summary[4][0]; module_pbsc = module_summary[5][0]; module_psgcc = module_summary[6][0]; module_pbnc = module_summary[7][0]; module_seqlendist = module_summary[8][0]; module_seqdup = module_summary[9][0]; module_overrep = module_summary[10][0]; module_adapter = module_summary[11][0] ## Basic statistics data basic_stats = fqc_object.clean_data('Basic Statistics') file_name = basic_stats[1][1]; file_type = basic_stats[2][1]; encoding = basic_stats[3][1] total_sequences = basic_stats[4][1]; poor_quality = basic_stats[5][1]; seq_len = basic_stats[6][1] gc_pcnt = basic_stats[7][1] ## FastQC html template file with data inserted fqc_return = '' f = open(fastqc_template, 'r') for line in f: line = line.format( MODULE_STATS = module_stats, MODULE_PBSQ = module_pbsq, MODULE_PTSQ = module_ptsq, MODULE_PSQS = module_psqs, MODULE_PBSC = module_pbsc, MODULE_PSGCC = module_psgcc, MODULE_PBNC = module_pbnc, MODULE_SEQLENDIST = module_seqlendist, MODULE_SEQDUP = module_seqdup, MODULE_OVERREP = module_overrep, MODULE_ADAPTER = module_adapter, FQC_FILENAME = file_name, FQC_FILETYPE = file_type, FQC_ENCODING = encoding, FQC_TOTALSEQ = total_sequences, FQC_POORQUAL = poor_quality, FQC_SEQLEN = seq_len, FQC_GCPCNT = gc_pcnt ) fqc_return = '{0}{1}'.format(fqc_return, line) f.close() ## return formatted FastQC report return fqc_return
def fastqc_result(self, r1, r2, sample, output, type): fq1 = Fadapa(r1) fq2 = Fadapa(r2) fastqc = {} fastqc_summary = {} fastqc_pass = {} fastqc_per_base_quality = {} fastqc_per_sequence_quality = {} fastqc_per_sequence_quality_r1 = {} fastqc_per_sequence_quality_r2 = {} fastqc_sequence_length_distribution = {} fastqc_sequence_length_distribution_r1 = {} fastqc_sequence_length_distribution_r2 = {} # fastqc per sequence quality scores for report # R1 f = open( "%s/data/stat/%s.fastq_quality_score_r1.txt" % (output, sample), "w") count = 0 for data in fq1.raw_data('Per sequence quality scores'): if (count > 3): break if data.startswith('>>Per') or data.startswith( '#Qual') or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # R2 f = open( "%s/data/stat/%s.fastq_quality_score_r2.txt" % (output, sample), "w") count = 0 for data in fq2.raw_data('Per sequence quality scores'): if (count > 3): break if data.startswith('>>Per') or data.startswith( '#Qual') or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # fastqc per base squence for report f = open( "%s/data/stat/%s.base_sequence_quality.txt" % (output, sample), "w") count = 0 for data in fq1.raw_data('Per base sequence quality'): if (count > 3): break if data.startswith(">>Per") or data.startswith( "#Base") or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # fastqc sequence length distribution for report f = open( "%s/data/stat/%s.sequence_length_distribution.txt" % (output, sample), "w") count = 0 for data in fq1.raw_data('Sequence Length Distribution'): if (count > 3): break if data.startswith(">>Seq") or data.startswith( "#") or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # fastqc parse for json for data in fq1.clean_data('Sequence Length Distribution'): if data[0] != "Length": fastqc_sequence_length_distribution_r1[data[0]] = data[1] for data in fq2.clean_data('Sequence Length Distribution'): if data[0] != "Length": fastqc_sequence_length_distribution_r2[data[0]] = data[1] total_reads_r1 = 0 total_reads_r2 = 0 above_30_r1 = 0 above_30_r2 = 0 for data in fq1.clean_data('Per sequence quality scores'): if data[0] != "Quality": fastqc_per_sequence_quality_r1[data[0]] = data[1] total_reads_r1 = total_reads_r1 + float(data[1]) if int(data[0]) >= 30: above_30_r1 = above_30_r1 + float(data[1]) for data in fq2.clean_data('Per sequence quality scores'): if data[0] != "Quality": fastqc_per_sequence_quality_r2[data[0]] = data[1] total_reads_r2 = total_reads_r2 + float(data[1]) if int(data[0]) >= 30: above_30_r2 = above_30_r2 + float(data[1]) #mean read quality(percentage of reads with mean Phred base quality above 30) mean_read_quality_percentage = (above_30_r1 + above_30_r2) / ( total_reads_r1 + total_reads_r2) * 100 mean_read_quality = {} mean_read_quality['total_read'] = total_reads_r1 + total_reads_r2 mean_read_quality['above_30'] = above_30_r1 + above_30_r2 mean_read_quality['percentage'] = mean_read_quality_percentage if type == "raw": if 90 < mean_read_quality_percentage: mean_read_quality['message'] = "pass" self._qc_pass_count = self._qc_pass_count + 1 else: message = "warn" mean_read_quality['message'] = "warn" for data in fq1.clean_data('Per base sequence quality'): #base, mean, median if data[0] != "base_Base": fastqc_per_base_quality[data[0]] = "{0}:{1}:{2}".format( data[0], data[1], data[2]) for data in fq1.clean_data('Basic Statistics'): if data[0] != "Measure": fastqc_summary[data[0]] = data[1] for data in fq1.summary(): if data[1] != "Module Name": fastqc_pass[data[1]] = data[0] fastqc['mean_read_quality'] = mean_read_quality fastqc[ 'fastqc_sequence_length_distribution'] = fastqc_sequence_length_distribution fastqc['per_sequence_quality_score'] = fastqc_per_sequence_quality fastqc['per_base_quality'] = fastqc_per_base_quality fastqc['summary'] = fastqc_summary fastqc['pass'] = fastqc_pass fastqc['fastq_file_name'] = "%s-%s" % (r1, r2) return fastqc