Exemplo n.º 1
0
    def format_fastqc(self, rawDataPath, currSample):

        ## FastQC templates
        fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html')

        ## just f****n lump it all in there for now and figure out what you want to format next
        fqc_object = Fadapa(rawDataPath)

        ## Module status data
        module_summary = fqc_object.summary()
        module_stats = module_summary[1][0]
        module_pbsq = module_summary[2][0]
        module_ptsq = module_summary[3][0]
        module_psqs = module_summary[4][0]
        module_pbsc = module_summary[5][0]
        module_psgcc = module_summary[6][0]
        module_pbnc = module_summary[7][0]
        module_seqlendist = module_summary[8][0]
        module_seqdup = module_summary[9][0]
        module_overrep = module_summary[10][0]
        module_adapter = module_summary[11][0]

        ## Basic statistics data
        basic_stats = fqc_object.clean_data('Basic Statistics')
        file_name = basic_stats[1][1]
        file_type = basic_stats[2][1]
        encoding = basic_stats[3][1]
        total_sequences = basic_stats[4][1]
        poor_quality = basic_stats[5][1]
        seq_len = basic_stats[6][1]
        gc_pcnt = basic_stats[7][1]

        ## FastQC html template file with data inserted
        fqc_return = ''
        f = open(fastqc_template, 'r')
        for line in f:
            line = line.format(MODULE_STATS=module_stats,
                               MODULE_PBSQ=module_pbsq,
                               MODULE_PTSQ=module_ptsq,
                               MODULE_PSQS=module_psqs,
                               MODULE_PBSC=module_pbsc,
                               MODULE_PSGCC=module_psgcc,
                               MODULE_PBNC=module_pbnc,
                               MODULE_SEQLENDIST=module_seqlendist,
                               MODULE_SEQDUP=module_seqdup,
                               MODULE_OVERREP=module_overrep,
                               MODULE_ADAPTER=module_adapter,
                               FQC_FILENAME=file_name,
                               FQC_FILETYPE=file_type,
                               FQC_ENCODING=encoding,
                               FQC_TOTALSEQ=total_sequences,
                               FQC_POORQUAL=poor_quality,
                               FQC_SEQLEN=seq_len,
                               FQC_GCPCNT=gc_pcnt)
            fqc_return = '{0}{1}'.format(fqc_return, line)
        f.close()

        ## return formatted FastQC report
        return fqc_return
Exemplo n.º 2
0
    def save_sections_into_file(self):

        data_file = os.path.join(self._dir, "fastqc_data.txt")
        if os.path.exists(data_file) and Fadapa:
            parser = Fadapa(data_file)
            module = [m[1] for m in parser.summary()][2:9]
            for m in module:
                out_file = os.path.join(self._dir, m.replace(" ", "_") + ".tsv")
                dt = self._get_module(parser, m)
                dt.to_csv(out_file, sep="\t", index=False)
Exemplo n.º 3
0
class TestFadapa(unittest.TestCase):
    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
Exemplo n.º 4
0
class TestFadapa(unittest.TestCase):

    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
Exemplo n.º 5
0
    def format_fastqc(self, rawDataPath, currSample):

        ## FastQC templates
        fastqc_template = os.path.join(self.TEMPLATES_BASE, 'fastqc.html')

        ## just f****n lump it all in there for now and figure out what you want to format next
        fqc_object = Fadapa(rawDataPath)

        ## Module status data
        module_summary = fqc_object.summary()
        module_stats = module_summary[1][0]; module_pbsq = module_summary[2][0]; module_ptsq = module_summary[3][0];
        module_psqs = module_summary[4][0]; module_pbsc = module_summary[5][0]; module_psgcc = module_summary[6][0];
        module_pbnc = module_summary[7][0]; module_seqlendist = module_summary[8][0]; module_seqdup = module_summary[9][0];
        module_overrep = module_summary[10][0]; module_adapter = module_summary[11][0]

        ## Basic statistics data
        basic_stats = fqc_object.clean_data('Basic Statistics')
        file_name = basic_stats[1][1]; file_type = basic_stats[2][1]; encoding = basic_stats[3][1]
        total_sequences = basic_stats[4][1]; poor_quality = basic_stats[5][1]; seq_len = basic_stats[6][1]
        gc_pcnt = basic_stats[7][1]

        ## FastQC html template file with data inserted
        fqc_return = ''
        f = open(fastqc_template, 'r')
        for line in f:
            line = line.format(
            MODULE_STATS = module_stats, MODULE_PBSQ = module_pbsq, MODULE_PTSQ = module_ptsq,
            MODULE_PSQS = module_psqs, MODULE_PBSC = module_pbsc, MODULE_PSGCC = module_psgcc,
            MODULE_PBNC = module_pbnc, MODULE_SEQLENDIST = module_seqlendist, MODULE_SEQDUP = module_seqdup,
            MODULE_OVERREP = module_overrep, MODULE_ADAPTER = module_adapter,
            FQC_FILENAME = file_name, FQC_FILETYPE = file_type, FQC_ENCODING = encoding,
            FQC_TOTALSEQ = total_sequences, FQC_POORQUAL = poor_quality, FQC_SEQLEN = seq_len,
            FQC_GCPCNT = gc_pcnt
            )
            fqc_return = '{0}{1}'.format(fqc_return, line)
        f.close()

        ## return formatted FastQC report
        return fqc_return
Exemplo n.º 6
0
    def fastqc_result(self, r1, r2, sample, output, type):
        fq1 = Fadapa(r1)
        fq2 = Fadapa(r2)
        fastqc = {}
        fastqc_summary = {}
        fastqc_pass = {}
        fastqc_per_base_quality = {}
        fastqc_per_sequence_quality = {}
        fastqc_per_sequence_quality_r1 = {}
        fastqc_per_sequence_quality_r2 = {}
        fastqc_sequence_length_distribution = {}
        fastqc_sequence_length_distribution_r1 = {}
        fastqc_sequence_length_distribution_r2 = {}

        # fastqc per sequence quality scores for report
        # R1
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r1.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()
        # R2
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r2.txt" % (output, sample),
            "w")
        count = 0
        for data in fq2.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc per base squence for report
        f = open(
            "%s/data/stat/%s.base_sequence_quality.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per base sequence quality'):
            if (count > 3):
                break
            if data.startswith(">>Per") or data.startswith(
                    "#Base") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc sequence length distribution for report
        f = open(
            "%s/data/stat/%s.sequence_length_distribution.txt" %
            (output, sample), "w")
        count = 0
        for data in fq1.raw_data('Sequence Length Distribution'):
            if (count > 3):
                break
            if data.startswith(">>Seq") or data.startswith(
                    "#") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc parse for json
        for data in fq1.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r1[data[0]] = data[1]
        for data in fq2.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r2[data[0]] = data[1]

        total_reads_r1 = 0
        total_reads_r2 = 0
        above_30_r1 = 0
        above_30_r2 = 0
        for data in fq1.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r1[data[0]] = data[1]
                total_reads_r1 = total_reads_r1 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r1 = above_30_r1 + float(data[1])

        for data in fq2.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r2[data[0]] = data[1]
                total_reads_r2 = total_reads_r2 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r2 = above_30_r2 + float(data[1])

        #mean read quality(percentage of reads with mean Phred base quality above 30)
        mean_read_quality_percentage = (above_30_r1 + above_30_r2) / (
            total_reads_r1 + total_reads_r2) * 100
        mean_read_quality = {}
        mean_read_quality['total_read'] = total_reads_r1 + total_reads_r2
        mean_read_quality['above_30'] = above_30_r1 + above_30_r2
        mean_read_quality['percentage'] = mean_read_quality_percentage
        if type == "raw":
            if 90 < mean_read_quality_percentage:
                mean_read_quality['message'] = "pass"
                self._qc_pass_count = self._qc_pass_count + 1
            else:
                message = "warn"
                mean_read_quality['message'] = "warn"

        for data in fq1.clean_data('Per base sequence quality'):
            #base, mean, median
            if data[0] != "base_Base":
                fastqc_per_base_quality[data[0]] = "{0}:{1}:{2}".format(
                    data[0], data[1], data[2])

        for data in fq1.clean_data('Basic Statistics'):
            if data[0] != "Measure":
                fastqc_summary[data[0]] = data[1]

        for data in fq1.summary():
            if data[1] != "Module Name":
                fastqc_pass[data[1]] = data[0]

        fastqc['mean_read_quality'] = mean_read_quality
        fastqc[
            'fastqc_sequence_length_distribution'] = fastqc_sequence_length_distribution
        fastqc['per_sequence_quality_score'] = fastqc_per_sequence_quality
        fastqc['per_base_quality'] = fastqc_per_base_quality
        fastqc['summary'] = fastqc_summary
        fastqc['pass'] = fastqc_pass
        fastqc['fastq_file_name'] = "%s-%s" % (r1, r2)
        return fastqc