Пример #1
0
class TestFadapa(unittest.TestCase):
    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
Пример #2
0
class TestFadapa(unittest.TestCase):

    def setUp(self):
        self.p_data = Fadapa('tests/fastqc_data.txt')

    def test_summary(self):
        summary = self.p_data.summary()
        self.assertEqual(summary[0], ['Module Name', 'Status'])

    def test_content(self):
        sys.stdout = StringIO()
        self.p_data.content()
        self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC')

    def test_raw_data(self):
        data = self.p_data.raw_data('Basic Statistics')
        self.assertEqual(data[-1], '>>END_MODULE')

    def test_cleaned_data(self):
        data = self.p_data.clean_data('Basic Statistics')
        self.assertEqual(data[0][0], 'Measure')
Пример #3
0
###Parses fastqc output to find overrepresented sequences to feed into cutadapt with a capital -A command

# imports
from fadapa import Fadapa
import sys

#take argument from bash script (which will be $j - the UID of the cell)
name = sys.argv[1]

#load file into fadapa parser
f = Fadapa('/home/graham/Downloads/trial/' + name + '_fastqc/fastqc_data.txt')

#get raw data for Overrepresented sequences
pass_seq = f.raw_data('Overrepresented sequences')[0]

#Initialise list of seqs
list_of_seqs = []

#If there are no overrepresented sequences, the clean parser breaks!
#Therefore, we cannot reference .clean unless .raw contains something
if pass_seq != ">>Overrepresented sequences	pass":
    #Loop through the .clean parsed data
    for data in f.clean_data('Overrepresented sequences'):
        #Add the first index of the .clean data to list
        #First entry will by #Sequence, subsequent will be the actual seqs
        list_of_seqs.append(data[0])

#Create empty output string
output = ""

#Loop through the list of sequeces from index 1 onwards (as the index 0 will be #Sequence)
Пример #4
0
#searching for overrepresented sequences within a genome sequence and producing an output for the extraction of these sequences in a upper case format as the sequences can be in lower or upper case. It is used instead of FASTQC when FASTQC cannot find any sequences. This can then be used by cutadapt

from fadapa import Fadapa
import sys
#input
file_one = sys.argv[1]

f = Fadapa('/home/rsk17/Group_project/1_Original_pipeline/' + file_one +
           '/fastqc_data.txt')
#look for certain phrases ie Overrepresented sequences
good_seq = f.raw_data("Overrepresented sequences")[0]
#create an empty list fo the overrepresented sequences to be appended to
seq_list = []

#if good_seq is pass in this case use ==
#if there is no overrepresented sequences pass then the loop continues, otherwise the loop is broken
#find when Overrepresented sequences are not a pass (!=)
if good_seq != ">>Overrepresented sequences     pass":
    for data in f.clean_data('Overrepresented sequences'):
        seq_list.append(data[0])

#this gets the output into the correct format for cutadapt (-A XXXX etc)
#creates output and uses the seq_list from before to output the found sequences into a format that can be used by cutadapt.
output = ""
#seq_list[1:] means that everything that isnt seq_list[0] (#sequence)
for items in seq_list[1:]:
    output = output + "-A" + items

print(output)
Пример #5
0
    def fastqc_result(self, r1, r2, sample, output, type):
        fq1 = Fadapa(r1)
        fq2 = Fadapa(r2)
        fastqc = {}
        fastqc_summary = {}
        fastqc_pass = {}
        fastqc_per_base_quality = {}
        fastqc_per_sequence_quality = {}
        fastqc_per_sequence_quality_r1 = {}
        fastqc_per_sequence_quality_r2 = {}
        fastqc_sequence_length_distribution = {}
        fastqc_sequence_length_distribution_r1 = {}
        fastqc_sequence_length_distribution_r2 = {}

        # fastqc per sequence quality scores for report
        # R1
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r1.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()
        # R2
        f = open(
            "%s/data/stat/%s.fastq_quality_score_r2.txt" % (output, sample),
            "w")
        count = 0
        for data in fq2.raw_data('Per sequence quality scores'):
            if (count > 3):
                break
            if data.startswith('>>Per') or data.startswith(
                    '#Qual') or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc per base squence for report
        f = open(
            "%s/data/stat/%s.base_sequence_quality.txt" % (output, sample),
            "w")
        count = 0
        for data in fq1.raw_data('Per base sequence quality'):
            if (count > 3):
                break
            if data.startswith(">>Per") or data.startswith(
                    "#Base") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc sequence length distribution for report
        f = open(
            "%s/data/stat/%s.sequence_length_distribution.txt" %
            (output, sample), "w")
        count = 0
        for data in fq1.raw_data('Sequence Length Distribution'):
            if (count > 3):
                break
            if data.startswith(">>Seq") or data.startswith(
                    "#") or data.startswith(">>END"):
                count = count + 1
            else:
                f.write(data)
                f.write("\n")
        f.close()

        # fastqc parse for json
        for data in fq1.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r1[data[0]] = data[1]
        for data in fq2.clean_data('Sequence Length Distribution'):
            if data[0] != "Length":
                fastqc_sequence_length_distribution_r2[data[0]] = data[1]

        total_reads_r1 = 0
        total_reads_r2 = 0
        above_30_r1 = 0
        above_30_r2 = 0
        for data in fq1.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r1[data[0]] = data[1]
                total_reads_r1 = total_reads_r1 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r1 = above_30_r1 + float(data[1])

        for data in fq2.clean_data('Per sequence quality scores'):
            if data[0] != "Quality":
                fastqc_per_sequence_quality_r2[data[0]] = data[1]
                total_reads_r2 = total_reads_r2 + float(data[1])
                if int(data[0]) >= 30:
                    above_30_r2 = above_30_r2 + float(data[1])

        #mean read quality(percentage of reads with mean Phred base quality above 30)
        mean_read_quality_percentage = (above_30_r1 + above_30_r2) / (
            total_reads_r1 + total_reads_r2) * 100
        mean_read_quality = {}
        mean_read_quality['total_read'] = total_reads_r1 + total_reads_r2
        mean_read_quality['above_30'] = above_30_r1 + above_30_r2
        mean_read_quality['percentage'] = mean_read_quality_percentage
        if type == "raw":
            if 90 < mean_read_quality_percentage:
                mean_read_quality['message'] = "pass"
                self._qc_pass_count = self._qc_pass_count + 1
            else:
                message = "warn"
                mean_read_quality['message'] = "warn"

        for data in fq1.clean_data('Per base sequence quality'):
            #base, mean, median
            if data[0] != "base_Base":
                fastqc_per_base_quality[data[0]] = "{0}:{1}:{2}".format(
                    data[0], data[1], data[2])

        for data in fq1.clean_data('Basic Statistics'):
            if data[0] != "Measure":
                fastqc_summary[data[0]] = data[1]

        for data in fq1.summary():
            if data[1] != "Module Name":
                fastqc_pass[data[1]] = data[0]

        fastqc['mean_read_quality'] = mean_read_quality
        fastqc[
            'fastqc_sequence_length_distribution'] = fastqc_sequence_length_distribution
        fastqc['per_sequence_quality_score'] = fastqc_per_sequence_quality
        fastqc['per_base_quality'] = fastqc_per_base_quality
        fastqc['summary'] = fastqc_summary
        fastqc['pass'] = fastqc_pass
        fastqc['fastq_file_name'] = "%s-%s" % (r1, r2)
        return fastqc