class TestFadapa(unittest.TestCase): def setUp(self): self.p_data = Fadapa('tests/fastqc_data.txt') def test_summary(self): summary = self.p_data.summary() self.assertEqual(summary[0], ['Module Name', 'Status']) def test_content(self): sys.stdout = StringIO() self.p_data.content() self.assertEqual(sys.stdout.getvalue()[:8], '##FastQC') def test_raw_data(self): data = self.p_data.raw_data('Basic Statistics') self.assertEqual(data[-1], '>>END_MODULE') def test_cleaned_data(self): data = self.p_data.clean_data('Basic Statistics') self.assertEqual(data[0][0], 'Measure')
###Parses fastqc output to find overrepresented sequences to feed into cutadapt with a capital -A command # imports from fadapa import Fadapa import sys #take argument from bash script (which will be $j - the UID of the cell) name = sys.argv[1] #load file into fadapa parser f = Fadapa('/home/graham/Downloads/trial/' + name + '_fastqc/fastqc_data.txt') #get raw data for Overrepresented sequences pass_seq = f.raw_data('Overrepresented sequences')[0] #Initialise list of seqs list_of_seqs = [] #If there are no overrepresented sequences, the clean parser breaks! #Therefore, we cannot reference .clean unless .raw contains something if pass_seq != ">>Overrepresented sequences pass": #Loop through the .clean parsed data for data in f.clean_data('Overrepresented sequences'): #Add the first index of the .clean data to list #First entry will by #Sequence, subsequent will be the actual seqs list_of_seqs.append(data[0]) #Create empty output string output = "" #Loop through the list of sequeces from index 1 onwards (as the index 0 will be #Sequence)
#searching for overrepresented sequences within a genome sequence and producing an output for the extraction of these sequences in a upper case format as the sequences can be in lower or upper case. It is used instead of FASTQC when FASTQC cannot find any sequences. This can then be used by cutadapt from fadapa import Fadapa import sys #input file_one = sys.argv[1] f = Fadapa('/home/rsk17/Group_project/1_Original_pipeline/' + file_one + '/fastqc_data.txt') #look for certain phrases ie Overrepresented sequences good_seq = f.raw_data("Overrepresented sequences")[0] #create an empty list fo the overrepresented sequences to be appended to seq_list = [] #if good_seq is pass in this case use == #if there is no overrepresented sequences pass then the loop continues, otherwise the loop is broken #find when Overrepresented sequences are not a pass (!=) if good_seq != ">>Overrepresented sequences pass": for data in f.clean_data('Overrepresented sequences'): seq_list.append(data[0]) #this gets the output into the correct format for cutadapt (-A XXXX etc) #creates output and uses the seq_list from before to output the found sequences into a format that can be used by cutadapt. output = "" #seq_list[1:] means that everything that isnt seq_list[0] (#sequence) for items in seq_list[1:]: output = output + "-A" + items print(output)
def fastqc_result(self, r1, r2, sample, output, type): fq1 = Fadapa(r1) fq2 = Fadapa(r2) fastqc = {} fastqc_summary = {} fastqc_pass = {} fastqc_per_base_quality = {} fastqc_per_sequence_quality = {} fastqc_per_sequence_quality_r1 = {} fastqc_per_sequence_quality_r2 = {} fastqc_sequence_length_distribution = {} fastqc_sequence_length_distribution_r1 = {} fastqc_sequence_length_distribution_r2 = {} # fastqc per sequence quality scores for report # R1 f = open( "%s/data/stat/%s.fastq_quality_score_r1.txt" % (output, sample), "w") count = 0 for data in fq1.raw_data('Per sequence quality scores'): if (count > 3): break if data.startswith('>>Per') or data.startswith( '#Qual') or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # R2 f = open( "%s/data/stat/%s.fastq_quality_score_r2.txt" % (output, sample), "w") count = 0 for data in fq2.raw_data('Per sequence quality scores'): if (count > 3): break if data.startswith('>>Per') or data.startswith( '#Qual') or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # fastqc per base squence for report f = open( "%s/data/stat/%s.base_sequence_quality.txt" % (output, sample), "w") count = 0 for data in fq1.raw_data('Per base sequence quality'): if (count > 3): break if data.startswith(">>Per") or data.startswith( "#Base") or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # fastqc sequence length distribution for report f = open( "%s/data/stat/%s.sequence_length_distribution.txt" % (output, sample), "w") count = 0 for data in fq1.raw_data('Sequence Length Distribution'): if (count > 3): break if data.startswith(">>Seq") or data.startswith( "#") or data.startswith(">>END"): count = count + 1 else: f.write(data) f.write("\n") f.close() # fastqc parse for json for data in fq1.clean_data('Sequence Length Distribution'): if data[0] != "Length": fastqc_sequence_length_distribution_r1[data[0]] = data[1] for data in fq2.clean_data('Sequence Length Distribution'): if data[0] != "Length": fastqc_sequence_length_distribution_r2[data[0]] = data[1] total_reads_r1 = 0 total_reads_r2 = 0 above_30_r1 = 0 above_30_r2 = 0 for data in fq1.clean_data('Per sequence quality scores'): if data[0] != "Quality": fastqc_per_sequence_quality_r1[data[0]] = data[1] total_reads_r1 = total_reads_r1 + float(data[1]) if int(data[0]) >= 30: above_30_r1 = above_30_r1 + float(data[1]) for data in fq2.clean_data('Per sequence quality scores'): if data[0] != "Quality": fastqc_per_sequence_quality_r2[data[0]] = data[1] total_reads_r2 = total_reads_r2 + float(data[1]) if int(data[0]) >= 30: above_30_r2 = above_30_r2 + float(data[1]) #mean read quality(percentage of reads with mean Phred base quality above 30) mean_read_quality_percentage = (above_30_r1 + above_30_r2) / ( total_reads_r1 + total_reads_r2) * 100 mean_read_quality = {} mean_read_quality['total_read'] = total_reads_r1 + total_reads_r2 mean_read_quality['above_30'] = above_30_r1 + above_30_r2 mean_read_quality['percentage'] = mean_read_quality_percentage if type == "raw": if 90 < mean_read_quality_percentage: mean_read_quality['message'] = "pass" self._qc_pass_count = self._qc_pass_count + 1 else: message = "warn" mean_read_quality['message'] = "warn" for data in fq1.clean_data('Per base sequence quality'): #base, mean, median if data[0] != "base_Base": fastqc_per_base_quality[data[0]] = "{0}:{1}:{2}".format( data[0], data[1], data[2]) for data in fq1.clean_data('Basic Statistics'): if data[0] != "Measure": fastqc_summary[data[0]] = data[1] for data in fq1.summary(): if data[1] != "Module Name": fastqc_pass[data[1]] = data[0] fastqc['mean_read_quality'] = mean_read_quality fastqc[ 'fastqc_sequence_length_distribution'] = fastqc_sequence_length_distribution fastqc['per_sequence_quality_score'] = fastqc_per_sequence_quality fastqc['per_base_quality'] = fastqc_per_base_quality fastqc['summary'] = fastqc_summary fastqc['pass'] = fastqc_pass fastqc['fastq_file_name'] = "%s-%s" % (r1, r2) return fastqc