def writeReport(combined, uncombined_R1, out_report): """ @summary: Writes report file for combination results. @param combined: [str] Path to the file containing the combined reads (format: fastq). @param uncombined_R1: [str] Path to the file containing the uncombined R1 (format: fastq). @param out_report: [str] Path to the outputted report file (format: json). """ import json from anacore.sequenceIO import FastqIO report = { "nb_combined_pairs": 0, "nb_uncombined_pairs": 0, "nb_by_lengths": dict() } # Get nb combined and lengths distribution with FastqIO(combined) as FH_comb: for record in FH_comb: report["nb_combined_pairs"] += 1 curr_len = len(record.string) if curr_len not in report["nb_by_lengths"]: report["nb_by_lengths"][curr_len] = 1 else: report["nb_by_lengths"][curr_len] += 1 # Get nb uncombined with FastqIO(uncombined_R1) as FH_not_comb: for record in FH_not_comb: report["nb_uncombined_pairs"] += 1 # Write report with open(out_report, "w") as FH_report: json.dump(report, FH_report, sort_keys=True)
def pickSequences(in_path, out_path, retained_ids): dict_retained_ids = {curr_id: 1 for curr_id in retained_ids} with FastqIO(out_path, "w") as FH_out: with FastqIO(in_path) as FH_in: for record in FH_in: if record.id in dict_retained_ids: FH_out.write(record)
def testNbSeqAndNt(self): nb_seq, nb_nt = FastqIO.nbSeqAndNt(self.tmp_seq) self.assertEqual(nb_seq, 4) self.assertEqual(nb_nt, 41) nb_seq, nb_nt = FastqIO.nbSeqAndNt(self.tmp_seq_gz) self.assertEqual(nb_seq, 4) self.assertEqual(nb_nt, 41)
def writeReport(combined, R1, out_report): """ Write report file for combination results. :param combined: Path to the file containing the combined reads (format: fastq). :type combined: str :param R1: Path to the initial R1 file (format: fastq). :type R1: str :param out_report: Path to the outputted report file (format: json). :type out_report: str """ report = { "nb_combined_pairs": 0, "nb_uncombined_pairs": 0, "nb_by_length": dict() } # Get nb combined and lengths distribution with FastqIO(combined) as FH_comb: for record in FH_comb: report["nb_combined_pairs"] += 1 curr_len = len(record.string) if curr_len not in report["nb_by_length"]: report["nb_by_length"][curr_len] = 1 else: report["nb_by_length"][curr_len] += 1 # Get nb uncombined nb_total_pairs = 0 with FastqIO(R1) as FH_R1: for record in FH_R1: nb_total_pairs += 1 report[ "nb_uncombined_pairs"] = nb_total_pairs - report["nb_combined_pairs"] # Write report with open(out_report, "w") as FH_report: json.dump(report, FH_report, sort_keys=True)
def get_seq_ids(fastq_path): ids = list() #~ with FastqIO( fastq_path ) as FH: #~ for record in FH: #~ ids.append( record.id ) FH = FastqIO(fastq_path) for record in FH: ids.append(record.id) FH.close() return ids
def testQualOffset(self): # Illumina 1.8 with under 59 content = """@M70265:234:000000000-CCC3N:1:1101:21165:1697 1:N:0:13 ATGTCCTTGTGCACAATGCCCTGGCTATGCAGGTACTCCAGGCCGTCAATCAGCTGACAGAAGTACCTGCGGGCAGCACACACCCGTCCTGGGGCCGAGGCCTCCCTGCCCCTCTCAGGGGCGAATTTCGACGATCGTTGCATTAACTCGC + -A-A@EFF9E,C9C,,C,CEEF,,6C9E,@,,,C<EEEE,,,:B7@:,,CC,,CE,,;,,,,,<9@CE,C+++@+,@,C,C,B@>+BBFE,,,+87++++8ABA=FE,B?BFDC==,,,,+6+++@BFD,+8++>>7@D,<,@@,,@7>*> @M70265:234:000000000-CCC3N:1:1101:14142:1764 1:N:0:13 TGTCAATCAATATCAGGACAAGCAGTGTGTCCTCACGGAAAGGAGCCTGCCCTGCCTGGCCCCCGGCCCCCGCCCCACCCTGGCCCCTGCCCCGCGCACCCACCCGTTGGCCTTGCCCCCTCGGAAACGCTTCTCCCGCACCCTTGCGAAT + B<CB9-CF9,F9FDC,,,C8<,C8,C,C,,<CEE9C,,+,6,<,<CBF@<@EE,CFD,,@ADCF7::C@B@+@>CC,:FFC,,4CDC<,,CED+@+>+6+8+?CC+8,+,4:>,,:,:+83++++++8+83>:,33,+3+5*68,,,**1* @M70265:234:000000000-CCC3N:1:1101:9715:1775 1:N:0:13 TCCAGGGCTTTTGTCTTCTTCCCTTTAGATTCTCTTCTTCTGTACTGCCTGTGCTTTTGCATTCTCTACACTCATCTGTGCCACCGTTTGGAAAGCTAGTGGTTCAGAGTTCTATATATTCTCGAATTTCGCCGATCGTTTCATTAACTCT + -8A----8FFGG,E@E@EEF<@6CFF9,,<,;C6C,6CE@C,C,CF,@C,,;,,,<,;,;,,,6,;C,,,6;;,<<E,,,6C,C<+CBA,,,,,,6C,,:,,CC@,,,,<E@F,C,,,<EA,C,,,9?E,,,8++8>+BE+559E,,5=E,""" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertEqual(FastqIO.qualOffset(self.tmp_out), 33) # Illumina 1.8 content = """@M70265:234:000000000-CCC3N:1:1102:19767:8584 1:N:0:35 TCATGACTGATATGGTAGACAGAGCCTAAACATCCCCTTAAATTGGATTAAAAAGAAATATACCTTTGTTGTTACCTTTAAATGCAAAGTTAAAATAGGCAGAAGTCTTGCCCACATCGTTGTAGGCCTTACATTCAACCGGCGAATTTCG + CCCCCFGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGFGGGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGEGGGGFGF @M70265:234:000000000-CCC3N:1:1102:17014:8587 1:N:0:35 TCCATAACTTCTTGCTAAGTCCTGAGCCTGTTTTGTGTCTACTGTTCTAGAAGGCAAATCACATTTATTTCCTACTAGGACCACAGGTACATCTTCAGAGTCCTTAACTCTTTTAATTTGTTCTCTGGGAAAGAGCGAATTTCGACGATCG + CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG @M70265:234:000000000-CCC3N:1:1102:16174:8588 1:N:0:35 CTTGAGTGAAGGACTGAGAAAATCCCTGTTCCCACTCATACAGGACTTGGGAGGTATCCACATCCTCTTCCTCAGGATTGCCTTTACCACTCTGAGAAGGAGCTGTGGTAGTGGCACCAGAATGGATTCCAGAGTCCAGGTAAGACTGCGC + CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG""" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertEqual(FastqIO.qualOffset(self.tmp_out), 33) # Solexa content = """@SRR1296011.1 1 length=107 CGGCAAGTTAACAAAAAGAAAAATGGTGAATGATACCCGGTGCTGGCAATCTCGTTTAAACTACATGCAGGAACAGCAAAGGAAATCCGGCAAATTTGCGCAGTCAT +SRR1296011.1 1 length=107 dddddaa]aafffc`c_ccc`cccf_^cddf_fcddd`ddWdd^a]daadf[fdcffaafcffcfcff]fcfffW^I^a^^KZdaffc_cWbc[cN[[X^]`a``ca @SRR1296011.2 2 length=107 AAATTTGCCGGATTTCCTTTGCTGTTCCTGCATGTAGTTTAAACGAGATTGCCAGCACCGGGTATCATTCACCATTTTTCTTTTTGTTAACTTGCCGTCAGCCTTTT +SRR1296011.2 2 length=107 gggggggaggggggggggfgfgggggggcgggggggc_geggfggggggggggaggggggggggdggffggfgggaggeegcgggeggggeffgac]dbcaggeab_ @SRR1296011.3 3 length=107 CTTTCTGTTCATGTGTATCTGCTGTCTCTTAGCCCAGACTTCCCGTGTCCTTTCCACTGGGCCTTTGGGAGGTCACAGGGTCTTGATGCTGTGGTCTTGATCTGCAG +SRR1296011.3 3 length=107 fffdffgggggc_aggaggggfe_afffffgggggfgggggggggddgge_aWdaggggg]]cfffffedfeUeaacff_Wcfcc`bb]d__b^Zacaa[]\```_b""" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertEqual(FastqIO.qualOffset(self.tmp_out), 64)
def testIter(self): with FastqIO(self.tmp_seq) as FH_in: for idx, record in enumerate(FH_in): self.assertTrue(cmpSequences(record, self.expected_rec[idx])) self.assertEqual(idx + 1, 4) with FastqIO(self.tmp_seq_gz) as FH_in: for idx, record in enumerate(FH_in): self.assertTrue(cmpSequences(record, self.expected_rec[idx])) self.assertEqual(idx + 1, 4) with FastqIO(self.tmp_seq) as FH_in: idx = 0 for rec_expected, rec_observed in zip(self.expected_rec, FH_in): self.assertTrue(cmpSequences(rec_observed, rec_expected)) idx += 1 self.assertEqual(idx, 4)
def testIsValid(self): # Valid self.assertTrue(FastqIO.isValid(self.tmp_seq)) self.assertTrue(FastqIO.isValid(self.tmp_seq_gz)) # Valid long file content = "@seq1\nATGC\n+\n####\n@seq2\nATGC\n+\n####\n@seq3\nATGC\n+\n####\n@seq4\nATGC\n+\n####\n@seq5\nATGC\n+\n####\n@seq6\nATGC\n+\n####\n@seq7\nATGC\n+\n####\n@seq8\nATGC\n+\n####\n@seq9\nATGC\n+\n####\n@seq10\nATGC\n+\n####\n@seq11\nATGC\n+\n####\n" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(FastqIO.isValid(self.tmp_out)) # Valid empty file content = "" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(FastqIO.isValid(self.tmp_out)) # Valid empty sequence content = "@seq1\n\n+\n\n" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(FastqIO.isValid(self.tmp_out)) # Invalid file (fasta) content = ">seq1\nATGC\n>seq2\nATGC" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(not FastqIO.isValid(self.tmp_out)) # Invalid file (not a sequence) content = "@seq1\nAT1GC2\n+\n######" with open(self.tmp_out, "w") as FH_out: FH_out.write(content) self.assertTrue(not FastqIO.isValid(self.tmp_out))
def getCountByBarcode(in_seq): """ Return the number of reads by barcode in the fastq file. :param in_seq: The path to the sequence file (format: fastq). :type in_seq: int :return: The number of reads by barcode. :rtype: dict """ count_by_barcode = dict() for curr_seq in in_seq: with FastqIO(curr_seq) as FH_in: for record in FH_in: barcode = getInfFromSeqDesc(record.description)["barcode"] if barcode not in count_by_barcode: count_by_barcode[barcode] = 1 else: count_by_barcode[barcode] += 1 return count_by_barcode
librairies = getLibFromDataFolder(raw_folder) status_by_spl = getStatus(annotation_path, [lib["spl_name"] for lib in librairies]) for lib in librairies: lib["status"] = status_by_spl[lib["spl_name"]] loci = set( locus.name for locus in getAreas(os.path.join(design_folder, "targets.bed"))) # Get nb nt log.info("Get the number of nucleotids by sample") for spl in librairies: spl["nb_nt"] = 0 spl["nb_reads"] = 0 for fastq in [spl["R1"], spl["R2"]]: with FastqIO(fastq) as FH_in: for rec in FH_in: spl["nb_nt"] += len(rec.string) spl["nb_reads"] += 1 # Process assessment app_config = os.path.join(APP_FOLDER, "jflow", "application.properties") app_config_bck = app_config + ".bck" shutil.copyfile(app_config, app_config_bck) try: is_first = True for nb_spl in args.nb_samples: # [1, 50, 100] for eval_idx in range(args.nb_tests): # 10 log.info("Create datasets for test #{} on {} samples".format( eval_idx + 1, nb_spl)) samples = shuffle(librairies,
help="The maximun quality. [Default: No maximun]") group_input = parser.add_argument_group('Inputs') # Inputs group_input.add_argument( '-i', '--input-file', required=True, help='Path to the sequences file (format: fastq).') group_output = parser.add_argument_group('Outputs') # Outputs group_output.add_argument('-o', '--output-file', required=True, help='Path to the output (format: fastq).') args = parser.parse_args() # Process old_offset = args.old_offset if args.old_offset is not None else FastqIO.qualOffset( args.input_file) if old_offset is None: raise Exception( "The quality offset in {} cannot be determined.".format( args.input_file)) offset_modifier = args.new_offset - old_offset with FastqIO(args.output_file, "w") as FH_out: with FastqIO(args.input_file) as FH_in: for record in FH_in: new_qual = "" for curr_qual in record.quality: new_qual_numer = ord(curr_qual) + offset_modifier if args.min_qual is not None: new_qual_numer = max(args.new_offset + args.min_qual, new_qual_numer) if args.max_qual is not None:
'--output-reads-2', help='The path to the outputted reads file R2 (format: FASTQ).') args = parser.parse_args() # Logger logging.basicConfig( format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Process if args.output_reads_2: # Write all reads in a pair of files (R1 and R2) with FastqIO(args.output_reads_2, "w") as writer_r2: with FastqIO(args.output_reads, "w") as writer_r1: with AlignmentFile(args.input_aln, "rb", check_sq=False) as reader: for curr_read in reader.fetch(until_eof=True): if not curr_read.is_secondary and not curr_read.is_supplementary: if args.keep_qc_failed or not curr_read.is_qcfail: barcode = args.reads_barcode if barcode is None and curr_read.has_tag( args.barcode_tag): barcode = curr_read.get_tag( args.barcode_tag).replace("-", "+") description = "{}:{}:0:{} {}={}".format( "1" if curr_read.is_read1 else "2", "Y" if curr_read.is_qcfail else "N", "" if barcode is None else barcode,
def testNbSeq(self): nb_seq = FastqIO.nbSeq(self.tmp_seq) self.assertEqual(nb_seq, 4)
def testWrite(self): with FastqIO(self.tmp_out, "w") as FH_out: for curr_rec in self.expected_rec: FH_out.write(curr_rec) self.assertTrue(FastqIO.isValid(self.tmp_out)) self.assertTrue(filecmp.cmp(self.tmp_out, self.tmp_seq))
def get_seq_ids(fastq_path): ids = list() with FastqIO(fastq_path) as FH: for record in FH: ids.append(record.id) return ids
def process(args, log): """ Combine R1 and R2 by their overlapping segment. :param args: The namespace extract from the script arguments. :type args: Namespace :param log: The logger of the script. :type log: logging.Logger """ nb_pairs = 0 combined = 0 with FastqIO(args.output_combined, "w") as FH_combined: with FastqIO(args.input_R1) as FH_r1: with FastqIO(args.input_R2) as FH_r2: for R1 in FH_r1: R2 = seqRevCom(FH_r2.nextSeq()) nb_pairs += 1 best_overlap = None max_nb_support = -1 R1_len = len(R1.string) R2_len = len(R2.string) R1_start = 0 R2_start = R2_len - args.min_overlap is_valid = R1_len >= args.min_overlap and R2_len >= args.min_overlap can_be_better = True while is_valid and can_be_better: # For each shift nb_support = 0 nb_contradict = 0 curr_overlap_len = min(R1_len - R1_start, R2_len - R2_start) if best_overlap is not None and R1_start != 0 and curr_overlap_len < best_overlap[ "nb_support"]: # R1 is first and overlap become lower than nb support can_be_better = False else: # Evaluate overlap R1_ov_s = R1.string[R1_start:R1_start + curr_overlap_len] R2_ov_s = R2.string[R2_start:R2_start + curr_overlap_len] for nt_R1, nt_R2, in zip( R1_ov_s, R2_ov_s): # For each nt in overlap if nt_R1 == nt_R2: nb_support += 1 nb_contradict = curr_overlap_len - nb_support # Filter consensus and select the best if nb_support >= max_nb_support: if float( nb_contradict ) / curr_overlap_len <= args.max_contradict_ratio: max_nb_support = nb_support best_overlap = { "nb_support": nb_support, "nb_contradict": nb_contradict, "R1_start": R1_start, "R2_start": R2_start, "length": curr_overlap_len } # Next shift if R1_start == 0: if R2_start == 0: R1_start = 1 else: R2_start -= 1 else: R1_start += 1 if R1_len - R1_start < args.min_overlap: is_valid = False if best_overlap is not None: # Current pair has valid combination # Filter fragment on length valid_frag_len = True if args.max_frag_length is not None or args.min_frag_length is not None: curr_frag_len = curr_overlap_len if best_overlap["R1_start"] != 0: # R1 is first curr_frag_len = R1_len + R2_len - curr_overlap_len if args.min_frag_length is not None: valid_frag_len = curr_frag_len >= args.min_frag_length if args.max_frag_length is not None: valid_frag_len = curr_frag_len <= args.max_frag_length # Write combined sequence if valid_frag_len: combined += 1 complete_seq = "" complete_qual = "" R1_ov_s = R1.string[best_overlap["R1_start"]: best_overlap["R1_start"] + best_overlap["length"]] R1_ov_q = R1.quality[best_overlap["R1_start"]: best_overlap["R1_start"] + best_overlap["length"]] R2_ov_s = R2.string[best_overlap["R2_start"]: best_overlap["R2_start"] + best_overlap["length"]] R2_ov_q = R2.quality[best_overlap["R2_start"]: best_overlap["R2_start"] + best_overlap["length"]] for nt_R1, qual_R1, nt_R2, qual_R2 in zip( R1_ov_s, R1_ov_q, R2_ov_s, R2_ov_q): # For each nt in overlap if nt_R1 == nt_R2: complete_seq += nt_R1 complete_qual += max(qual_R1, qual_R2) else: if qual_R1 >= qual_R2: complete_seq += nt_R1 complete_qual += qual_R1 else: complete_seq += nt_R2 complete_qual += qual_R2 if best_overlap[ "R1_start"] > 0: # If R1 start before R2 (insert size > read length) complete_seq = R1.string[0:best_overlap[ "R1_start"]] + complete_seq + R2.string[ best_overlap["length"]:] complete_qual = R1.quality[0:best_overlap[ "R1_start"]] + complete_qual + R2.quality[ best_overlap["length"]:] consensus_record = Sequence( R1.id, complete_seq, "Support_ratio:{}/{};R1_start:{};R2_start:{}". format(best_overlap["nb_support"], best_overlap["length"], best_overlap["R1_start"], best_overlap["R2_start"]), complete_qual) FH_combined.write(consensus_record) # Log log.info("Nb pair: {} ; Nb combined: {} ({}%)".format( nb_pairs, combined, (0 if nb_pairs == 0 else round(float(combined * 100) / nb_pairs, 2)))) if args.output_report is not None: writeReport(args.output_combined, args.input_R1, args.output_report)