Exemplo n.º 1
0
def writeReport(combined, uncombined_R1, out_report):
    """
    @summary: Writes report file for combination results.
    @param combined: [str] Path to the file containing the combined reads (format: fastq).
    @param uncombined_R1: [str] Path to the file containing the uncombined R1 (format: fastq).
    @param out_report: [str] Path to the outputted report file (format: json).
    """
    import json
    from anacore.sequenceIO import FastqIO
    report = {
        "nb_combined_pairs": 0,
        "nb_uncombined_pairs": 0,
        "nb_by_lengths": dict()
    }
    # Get nb combined and lengths distribution
    with FastqIO(combined) as FH_comb:
        for record in FH_comb:
            report["nb_combined_pairs"] += 1
            curr_len = len(record.string)
            if curr_len not in report["nb_by_lengths"]:
                report["nb_by_lengths"][curr_len] = 1
            else:
                report["nb_by_lengths"][curr_len] += 1
    # Get nb uncombined
    with FastqIO(uncombined_R1) as FH_not_comb:
        for record in FH_not_comb:
            report["nb_uncombined_pairs"] += 1
    # Write report
    with open(out_report, "w") as FH_report:
        json.dump(report, FH_report, sort_keys=True)
Exemplo n.º 2
0
def pickSequences(in_path, out_path, retained_ids):
    dict_retained_ids = {curr_id: 1 for curr_id in retained_ids}
    with FastqIO(out_path, "w") as FH_out:
        with FastqIO(in_path) as FH_in:
            for record in FH_in:
                if record.id in dict_retained_ids:
                    FH_out.write(record)
Exemplo n.º 3
0
 def testNbSeqAndNt(self):
     nb_seq, nb_nt = FastqIO.nbSeqAndNt(self.tmp_seq)
     self.assertEqual(nb_seq, 4)
     self.assertEqual(nb_nt, 41)
     nb_seq, nb_nt = FastqIO.nbSeqAndNt(self.tmp_seq_gz)
     self.assertEqual(nb_seq, 4)
     self.assertEqual(nb_nt, 41)
Exemplo n.º 4
0
def writeReport(combined, R1, out_report):
    """
    Write report file for combination results.

    :param combined: Path to the file containing the combined reads (format: fastq).
    :type combined: str
    :param R1: Path to the initial R1 file (format: fastq).
    :type R1: str
    :param out_report: Path to the outputted report file (format: json).
    :type out_report: str
    """
    report = {
        "nb_combined_pairs": 0,
        "nb_uncombined_pairs": 0,
        "nb_by_length": dict()
    }
    # Get nb combined and lengths distribution
    with FastqIO(combined) as FH_comb:
        for record in FH_comb:
            report["nb_combined_pairs"] += 1
            curr_len = len(record.string)
            if curr_len not in report["nb_by_length"]:
                report["nb_by_length"][curr_len] = 1
            else:
                report["nb_by_length"][curr_len] += 1
    # Get nb uncombined
    nb_total_pairs = 0
    with FastqIO(R1) as FH_R1:
        for record in FH_R1:
            nb_total_pairs += 1
    report[
        "nb_uncombined_pairs"] = nb_total_pairs - report["nb_combined_pairs"]
    # Write report
    with open(out_report, "w") as FH_report:
        json.dump(report, FH_report, sort_keys=True)
Exemplo n.º 5
0
def get_seq_ids(fastq_path):
    ids = list()
    #~ with FastqIO( fastq_path ) as FH:
    #~ for record in FH:
    #~ ids.append( record.id )
    FH = FastqIO(fastq_path)
    for record in FH:
        ids.append(record.id)
    FH.close()
    return ids
Exemplo n.º 6
0
    def testQualOffset(self):
        # Illumina 1.8 with under 59
        content = """@M70265:234:000000000-CCC3N:1:1101:21165:1697 1:N:0:13
ATGTCCTTGTGCACAATGCCCTGGCTATGCAGGTACTCCAGGCCGTCAATCAGCTGACAGAAGTACCTGCGGGCAGCACACACCCGTCCTGGGGCCGAGGCCTCCCTGCCCCTCTCAGGGGCGAATTTCGACGATCGTTGCATTAACTCGC
+
-A-A@EFF9E,C9C,,C,CEEF,,6C9E,@,,,C<EEEE,,,:B7@:,,CC,,CE,,;,,,,,<9@CE,C+++@+,@,C,C,B@>+BBFE,,,+87++++8ABA=FE,B?BFDC==,,,,+6+++@BFD,+8++>>7@D,<,@@,,@7>*>
@M70265:234:000000000-CCC3N:1:1101:14142:1764 1:N:0:13
TGTCAATCAATATCAGGACAAGCAGTGTGTCCTCACGGAAAGGAGCCTGCCCTGCCTGGCCCCCGGCCCCCGCCCCACCCTGGCCCCTGCCCCGCGCACCCACCCGTTGGCCTTGCCCCCTCGGAAACGCTTCTCCCGCACCCTTGCGAAT
+
B<CB9-CF9,F9FDC,,,C8<,C8,C,C,,<CEE9C,,+,6,<,<CBF@<@EE,CFD,,@ADCF7::C@B@+@>CC,:FFC,,4CDC<,,CED+@+>+6+8+?CC+8,+,4:>,,:,:+83++++++8+83>:,33,+3+5*68,,,**1*
@M70265:234:000000000-CCC3N:1:1101:9715:1775 1:N:0:13
TCCAGGGCTTTTGTCTTCTTCCCTTTAGATTCTCTTCTTCTGTACTGCCTGTGCTTTTGCATTCTCTACACTCATCTGTGCCACCGTTTGGAAAGCTAGTGGTTCAGAGTTCTATATATTCTCGAATTTCGCCGATCGTTTCATTAACTCT
+
-8A----8FFGG,E@E@EEF<@6CFF9,,<,;C6C,6CE@C,C,CF,@C,,;,,,<,;,;,,,6,;C,,,6;;,<<E,,,6C,C<+CBA,,,,,,6C,,:,,CC@,,,,<E@F,C,,,<EA,C,,,9?E,,,8++8>+BE+559E,,5=E,"""
        with open(self.tmp_out, "w") as FH_out:
            FH_out.write(content)
        self.assertEqual(FastqIO.qualOffset(self.tmp_out), 33)
        # Illumina 1.8
        content = """@M70265:234:000000000-CCC3N:1:1102:19767:8584 1:N:0:35
TCATGACTGATATGGTAGACAGAGCCTAAACATCCCCTTAAATTGGATTAAAAAGAAATATACCTTTGTTGTTACCTTTAAATGCAAAGTTAAAATAGGCAGAAGTCTTGCCCACATCGTTGTAGGCCTTACATTCAACCGGCGAATTTCG
+
CCCCCFGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGFGGGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGEGGGGFGF
@M70265:234:000000000-CCC3N:1:1102:17014:8587 1:N:0:35
TCCATAACTTCTTGCTAAGTCCTGAGCCTGTTTTGTGTCTACTGTTCTAGAAGGCAAATCACATTTATTTCCTACTAGGACCACAGGTACATCTTCAGAGTCCTTAACTCTTTTAATTTGTTCTCTGGGAAAGAGCGAATTTCGACGATCG
+
CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
@M70265:234:000000000-CCC3N:1:1102:16174:8588 1:N:0:35
CTTGAGTGAAGGACTGAGAAAATCCCTGTTCCCACTCATACAGGACTTGGGAGGTATCCACATCCTCTTCCTCAGGATTGCCTTTACCACTCTGAGAAGGAGCTGTGGTAGTGGCACCAGAATGGATTCCAGAGTCCAGGTAAGACTGCGC
+
CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"""
        with open(self.tmp_out, "w") as FH_out:
            FH_out.write(content)
        self.assertEqual(FastqIO.qualOffset(self.tmp_out), 33)
        # Solexa
        content = """@SRR1296011.1 1 length=107
CGGCAAGTTAACAAAAAGAAAAATGGTGAATGATACCCGGTGCTGGCAATCTCGTTTAAACTACATGCAGGAACAGCAAAGGAAATCCGGCAAATTTGCGCAGTCAT
+SRR1296011.1 1 length=107
dddddaa]aafffc`c_ccc`cccf_^cddf_fcddd`ddWdd^a]daadf[fdcffaafcffcfcff]fcfffW^I^a^^KZdaffc_cWbc[cN[[X^]`a``ca
@SRR1296011.2 2 length=107
AAATTTGCCGGATTTCCTTTGCTGTTCCTGCATGTAGTTTAAACGAGATTGCCAGCACCGGGTATCATTCACCATTTTTCTTTTTGTTAACTTGCCGTCAGCCTTTT
+SRR1296011.2 2 length=107
gggggggaggggggggggfgfgggggggcgggggggc_geggfggggggggggaggggggggggdggffggfgggaggeegcgggeggggeffgac]dbcaggeab_
@SRR1296011.3 3 length=107
CTTTCTGTTCATGTGTATCTGCTGTCTCTTAGCCCAGACTTCCCGTGTCCTTTCCACTGGGCCTTTGGGAGGTCACAGGGTCTTGATGCTGTGGTCTTGATCTGCAG
+SRR1296011.3 3 length=107
fffdffgggggc_aggaggggfe_afffffgggggfgggggggggddgge_aWdaggggg]]cfffffedfeUeaacff_Wcfcc`bb]d__b^Zacaa[]\```_b"""
        with open(self.tmp_out, "w") as FH_out:
            FH_out.write(content)
        self.assertEqual(FastqIO.qualOffset(self.tmp_out), 64)
Exemplo n.º 7
0
 def testIter(self):
     with FastqIO(self.tmp_seq) as FH_in:
         for idx, record in enumerate(FH_in):
             self.assertTrue(cmpSequences(record, self.expected_rec[idx]))
         self.assertEqual(idx + 1, 4)
     with FastqIO(self.tmp_seq_gz) as FH_in:
         for idx, record in enumerate(FH_in):
             self.assertTrue(cmpSequences(record, self.expected_rec[idx]))
         self.assertEqual(idx + 1, 4)
     with FastqIO(self.tmp_seq) as FH_in:
         idx = 0
         for rec_expected, rec_observed in zip(self.expected_rec, FH_in):
             self.assertTrue(cmpSequences(rec_observed, rec_expected))
             idx += 1
         self.assertEqual(idx, 4)
Exemplo n.º 8
0
 def testIsValid(self):
     # Valid
     self.assertTrue(FastqIO.isValid(self.tmp_seq))
     self.assertTrue(FastqIO.isValid(self.tmp_seq_gz))
     # Valid long file
     content = "@seq1\nATGC\n+\n####\n@seq2\nATGC\n+\n####\n@seq3\nATGC\n+\n####\n@seq4\nATGC\n+\n####\n@seq5\nATGC\n+\n####\n@seq6\nATGC\n+\n####\n@seq7\nATGC\n+\n####\n@seq8\nATGC\n+\n####\n@seq9\nATGC\n+\n####\n@seq10\nATGC\n+\n####\n@seq11\nATGC\n+\n####\n"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(FastqIO.isValid(self.tmp_out))
     # Valid empty file
     content = ""
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(FastqIO.isValid(self.tmp_out))
     # Valid empty sequence
     content = "@seq1\n\n+\n\n"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(FastqIO.isValid(self.tmp_out))
     # Invalid file (fasta)
     content = ">seq1\nATGC\n>seq2\nATGC"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(not FastqIO.isValid(self.tmp_out))
     # Invalid file (not a sequence)
     content = "@seq1\nAT1GC2\n+\n######"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(not FastqIO.isValid(self.tmp_out))
Exemplo n.º 9
0
def getCountByBarcode(in_seq):
    """
    Return the number of reads by barcode in the fastq file.

    :param in_seq: The path to the sequence file (format: fastq).
    :type in_seq: int
    :return: The number of reads by barcode.
    :rtype: dict
    """
    count_by_barcode = dict()
    for curr_seq in in_seq:
        with FastqIO(curr_seq) as FH_in:
            for record in FH_in:
                barcode = getInfFromSeqDesc(record.description)["barcode"]
                if barcode not in count_by_barcode:
                    count_by_barcode[barcode] = 1
                else:
                    count_by_barcode[barcode] += 1
    return count_by_barcode
Exemplo n.º 10
0
    librairies = getLibFromDataFolder(raw_folder)
    status_by_spl = getStatus(annotation_path,
                              [lib["spl_name"] for lib in librairies])
    for lib in librairies:
        lib["status"] = status_by_spl[lib["spl_name"]]
    loci = set(
        locus.name
        for locus in getAreas(os.path.join(design_folder, "targets.bed")))

    # Get nb nt
    log.info("Get the number of nucleotids by sample")
    for spl in librairies:
        spl["nb_nt"] = 0
        spl["nb_reads"] = 0
        for fastq in [spl["R1"], spl["R2"]]:
            with FastqIO(fastq) as FH_in:
                for rec in FH_in:
                    spl["nb_nt"] += len(rec.string)
                    spl["nb_reads"] += 1

    # Process assessment
    app_config = os.path.join(APP_FOLDER, "jflow", "application.properties")
    app_config_bck = app_config + ".bck"
    shutil.copyfile(app_config, app_config_bck)
    try:
        is_first = True
        for nb_spl in args.nb_samples:  # [1, 50, 100]
            for eval_idx in range(args.nb_tests):  # 10
                log.info("Create datasets for  test #{} on {} samples".format(
                    eval_idx + 1, nb_spl))
                samples = shuffle(librairies,
Exemplo n.º 11
0
                        help="The maximun quality. [Default: No maximun]")
    group_input = parser.add_argument_group('Inputs')  # Inputs
    group_input.add_argument(
        '-i',
        '--input-file',
        required=True,
        help='Path to the sequences file (format: fastq).')
    group_output = parser.add_argument_group('Outputs')  # Outputs
    group_output.add_argument('-o',
                              '--output-file',
                              required=True,
                              help='Path to the output (format: fastq).')
    args = parser.parse_args()

    # Process
    old_offset = args.old_offset if args.old_offset is not None else FastqIO.qualOffset(
        args.input_file)
    if old_offset is None:
        raise Exception(
            "The quality offset in {} cannot be determined.".format(
                args.input_file))
    offset_modifier = args.new_offset - old_offset
    with FastqIO(args.output_file, "w") as FH_out:
        with FastqIO(args.input_file) as FH_in:
            for record in FH_in:
                new_qual = ""
                for curr_qual in record.quality:
                    new_qual_numer = ord(curr_qual) + offset_modifier
                    if args.min_qual is not None:
                        new_qual_numer = max(args.new_offset + args.min_qual,
                                             new_qual_numer)
                    if args.max_qual is not None:
Exemplo n.º 12
0
        '--output-reads-2',
        help='The path to the outputted reads file R2 (format: FASTQ).')
    args = parser.parse_args()

    # Logger
    logging.basicConfig(
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Process
    if args.output_reads_2:  # Write all reads in a pair of files (R1 and R2)
        with FastqIO(args.output_reads_2, "w") as writer_r2:
            with FastqIO(args.output_reads, "w") as writer_r1:
                with AlignmentFile(args.input_aln, "rb",
                                   check_sq=False) as reader:
                    for curr_read in reader.fetch(until_eof=True):
                        if not curr_read.is_secondary and not curr_read.is_supplementary:
                            if args.keep_qc_failed or not curr_read.is_qcfail:
                                barcode = args.reads_barcode
                                if barcode is None and curr_read.has_tag(
                                        args.barcode_tag):
                                    barcode = curr_read.get_tag(
                                        args.barcode_tag).replace("-", "+")
                                description = "{}:{}:0:{} {}={}".format(
                                    "1" if curr_read.is_read1 else "2",
                                    "Y" if curr_read.is_qcfail else "N",
                                    "" if barcode is None else barcode,
Exemplo n.º 13
0
 def testNbSeq(self):
     nb_seq = FastqIO.nbSeq(self.tmp_seq)
     self.assertEqual(nb_seq, 4)
Exemplo n.º 14
0
 def testWrite(self):
     with FastqIO(self.tmp_out, "w") as FH_out:
         for curr_rec in self.expected_rec:
             FH_out.write(curr_rec)
     self.assertTrue(FastqIO.isValid(self.tmp_out))
     self.assertTrue(filecmp.cmp(self.tmp_out, self.tmp_seq))
Exemplo n.º 15
0
def get_seq_ids(fastq_path):
    ids = list()
    with FastqIO(fastq_path) as FH:
        for record in FH:
            ids.append(record.id)
    return ids
Exemplo n.º 16
0
def process(args, log):
    """
    Combine R1 and R2 by their overlapping segment.

    :param args: The namespace extract from the script arguments.
    :type args: Namespace
    :param log: The logger of the script.
    :type log: logging.Logger
    """
    nb_pairs = 0
    combined = 0
    with FastqIO(args.output_combined, "w") as FH_combined:
        with FastqIO(args.input_R1) as FH_r1:
            with FastqIO(args.input_R2) as FH_r2:
                for R1 in FH_r1:
                    R2 = seqRevCom(FH_r2.nextSeq())
                    nb_pairs += 1
                    best_overlap = None
                    max_nb_support = -1
                    R1_len = len(R1.string)
                    R2_len = len(R2.string)
                    R1_start = 0
                    R2_start = R2_len - args.min_overlap
                    is_valid = R1_len >= args.min_overlap and R2_len >= args.min_overlap
                    can_be_better = True
                    while is_valid and can_be_better:  # For each shift
                        nb_support = 0
                        nb_contradict = 0
                        curr_overlap_len = min(R1_len - R1_start,
                                               R2_len - R2_start)
                        if best_overlap is not None and R1_start != 0 and curr_overlap_len < best_overlap[
                                "nb_support"]:  # R1 is first and overlap become lower than nb support
                            can_be_better = False
                        else:
                            # Evaluate overlap
                            R1_ov_s = R1.string[R1_start:R1_start +
                                                curr_overlap_len]
                            R2_ov_s = R2.string[R2_start:R2_start +
                                                curr_overlap_len]
                            for nt_R1, nt_R2, in zip(
                                    R1_ov_s,
                                    R2_ov_s):  # For each nt in overlap
                                if nt_R1 == nt_R2:
                                    nb_support += 1
                            nb_contradict = curr_overlap_len - nb_support
                            # Filter consensus and select the best
                            if nb_support >= max_nb_support:
                                if float(
                                        nb_contradict
                                ) / curr_overlap_len <= args.max_contradict_ratio:
                                    max_nb_support = nb_support
                                    best_overlap = {
                                        "nb_support": nb_support,
                                        "nb_contradict": nb_contradict,
                                        "R1_start": R1_start,
                                        "R2_start": R2_start,
                                        "length": curr_overlap_len
                                    }
                            # Next shift
                            if R1_start == 0:
                                if R2_start == 0:
                                    R1_start = 1
                                else:
                                    R2_start -= 1
                            else:
                                R1_start += 1
                                if R1_len - R1_start < args.min_overlap:
                                    is_valid = False
                    if best_overlap is not None:  # Current pair has valid combination
                        # Filter fragment on length
                        valid_frag_len = True
                        if args.max_frag_length is not None or args.min_frag_length is not None:
                            curr_frag_len = curr_overlap_len
                            if best_overlap["R1_start"] != 0:  # R1 is first
                                curr_frag_len = R1_len + R2_len - curr_overlap_len
                            if args.min_frag_length is not None:
                                valid_frag_len = curr_frag_len >= args.min_frag_length
                            if args.max_frag_length is not None:
                                valid_frag_len = curr_frag_len <= args.max_frag_length
                        # Write combined sequence
                        if valid_frag_len:
                            combined += 1
                            complete_seq = ""
                            complete_qual = ""
                            R1_ov_s = R1.string[best_overlap["R1_start"]:
                                                best_overlap["R1_start"] +
                                                best_overlap["length"]]
                            R1_ov_q = R1.quality[best_overlap["R1_start"]:
                                                 best_overlap["R1_start"] +
                                                 best_overlap["length"]]
                            R2_ov_s = R2.string[best_overlap["R2_start"]:
                                                best_overlap["R2_start"] +
                                                best_overlap["length"]]
                            R2_ov_q = R2.quality[best_overlap["R2_start"]:
                                                 best_overlap["R2_start"] +
                                                 best_overlap["length"]]
                            for nt_R1, qual_R1, nt_R2, qual_R2 in zip(
                                    R1_ov_s, R1_ov_q, R2_ov_s,
                                    R2_ov_q):  # For each nt in overlap
                                if nt_R1 == nt_R2:
                                    complete_seq += nt_R1
                                    complete_qual += max(qual_R1, qual_R2)
                                else:
                                    if qual_R1 >= qual_R2:
                                        complete_seq += nt_R1
                                        complete_qual += qual_R1
                                    else:
                                        complete_seq += nt_R2
                                        complete_qual += qual_R2
                            if best_overlap[
                                    "R1_start"] > 0:  # If R1 start before R2 (insert size > read length)
                                complete_seq = R1.string[0:best_overlap[
                                    "R1_start"]] + complete_seq + R2.string[
                                        best_overlap["length"]:]
                                complete_qual = R1.quality[0:best_overlap[
                                    "R1_start"]] + complete_qual + R2.quality[
                                        best_overlap["length"]:]
                            consensus_record = Sequence(
                                R1.id, complete_seq,
                                "Support_ratio:{}/{};R1_start:{};R2_start:{}".
                                format(best_overlap["nb_support"],
                                       best_overlap["length"],
                                       best_overlap["R1_start"],
                                       best_overlap["R2_start"]),
                                complete_qual)
                            FH_combined.write(consensus_record)
    # Log
    log.info("Nb pair: {} ; Nb combined: {} ({}%)".format(
        nb_pairs, combined,
        (0 if nb_pairs == 0 else round(float(combined * 100) / nb_pairs, 2))))
    if args.output_report is not None:
        writeReport(args.output_combined, args.input_R1, args.output_report)