Exemplo n.º 1
0
def get_reads_per_pos(reads_file, transcript_bed):
    """
    Given a BED file of reads and a BED file of transcript coordinates,
    make a dictionary with transcript IDs as keys and number of reads per position,
    as well as the absolute coordinates of the nucleotides, as values.
    :param reads_file: BED file with read coordinates
    :param transcript_bed: BED file with transcript coordinates
    :return: dictionary with numbers of reads per position
    """
    # intersect the transcripts and the reads, so you'd have an output file where
    # the transcript coordinates are followed by the overlapping read
    intermediate_file = "{0}_{1}read_per_pos_intermediate.bed".format(
        reads_file[:-4],
        transcript_bed.split("/")[-1][:-4])
    co.intersect_bed(transcript_bed,
                     reads_file,
                     force_strand=True,
                     write_both=True,
                     no_dups=False,
                     write_zero=False,
                     output_file=intermediate_file)
    reads_per_pos = {}
    total = hk.line_count(intermediate_file)
    print("Calculating the number of reads per position in each transcript...")
    with open(intermediate_file, newline="") as file:
        file_reader = csv.reader(file, delimiter="\t")
        for pos, line in enumerate(file_reader):
            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, total))
            # prefix the chromosome and the strand to the transcript name cause you'll
            # need it later
            trans_name = line[3]
            trans_name = "{0}.{1}.{2}".format(line[0], line[5], trans_name)
            reads_per_pos = hk.add_key(trans_name, {"reads": {}},
                                       reads_per_pos)
            strand = line[5]
            if strand == "+":
                position = int(line[8]) - 1
            else:
                position = int(line[7])
            reads_per_pos[trans_name]["reads"] = hk.add_key(
                position, 0, reads_per_pos[trans_name]["reads"])
            reads_per_pos[trans_name]["reads"][
                position] = reads_per_pos[trans_name]["reads"][position] + 1
            reads_per_pos[trans_name] = hk.add_key(
                "coords", (int(line[1]), int(line[2])),
                reads_per_pos[trans_name])
    hk.remove_file(intermediate_file)
    return reads_per_pos
Exemplo n.º 2
0
def main():
    description = "Make a file with intron lariat read counts per exon."
    args = hk.parse_arguments(description,
                              ["intron_lariat_file", "regions_file"])
    intron_lariat_file, regions_file = args.intron_lariat_file, args.regions_file

    # the intron_lariat_file contains only those reads whose
    # 3' ends map to the last position of an intron
    snr_name = "{0}_snr.bed".format(intron_lariat_file[:-4])
    co.snr_bed(intron_lariat_file, snr_name)

    co.intersect_bed(regions_file,
                     snr_name,
                     force_strand=True,
                     hit_count=True,
                     no_dups=False,
                     output_file="{0}_il_counts.bed".format(regions_file[:-4]))
def main():
    description = "Given a BED file of reads, filter out reads whose " \
                  "3' end maps to the last nucleotide of an intron or" \
                  "the last nucleotide of an exon."
    args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"])
    reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile

    print("Getting intron lariat positions...")

    # read in exon coordinates
    exons = rw.read_gtf(gtf, element="exon", gene=False)
    # make a BED file with the last positions of introns
    intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True)

    # intersect the reads with intron lariat positions
    intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name)
    hk.remove_file(intron_lariat_bed)
    intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at intron lariat positions
    check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file)
    hk.remove_file(intron_lariat_intersect_file_name)

    # write BED with the last positions of exons
    splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4])
    co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True)

    print("Getting splice intermediate positions.")

    # intersect the reads with splice intermediate positions
    splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name)
    hk.remove_file(splice_intermediate_bed)
    SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at the end of the exon
    check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file)
    hk.remove_file(splice_intermediate_intersect_file_name)

    print("Concatenating the two files.")

    # concatenate the IL and SI read files so you could exclude both in one go
    combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4])
    hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file)

    hk.remove_file(SI_reads_file)
    hk.remove_file(intron_lariat_reads_file)

    # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the
    # putative intron lariat reads from the main reads file
    co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile)

    hk.remove_file(combined_file)
Exemplo n.º 4
0
def main():
    description = "Generate a NET-seq control set that would have the same distribution of -2:2 nucleotides" \
                  "as the true set."
    args = hk.parse_arguments(description, [
        "active_genes_file", "gtf", "PolII_file", "fasta", "outfile",
        "chrom_sizes"
    ])
    active_genes_file, gtf, PolII_file, fasta, outfile, chrom_sizes = args.active_genes_file, args.gtf, args.PolII_file, args.fasta, args.outfile, args.chrom_sizes

    chrom_sizes = rw.read_many_fields(chrom_sizes, delimiter="\t")
    chrom_sizes = hk.list_to_dict(chrom_sizes, 0, 1, intify=True)

    # get transcriptionally active genes and make a BED file with their coordinates
    print("Getting the coordinates of transcriptionally active genes...")
    trans_active_genes = rw.read_many_fields(active_genes_file, "\t")[1:]
    trans_active_genes = [i[3] for i in trans_active_genes]
    transcripts_file = "{0}_transcripts_all.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)

    transcripts_dict = {}
    # this will be used for getting the k-mers in the transcripts
    filtered_transcripts_file_plus2 = "{0}_trans_act_only_plus3.bed".format(
        transcripts_file[:-4])
    # this will be used for filtering the reads
    filtered_transcripts_file = "{0}_trans_act_only.bed".format(
        transcripts_file[:-4])
    with open(filtered_transcripts_file,
              "w") as ft_file, open(transcripts_file) as t_file, open(
                  filtered_transcripts_file_plus2, "w") as ft_file2:
        reader = csv.reader(t_file, delimiter="\t")
        writer = csv.writer(ft_file, delimiter="\t")
        writer2 = csv.writer(ft_file2, delimiter="\t")
        for line in reader:
            if line[3] in trans_active_genes:
                # if line[0][0] not in ["G", "K"]:
                #     line[0] = "chr{0}".format(line[0])
                writer.writerow(line)
                # this is because if a read falls at the first position, you will need to know the
                # preceding two bases. Same if it falls at the last position.
                line[1] = str((int(line[1])) - 3)
                line[2] = str((int(line[2])) + 3)
                writer2.writerow(line)
                transcripts_dict[line[3]] = line

    print("Filtering reads to the transcripts...")
    # filter reads to only ones that overlap these transcripts
    transcripts_PolII = "{0}_transcripts.bed".format(PolII_file[:-4])
    co.intersect_bed(PolII_file,
                     filtered_transcripts_file,
                     force_strand=True,
                     output_file=transcripts_PolII)

    print("Extracting FASTA from the transcript coordinates...")
    # the genome FASTA is formatted as N rather than chrN
    filtered_transcripts_file_no_chr = "{0}_trans_act_only_plus3_no_chr.bed".format(
        transcripts_file[:-4])
    hk.run_process(["sed", "s/^chr//", filtered_transcripts_file_plus2],
                   file_for_output=filtered_transcripts_file_no_chr)
    filtered_transcripts_fasta_no_chr = "{0}_trans_act_only_plus3.fasta".format(
        transcripts_file[:-4])
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed",
        filtered_transcripts_file_no_chr, "-fo",
        filtered_transcripts_fasta_no_chr, "-s", "-name"
    ])

    print("Mapping kmers to transcript positions...")
    kmer_dict = map_kmers_to_positions(filtered_transcripts_fasta_no_chr,
                                       k=6,
                                       focal_pos=3)

    print("Extracting the starting dinucleotide for each read...")
    starting_dints_PolII = "{0}_transcripts_starting_6mers.bed".format(
        PolII_file[:-4])
    starting_dints_PolII_fasta = "{0}_transcripts_starting_6mers.fasta".format(
        PolII_file[:-4])
    co.extend_intervals(transcripts_PolII,
                        starting_dints_PolII,
                        3,
                        3,
                        remove_chr=True)
    hk.run_process([
        "bedtools", "getfasta", "-fi", fasta, "-bed", starting_dints_PolII,
        "-fo", starting_dints_PolII_fasta, "-s"
    ])

    print("Picking random control positions...")
    pick_random_positions(transcripts_PolII,
                          starting_dints_PolII_fasta,
                          outfile,
                          kmer_dict,
                          transcripts_dict,
                          chrom_sizes=chrom_sizes)

    print("Making single nucleotide resolution file...")
    snr_file = "{0}_snr.bed".format(outfile[:-4])
    co.snr_bed(outfile, snr_file)

    print(
        "Removing reads that overlap potential splice intermediate positions..."
    )
    no_si_snr_file = "{0}_snr_no_si.bed".format(outfile[:-4])
    co.intersect_bed(snr_file,
                     "data/Genomes/GTFs/dm6/dmel-all-r6.18_exon_ends_chr.gtf",
                     force_strand=True,
                     exclude=True,
                     no_dups=False)
Exemplo n.º 5
0
def filter_peaks(in_peak_bed,
                 read_bed,
                 read_count_file,
                 out_peak_bed,
                 min_reads_per_peak,
                 min_peak_length,
                 stats_file,
                 no_PCR_filter=False):
    """
    Filter a BED file of peaks by removing peaks that overlap fewer than
    _min_reads_per_peak_ reads or are shorter than _min_peak_length_.
    Also write some stats in _stats_file_.
    :param in_peak_bed: BED file with peak coordinates
    :param read_bed: BED file with read coordinates
    :param read_count_file: text file with one row per transcript, containing all the significant positions
    along with their read counts
    :param out_peak_bed: file for filtered BED
    :param min_reads_per_peak: minimum number of reads per peak
    :param min_peak_length: minimum length of peak
    :param stats_file: file for the output stats
    :param no_PCR_filter: if True, no filtering of potential PCR duplicates will be performed
    :return: None
    """
    PCR_threshold = 0.9
    if no_PCR_filter:
        PCR_threshold = 1
    # parse read counts per significant position
    read_counts = {}
    with open(read_count_file) as file:
        for line in file:
            line = line.rstrip("\n").split("\t")
            curr_counts = [i.split(":") for i in line[1:]]
            curr_counts = {int(i[0]): int(i[1]) for i in curr_counts}
            read_counts[line[0]] = curr_counts
    # intersect the peaks with the original reads to count the
    # number of reads per peak
    intersect_file = "{0}_{1}_intersect.bed".format(
        in_peak_bed[:-4],
        read_bed.split("/")[-1][:-4])
    co.intersect_bed(in_peak_bed,
                     read_bed,
                     force_strand=True,
                     write_both=True,
                     no_dups=False,
                     output_file=intersect_file,
                     hit_count=True)
    lengths = []
    counts = []
    with open(intersect_file) as in_file, open(out_peak_bed,
                                               "w") as out_file, open(
                                                   stats_file,
                                                   "w") as stats_file:
        stats_file.write("transcript\tlength\tread_count\n")
        for line in in_file:
            line = line.rstrip("\n").split("\t")
            read_count = int(line[6])
            if read_count >= min_reads_per_peak:
                end = int(line[2])
                start = int(line[1])
                length = end - start
                if length >= min_peak_length:
                    # check that the peak hasn't been merged between more than one transcript
                    trans = line[3]
                    if "," not in trans:
                        # check that the most enriched position doesn't account for more than 90%
                        # of the density
                        curr_read_counts = [
                            read_counts[trans][i]
                            if i in read_counts[trans] else 0
                            for i in range(start, end)
                        ]
                        maximum = np.max(curr_read_counts)
                        all_pos = np.sum(curr_read_counts)
                        if maximum / all_pos <= PCR_threshold:
                            # so the file could be visualized in a genome browser
                            line[4] = "100"
                            out_file.write("{0}\n".format("\t".join(line)))
                            stats_file.write("{0}\t{1}\t{2}\n".format(
                                trans, length, read_count))
                            lengths.append(length)
                            counts.append(read_count)
        print("Found a total of {0} peaks.".format(len(lengths)))
        print("The median peak length is {0}.".format(np.median(lengths)))
        print("The median peak read count is {0}.".format(np.median(counts)))
Exemplo n.º 6
0
def main():
    description = "Prepare a BED file with the TES coordinates of transcriptionally" \
                  "active genes and make a metagene of reads within this region."

    args = hk.parse_arguments(description, ["trans_act_file", "gtf", "start_coord", "end_coord", "outname", "reads_file"], ints = [2, 3])
    trans_act_file, gtf, start_coord, end_coord, outname, reads_file = args.trans_act_file, args.gtf, args.start_coord, args.end_coord, args.outname, args.reads_file

    trans_act_genes = []
    with open(trans_act_file) as f:
        reader = csv.reader(f, delimiter = "\t")
        for line in reader:
            trans_act_genes.append(line[3])

    exons = rw.read_gtf(gtf, "exon")
    CDSs = rw.read_gtf(gtf, "CDS")

    exons = {i: exons[i] for i in exons if i in trans_act_genes}
    # protein-coding only
    exons = {i: exons[i] for i in exons if i in CDSs}

    ds_500 = "{0}_ds_500.bed".format(outname[:-4])
    with open(outname, "w") as out, open(ds_500, "w") as out_ds:
        writer = csv.writer(out, delimiter="\t")
        writer_ds = csv.writer(out_ds, delimiter="\t")
        for trans in exons:
            strand = exons[trans][0][6]
            chrom = "chr{0}".format(exons[trans][0][0])
            if strand == "+":
                TES = exons[trans][-1][4]
                new_start = TES - start_coord
                new_end = TES + end_coord
                new_start_ds = TES
                new_end_ds = TES + 500
            else:
                TES = exons[trans][-1][3]
                new_start = TES - start_coord - 1
                new_end = TES + start_coord - 1
                new_start_ds = TES - 500 - 1
                new_end_ds = TES - 1
            writer.writerow([chrom, new_start, new_end, trans, "0", strand])
            chrom = chrom.lstrip("chr")
            writer_ds.writerow([chrom, new_start_ds, new_end_ds, trans, "0", strand])

    intersect = "{0}_ds500_intersect.bed".format(outname[:-4])
    transcripts_file = "{0}_transcripts.bed".format(gtf[:-4])
    co.intersect_bed(ds_500, transcripts_file, write_both = True, force_strand=False, no_dups = False, output_file=intersect)

    co.get_transcripts(gtf, transcripts_file, with_detail=True)
    mapping = co.transcript_mapping(transcripts_file)

    to_exclude = []
    with open(intersect) as int_file:
        reader = csv.reader(int_file, delimiter = "\t")
        for line in reader:
            strand = line[5]
            curr_gene = mapping[line[3]]
            other_gene = mapping[line[9]]
            if curr_gene != other_gene:
                to_exclude.append(line[3])

    filtered_out_name = "{0}_filt.txt".format(outname[:-4])
    with open(filtered_out_name, "w") as filt_f:
        for name in to_exclude:
            filt_f.write("{0}\n".format(name))

    final_out_name = "{0}_distrib.bed".format(outname[:-4])

    distances = co.peak_pos_in_exon(outname, reads_file, from_end = True, reads_mode = True)[0]
    write_dist_mat(distances, start_coord + end_coord, final_out_name, None, "{0}_names.txt".format(final_out_name[:-4]), None)
Exemplo n.º 7
0
def main():
    description = "Record splicing distance."
    args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7])
    input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal

    if outsuffix == "None":
        outsuffix = ""

    bare_input_path = input_file.split("/")[-1]
    bed = "{0}.bed".format(input_file[:-4])
    # hk.convert2bed(input_file, bed)

    # get descriptive stats of the reads
    length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4])
    write_read_lengths(bed, length_file)

    # read in CDS coordinates
    exons = rw.read_gtf(gtf, "CDS", gene=False)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    exons = {i: exons[i] for i in exons if i in trans_active_genes}
    terminal_suff = "_with_terminal"
    if not leave_terminal:
        # remove last exons
        exons = {i: exons[i][:-1] for i in exons}
        terminal_suff = ""
    # prepare exon-exon junctions
    exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff)
    all_junctions = co.extract_3ss(exons, exon_junctions_file)

    out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True)
    out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True)
    intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff)
    write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True)
    out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True)
    out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True)
    out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si, exons, add_chr=True)
    out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True)
    # check which junctions are associated with a splicing intermediate read
    snr_bed = "{0}_snr.bed".format(bed[:-4])
    co.snr_bed(bed, snr_bed)
    si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed)
    si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed)

    # filter out reads that don't overlap exon-exon junctions
    exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    co.intersect_bed(bed, exon_junctions_file, write_both=True,
                     output_file=exon_junction_bed,
                  force_strand=True, no_dups=False)

    spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    sr_distances = {}
    ur_distances = {}
    found_count = 0
    file_size = hk.line_count(exon_junction_bed)

    # will store all the intron names for which there are
    # either spliced or unspliced reads
    valid_junctions = []
    with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile:
        for pos, line in enumerate(file):

            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, file_size))
                print("Found {0} spliced reads.".format(found_count))
                print("\n")

            line = line.split("\t")

            # reads that end at the last nucleotide of an exon
            intermediate_read = NGS.check_intermediate_read(line, exons)
            intron_name = line[20]

            if not intermediate_read:

                # check that it ends within the exon just downstream of
                # the 3' ss that is being analyzed

                in_dwns_exon = NGS.check_position_in_exon(line, exons)

                if in_dwns_exon:

                    # 'spliced', 'unspliced' or 'None' (=can't analyze)
                    read_type = NGS.analyze_cigar(line, overhang = 5)

                    if read_type:
                        if intron_name not in valid_junctions:
                            valid_junctions.append(intron_name)
                        splice_dist = NGS.get_splice_dist(line)
                        if read_type == "S":
                            sfile.write("\t".join([str(i) for i in line]))
                            found_count = found_count + 1
                            sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist)
                        else:
                            ufile.write("\t".join([str(i) for i in line]))
                            ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist)

    print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1)))

    # for each valid junction, calculate the length of the exonic sequence
    # afterwards, so that you wouldn't consider intronic sequence in the distance
    # matrix
    lengths_dict = co.get_lengths(exons, valid_junctions)

    write_dist_mat(sr_distances, window_size,
                   "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))

    write_dist_mat(ur_distances, window_size,
                   "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))