Exemplo n.º 1
0
def main():
    if args.input == stdin:
        outprefix = "repeatmasker.out"
    else:
        if args.output:
            outprefix = args.output
        else:
            outprefix = ('.').join(args.input.split('.')[:-1])
    outprefix = metaoutput(outprefix, ".gff.gz")
    outfile = metaopen(outprefix, "wt")
    print("OUT to GFF converter...")
    with metaopen(args.input, "r", buffering=args.buffering) as data:
        count = 0
        for l in data:
            if count < 4:
                count += 1
                continue
            line = l.strip().split()
            strand = "+" if line[8] == "+" else "-"
            class_family_info = line[10].split("/")
            if len(class_family_info) == 1:
                class_family_info.append(".")
            additional_info = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s"\
                               % (class_family_info[0], class_family_info[1],
                                  line[9], line[0], line[1], line[2], line[3])
            gff_line = "%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (
                line[4], line[5], line[6], strand, additional_info)
            outfile.write(gff_line)
Exemplo n.º 2
0
def main():
    outfile = metaopen(metaoutput(args.output, ".faa"), "wt")
    csv = metaopen(args.tabular_file, 'rt').readlines()[1:]
    data = Fasta_opener(args.input)
    faa_dict = data.parse_sequences_without_join()

    geneid_prev = None
    count = 0
    for ln in csv:
        line = ln.strip().split(",")
        geneid, length = line[5], int(line[9])
        if geneid_prev is None:
            geneid_prev = geneid
            count, protein = length, line[8][1:-1]
            continue
        if geneid == geneid_prev:
            if length >= count:
                count, protein = length, line[8][1:-1]
        else:
            for k, v in faa_dict.items():
                if protein in k:
                    result_sequence = k + v
                    outfile.write(result_sequence)
            count, protein = 0, line[8][1:-1]
        geneid_prev = geneid
Exemplo n.º 3
0
def main():
    if args.input == stdin:
        outprefix = "windowmasker.asnb"
    else:
        if args.output:
            outprefix = args.output
        else:
            outprefix = ('.').join(args.input.split('.')[:-1])
    outprefix = metaoutput(outprefix, ".gff.gz")
    outfile = metaopen(outprefix, "wt")
    print("INTERVAL to GFF converter...")
    with metaopen(args.input, "r", buffering=args.buffering) as data:
        count = 0
        for l in data:
            if l.startswith(">"):
                line = l.strip().split("|")
                seq_name = line[1]
                # info = "_".join(line[2].split())
            else:
                line = l.strip().split(" ")
                start, stop = str(int(line[0]) + 1), line[2]
                gff_line = [
                    seq_name, 'WindowMasker', 'repeat', start, stop, '.', '.',
                    '.', 'ID=' + str(count) + '\n'
                ]  # INFO is just information from a FASTA file "+ ';INFO=' + info + '\n'"
                count += 1
                outfile.write('\t'.join(gff_line))
Exemplo n.º 4
0
def main():
    outfile = metaopen(metaoutput(args.output, ".csv"), "wt")
    frame_coverages_amounts_dict = Counter()
    line_counter = 0
    frame_line_counter = 0

    for ln in args.input:
        line = ln.strip().split()
        line_counter += 1
        frame_line_counter += 1
        frame_coverages_amounts_dict[int(line[1])] += 1
        if frame_line_counter == args.frame_size:
            start = line_counter - args.frame_size + 1
            stop = line_counter
            metrics = CoveragesMetrics(frame_coverages_amounts_dict)
            coverage = metrics.median_value()
            outfile.write("\t".join(
                ["MT", str(start), str(stop),
                 str(coverage)]) + "\n")
            frame_coverages_amounts_dict.clear()
            frame_line_counter = 0
    if frame_coverages_amounts_dict:
        start = line_counter - sum(frame_coverages_amounts_dict.values()) + 1
        stop = line_counter
        metrics = CoveragesMetrics(frame_coverages_amounts_dict)
        coverage = metrics.median_value()
        outfile.write(
            "\t".join(["MT", str(start),
                       str(stop), str(coverage)]) + "\n")
Exemplo n.º 5
0
def main():
    outfile = metaopen(metaoutput(args.output, ".mask.bed.gz"), "wt")
    for line in args.input:
        scaffold_name, start, stop, coverage_value = args.input.readline(
        ).strip().split()
        coverage_value = int(coverage_value)
        if (coverage_value >
            (2.5 * args.whole_median)) or coverage_value < (0.5 *
                                                            args.whole_median):
            break

    for i in args.input:
        line = i.strip().split()
        line[3] = int(line[3])
        if (line[3] >
            (2.5 * args.whole_median)) or (line[3] <
                                           (0.5 * args.whole_median)):
            if line[0] == scaffold_name:
                if stop == line[1]:
                    stop = line[2]
                    continue
                else:
                    outfile.write("\t".join([scaffold_name, start, stop]) +
                                  "\n")
                    start = line[1]
                    stop = line[2]
            else:
                outfile.write("\t".join([scaffold_name, start, stop]) + "\n")
                scaffold_name, start, stop = line[:3]
    outfile.write("\t".join([scaffold_name, start, stop]) + "\n")
Exemplo n.º 6
0
def main():
    barcode_fd = metaopen(args.prefix + "_barcodes.txt.gz", 'wt')
    forward_fd = metaopen(args.prefix + "_ema-bin-all_1.fastq.gz", 'wt')
    reverse_fd = metaopen(args.prefix + "_ema-bin-all_2.fastq.gz", 'wt')
    if args.input == '-':
        data = stdin
    elif args.compressed:  #for file.gz
        data = metaopen(args.input, "rt", args.buffering)
    else:
        data = metaopen(args.input, "r", args.buffering)
    for line in data:
        line = line.split()
        barcode_fd.write(line[0] + '\n')
        forward_fd.write(line[1] + '\n' + line[2] + '\n' + '+\n' + line[3] +
                         '\n')
        reverse_fd.write(line[1] + '\n' + line[4] + '\n' + '+\n' + line[5] +
                         '\n')
    barcode_fd.close()
    forward_fd.close()
    reverse_fd.close()
    if args.input != '-':
        data.close()
Exemplo n.º 7
0
def main():
    if args.input == stdin:
        outprefix = "trf.dat"
    else:
        if args.output:
            outprefix = args.output
        else:
            outprefix = ('.').join(args.input.split('.')[:-1])
    outprefix = metaoutput(outprefix, ".gff.gz")
    outfile = metaopen(outprefix, "wt")
    print("DAT to GFF converter...")
    with metaopen(args.input, "r", buffering=args.buffering) as data:
        count = 0
        for l in data:
            line = l.strip().split(" ")
            if l.startswith(
                    'Sequence:'
            ):  # information about authors and parameters is not processed
                seq_name = line[1]
            elif l[0].isdigit():
                [
                    start, stop, period, copies, consensus_size, perc_match,
                    perc_indels, align_score, perc_A, perc_C, perc_G, perc_T,
                    entropy, cons_seq, repeat_seq
                ] = line
                gff_line = [
                    seq_name, 'TRF', 'repeat', start, stop, '.', '.', '.',
                    'ID=' + str(count) + ';period=' + period + ';copies=' +
                    copies + ';consensus_size=' + consensus_size +
                    ';perc_match=' + perc_match + ';perc_indels=' +
                    perc_indels + ';align_score=' + align_score + 'perc_A' +
                    perc_A + 'perc_C' + perc_C + 'perc_G' + perc_G + 'perc_T' +
                    perc_T + ';entropy=' + entropy + ';cons_seq=' + cons_seq +
                    ';repeat_seq=' + repeat_seq + '\n'
                ]
                count += 1
                outfile.write('\t'.join(gff_line))
Exemplo n.º 8
0
 def parse_sequences_without_join(self, buffering=None) -> dict:
     print("parse_sequences started")
     data = OrderedDict()
     header = None
     f = metaopen(self.path, 'rt', buffering)
     for line in f:
         if line.startswith('>'):
             header = line
             data[header] = []
         else:
             data[header].append(line)
     f.close()
     for name in data:
         data[name] = ''.join(data[name])
     return data
Exemplo n.º 9
0
 def parse_sequences(self, buffering=None) -> dict:
     """
     Parses a text file of genome sequences into a dictionary.
     Arguments:
       buffering: buffer text value
     """
     print("parse_sequences started")
     data = OrderedDict()
     self.lengths = {}
     header = None
     f = metaopen(self.path, 'rt', buffering)
     for line in f:
         if line.startswith('>'):
             header = line[1:].split(' ')[0]
             data[header] = []
         else:
             data[header].append(line.rstrip())
     f.close()
     for name in data:
         data[name] = ''.join(data[name])
         self.lengths[name] = len(data[name])
     return data
Exemplo n.º 10
0
    
    if args.whole_genome_stats:
        metrics.get_whole_genome_stats()
    if args.scaffolds_stats:
        metrics.get_scaffolds_stats()
    if args.nonoverlapping_windows_stats:
        metrics.get_nonoverlapping_windows_stats(args.frame_size)
    if args.universal_windows_stats: # in development for mosdepth
        metrics.get_universal_windows_stats(args.frame_size, args.frame_shift)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="script for calculating median, average, maximum and minimum coverage in genomecov.tab.gz or mosdepth.bed.gz. Report to files.csv")

    group_required = parser.add_argument_group('Required options')
    group_required.add_argument('-i', '--input', type=lambda s: metaopen(s, "rt"),
                                help="input file.bam.gz (don`t use for STDIN)", default=stdin)

    group_additional = parser.add_argument_group('Additional options')
    group_additional.add_argument('-o', '--output', metavar='PATH', type=str, default=False,
                                  help='output file prefix without frame size')
    group_additional.add_argument('-t', '--tool-name', type=str, default="mosdepth",
                                  help="tool name parameter (you can use 'mosdepth' or 'genomecov')") 
    group_additional.add_argument('-f', '--frame-size', type=int, 
                                  help="<f> bp windows size (for windows statistics)", default=1000000)
    group_additional.add_argument('--frame-shift', type=int,
                                  help="window shift step (for windows statistics)", default=1000000)
    group_additional.add_argument('-g', '--whole-genome-stats', action="store_true", default=False,
                                  help="to calculate statistics for whole genome only")
    group_additional.add_argument('-s', '--scaffolds-stats', action="store_true", default=False,
                                  help="to calculate statistics for scaffolds only")
Exemplo n.º 11
0
        outfile.writelines(
            coordinates_list_to_BED(args.scaffold_name,
                                    coordinates_merge_by_median))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        "Script for determining the coordinates of the PAR. Output of coordinates to BED file"
    )

    group_required = parser.add_argument_group('Required options')
    group_required.add_argument(
        '-i',
        '--input',
        type=lambda s: metaopen(s, "rt"),
        help="input coverage_statistics_output.csv (don`t use for STDIN)",
        default=stdin)

    group_additional = parser.add_argument_group('Additional options')
    group_additional.add_argument('-o',
                                  '--output',
                                  metavar='PATH',
                                  type=str,
                                  default=False,
                                  help='output file prefix')
    group_additional.add_argument('-f',
                                  '--window-size',
                                  type=int,
                                  help="the window size used in your data",
                                  default=10000)