def pick_longest_rep(fasta_filename, gff_filename, group_filename, output_filename):
    """
    For each group, select the representative record to be the longest
    """
    fastad = LazyFastaReader(fasta_filename)
    fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split("\t")
        if raw[2] == "transcript":
            tid = raw[-1].split("; ")[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split("\t")
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(","):
            if len(fastad[x].sequence) >= max_len:
                best_id = x
                best_seq = fastad[x].sequence
                max_len = len(fastad[x].sequence)
        fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id), best_seq)
    fout.close()
Пример #2
0
def pick_longest_rep(fasta_filename, gff_filename, group_filename,
                     output_filename):
    """
    For each group, select the representative record to be the longest
    """
    fastad = LazyFastaReader(fasta_filename)
    fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split('\t')
        if raw[2] == 'transcript':
            tid = raw[-1].split('; ')[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4],
                                                    raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(','):
            if len(fastad[x].sequence) >= max_len:
                best_id = x
                best_seq = fastad[x].sequence
                max_len = len(fastad[x].sequence)
        fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id),
                         best_seq)
    fout.close()
Пример #3
0
def main(argv):
    desc = 'A tool to trim quiver results for contigs majority lowercase'
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('inputFile', help='input sequence')
    parser.add_argument('outputFile', help='output fasta')
    parser.add_argument(
        '--filt',
        default=0.5,
        dest='filt',
        type=float,
        help=
        'proportion of lowercase bases a contig can have before being filtered out'
    )
    args = parser.parse_args()

    writer = FastaWriter(args.outputFile)

    for record in FastaReader(args.inputFile):
        upper_output = []
        upper_indx = []
        lower = float(sum(1 for c in record.sequence if c.islower()))
        pro = lower / float(len(record.sequence))
        print pro
        if pro < args.filt:
            writer.writeRecord(record)
Пример #4
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        i = 1
        reader = FastaReader(self.input_filename) if self.filetype == "fasta" else FastqReader(self.input_filename)

        f = FastaWriter(self.dazz_filename)

        for r in reader:
            f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence)
            self.dazz_mapping[i] = r.id
            i += 1

        f.close()

        with open(self.dazz_filename + ".pickle", "w") as f:
            dump(self.dazz_mapping, f)
Пример #5
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        i = 1
        reader = FastaReader(self.input_filename) if self.filetype == 'fasta' else \
            FastqReader(self.input_filename)

        f = FastaWriter(self.dazz_filename)

        for r in reader:
            f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence)
            self.dazz_mapping[i] = r.id
            i += 1

        f.close()

        with open(self.dazz_filename + '.pickle', 'w') as f:
            dump(self.dazz_mapping, f)