def pick_longest_rep(fasta_filename, gff_filename, group_filename, output_filename):
    """
    For each group, select the representative record to be the longest
    """
    fastad = LazyFastaReader(fasta_filename)
    fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split("\t")
        if raw[2] == "transcript":
            tid = raw[-1].split("; ")[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split("\t")
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(","):
            if len(fastad[x].sequence) >= max_len:
                best_id = x
                best_seq = fastad[x].sequence
                max_len = len(fastad[x].sequence)
        fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id), best_seq)
    fout.close()
Пример #2
0
def main(argv):
    desc = 'A tool to trim quiver results for contigs majority lowercase'
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('inputFile', help='input sequence')
    parser.add_argument('outputFile', help='output fasta')
    parser.add_argument(
        '--filt',
        default=0.5,
        dest='filt',
        type=float,
        help=
        'proportion of lowercase bases a contig can have before being filtered out'
    )
    args = parser.parse_args()

    writer = FastaWriter(args.outputFile)

    for record in FastaReader(args.inputFile):
        upper_output = []
        upper_indx = []
        lower = float(sum(1 for c in record.sequence if c.islower()))
        pro = lower / float(len(record.sequence))
        print pro
        if pro < args.filt:
            writer.writeRecord(record)
Пример #3
0
def separate_listed_sequences(fasta_file, good_values, good_output,
                              bad_output):
    """
    Separate a fasta file into two based on a supplied value list
    """
    with FastaWriter(good_output) as good_handle:
        with FastaWriter(bad_output) as bad_handle:
            for record in FastaReader(fasta_file):
                name = get_base_sequence_name(record.name)
                if name in good_values:
                    good_handle.writeRecord(record)
                else:
                    bad_handle.writeRecord(record)
Пример #4
0
def separate_aligned_sequences(fasta_file, dictionary, good_values,
                               good_output, bad_output):
    """
    Separate a fasta file into two based on a supplied dictionary and value list
    """
    with FastaWriter(good_output) as good_handle:
        with FastaWriter(bad_output) as bad_handle:
            for record in FastaReader(fasta_file):
                name = get_base_sequence_name(record.name)
                value = dictionary.get(name, "Unmapped")
                if value in good_values:
                    good_handle.writeRecord(record)
                else:
                    bad_handle.writeRecord(record)
Пример #5
0
    def _updateChimeraInfo(self,
                           suspicous_hits,
                           in_read_fn,
                           out_nc_fn,
                           out_c_fn,
                           primer_report_fn,
                           write_report_header=True):
        """
        in_read_fn --- a fasta of full-length reads or a fasta of
                       non-full-length reads.
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fn.
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        logging.debug(
            "Update chimera info for reads in {f} ".format(f=in_read_fn))
        logging.debug(
            "Write primer report to {rpt}".format(rpt=primer_report_fn))

        num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0
        with FastaReader(in_read_fn) as reader, \
                FastaWriter(out_nc_fn) as writer, \
                FastaWriter(out_c_fn) as writer_chimera, \
                open(primer_report_fn, 'w') as reporter:
            if write_report_header:
                reporter.write(ReadAnnotation.header(delimiter=",") + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    num_nc += 1
                    num_nc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(), r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    num_c += 1
                    num_c_bases += len(r.sequence)
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
            return (num_nc, num_c, num_nc_bases, num_c_bases)
Пример #6
0
def writerProcess(outDir):
    # makes output directories
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    fastOutDir = os.path.join(outDir, "Demultiplexed/")
    if not os.path.exists(fastOutDir):
        os.makedirs(fastOutDir)

    # opens files
    csvOut = open(os.path.join(outDir, "Report.csv"), "w")
    csvOut.write("Name,Barcode,NumPasses,Coverage,AvgConfidence,MinConfidence,TrimFail,MappingFail\n")
    writers = {}
    for writecount in range(totalNumber):
        result = resultQueue.get()
        csvOut.write("%s,%s,%d,%d,%0.6f,%0.6f,%s,%s\n" % (
            result.name, result.barcode, result.numPasses, result.coverage, result.predictedAccuracy,
            result.minConfidence, result.trimFail, result.mappingFail))
        if result.barcode not in writers:
            if args.fastq:
                writers[result.barcode] = FastqWriter(os.path.join(fastOutDir, result.barcode + ".fastq"))
            else:
                writers[result.barcode] = FastaWriter(os.path.join(fastOutDir, result.barcode + ".fasta"))
        if not any((result.minNumPassesFail, result.mappingFail, result.trimFail, result.minCoverageFail,
                    result.minAvgConfidenceFail, result.minConfidenceFail)):
            if args.fastq:
                writers[result.barcode].writeRecord(result.name, result.seq, result.qual)
            else:
                writers[result.barcode].writeRecord(result.name, result.seq)
Пример #7
0
def _write_assigned_reads(input_fasta, assignments):
    """
    Write out subreads to the appropriate file
    """
    log.info("Separating subreads based on their amplicon assignments")
    output_files = []
    writers = {}
    root_name = '.'.join(input_fasta.split('.')[:-1])
    # Open up output writers for each group
    for group in assignments:
        output_file = "%s_%s.fasta" % (root_name, group)
        output_files.append(output_file)
        writers[group] = FastaWriter(output_file)

    # Write each record to it's appropriate group(s)
    for record in FastaReader(input_fasta):
        name = record.name.split()[0]
        for group in assignments:
            if name in assignments[group]:
                writers[group].writeRecord(record)
                break

    # Close all of the output writers
    for group in writers:
        writers[group].close()
    return output_files
Пример #8
0
 def open_writer(self):
     if self.filetype == 'fasta':
         output_file = '%s.trim.fasta' % self.prefix
         self.writer = FastaWriter(output_file)
     elif self.filetype == 'fastq':
         output_file = '%s.trim.fastq' % self.prefix
         self.writer = FastqWriter(output_file)
Пример #9
0
 def add_writer(self, group):
     if self.filetype == 'fasta':
         output_file = '%s.g%s.fasta' % (self.prefix, group)
         self.writers[group] = FastaWriter(output_file)
     if self.filetype == 'fastq':
         output_file = '%s.g%s.fastq' % (self.prefix, group)
         self.writers[group] = FastqWriter(output_file)
Пример #10
0
 def writeSequenceData(self, sequenceData):
     outputFile = 'temp_%s.fasta' % self.counter
     with FastaWriter(outputFile) as handle:
         for record in sequenceData:
             handle.writeRecord(record)
     self.tempFiles.append(outputFile)
     return outputFile
Пример #11
0
def pick_rep(fa_fq_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=False,
             pick_least_err_instead=True):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then 
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = FastqWriter(output_filename)
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split('\t')
        if raw[2] == 'transcript':
            tid = raw[-1].split('; ')[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4],
                                                    raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or (
                (not is_fq or not pick_least_err_instead)
                    and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if is_fq:
            fout.writeRecord(_id_, _seq_, best_qual)
        else:
            fout.writeRecord(_id_, _seq_)
    fout.close()
Пример #12
0
def write_references(reference_file, references):
    for i, ref in enumerate(references):
        for record in FastaReader(reference_file):
            name = record.name.split()[0]
            if name == ref:
                filename = 'reference_%s.fasta' % (i + 1)
                with FastaWriter(filename) as writer:
                    writer.writeRecord(record)
Пример #13
0
def subset_references(reference_file, reference_names):
    output = 'references.fasta'
    with FastaWriter(output) as writer:
        for record in FastaReader(reference_file):
            name = record.name.split()[0]
            if name in reference_names:
                writer.writeRecord(record)
    return output
Пример #14
0
def get_temp_fasta(record):
    """
    Create a temporary Fasta file for Blasr/HMMsearch/etc
    """
    temp_record = get_temp_fasta_record(record)
    temp_fasta = NamedTemporaryFile(suffix='.fasta')
    with FastaWriter(temp_fasta.name) as handle:
        handle.writeRecord(temp_record)
    return temp_fasta
Пример #15
0
def combine_fasta(fasta_files, destination):
    with FastaWriter(destination) as handle:
        for fasta in fasta_files:
            try:
                for record in FastaReader(fasta):
                    handle.writeRecord(record)
            except:
                log.warn('Could not open "%s" as Fasta' % fasta)
    check_output_file(destination)
Пример #16
0
def write_fasta(records, output_file):
    """
    Write a FastaRecord, or a list of FastaRecords, out to file
    """
    with FastaWriter(output_file) as handle:
        for record in records:
            assert isinstance(record, FastaRecord)
            handle.writeRecord(record)
    check_output_file(output_file)
    return output_file
Пример #17
0
def extract_subreads(input_file,
                     output_file,
                     min_length,
                     max_length,
                     min_score,
                     min_snr,
                     max_count,
                     white_list=None):
    """
    Extract, filter and subset subreads from Bas/Bax/Fofn Files
    """
    log.info('Extracting subreads from %s' % os.path.basename(input_file))
    log.debug('\tMinimum Length:\t%s' % min_length)
    log.debug('\tMaximum Length:\t%s' % max_length)
    log.debug('\tMinimum Score:\t%s' % min_score)
    log.debug('\tMinimum SNR:\t%s' % min_snr)
    log.debug('\tMax Count:\t%s' % max_count)
    log.debug('\tWhitelisted ZMWs:\t%s' % white_list)

    if white_list:
        white_list = set(_parse_white_list(white_list))

    output_prefix = os.path.dirname(output_file)
    output_file_list = []

    subread_count = 0
    for i, filename in enumerate(_iterate_input_files(input_file)):

        curr_output = os.path.join(output_prefix,
                                   'subreads_%s.fasta' % (i + 1))
        if filename.endswith('.bas.h5') or filename.endswith('bax.h5'):
            subreads = _extract_from_bash5(filename, min_length, max_length,
                                           min_score, min_snr, white_list)
        elif filename.endswith('.fa') or filename.endswith('.fasta'):
            subreads = _extract_from_fasta(filename, min_length, max_length)

        with FastaWriter(curr_output) as writer:
            for record in subreads:
                writer.writeRecord(record)

        subread_count += len(subreads)
        output_file_list.append(curr_output)
    log.info("Extracted %s subreads from %s files" % (subread_count, i + 1))

    log.info("Writing FOFN of subread files")
    with open(output_file, 'w') as handle:
        for filename in output_file_list:
            handle.write(filename + '\n')

    # TODO: Fix MaxCount function
    #if max_count:
    #    subreads = _subset_subreads( subreads, max_count )

    log.info("Finished extracting subreads")
    return output_file
Пример #18
0
def make_current_fasta(icec_obj, flnc_filename, root_dir):
    """
    current fasta will consists of all ids

    however --- if this was a already finished run and we are adding more input,
        then newids is empty, in this case we set newids = everything that
        has no affiliation or more than one affiliated cluster in d
    """
    with FastaWriter(current_fasta(root_dir)) as f:
        for r in FastaReader(flnc_filename):
            f.writeRecord(r)
Пример #19
0
def _open_output_handle(output_file, output_type):
    """
    Open an appropriate output handle to record the exon sequences
    """
    if output_type == 'fasta':
        return FastaWriter(output_file)
    elif output_type == 'fastq':
        return FastqWriter(output_file)
    msg = 'Output type must be Fasta or Fastq'
    log.error(msg)
    raise TypeError(msg)
Пример #20
0
 def outputReferenceFasta(self, reference, count):
     print "Creating reference sequence for Cluster #%s" % count
     referenceFile = 'cluster%s_ref.fasta' % count
     reference_desc = 'cluster{0}_reference\t{1}'.format(
         count, reference.name)
     if os.path.exists(referenceFile):
         return referenceFile
     with FastaWriter(referenceFile) as handle:
         referenceFasta = FastaRecord(reference_desc, reference.sequence)
         handle.writeRecord(referenceFasta)
     return referenceFile
Пример #21
0
 def outputClusterFasta(self, reads, count):
     fastaFile = 'cluster%s.fasta' % count
     if os.path.exists(fastaFile):
         return fastaFile
     # Rename the "Reference" sequence to the cluster
     with FastaWriter(fastaFile) as handle:
         for fastqRecord in reads:
             fastaRecord = FastaRecord(fastqRecord.name,
                                       fastqRecord.sequence)
             handle.writeRecord(fastaRecord)
     return fastaFile
Пример #22
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        i = 1
        reader = FastaReader(self.input_filename) if self.filetype == "fasta" else FastqReader(self.input_filename)

        f = FastaWriter(self.dazz_filename)

        for r in reader:
            f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence)
            self.dazz_mapping[i] = r.id
            i += 1

        f.close()

        with open(self.dazz_filename + ".pickle", "w") as f:
            dump(self.dazz_mapping, f)
Пример #23
0
def pick_longest_rep(fasta_filename, gff_filename, group_filename,
                     output_filename):
    """
    For each group, select the representative record to be the longest
    """
    fastad = LazyFastaReader(fasta_filename)
    fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split('\t')
        if raw[2] == 'transcript':
            tid = raw[-1].split('; ')[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4],
                                                    raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(','):
            if len(fastad[x].sequence) >= max_len:
                best_id = x
                best_seq = fastad[x].sequence
                max_len = len(fastad[x].sequence)
        fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id),
                         best_seq)
    fout.close()
Пример #24
0
 def runBlasr(cls, fastqRecord, alignedRecord):
     # Write the query and reference records to file
     tempId = str(int(random() * 10000000))
     tempRef = 'temp_ref_%s.fasta' % tempId
     with FastaWriter(tempRef) as handle:
         fastaRecord = cls.convertFastqToFasta(fastqRecord)
         handle.writeRecord(fastaRecord)
     tempQuery = 'temp_query_%s.fasta' % tempId
     with FastaWriter(tempQuery) as handle:
         handle.writeRecord(alignedRecord)
     # Create and run the command-line
     tempOut = 'temp_%s.m1' % tempId
     cline = 'blasr %s %s -m 1 -bestn 1 -out %s' % (tempQuery, tempRef,
                                                    tempOut)
     p = subprocess.Popen(cline.split())
     stdout, stderr = p.communicate()
     # Parse and return the best hit and remove temp files
     bestHit = cls.readBestBlasrHit(tempOut)
     os.remove(tempRef)
     os.remove(tempQuery)
     os.remove(tempOut)
     return bestHit
Пример #25
0
def combine_fasta(sequence_files, output_file):
    """
    Combine a series of sequence files into one Fasta
    """
    with FastaWriter(output_file) as handle:
        for filename in sequence_files:
            try:
                for record in FastaReader(filename):
                    handle.writeRecord(record)
            except:
                log.warn('Could not open "%s" as Fasta' % fasta)
    check_output_file(output_file)
    return output_file
Пример #26
0
 def output_final_sequences(self, finalSequenceList):
     outputFile = self.process_setup(finalSequenceList,
                                     'SequenceWriter',
                                     suffix='fasta')
     if self.output_files_exist(output_file=outputFile):
         return outputFile
     with FastaWriter(outputFile) as writer:
         with open(finalSequenceList) as handle:
             for line in handle:
                 sequenceFile = line.strip()
                 copy_fasta_sequences(sequenceFile, writer)
     self.process_cleanup(output_file=outputFile)
     return outputFile
Пример #27
0
def write_fasta(fasta_records, output_file):
    """
    Write a FastaRecord, or list of records, out to file
    """
    with FastaWriter(output_file) as handle:
        if isinstance(fasta_records, FastaRecord):
            handle.writeRecord(fasta_records)
        elif isinstance(fasta_records, list):
            for record in fasta_records:
                handle.writeRecord(record)
        else:
            msg = "Input Record(s) type not recognized"
            log.error(msg)
            raise TypeError(msg)
    check_output_file(output_file)
Пример #28
0
def separate_sequences(fasta_file, dictionary, prefix=''):
    """
    Separate a fasta file into multiple groups based on some dict
    """
    file_handles = {}
    for record in FastaReader(fasta_file):
        name = get_base_sequence_name(record.name)
        group = dictionary.get(name, "Unmapped")
        group_file = prefix + '_' + group + '.fasta'
        try:
            file_handles[group_file].writeRecord(record)
        except KeyError:
            file_handles[group_file] = FastaWriter(group_file)
            file_handles[group_file].writeRecord(record)
    return closed_file_handles(file_handles)
Пример #29
0
    def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn,
                           out_flc_fn, primer_report_fl_fn):
        """
        in_read_fn --- a fasta of full-length reads
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fl_fn.
        """
        logging.info("Update chimera info to reads annotations " +
                     "in the output FASTA file and the primer report.")

        with FastaReader(in_read_fn) as reader, \
             FastaWriter(out_flnc_fn) as writer, \
             FastaWriter(out_flc_fn) as writer_chimera, \
             open(primer_report_fl_fn, 'w') as reporter:
            reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    assert (annotation.isFullLength)
                    self.summary.num_flnc += 1
                    self.summary.num_flnc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(), r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    self.summary.num_flc += 1
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord() + "\n")
Пример #30
0
def extract_exons(afa_file, info_file):
    locus = afa_file.split('_')[0]
    output_fofn = '%s_exons.fofn' % locus
    records = list(FastaReader(afa_file))
    regions = list(_parse_info_file(info_file))

    with open(output_fofn, 'w') as fofn_handle:
        for exon in _select_exons(regions):
            output_file = _get_output_file(locus, exon)
            with FastaWriter(output_file) as output:
                for record in _extract_fasta_region(records, exon):
                    if len(set(record.sequence)) == 1:
                        continue
                    output.writeRecord(record)
            fofn_handle.write(os.path.abspath(output_file) + '\n')
Пример #31
0
def rename_fasta( input_file, output_file, name_key ):
    """
    Rename a single Fasta of subreads
    """
    renaming_dict = read_dict_file( name_key )
    with FastaWriter( output_file ) as writer:
        for record in FastaReader( input_file ):
            old_name = record.name.split()[0]
            try:
                new_name = renaming_dict[old_name]
            except KeyError:
                msg = "Sequence name not found!"
                log.error( msg )
                raise KeyError( msg )
            new_record = FastaRecord( new_name, record.sequence )
            writer.writeRecord( new_record )
    check_output_file( output_file )
    return output_file
Пример #32
0
def extract_subreads(input_file,
                     output_file,
                     min_length,
                     max_length,
                     min_score,
                     min_snr,
                     max_count,
                     white_list=None):
    """
    Extract, filter and subset subreads from Bas/Bax/Fofn Files
    """
    log.info('Extracting subreads from %s' % os.path.basename(input_file))
    log.debug('\tMinimum Length:\t%s' % min_length)
    log.debug('\tMaximum Length:\t%s' % max_length)
    log.debug('\tMinimum Score:\t%s' % min_score)
    log.debug('\tMinimum SNR:\t%s' % min_snr)
    log.debug('\tMax Count:\t%s' % max_count)
    log.debug('\tWhitelisted ZMWs:\t%s' % white_list)

    if white_list:
        white_list = set(_parse_white_list(white_list))

    subreads = []
    for i, filename in enumerate(_iterate_input_files(input_file)):
        if filename.endswith('.bas.h5') or filename.endswith('bax.h5'):
            subreads += _extract_from_bash5(filename, min_length, max_length,
                                            min_score, min_snr, white_list)
        elif filename.endswith('.fa') or filename.endswith('.fasta'):
            subreads += _extract_from_fasta(filename, min_length, max_length)
    log.info("Extracted %s subreads from %s files" % (len(subreads), i + 1))

    if max_count:
        subreads = _subset_subreads(subreads, max_count)

    with FastaWriter(output_file) as writer:
        for record in subreads:
            writer.writeRecord(record)

    log.info("Finished extracting subreads")
    return output_file