示例#1
0
    def create_renaming_key(self, raw_subreads, renamed_subreads):
        """
        Create a key for translating HBAR subread names to canonical PacBio names
        """
        log.info("Looking for Raw<--->HBAR subread renaming key")
        renaming_key = self.get_filepath('subreads', 'renaming_key.txt')
        if valid_file(renaming_key):
            log.info('Using existing subread renaming key\n')
            return renaming_key

        log.info("No subread renaming key round, creating one...")
        # Compare the two files to make sure they're equivalent
        raw_count = fasta_size(raw_subreads)
        new_count = fasta_size(renamed_subreads)
        try:
            assert raw_count == new_count
        except AssertionError:
            msg = 'The number of raw subreads (%s) does not ' % raw_count + \
                  'match the number of renamed reads (%s)' % new_count
            log.info(msg)
            raise ValueError(msg)
        # Write out the pairs of names to file
        with open(renaming_key, 'w') as handle:
            for raw, renamed in zip(FastaReader(raw_subreads),
                                    FastaReader(renamed_subreads)):
                raw_name = raw.name.split()[0]
                new_name = renamed.name.split()[0]
                handle.write('%s\t%s\n' % (new_name, raw_name))
        check_output_file(renaming_key)
        log.info("Finished creating subread renaming key\n")
        return renaming_key
示例#2
0
    def create_renaming_key(self, raw_subreads, renamed_subreads ):
        """
        Create a key for translating HBAR subread names to canonical PacBio names
        """
        log.info("Looking for Raw<--->HBAR subread renaming key")
        renaming_key = self.get_filepath( 'subreads', 'renaming_key.txt' )
        if valid_file( renaming_key ):
            log.info('Using existing subread renaming key\n')
            return renaming_key

        log.info("No subread renaming key round, creating one...")
        # Compare the two files to make sure they're equivalent
        raw_count = fasta_size( raw_subreads )
        new_count = fasta_size( renamed_subreads )
        try:
            assert raw_count == new_count 
        except AssertionError:
            msg = 'The number of raw subreads (%s) does not ' % raw_count + \
                  'match the number of renamed reads (%s)' % new_count
            log.info( msg )
            raise ValueError( msg )
        # Write out the pairs of names to file
        with open( renaming_key, 'w') as handle:
            for raw, renamed in zip( FastaReader(raw_subreads), FastaReader(renamed_subreads) ):
                raw_name = raw.name.split()[0]
                new_name = renamed.name.split()[0]
                handle.write('%s\t%s\n' % (new_name, raw_name))
        check_output_file( renaming_key )
        log.info("Finished creating subread renaming key\n")
        return renaming_key
示例#3
0
def align_by_identity(query, reference_fasta, output=None, format='1'):
    """
    Type sequences in a fasta file by finding the closet reference
    """
    # If output isn't specified, base it on the query
    assert format in ['1', '5']
    if output is None:
        basename = '.'.join(query.split('.')[:-1])
        output = '%s.m%s' % (basename, format)
    ref_count = fasta_size(reference_fasta)
    # Iterate over each Fasta, aligning individually.
    with BlasrWriter(output) as handle:
        handle.write_header('m1')
        for record in read_sequences(query):
            log.info('Aligning %s by identity to %s references' %
                     (record.name, ref_count))
            temp = write_temp_fasta(record)
            alignments = _align_fasta(temp.name, reference_fasta, format)
            if not alignments:
                log.info("No hits found for %s" % record.name)
                continue
            alignments = _sort_alignments(alignments)
            alignments = _filter_alignments(alignments)
            log.info(
                'Found %s alignments sharing maximum identity with the query' %
                len(alignments))
            handle.write(alignments[0])
            os.unlink(temp.name)
    check_output_file(output)
    return output
示例#4
0
def align_by_identity( query, reference_fasta, output=None, format='1' ):
    """
    Type sequences in a fasta file by finding the closet reference
    """
    # If output isn't specified, base it on the query
    assert format in ['1', '5']
    if output is None:
        basename = '.'.join( query.split('.')[:-1] )
        output = '%s.m%s' % (basename, format)
    ref_count = fasta_size(reference_fasta)
    # Iterate over each Fasta, aligning individually.
    with BlasrWriter( output ) as handle:
        handle.write_header( 'm1' )
        for record in read_sequences( query ):
            log.info('Aligning %s by identity to %s references' % (record.name, ref_count))
            temp = write_temp_fasta( record )
            alignments = _align_fasta( temp.name, reference_fasta, format )
            if not alignments:
                log.info("No hits found for %s" % record.name)
                continue
            alignments = _sort_alignments( alignments )
            alignments = _filter_alignments( alignments )
            log.info('Found %s alignments sharing maximum identity with the query' % len(alignments))
            handle.write( alignments[0] )
            os.unlink( temp.name )
    check_output_file( output )
    return output
示例#5
0
def align_amplicons( filetype, sequence_5p, sequence_3p ):
    blasr_args = {'bestn': 1,
                  'out': 'test.m5',
                  'm': 5,
                  'noSplitSubreads': True}
    if filetype == 'fastq':
        temp_5p = write_temp_fasta( sequence_5p )
        temp_3p = write_temp_fasta( sequence_3p )
        align_left = run_blasr( temp_5p.name, temp_3p.name, blasr_args, verbose=True )
    elif filetype == 'fasta':
        assert fasta_size( sequence_5p ) == 2
        assert fasta_size( sequence_3p ) == 2
        align_left = run_blasr( sequence_5p, sequence_3p, blasr_args )
    else:
        raise ValueError
    return align_left
示例#6
0
def _align_subreads( subread_fasta, reference_fasta, locus ):
    """
    Align all locus-specific subreads against the appropriate references
    """
    location = os.path.dirname( subread_fasta )
    alignment_file = os.path.join(location, 'temp.m1')
    subread_count = fasta_size( subread_fasta )
    reference_count = fasta_size( reference_fasta )
    blasr_args = {'nproc': 8,
                  'out': alignment_file,
                  'bestn': 1,
                  'nCandidates': reference_count,
                  'noSplitSubreads': True}
    log.info("Aligning %s reads against %s references for %s" % (subread_count, 
                                                                 reference_count,
                                                                 locus))
    run_blasr( subread_fasta, reference_fasta, blasr_args )
    check_output_file( alignment_file )
    return alignment_file
示例#7
0
def _align_subreads(subread_fasta, reference_fasta, locus):
    """
    Align all locus-specific subreads against the appropriate references
    """
    location = os.path.dirname(subread_fasta)
    alignment_file = os.path.join(location, 'temp.m1')
    subread_count = fasta_size(subread_fasta)
    reference_count = fasta_size(reference_fasta)
    blasr_args = {
        'nproc': 8,
        'out': alignment_file,
        'bestn': 1,
        'nCandidates': reference_count,
        'noSplitSubreads': True
    }
    log.info("Aligning %s reads against %s references for %s" %
             (subread_count, reference_count, locus))
    run_blasr(subread_fasta, reference_fasta, blasr_args)
    check_output_file(alignment_file)
    return alignment_file
示例#8
0
def align_amplicons(filetype, sequence_5p, sequence_3p):
    blasr_args = {
        'bestn': 1,
        'out': 'test.m5',
        'm': 5,
        'noSplitSubreads': True
    }
    if filetype == 'fastq':
        temp_5p = write_temp_fasta(sequence_5p)
        temp_3p = write_temp_fasta(sequence_3p)
        align_left = run_blasr(temp_5p.name,
                               temp_3p.name,
                               blasr_args,
                               verbose=True)
    elif filetype == 'fasta':
        assert fasta_size(sequence_5p) == 2
        assert fasta_size(sequence_3p) == 2
        align_left = run_blasr(sequence_5p, sequence_3p, blasr_args)
    else:
        raise ValueError
    return align_left
示例#9
0
def _parse_subread_counts(subread_fofn):
    """
    Count the number of subreads assocated with each consensus
    """
    sizes = {}
    with open(subread_fofn) as handle:
        for filepath in handle:
            filepath = filepath.strip()
            filename = os.path.basename(filepath)
            contig_name = filename.split('.')[0]
            if contig_name.startswith('Allele_'):
                contig_name = '_'.join(contig_name.split('_')[1:])
            if contig_name.startswith('Resequenced_'):
                contig_name = '_'.join(contig_name.split('_')[1:])
            sizes[contig_name] = fasta_size(filepath)
    return sizes
示例#10
0
def _parse_subread_counts( subread_fofn ):
    """
    Count the number of subreads assocated with each consensus
    """
    sizes = {}
    with open( subread_fofn ) as handle:
        for filepath in handle:
            filepath = filepath.strip()
            filename = os.path.basename( filepath )
            contig_name = filename.split('.')[0]
            if contig_name.startswith('Allele_'):
                contig_name = '_'.join( contig_name.split('_')[1:] )
            if contig_name.startswith('Resequenced_'):
                contig_name = '_'.join( contig_name.split('_')[1:] )
            sizes[contig_name] = fasta_size( filepath )
    return sizes
 def align_subreads( self, white_list, reference_file ):
     """
     Align the subreads in a Whitelist to the created reference
     """
     basename = '.'.join( reference_file.split('.')[:-1] )
     alignment_file = '%s.m1' % basename
     reference_count = fasta_size( reference_file )
     blasr_args = { 'nproc': self._nproc,
                    'out': alignment_file,
                    'bestn': 1,
                    'nCandidates': reference_count,
                    'noSplitSubreads': True }
     run_blasr( white_list,
                reference_file,
                blasr_args )
     check_output_file( alignment_file )
     return alignment_file
 def separate_alleles( self, white_list ):
     # Run the first pass, with clustering
     log.info("Beginning iteration #%s" % self._count)
     print
     print self._count, self._output_filelist
     print
     curr_output = os.path.join( self._output, 'Iteration_%s' % self._count )
     output_file = amp_assem_output_exists( curr_output )
     if output_file:
         log.info('Existing output detected, skipping...')
     else:
         log.info('No existing output detected, proceeding ...')
         if self._count == 0: # For the first pass we enable clustering
             output_file = self.run_analysis( curr_output,
                                              white_list,
                                              cluster=True )
         else: # For all other iterations, we disable clustering
             output_file = self.run_analysis( curr_output,
                                              white_list,
                                              cluster=False )
     check_output_file( output_file )
     # Outputs of a single Fasta File are returned as is:
     log.info("Finished iteration #%s" % self._count)
     self._count += 1
     fasta_count = fasta_size( output_file )
     if fasta_count == 1:
         log.info('AmpliconAnalysis generated 1 cluster, exiting...')
         self.output_filelist.append( output_file )
         return 
     log.info('Amplicon Analysis generated %s clusters, continuing splitting' % fasta_count)
     # Otherwise we partition the reads and run the process on each partition
     alignment = self.align_subreads( white_list, output_file )
     groups = group_subreads( alignment )
     output_dir = os.path.dirname( output_file )
     sub_lists = []
     for reference, group in groups.iteritems():
         group_file = '%s.ids' % reference
         group_path = os.path.join( output_dir, group_file )
         write_whitelist( group, group_path )
         white_list_seqs = self.extract_whitelist_reads( group_path )
         sub_lists.append( white_list_seqs )
         if len(group) < MIN_SIZE:
             log.info('')
             continue
     for sub_list in sub_lists:
         self.separate_alleles( sub_list )
示例#13
0
def _align_fasta( query, reference, format ):
    """
    Align a single query sequence to all valid references
    """
    suffix = '.m%s' % format
    temp_align = tempfile.NamedTemporaryFile( suffix=suffix, delete=False )
    reference_count = fasta_size( reference )
    blasr_args = {'nproc': NPROC,
                  'out': temp_align.name,
                  'bestn': reference_count,
                  'nCandidates': reference_count,
                  'm': format,
                  'noSplitSubreads': True}
    run_blasr( query, reference, blasr_args )
    # Parse the output for return and delete the file
    alignments = list( BlasrReader( temp_align.name ))
    os.unlink( temp_align.name )
    return alignments
示例#14
0
def _align_fasta(query, reference, format):
    """
    Align a single query sequence to all valid references
    """
    suffix = '.m%s' % format
    temp_align = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
    reference_count = fasta_size(reference)
    blasr_args = {
        'nproc': NPROC,
        'out': temp_align.name,
        'bestn': reference_count,
        'nCandidates': reference_count,
        'm': format,
        'noSplitSubreads': True
    }
    run_blasr(query, reference, blasr_args)
    # Parse the output for return and delete the file
    alignments = list(BlasrReader(temp_align.name))
    os.unlink(temp_align.name)
    return alignments
示例#15
0
def align_best_reference(query, reference, output=None):
    """
    Align the output of AA to the references and return
    """
    output = _get_output_file(query, output, 'm1')
    # Run Blasr
    ref_count = fasta_size(reference)
    log.info("Aligning %s sequences to %s references" % (query, ref_count))
    blasr_args = {'nproc': nproc,
                  'out': output,
                  'bestn': 1,
                  'nCandidates': ref_count,
                  'noSplitSubreads': True}
    if reference_has_index( reference ):
        blasr_args['sa'] = reference + '.sa'
    run_blasr(query, reference, blasr_args)
    # Check the output file
    if valid_file( output ):
        return output
    return None
示例#16
0
def split_results(amp_analysis):
    """Split the output of an Amplicon Analysis job by Barcode"""
    assert os.path.isdir(amp_analysis)
    sequence_path = os.path.join(amp_analysis, "amplicon_analysis.fasta")
    check_output_file(sequence_path)
    print "Analyzing %s output sequences" % fasta_size(sequence_path)
    barcode_path = os.path.join(amp_analysis, "by_barcode")
    create_directory(barcode_path)

    records = list(FastaReader(sequence_path))
    barcodes = {get_barcode(r): [] for r in records}
    [barcodes[get_barcode(r)].append(r) for r in records]
    barcode_files = {}
    for barcode, records in barcodes.iteritems():
        barcode_file = barcode + ".fasta"
        sample_path = os.path.join(barcode_path, barcode_file)
        with FastaWriter(sample_path) as handle:
            for record in records:
                handle.writeRecord(record)
        barcode_files[barcode] = sample_path
    return barcode_files
示例#17
0
def split_results(amp_analysis):
    """Split the output of an Amplicon Analysis job by Barcode"""
    assert os.path.isdir(amp_analysis)
    sequence_path = os.path.join(amp_analysis, 'amplicon_analysis.fasta')
    check_output_file(sequence_path)
    print "Analyzing %s output sequences" % fasta_size(sequence_path)
    barcode_path = os.path.join(amp_analysis, 'by_barcode')
    create_directory(barcode_path)

    records = list(FastaReader(sequence_path))
    barcodes = {get_barcode(r): [] for r in records}
    [barcodes[get_barcode(r)].append(r) for r in records]
    barcode_files = {}
    for barcode, records in barcodes.iteritems():
        barcode_file = barcode + '.fasta'
        sample_path = os.path.join(barcode_path, barcode_file)
        with FastaWriter(sample_path) as handle:
            for record in records:
                handle.writeRecord(record)
        barcode_files[barcode] = sample_path
    return barcode_files
示例#18
0
def full_align_best_reference(query, reference, output=None):
    """
    Align the output of AA to the references and return
    """
    # Figure out the output and remove it if it exists
    output = _get_output_file(query, output, 'm5')
    # Run Blasr
    ref_count = fasta_size(reference)
    log.info("Aligning %s sequences to %s references" % (query, ref_count))
    blasr_args = {'nproc': nproc,
                  'out': output,
                  'm': 5,
                  'bestn': 1,
                  'nCandidates': ref_count,
                  'noSplitSubreads': True}
    if reference_has_index( reference ):
        blasr_args['sa'] = reference + '.sa'
    run_blasr(query, reference, blasr_args)
    # Check the output file
    check_output_file(output)
    return output