예제 #1
0
파일: merge.py 프로젝트: la0hu2006/HlaTools
def write_sequences(filetype, merged, output):
    if filetype == 'fasta':
        write_fasta(merged, output)
    elif filetype == 'fastq':
        write_fastq(merged, output)
    else:
        raise ValueError
예제 #2
0
def create_chimeras(input_file,
                    output=None,
                    reference_file=None,
                    alignment_file=None):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Check the input files, and align the input file if needed
    if reference_file and alignment_file is None:
        alignment_file = align_best_reference(input_file, reference_file)
    elif reference_file is None and alignment_file is None:
        msg = "extract_alleles requires either an Alignment or a Reference!"
        log.error(msg)
        raise IOError(msg)
    # Set the output file if not specified
    if output is None:
        basename = '.'.join(input_file.split('.')[:-1])
        output = '%s.chimeras.fasta' % basename
    # Parse the alignment data and extract the target sequences
    alignments = list(BlasrReader(alignment_file))
    groups = _group_by_locus(alignments)
    groups = _filter_groups(groups)
    sequences = list(FastaReader(input_file))
    chimeras = list(_create_chimeras(groups, sequences))
    write_fasta(chimeras, output)
    return output
예제 #3
0
def write_sequences( filetype, merged, output ):
    if filetype == 'fasta':
        write_fasta( merged, output )
    elif filetype == 'fastq':
        write_fastq( merged, output )
    else:
        raise ValueError
예제 #4
0
def _write_output( records, output_file, output_type ):
    """Write the records out to file"""
    if output_type == 'fasta':
        write_fasta( records, output_file )
    else:
        with FastqWriter( output_file ) as writer:
            for record in records:
                writer.writeRecord( record )
        check_output_file( output_file )
예제 #5
0
파일: trim.py 프로젝트: la0hu2006/HlaTools
def _write_output(records, output_file, output_type):
    """Write the records out to file"""
    if output_type == 'fasta':
        write_fasta(records, output_file)
    else:
        with FastqWriter(output_file) as writer:
            for record in records:
                writer.writeRecord(record)
        check_output_file(output_file)
예제 #6
0
def pair_exon_files( fofn_file, overlap_exon ):
    """
    Pair the 5' and 3' amplicons of a gene base on 1 overlapping exon    
    """
    exon_files = list( _parse_fofn( fofn_file ))
    output_file = _get_output_file( exon_fasta )
    fasta_records = list( FastaReader( exon_fasta ))
    sorted_records = _sort_fasta_records( fasta_records )
    cDNA_record = _combine_records( sorted_records )
    write_fasta( [cDNA_record], output_file )
예제 #7
0
def from_assembly( contig_file, reference_fofn ):
    contigs = read_fasta_dict( contig_file )
    references = read_reference_fofn( reference_fofn )
    all_genes = []
    for locus, reference in references.iteritems():
        alignment = align_reference_to_contigs( locus, reference, contig_file )
        hits = read_blasr_hits( alignment )
        hits = pick_blasr_hits( hits )
        genes = extract_genes( contigs, hits )
        all_genes += genes
    write_fasta( all_genes, 'output.fasta' )
예제 #8
0
 def parse_reference(self):
     """
     Parse HLA data from the configured reference FOFN
     """
     log.info("Parsing the supplied FOFN of HLA reference data")
     hla_reference_seqs = self.get_filepath( "references", "HLA_references.fasta" )
     sequences, metadata, loci = parse_reference_fofn( self.hla_reference )
     log.info("Writing collected HLA reference sequences to file")
     write_fasta( sequences, hla_reference_seqs )
     check_output_file( hla_reference_seqs )
     log.info("Finished parsing the HLA reference data\n")
     return hla_reference_seqs, metadata, loci
예제 #9
0
def trim_fasta( fasta_file, blasr_file, output_file, locus_dict, window=WINDOW, loci=LOCI ):
    log.info('Trimming sequences in "%s"' % fasta_file)
    log.debug("\tWindow Size:\t%s" % window)

    records = list( FastaReader( fasta_file ) )
    trims = parse_trims( blasr_file, window )
    trims = filter_trims_on_loci( trims, locus_dict, loci )
    trimmed_records = apply_trims( records, trims )
    write_fasta( trimmed_records, output_file )

    log.info('Finished trimming the supplied sequencs\n')
    return 
예제 #10
0
 def parse_reference(self):
     """
     Parse HLA data from the configured reference FOFN
     """
     log.info("Parsing the supplied FOFN of HLA reference data")
     hla_reference_seqs = self.get_filepath("references",
                                            "HLA_references.fasta")
     sequences, metadata, loci = parse_reference_fofn(self.hla_reference)
     log.info("Writing collected HLA reference sequences to file")
     write_fasta(sequences, hla_reference_seqs)
     check_output_file(hla_reference_seqs)
     log.info("Finished parsing the HLA reference data\n")
     return hla_reference_seqs, metadata, loci
예제 #11
0
def _write_temp_fasta( record ):
    """
    Write a sequence record out to a temporary Fasta file
    """
    temp = tempfile.NamedTemporaryFile( suffix='.fasta', delete=False )
    if isinstance( record, FastaRecord ):
        write_fasta( [record], temp.name )
    elif isinstance( record, FastqRecord ):
        temp_record = FastaRecord(record.name, record.sequence)
        write_fasta( [temp_record], temp.name )
    else:
        msg = 'Record must be either FastaRecord or FastqRecord'
        log.error( msg )
        raise TypeError( msg )
    return temp.name
예제 #12
0
def _write_temp_fasta(record):
    """
    Write a sequence record out to a temporary Fasta file
    """
    temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False)
    if isinstance(record, FastaRecord):
        write_fasta([record], temp.name)
    elif isinstance(record, FastqRecord):
        temp_record = FastaRecord(record.name, record.sequence)
        write_fasta([temp_record], temp.name)
    else:
        msg = 'Record must be either FastaRecord or FastqRecord'
        log.error(msg)
        raise TypeError(msg)
    return temp.name
예제 #13
0
def create_chimeras( input_file, output=None, reference_file=None, alignment_file=None ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Check the input files, and align the input file if needed
    if reference_file and alignment_file is None:
        alignment_file = align_best_reference( input_file, reference_file )
    elif reference_file is None and alignment_file is None:
        msg = "extract_alleles requires either an Alignment or a Reference!"
        log.error( msg )
        raise IOError( msg )
    # Set the output file if not specified
    if output is None:
        basename = '.'.join( input_file.split('.')[:-1] )
        output = '%s.chimeras.fasta' % basename
    # Parse the alignment data and extract the target sequences
    alignments = list( BlasrReader( alignment_file ))
    groups = _group_by_locus( alignments )
    groups = _filter_groups( groups )
    sequences = list( FastaReader( input_file ))
    chimeras = list( _create_chimeras( groups, sequences ))
    write_fasta( chimeras, output )
    return output
예제 #14
0
def extract_best_reads(input_file, 
                       output_file=None,
                       min_length=MIN_LENGTH, 
                       min_score=MIN_SCORE):
    """
    Extract, filter and subset subreads from Bas/Bax/Fofn Files
    """
    if output_file is None:
        basename = '.'.join( input_file.split('.')[:-1] )
        output_file = '%s.best.fasta' % basename
    log.info('Extracting subreads from %s' % os.path.basename(input_file))
    log.debug('\tMinimum Length:\t%s' % min_length)
    log.debug('\tMinimum Score:\t%s' % min_score)

    reads = []
    for i, filename in enumerate(_iterate_input_files( input_file )):
        reads += list( _extract_from_bash5( filename, min_length, min_score ))
    log.info("Extracted %s subreads from %s files" % (len(reads), i+1))

    write_fasta( reads, output_file )
    check_output_file( output_file )
    log.info("Finished extracting subreads")
    return output_file
예제 #15
0
def subset_sequences( fasta_file, summary_file, output_file ):
    seq_ids = identify_sequences( summary_file )
    sequences = subset_sequence_records( fasta_file, seq_ids )
    write_fasta( sequences, output_file )