Python get_file_type 예제들, pbhla.filenames.get_file_type Python 예제들

예제 #1

0

파일 보기

def extract_alleles(input_file,
                    output_file=None,
                    reference_file=None,
                    alignment_file=None,
                    method=METHOD,
                    sort=SORT,
                    loci=LOCI):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # If align to reference for breaking ties
    alignment_file = get_alignment_file(input_file, reference_file,
                                        alignment_file)
    alignments = list(BlasrReader(alignment_file))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus(alignments, loci)
    elif method == 'barcode':
        groups = _group_by_barcode(alignments)
    elif method == 'both':
        groups = _group_by_both(alignments, loci)
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error(msg)
        raise ValueError(msg)

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences(input_file)
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error(msg)
        raise ValueError(msg)

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups(groups, sorting_data)

    log.info('Selecting top sequences from %s according to the "%s" policy' %
             (input_file, method))
    selected = list(_select_sequences(ordered))
    log.info('Selected %s sequences from %s total for further analysis' %
             (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list(_subset_sequences(sequences, selected))
    _write_output(subset, output_file, output_type)
    return output_file

예제 #2

0

파일 보기

파일: extract.py 프로젝트: WenchaoLin/HlaTools

def extract_alleles( input_file, output_file=None, reference_file=None,
                                                   alignment_file=None,
                                                   method=METHOD,
                                                   sort=SORT,
                                                   loci=LOCI ):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file( input_file )
    output_type = get_file_type( output_file )

    # If align to reference for breaking ties
    alignment_file = get_alignment_file( input_file, reference_file, alignment_file )
    alignments = list( BlasrReader( alignment_file ))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus( alignments, loci )
    elif method == 'barcode':
        groups = _group_by_barcode( alignments )
    elif method == 'both':
        groups = _group_by_both( alignments, loci )
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error( msg )
        raise ValueError( msg )

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences( input_file )
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error( msg )
        raise ValueError( msg )

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups( groups, sorting_data )

    log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method))
    selected = list( _select_sequences( ordered ))
    log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list( _subset_sequences( sequences, selected ))
    _write_output( subset, output_file, output_type )
    return output_file

예제 #3

0

파일 보기

파일: utils.py 프로젝트: WenchaoLin/HlaTools

def get_output_file( input_file, modifier ):
    """
    Get a modified output file name based on some input file
    """
    basename = '.'.join( input_file.split('.')[:-1] )
    file_type = get_file_type( input_file )
    return '%s.%s.%s' % (basename, modifier, file_type)

예제 #4

0

파일 보기

def _get_output_file(input_file):
    """
    Get the output file, either as provided or from the input filename
    """
    basename = '.'.join(input_file.split('.')[:-1])
    input_type = get_file_type(input_file)
    return '%s.oriented.%s' % (basename, input_type)

예제 #5

0

파일 보기

파일: utils.py 프로젝트: la0hu2006/HlaTools

def get_output_file(input_file, modifier):
    """
    Get a modified output file name based on some input file
    """
    basename = '.'.join(input_file.split('.')[:-1])
    file_type = get_file_type(input_file)
    return '%s.%s.%s' % (basename, modifier, file_type)

예제 #6

0

파일 보기

파일: orientation.py 프로젝트: WenchaoLin/HlaTools

def _get_output_file( input_file ):
    """
    Get the output file, either as provided or from the input filename
    """
    basename = '.'.join( input_file.split('.')[:-1] ) 
    input_type = get_file_type( input_file )
    return '%s.oriented.%s' % (basename, input_type)

예제 #7

0

파일 보기

파일: orientation.py 프로젝트: WenchaoLin/HlaTools

def _get_output_type( output_file ):
    """
    Get the output filetype and confirm the format is valid
    """
    output_type = get_file_type( output_file )
    if output_type in ['fasta', 'fastq']:
        return output_type
    else:
        msg = "Output file must be either Fasta or Fastq format"
        log.error( msg )
        raise TypeError( msg )

예제 #8

0

파일 보기

파일: orientation.py 프로젝트: la0hu2006/HlaTools

def _get_output_type(output_file):
    """
    Get the output filetype and confirm the format is valid
    """
    output_type = get_file_type(output_file)
    if output_type in ['fasta', 'fastq']:
        return output_type
    else:
        msg = "Output file must be either Fasta or Fastq format"
        log.error(msg)
        raise TypeError(msg)

예제 #9

0

파일 보기

파일: extract_cDNA.py 프로젝트: Akado2009/HlaTools

def _parse_input_records( input_file ):
    """
    Parse the input sequence records with the appropriate pbcore Reader
    """
    input_type = get_file_type( input_file )
    if input_type == 'fasta':
        return list( FastaReader( input_file ))
    elif input_type == 'fastq':
        return list( FastqReader( input_file ))
    else:
        msg = 'Input file must be either Fasta or Fastq'
        log.error( msg )

예제 #10

0

파일 보기

파일: exons_to_cDNA.py 프로젝트: WenchaoLin/HlaTools

def exons_to_cDNA( exon_file ):
    """
    Combine a multi-Fasta of Exon sequences into a mock cDNA
    """
    output_type = get_file_type( exon_file )
    output_file = _get_output_file( exon_file, output_type )
    records = _parse_exon_records( exon_file, output_type )
    log.info("Combinging %s exons sequences to cDNA" % len(records))
    if len( records ):
        sorted_records = _sort_records( records )
        cDNA_record = _combine_records( sorted_records )
        log.info("Writing cDNA sequence out to %s" % output_file)
        write_sequences( cDNA_record, output_file )

예제 #11

0

파일 보기

파일: exons_to_cDNA.py 프로젝트: la0hu2006/HlaTools

def exons_to_cDNA(exon_file):
    """
    Combine a multi-Fasta of Exon sequences into a mock cDNA
    """
    output_type = get_file_type(exon_file)
    output_file = _get_output_file(exon_file, output_type)
    records = _parse_exon_records(exon_file, output_type)
    log.info("Combinging %s exons sequences to cDNA" % len(records))
    if len(records):
        sorted_records = _sort_records(records)
        cDNA_record = _combine_records(sorted_records)
        log.info("Writing cDNA sequence out to %s" % output_file)
        write_sequences(cDNA_record, output_file)

예제 #12

0

파일 보기

파일: filter.py 프로젝트: la0hu2006/HlaTools

def extract_alleles( input_file, min_reads, min_length, output_file=None ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Set the output file if not specified
    output_file = output_file or _get_output_file( input_file )
    output_type = get_file_type( output_file )
    # Parse the alignment data and extract the target sequences
    sequences = _parse_input_records( input_file )
    sequences = _filter_on_length( sequences, min_length )
    sequences = _filter_on_numreads( sequences, min_reads )
    _write_output( sequences, output_file, output_type )
    return output_file

예제 #13

0

파일 보기

def extract_exons(input_record, exon_fofn, directory=None):
    """
    Extract all exons from a particular Fasta File into a separate Fasta File
    """
    if isinstance(input_record, str):
        output_type = get_file_type(input_record)
        input_record = _read_fasta_record(input_record)
    elif isinstance(input_record, FastaRecord):
        output_type = 'fasta'
    elif isinstance(input_record, FastqRecord):
        output_type = 'fastq'
    else:
        msg = 'Input record must be Filename, FastaRecord or FastqRecord'
        log.error(msg)
        raise TypeError(msg)
    return _extract_exons(input_record, exon_fofn, output_type, directory)

예제 #14

0

파일 보기

파일: extract_exons.py 프로젝트: WenchaoLin/HlaTools

def extract_exons( input_record, exon_fofn, directory=None ):
    """
    Extract all exons from a particular Fasta File into a separate Fasta File
    """
    if isinstance( input_record, str ):
        output_type = get_file_type( input_record )
        input_record = _read_fasta_record( input_record )
    elif isinstance( input_record, FastaRecord ):
        output_type = 'fasta'
    elif isinstance( input_record, FastqRecord ):
        output_type = 'fastq'
    else:
        msg = 'Input record must be Filename, FastaRecord or FastqRecord'
        log.error( msg )
        raise TypeError( msg )
    return _extract_exons( input_record, exon_fofn, output_type, directory )

예제 #15

0

파일 보기

파일: trim.py 프로젝트: bnbowman/HlaTools

def trim_alleles(input_file, output_file=None, trim=0):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""

    # If no trim or output file is specified, we can skip this module
    if trim == 0 and output_file is None:
        log.info('No trimming necessary for "%s", skipping...' % input_file)
        return input_file

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # Read the input sequences and trim each record
    sequences = read_sequences(input_file)
    log.info("Trimming sequences by %s bp from each end" % trim)
    trimmed = _trim_sequences(sequences, trim)

    log.info("Writing the trimmed sequences out to %s" % output_file)
    _write_output(trimmed, output_file, output_type)
    return output_file

예제 #16

0

파일 보기

파일: trim.py 프로젝트: la0hu2006/HlaTools

def trim_alleles(input_file, output_file=None, trim=0):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""

    # If no trim or output file is specified, we can skip this module
    if trim == 0 and output_file is None:
        log.info('No trimming necessary for "%s", skipping...' % input_file)
        return input_file

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # Read the input sequences and trim each record
    sequences = read_sequences(input_file)
    log.info('Trimming sequences by %s bp from each end' % trim)
    trimmed = _trim_sequences(sequences, trim)

    log.info('Writing the trimmed sequences out to %s' % output_file)
    _write_output(trimmed, output_file, output_type)
    return output_file

예제 #17

0

파일 보기

파일: filter.py 프로젝트: la0hu2006/HlaTools

def _get_output_file( input_file ):
    basename = '.'.join( input_file.split('.')[:-1] )
    file_type = get_file_type( input_file )
    return '%s.filtered.%s' % (basename, file_type)

예제 #18

0

파일 보기

파일: extract.py 프로젝트: WenchaoLin/HlaTools

def _get_output_file( input_file ):
    basename = '.'.join( input_file.split('.')[:-1] )
    file_type = get_file_type( input_file )
    return '%s.selected.%s' % (basename, file_type)

예제 #19

0

파일 보기

파일: trim.py 프로젝트: bnbowman/HlaTools

def _get_output_file(input_file):
    basename = ".".join(input_file.split(".")[:-1])
    file_type = get_file_type(input_file)
    return "%s.trimmed.%s" % (basename, file_type)