示例#1
0
def orient_sequences(input_file,
                     reference_file=None,
                     alignment_file=None,
                     output_file=None):
    """
    Reorient a fasta file so all sequences are in the same direction as their reference
    """
    log.info(
        "Reorienting all sequences in %s to the direction of their reference" %
        input_file)
    # Set the output file and type
    output_file = output_file or _get_output_file(input_file)
    output_type = _get_output_type(output_file)
    if valid_file(output_file):
        log.info("Found existing output file %s, skipping orientation step" %
                 output_file)
        return output_file
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file(input_file, reference_file,
                                        alignment_file)
    reversed_seqs = _identify_reversed_sequences(alignment_file)
    log.info("Identified %s sequences needing Reverse Complementation" %
             len(reversed_seqs))
    input_records = _parse_input_records(input_file)
    reversed_records = _reverse_records(input_records, reversed_seqs)
    log.info("Writing out sequences to %s" % output_file)
    _write_output(reversed_records, output_file, output_type)
    return output_file
示例#2
0
def get_cDNA_reference():
    if valid_file( CDNA_REF ):
        logging.info('Using existing cDNA Reference fasta')
        return CDNA_REF
    else:
        logging.info('Creating new cDNA Reference fasta')
        create_cDNA_reference()
        return get_cDNA_reference()
示例#3
0
def get_genomic_reference():
    if valid_file( GEN_REF ):
        logging.info('Using existing Genomic Reference fasta')
        return GEN_REF
    else:
        logging.info('Creating new Genomic Reference fasta')
        create_genomic_reference()
        return get_genomic_reference()
示例#4
0
def get_exon_reference():
    if valid_file( EXON_REF ):
        logging.info('Using existing Exon Reference fofn')
        return EXON_REF
    else:
        logging.info('Creating new Exon Reference fofn')
        create_exon_fofns()
        create_exon_reference()
        return get_exon_reference()
示例#5
0
def sort_subreads( subread_file, reference_file ):
    """
    Aligning
    """
    log.info("Aligning subreads to the two best references")
    temp = 'temp2.m1'
    if valid_file( temp ):
        return {hit.qname: hit.tname for hit in BlasrReader(temp)}
    align_best_reference( subread_file, reference_file, temp )
    return {hit.qname: hit.tname for hit in BlasrReader(temp)}
示例#6
0
def order_references( subread_file, reference_file ):
    """
    Select the two best reference sequences from a list
    """
    log.info("Selecting the best references sequences to use")
    temp = 'temp.m1'
    if not valid_file( temp ):
        align_best_reference( subread_file, reference_file, temp )
    c = Counter([hit.tname for hit in BlasrReader(temp)])
    return [k for k, v in c.most_common()]
示例#7
0
def sort_subreads(subread_file, reference_file):
    """
    Aligning
    """
    log.info("Aligning subreads to the two best references")
    temp = 'temp2.m1'
    if valid_file(temp):
        return {hit.qname: hit.tname for hit in BlasrReader(temp)}
    align_best_reference(subread_file, reference_file, temp)
    return {hit.qname: hit.tname for hit in BlasrReader(temp)}
示例#8
0
def order_references(subread_file, reference_file):
    """
    Select the two best reference sequences from a list
    """
    log.info("Selecting the best references sequences to use")
    temp = 'temp.m1'
    if not valid_file(temp):
        align_best_reference(subread_file, reference_file, temp)
    c = Counter([hit.tname for hit in BlasrReader(temp)])
    return [k for k, v in c.most_common()]
示例#9
0
def _align_sequences(query, reference):
    """
    Align one fasta file of sequences to another
    """
    temp = NamedTemporaryFile(suffix='.m1', delete=False)
    align_best_reference(query, reference, output=temp.name)
    if valid_file(temp.name):
        hits = list(BlasrReader(temp.name))
        os.unlink(temp.name)
        return hits
    os.unlink(temp.name)
    return None
示例#10
0
def get_input_file( input ):
    if os.path.isdir( input ):
        log.info("Input appears to be a directory, looking for sequence files")
        return get_amplicon_analysis_output( input )
    elif valid_file( input ):
        if is_fasta( input ):
            log.info("Input appears to be a valid Fasta file")
            return input
        elif is_fastq( input ):
            log.info("Input appears to be a valid Fastq file")
            return input
        else:
            msg = "Input is not a valid Fasta or Fastq file!"
            log.error( msg )
            raise IOError( msg )
    else:
        msg = "Supplied input does not appear to be a valid AmpliconAnalysis file or directory"
        log.error( msg )
        raise IOError( msg )
示例#11
0
def get_input_file(input):
    if os.path.isdir(input):
        log.info("Input appears to be a directory, looking for sequence files")
        return get_amplicon_analysis_output(input)
    elif valid_file(input):
        if is_fasta(input):
            log.info("Input appears to be a valid Fasta file")
            return input
        elif is_fastq(input):
            log.info("Input appears to be a valid Fastq file")
            return input
        else:
            msg = "Input is not a valid Fasta or Fastq file!"
            log.error(msg)
            raise IOError(msg)
    else:
        msg = "Supplied input does not appear to be a valid AmpliconAnalysis file or directory"
        log.error(msg)
        raise IOError(msg)
示例#12
0
def align_best_reference(query, reference, output=None):
    """
    Align the output of AA to the references and return
    """
    output = _get_output_file(query, output, 'm1')
    # Run Blasr
    ref_count = fasta_size(reference)
    log.info("Aligning %s sequences to %s references" % (query, ref_count))
    blasr_args = {'nproc': nproc,
                  'out': output,
                  'bestn': 1,
                  'nCandidates': ref_count,
                  'noSplitSubreads': True}
    if reference_has_index( reference ):
        blasr_args['sa'] = reference + '.sa'
    run_blasr(query, reference, blasr_args)
    # Check the output file
    if valid_file( output ):
        return output
    return None
示例#13
0
def orient_sequences( input_file, reference_file=None, alignment_file=None, output_file=None ):
    """
    Reorient a fasta file so all sequences are in the same direction as their reference
    """
    log.info("Reorienting all sequences in %s to the direction of their reference" % input_file)
    # Set the output file and type
    output_file = output_file or _get_output_file( input_file )
    output_type = _get_output_type( output_file )
    if valid_file( output_file ):
        log.info("Found existing output file %s, skipping orientation step" % output_file)
        return output_file
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file( input_file, reference_file, alignment_file )
    reversed_seqs = _identify_reversed_sequences( alignment_file )
    log.info("Identified %s sequences needing Reverse Complementation" % len(reversed_seqs))
    input_records = _parse_input_records( input_file )
    reversed_records = _reverse_records( input_records, reversed_seqs )
    log.info("Writing out sequences to %s" % output_file)
    _write_output( reversed_records, output_file, output_type )
    return output_file
示例#14
0
def reference_has_index( reference ):
    index_file = reference + '.sa'
    if valid_file( index_file ):
        return index_file
    return False