Exemplo n.º 1
0
def seq_guess_and_write(seqs, filename):
    seq_format, compressed = utilities.guessFileFormat(filename)
    if seq_format is not None:
        with utilities.flexible_handle(filename, compressed, 'wt') as seq_out:
            SeqIO.write(seqs, seq_out, seq_format)
    else:
        print("Cannot infer sequence format for file: " + filename)
Exemplo n.º 2
0
def cleanupAndWrite(assembly_file,output_file,length=None,coverage=None,image_file=None,base_ID=None):
        ##Note: no sanity checks
    ## Load the assemblies
    assembly_format,assembly_compressed = utilities.guessFileFormat(assembly_file)
    output_format,output_compressed = utilities.guessFileFormat(output_file)
    if assembly_format != output_format:
        print("Warning on cleanup: input and output formats do not match ({} and {})".format(assembly_format,output_format))
    with utilities.flexible_handle(assembly_file, assembly_compressed, 'rt') as fin:
        seqs = [c for c in SeqIO.parse(fin,assembly_format)]
    if base_ID is not None:
        new_contigs, c = seq_utilities.standardize_contig_names(seqs,base_ID)
        seqs = new_contigs
    #Precise manipulation of single contig
    if length is None:
        length = 0
    if coverage is None:
        coverage = 0
    ##always SPADES
    print("Removing low quality contigs from SKESA assembly. Length < {}; coverage < {}".format(length,coverage))
#     raw_filename = os.path.join(os.path.dirname(report_file),os.path.basename(assembly_file))
    discard_file = utilities.appendToFilename(output_file, '_discarded') ##ext is same as assembly file
    updated_seqs = cleanup_SKESA(seqs,minimum_length = length, minimum_coverage = coverage,discard_file=discard_file,export_contig_graph=image_file)
    if updated_seqs is None:
        print("Unable to clean and orient the assembly: \n\t"+assembly_file)
        return 1      
    else:
        print("Retained {} of {} contigs.".format(len(updated_seqs),len(seqs)))
        with open(output_file,'wt') as fout:
            SeqIO.write(updated_seqs,fout,output_format)
        print('Saved reoriented assembly at {}'.format(output_file))
        if output_compressed:
            print("Warning. Compression not implemented. The file extension is misleading")
        return 0
Exemplo n.º 3
0
def seqs_guess_and_parse2list(filename):
    seq = None
    seq_format, compressed = utilities.guessFileFormat(filename)
    if seq_format is not None:
        with utilities.flexible_handle(filename, compressed, 'rt') as seq_in:
            seq = [x for x in SeqIO.parse(seq_in, seq_format)]
    else:
        print("Cannot infer sequence format for file: " + filename)
    return seq
Exemplo n.º 4
0
def seqs_guess_and_parse2dict(filename):
    if not isinstance(filename, str):
        raise TypeError("Filename must be string, is {}".format(
            type(filename)))
#     if not os.path.isfile(filename):
#         raise ValueError("Cannot locate file: {}".format(filename))
    seq_dict = None
    seq_format, compressed = utilities.guessFileFormat(filename)
    if seq_format is not None:
        with utilities.flexible_handle(filename, compressed, 'rt') as seq_in:
            seq_dict = SeqIO.to_dict(SeqIO.parse(seq_in, seq_format))
    else:
        print("Cannot infer sequence format for file: " + filename)
    return seq_dict
Exemplo n.º 5
0
def describeSequences(sequenceFile):
    result = defaultdict(int)
    result['FileSize'] = os.path.getsize(sequenceFile)
    seq_format, compressed = utilities.guessFileFormat(
        sequenceFile)  ##guess and parse
    if seq_format is not None:
        with utilities.flexible_handle(sequenceFile, compressed,
                                       'rt') as seq_in:
            for s in SeqIO.parse(seq_in, seq_format):
                result['Sequences'] += 1
                result['Nucleotides'] += len(s)
    else:
        print("Cannot infer sequence format for file: " + sequenceFile)
    ##TODO: add Q30 and such?
    return result
Exemplo n.º 6
0
def setupGenomeForBlastBasedExtraction(genome_name,genome_file,tempDir,file_format = '',is_compressed = None):
    ##Genome information 
    genomeInfo = dict()
    genomeInfo['name'] = genome_name
    genomeInfo['original'] = genome_file #just for reporting
    ##Some people use weird genome filenames, so I need to copy it to something without special characters
    temp_genome = os.path.join(tempDir,genome_name + '.fasta')
    genomeOrganizer.exportGenomeFASTA(genome_file,temp_genome,file_format,is_compressed)
    genomeInfo['fasta'] = temp_genome   
    if not os.path.isfile(genomeInfo['fasta']):
        raise IOError("Illegitimate file at "+genomeInfo['fasta'])
    #~ genomeDir,genomeFile = os.path.split(os.path.abspath(genomeInfo['fasta']))
    #open the genome file for extracting sequences
    genome_handle = utilities.flexible_handle(genomeInfo['original'], is_compressed, 'rt')
    genomeInfo['seqs'] = SeqIO.to_dict(SeqIO.parse(genome_handle, file_format))
    print("{} bp in {} contig(s)".format(sum([len(c) for c in genomeInfo['seqs'].values()]),len(genomeInfo['seqs']))) ##Appends to sequence identifier line
    if len(genomeInfo['seqs']) == 0:
        raise ValueError("No sequences parsed from file {}".format(genomeInfo['fasta']))
    genome_handle.close()      
    # make search database for genome
    db_base = os.path.basename(genomeInfo['fasta'])
    genomeInfo['db'] = os.path.join(tempDir,db_base)
    makeblastdb(genomeInfo['fasta'],genomeInfo['db'])  
    return genomeInfo
Exemplo n.º 7
0
def cleanupAndWrite(assembly_file,
                    output_file,
                    circle_new_start=None,
                    reverse_contig=None,
                    closed_circle=None,
                    broken_circle=None,
                    circularize_with_Ns=0,
                    length=None,
                    coverage=None,
                    report_file=None,
                    reference=None,
                    assembler=None,
                    working_dir=None):
    ##Note: no sanity checks
    ## Load the assemblies
    assembly_format, assembly_compressed = utilities.guessFileFormat(
        assembly_file)
    output_format, output_compressed = utilities.guessFileFormat(output_file)
    if assembly_format != output_format:
        print(
            "Warning on cleanup: input and output formats do not match ({} and {})"
            .format(assembly_format, output_format))
    with utilities.flexible_handle(assembly_file, assembly_compressed,
                                   'rt') as fin:
        seqs = [c for c in SeqIO.parse(fin, assembly_format)]
    #Precise manipulation of single contig
    updated_seqs = None
    if circle_new_start or reverse_contig:
        if len(seqs) > 1:
            print(
                "Error: User provided explicit reorientation instructions for a contig, but multiple contigs are present in assembly: \n"
                + assembly_file)
            return 1
        elif closed_circle:
            print("Shifting closed circle...")
            updated_seqs = shiftCirclarChromosome(seqs[0],
                                                  circle_new_start,
                                                  reverse_contig,
                                                  N_padding=0)
        elif broken_circle:
            print("Shifting broken circle...")
            updated_seqs = shiftCirclarChromosome(seqs[0],
                                                  circle_new_start,
                                                  reverse_contig,
                                                  N_padding=-1)
        elif circularize_with_Ns > 0:
            print('Scaffolding not implemented')
        else:
            print(
                'To shift a chromosome, you must specify whether the circle is closed or broken'
            )
    else:  ## Complex criteria for manipulation
        if closed_circle and len(seqs) > 1:
            print(
                "Warning: Untested parameters. User specified 'closed circle' but multiple contigs are present in assembly"
            )

        ## Remove the low-quality contigs:
        ##TODO: consider if another parameter should be passed. At least specify if  it came from SPAdes
        circular = closed_circle or broken_circle  ##Circles imply high-quality sequence
        if not circular:
            if length is None:
                length = 0
            if coverage is None:
                coverage = 0
            if assembler is None:
                print("Removing short contigs from assembly.")
                updated_seqs = [x for x in seqs if len(x) > length]


#                 if coverage
            elif assembler.upper() == 'SPADES':
                print(
                    "Removing low quality contigs from SPADES assembly. Length < {}; coverage < {}"
                    .format(length, coverage))
                raw_filename = os.path.join(os.path.dirname(report_file),
                                            os.path.basename(assembly_file))
                image_file = None  # utilities.setExt(raw_filename, 'png') ##Note: this has been moved to the calculateStats routine
                discard_file = utilities.appendToFilename(
                    raw_filename, '_discarded')  ##ext is same as assembly file
                updated_seqs = cleanup_SPADES(seqs,
                                              minimum_length=length,
                                              minimum_coverage=coverage,
                                              export_contig_data=report_file,
                                              discard_file=discard_file,
                                              export_contig_graph=image_file)
            else:
                print(
                    "Error: assembler ({}) unknown for non-circular assembly. Not attempting to cleanup contigs in file: \n{}"
                    .format(assembler, assembly_file))
                return 1
        ## Reorient to reference if requested
        if reference:
            input_seqs = updated_seqs if updated_seqs is not None else seqs
            if os.path.isfile(reference):
                if circular:
                    if len(input_seqs) > 1:
                        print(
                            'Warning: multiple contigs in "circular" assembly. Only one contig will be reoriented and I cannot tell you which one. Untested.'
                        )
                    if len(input_seqs) > 0:
                        N_padding = -1  ##Do not religate
                        if closed_circle:
                            N_padding = 0
                        elif circularize_with_Ns > 0:
                            print('Scaffolding not implemented')
                            return 1
                        print(
                            "Reorienting circular chromosome to reference...")
                        updated_seqs = reorientClosedChromosome(
                            input_seqs,
                            reference,
                            N_padding=N_padding,
                            working_dir=working_dir
                        )  #Note: only treated as closed if N_padding >= 0
                    else:  ## Len == 0
                        print(
                            "None of {} contigs passed your exclusion criteria. Exiting "
                            .format(len(seqs)))
                        return 1
                else:
                    if working_dir is None:
                        working_dir = os.path.splitext(output_file)[0]
                    draft_name = os.path.splitext(
                        os.path.basename(assembly_file))[0]
                    print("Reorienting contigs")
                    reorder_stats = reorientContigs(
                        input_seqs,
                        reference,
                        working_dir,
                        name=draft_name,
                        input_format=assembly_format)  ##Will be genbank format
                    if isinstance(reorder_stats, dict) and ('ReorderedDraft'
                                                            in reorder_stats):
                        updated_seqs = seq_utilities.seqs_guess_and_parse2list(
                            reorder_stats['ReorderedDraft']
                        )  ##Excessive to reload... but it fits in this flow
                    else:
                        updated_seqs = None

            else:
                print(
                    "Unable to realign to reference because there is no refernce file: {}"
                    .format(reference))
    if updated_seqs is None:
        print("Unable to clean and orient the assembly: \n\t" + assembly_file)
        return 1
    else:
        with open(output_file, 'wt') as fout:
            SeqIO.write(updated_seqs, fout, output_format)
        print('Saved cleaned assembly at {}'.format(output_file))
        if output_compressed:
            print(
                "Warning. Compression not implemented. The file extension is misleading"
            )
        return 0