예제 #1
0
def parse_SPADES(contigs,oldVersion=False,export_contig_data=None,export_contig_graph=None):
    result = []
    for c in contigs: 
        spades_stats = parse_SPADES_contig(c,oldVersion=oldVersion)
        if spades_stats is not None:
            result.append(spades_stats)
        else:
            print("Failure to parse SPADEs contig")
    contig_table = pd.DataFrame(result)  
    ### TODO: remove this to a higher level          
    if has_plt:
        try:
            if isinstance(export_contig_graph,str):
                fig = contig_table.plot(kind='scatter', x='Contig_Size',y='Coverage',logx=True,logy=True)
                fig = fig.get_figure()
                fig.savefig(export_contig_graph)   
        except Exception as e:
            print('Failed to save contig stats scatterplot at '+export_contig_graph)
            utilities.printExceptionDetails(e)
    try:
        if isinstance(export_contig_data,str):
            contig_table[ContigHeaders[0:-1]].to_csv(export_contig_data,index=False)
    except:
        print('Failed to save contig stats table at '+export_contig_data)
        raise
    return contig_table
예제 #2
0
def BeforeAndAfter(pre_stats,post_stats):
    rename_raw = {x:x+'_raw' for x in pre_stats.columns}       
    merged_stats= pd.merge(pre_stats.rename(columns=rename_raw),post_stats,left_index=True,right_index=True,how='outer')#Each should have a single index.
    merged_stats.fillna('N/A', inplace=True)
    try:
        if ('Bases_In_Contigs' in merged_stats) and ('Bases_In_Contigs_raw' in merged_stats): ##Should always be integers
            merged_stats['Discarded_Bases'] = merged_stats.Bases_In_Contigs_raw.astype(int) - merged_stats.Bases_In_Contigs.astype(int)
            merged_stats['Discarded_Percent'] = 100*merged_stats.Discarded_Bases.astype(int)/merged_stats.Bases_In_Contigs_raw.astype(int)
            if ('HalfCov_Contig_Bases' in merged_stats):
                merged_stats['HalfCov_Percent'] = 100*merged_stats.HalfCov_Contig_Bases/merged_stats.Bases_In_Contigs.astype(int)
    except Exception as e:
        utilities.printExceptionDetails(e)
    return merged_stats    
예제 #3
0
def multiple(multi_args):
    if multi_args.force and multi_args.resume:
        print(
            "Exiting: the options 'force' and 'resume' are incompatible. Use only 'force' if you want to overwrite prior files."
        )
        return 1
    output_dir = multi_args.output if multi_args.output else utilities.safeMakeOutputFolder(
        _outputBase)
    utilities.safeMakeDir(output_dir)
    logFile = os.path.join(output_dir, "AssemblyCleanup.log")
    resultFile = os.path.join(output_dir, "AssemblyCleanupTable.tab")
    tempFile = utilities.appendToFilename(resultFile, '_temp')
    sys.stdout = utilities.Logger(logFile)
    assembler_name = None if multi_args.assembler is None else multi_args.assembler.lower(
    )
    print("Parameters:")
    for k, v in vars(multi_args).items():
        print('{} : {}'.format(k, v))
    draft_location = multi_args.draft_location
    if os.path.isfile(draft_location):
        guideFrame = pd.read_table(draft_location)
        print('Loaded guide table from ' + draft_location)
        print("\t table contains {} records".format(len(guideFrame)))
    elif os.path.isdir(draft_location):
        print("Searching for files in " + os.path.abspath(draft_location))
        deep_search = False if multi_args.shallow_search_assemblies else True
        guideFrame = NGS_data_utilities.listGenomeFilesWithNames(
            draft_location,
            deep_search=deep_search,
            extension=multi_args.extension)
        ##Exclude reads
        size_limit = multi_args.size_limit
        if size_limit > 0:
            guideFrame['filesize'] = guideFrame.Filename.apply(os.path.getsize)
            small_enough = (guideFrame.filesize <= size_limit)
            if sum(small_enough) < len(guideFrame):
                print('Only {} of {} files pass the upper size limit of {}'.
                      format(sum(small_enough), len(guideFrame), size_limit))
                guideFrame = guideFrame[small_enough].copy()
        guideFrame = guideFrame[NGS_data_utilities.dfHeaders].copy()
        if guideFrame is None or (len(guideFrame) == 0):
            print("Exiting. Failed to retrieve any files")
            return 1
        if assembler_name:
            guideFrame['assembler'] = assembler_name
            print('assigned assembler to be ' + assembler_name)
        else:  #This is not passed to AssemblyStats
            for i in guideFrame.index:
                if 'spades' in guideFrame.loc[i, 'Filename'].lower():
                    guideFrame.loc[i, 'assembler'] = 'spades'
                    print('assigned assembler to be spades for {}'.format(
                        guideFrame.loc[i, 'Lab_ID']))
        print('Calculating raw stats...')
        assemblyStats = AssemblyStats.calculateStats(
            guideFrame.Filename.tolist(),
            ass_format=assembler_name,
            image_dir=output_dir
        )  ##This will independently infer assembler from name unless given
        assemblyStats['Contig_Count'] = assemblyStats['Contig_Count'].astype(
            int)
        if assemblyStats is None or len(assemblyStats) == 0:
            print("Exiting failed to calculate assembly stats on input")
            return 1
        guideFrame = pd.merge(
            guideFrame, assemblyStats, how='left'
        )  ##Should merge on Filename. Don't want confusion if they share other fields
        if multi_args.BCFB_PacBio_Name:
            print('interpreting BCFB PacBio names...')
            for i in guideFrame.index:
                guideFrame.loc[i,
                               'Gaps'] = False if '.ro1m.' in guideFrame.loc[
                                   i, 'Filename'] else True
        else:
            guideFrame[
                'Gaps'] = True  ### Assume no closed genomes unless stated
    else:
        print("Exiting. Unable to find the location of draft files: {}".format(
            draft_location))
        return (1)
    print('Loaded data...')
    process = None
    if multi_args.reorient:
        process = 'RO'
    elif multi_args.discard:
        process = 'DIS'
    elif multi_args.discard_then_reorient:
        process = 'DIS_RO'
    else:
        print("Exiting. No processing specified")
        return (1)
    expectedArgs = set(['working_dir', 'report_file', 'assembler'])
    # circle_new_start=None,reverse_contig=None,closed_circle=None,broken_circle=None,circularize_with_Ns=0,
    #                     length=250,coverage=10,report_file=None,reference=None,assembler=None
    if 'RO' in process:
        expectedArgs.update(RO_argset)
        if not os.path.isfile(multi_args.reference):
            print("Cannot find reference file. Exiting")
            return 1
    if 'DIS' in process:
        expectedArgs.update(DIS_argset)

    tag = multi_args.tag if multi_args.tag else process
    print('Result files will have the tag "{}"'.format(tag))

    ##TODO test columns here

    permitted_fields = req_fields + list(expectedArgs)
    keep_fields = [x for x in guideFrame.columns if x in permitted_fields]
    parameterFrame = guideFrame[keep_fields].copy()
    if len(parameterFrame) == 0:
        return 1  ##Failuer
    fail_list = []
    for i, row in parameterFrame.iterrows(
    ):  ##Row gets converted to keyword arguments; shares index with guideFrame
        assembly_file = row['Filename']
        if not os.path.isfile(assembly_file):
            print("Error: unable to find file: {}".format(assembly_file))
            output_file = 'error. '
        else:
            print("Working on " + os.path.basename(assembly_file))
            print("\tat  {}".format(time.ctime()))
            del row['Filename']
            if 'Contig_Count' in row.index:
                if (str(row['Contig_Count']) == str(1)):
                    gaps = row['Gaps']
                    gap_bool = True  ##Safest default (will introduce contig breaks). But should probably skip reorientation
                    if isinstance(gaps, str):
                        if gaps.upper() == 'TRUE':
                            gap_bool = True
                        elif gaps.upper() == 'FALSE':
                            gap_bool = False
                        else:
                            print("unable to interpret 'gaps' notation: {}".
                                  format(gaps))
                            continue
                    elif isinstance(gaps, bool):
                        gap_bool = gaps
                    else:
                        print("unable to interpret 'gaps' notation: {}".format(
                            gaps))
                        continue
                    if gap_bool:
                        row['broken_circle'] = True  ##NOTE: with our bacteria, we assume circle
                    else:
                        row['closed_circle'] = True

                del row['Gaps']

            assembly_basename = utilities.appendToFilename(
                os.path.basename(assembly_file), '_' + tag)
            output_file = os.path.join(output_dir, assembly_basename)

            report_file = os.path.join(
                output_dir, os.path.basename(assembly_file)) + '.report.txt'
            has_out = os.path.isfile(output_file)
            has_rpt = os.path.isfile(report_file)
            if has_out or has_rpt:
                if multi_args.force:
                    if has_out:
                        print(
                            "Removing prexisting file: {}".format(output_file))
                        os.remove(output_file)
                    if has_rpt:
                        print("Removing pre-existing file: {}".format(
                            report_file))
                        os.remove(report_file)
                else:
                    if not multi_args.resume:
                        print(
                            "Error: Refusing to overwrite pre-existing output files: \n\t{}\n\t{}"
                            .format(output_file, report_file))
                    continue
            try:
                open(output_file, 'a').close()
                os.remove(output_file)
            except IOError:
                print(
                    "Error. Do not have permission to write to output file \n\t{}"
                    .format(output_file))
                continue

            cleanup_args = vars(multi_args).copy()  ##TODO: put this up front?
            cleanup_args.update(row.to_dict())
            cleanup_args['working_dir'] = os.path.join(output_dir, 'work')
            cleanup_args = {
                k: v
                for k, v in cleanup_args.items() if k in expectedArgs
            }
            if 'Mean_Coverage' in row.index:
                proportion_cutoff = multi_args.coverage_proportion * row.loc[
                    'Mean_Coverage']
                min_coverage = max(multi_args.coverage, proportion_cutoff)
                cleanup_args['coverage'] = min_coverage
                del cleanup_args['Mean_Coverage']
            else:
                cleanup_args[
                    'coverage'] = multi_args.coverage  ##This should actually be irrelevant --
            try:
                print("Arguments:")
                print(cleanup_args)
                if cleanupAndWrite(assembly_file,
                                   output_file,
                                   report_file=report_file,
                                   **cleanup_args) != 0:  ##TODO: return stats
                    output_file = 'error'
                    fail_list.append(assembly_file)
            except Exception as e:
                fail_list.append(assembly_file)
                output_file = 'error'
                warn = "Exception on cleanupAndWrite:"
                utilities.printExceptionDetails(e, warn)
            print()  ##Blank line
        guideFrame.loc[i, 'CleanedFile'] = output_file
        guideFrame.to_csv(tempFile, index=False, sep='\t')
    print("Errors on {} files: ".format(len(fail_list)))
    print("\n\t".join(fail_list))
    if process in ['DIS',
                   'DIS_RO']:  ##recalculate stats for filtered contig sets
        assemblyStats2 = AssemblyStats.calculateStats(
            guideFrame.CleanedFile.tolist(), ass_format=assembler_name)
        if assemblyStats2 is not None:
            #             assemblyStats2.rename(columns={'Filename':'CleanedFile'},inplace=True)
            guideFrame = AssemblyStats.BeforeAndAfter(
                guideFrame.set_index("CleanedFile"),
                assemblyStats2.set_index('Filename'))
#             guideFrame = pd.merge(guideFrame,assemblyStats2,on='CleanedFile',suffixes=('_raw',''),how='outer')
    print("Reporting stats for {} genomes.".format(len(guideFrame)))
    guideFrame.fillna('N/A', inplace=True)
    utilities.safeOverwriteTable(resultFile, guideFrame, 'tab', index=False)
    return 0
예제 #4
0
def calculateStats(filelist_or_frame,out_file=None,ass_format=None,image_dir=None,save_details=False):
    if isinstance(filelist_or_frame,list):
        filelist = filelist_or_frame
        fileframe = None
    elif isinstance(filelist_or_frame,pd.DataFrame):
        filelist = filelist_or_frame.Filename
        fileframe = filelist_or_frame
    else:
        raise ValueError("can only calculate stats on a list of filenames or a DataFrame with a Filename field")
    if len(filelist) == 0:
        raise ValueError("AssemblyStats CalculateStats requires a list of files with length > 0. Contact developer")
    assFrame = None
    if isinstance(image_dir,str):
        utilities.safeMakeDir(image_dir)    
    if len(filelist) > 0:
        assemblyList = []
        for filename in filelist:
            if isinstance(ass_format,str):
                assembler = ass_format
            elif ('spades' in filename):
                assembler = 'spades'
                print("Guessing assembler as {}".format(assembler))
            elif ('skesa' in filename):
                assembler = 'skesa'
                print("Guessing assembler as {}".format(assembler))
            else:
                assembler = None
            genome_format,_ = utilities.guessFileFormat(filename)
            AssInfo = {'Filename':filename} ##This will report data for all files provided. Junk files will have 0 contigs and 0 size
            if genome_format is None:
                AssInfo['Note']='Could not identify genome format'  
            else:
                try:
                    contig_list = seq_utilities.seqs_guess_and_parse2list(filename)                                       
                    if isinstance(contig_list,list) and len(contig_list) > 0:
                        contigFrame = getContigStats(contig_list,hasQual = (genome_format == 'fastq'),assembler=assembler) 
                        if 'Coverage' in contigFrame.columns:
                            contigFrame['Coverage'] = contigFrame['Coverage'].astype(float) ##Note: Coverage is being cast to float in getSpadesStats, but somehow becomes string in this frame.
                        if 'Contig_Size' in contigFrame.columns:
                            contigFrame['Contig_Size'] = contigFrame['Contig_Size'].astype(int)
                        if save_details:
                            contig_file = utilities.setExt(utilities.appendToFilename(filename,'_contigs'),'.xlsx')
                            contigFrame.to_excel(contig_file)
                        assert len(contig_list) == len(contigFrame), "Not all contigs are in dataframe"  
                        if isinstance(image_dir,str) and os.path.isdir(image_dir):
                            if has_plt:
                                if ('Coverage' in contigFrame.columns) and ('Contig_Size' in contigFrame.columns):
                                    tempFrame = contigFrame[['Coverage','Contig_Size']].copy()
                                    try:
                                        raw_filename = os.path.join(image_dir,os.path.basename(filename))
                                        image_file = utilities.setExt(raw_filename, 'png') ##Note: only reason to do                                     
                                        if isinstance(image_file,str):
                                            fig = tempFrame.plot(kind='scatter', x='Contig_Size',y='Coverage',logx=True,logy=True)
                                            fig = fig.get_figure()
                                            fig.savefig(image_file)    
                                    except Exception as e:
                                        print('Failed to save contig stats scatterplot at '+image_file)
                                        for c in tempFrame.columns:
                                            print(tempFrame[c])
                                        utilities.printExceptionDetails(e)
                                    else:
                                        try:
                                            plt.close(fig)
                                        except:
                                            print("Failed to close image...")  
                                elif assembler in ['skesa','spades']:
                                    print("Unable to produce contig stats scatterplot because necessary fields are not present ('Contig_Size' and 'Coverage')")                     
                        AssInfo['Contig_Count']=str(len(contig_list))
                        contigSizes = contigFrame['Contig_Size'].astype(int)
                        assemblySize = sum(contigSizes)                
                        AssInfo['Bases_In_Contigs'] = str(assemblySize)                   
                        largeContigs = contigSizes > 10000
                        AssInfo['Large_Contig_Count'] = str(sum(largeContigs))
                        AssInfo['Small_Contig_Count'] = str(sum(~largeContigs))
                        AssInfo['Bases_In_Large_Contigs'] = str(sum(contigSizes[largeContigs]))
                        AssInfo['Bases_In_Small_Contigs'] = str(sum(contigSizes[~largeContigs]))
                        emptyContigs = contigSizes == 0
                        if sum(emptyContigs) > 0:
                            print('\n#### WARNING #### EMPTY CONTIGS ########\n')                           
                            print('\n\t'.join(contigFrame[emptyContigs].Contig_Name.tolist()))
                            print('\n########################################\n')
                        if 'Coverage' in contigFrame.columns:
                            contigCoverage = contigFrame['Coverage'] ##should be float, but seems to get converted to a string with some versions
                            if len(contigCoverage[largeContigs]) > 0:
                                min_c = min(contigCoverage[largeContigs])
                                AssInfo['Min_Coverage_Large_Contigs'] = str(min_c) 
                                max_c = max(contigCoverage[largeContigs])
                                AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = '{:0.2f}'.format(max_c/min_c) 
                                lowC_contigs = contigFrame['Coverage'] < (min_c / 2)
                                AssInfo['Low_Coverage_Contig_Count'] = sum(lowC_contigs)
                                AssInfo['Low_Coverage_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size'])
                            else:
                                AssInfo['Min_Coverage_Large_Contigs'] =  'N/A'
                                AssInfo['Max_Ratio_of_Coverage_Large_Contigs'] = 'N/A' 
                                AssInfo['Low_Coverage_Contig_Count'] = 'N/A'
                                AssInfo['Low_Coverage_Contig_Bases'] = 'N/A'                      
                            coverageProduct = contigFrame['Contig_Size'].astype(int) * contigFrame['Coverage']   
                            coverageProductSum = sum(coverageProduct)    
                            meanCoverage = coverageProductSum/assemblySize
                            AssInfo['Mean_Coverage'] = meanCoverage            
                            lowC_contigs = contigFrame['Coverage'] < (meanCoverage / 2)
                            AssInfo['HalfCov_Contig_Count'] = sum(lowC_contigs)
                            AssInfo['HalfCov_Contig_Bases'] = sum(contigFrame.loc[lowC_contigs,'Contig_Size'])
                        if feature_head in contigFrame:
                            featureCounts = contigFrame[feature_head].astype(int)
                            AssInfo[feature_head] = sum(featureCounts)
                        ### Sum ambiguous nucleotides
                        ambigCounts = contigFrame['Ambiguous_nucleotides'].astype(int)
                        AssInfo['Ambiguous_nucleotides']=sum(ambigCounts)
                        ## Import the quality scores
                        for c in contigFrame.columns:
                            if c.startswith(quality_head):
                                AssInfo[c] = str(sum(contigFrame[c]))            
                        ##Calculate N50 and N90
                        N_stats = calcN50_stats(contigSizes.tolist(),thresholds=[50,75,90])
                        for n,size in N_stats.items():
                            header = "N{}".format(n)
                            AssInfo[header] = str(size)
#                         assemblyList.append(AssInfo)
                    else:
                        print("failed to parse file: "+filename)
                        AssInfo['Note'] = 'No sequences parsed from file'
                except Exception as e:
                    print("Warning: failed to assess file: " + filename)
                    print("Exception: {}".format(e))
                    raise
             
            if 'Bases_In_Contigs' not in AssInfo:
                AssInfo['Bases_In_Contigs'] = 0 
            if 'Contig_Count' not in AssInfo:
                AssInfo['ContigCount'] = 0
            assemblyList.append(AssInfo)
        if len(assemblyList) > 0:
            print("Stats for {} assemblies.".format(len(assemblyList)))
            assFrame = pd.DataFrame(assemblyList)
            if isinstance(fileframe,pd.DataFrame):
                saveFrame = pd.merge(fileframe,assFrame,on='Filename')
            else:
                saveFrame = assFrame.set_index('Filename')
            if (out_file is not None):
                try:
                    saveFrame.to_csv(out_file)
                except Exception as e:
                    print(saveFrame.to_csv())
                    print()
                    print("Failed to print to target file {}. \nPrinted results to screen (above)".format(out_file))
                    utilities.printExceptionDetails(e)
        else:
            print("Failed to evaluate assemblies...")
            print("attempted to evaluate the following files:"+"\n".join(filelist))
    return assFrame
예제 #5
0
def listReadFilesWithNames(directory,outfile = None,read_extension=None,verbose=False,doAssignReadSets=False, deep_search = True,read_codes=None,useLabID=True,target_path='/'):
    if read_codes is None:
        read_codes = df_read_codes
    if read_extension is None:
        read_extension = read_ext
    extRE = re.compile(re.escape(read_extension)) #All read files are compressed fastq
    fileList = []
    readDataFile = None
    ##Find all read files in this directory tree
    abs_dir = stayOnPath(directory,target_path)
    print("Directory is "+abs_dir)
    if deep_search:
        for rootdir, _, files in os.walk(abs_dir):
            if verbose: 
                print("Scanning {}".format(rootdir))
            for filename in files:
                if extRE.search(filename):
                    fileList.append(stayOnPath((os.path.join(rootdir,filename)),target_path))
                else:
                    if (rootdir == abs_dir) and (filename.endswith('.xlsx') and (not filename.startswith('~'))):
                        if readDataFile is None:
                            readDataFile = os.path.join(abs_dir,filename)
                        else:
                            print("Warning: found multiple excel files in top of directory. Not clear which is the demultiplexing file")
                    if verbose: 
                        print("ignoring file: "+filename)
    else:
        all_files = os.listdir(abs_dir)
        first_file = True
        for filename in all_files: ##TODO refactor
            if extRE.search(filename):
                if first_file:
                    print("\t Collecting files from directory: "+abs_dir) 
                    first_file = False
                fileList.append(os.path.join(abs_dir,filename))
            else:
                if verbose:
                    print("ignoring file: "+filename)   
                      

    if verbose: print("Identified {} files in {}".format(len(fileList),directory))
    #### Interpret the read filenames
    readFrame =  pairReads(fileList,read_codes=read_codes,useLabID=useLabID)    
    #### Append any additional information
    if readFrame is None:
        print("Failed to identify read files in {}".format(directory))
    else:
        try:
            readFrame['Date_Created'] = readFrame['Read1'].apply(lambda x : time.ctime(os.path.getctime(x)))
        except OSError as e:
            print("Failure to identify file creation times")
            utilities.printExceptionDetails(e)
        readFrame['Date_Ingested'] = time.ctime()
        if verbose: print("Returned {} read sets".format(len(readFrame)))
        if readDataFile is not None: ###Append data to frame if available; filename is identified during directory search,  so file exists
            readDataFrame = openReadDataFile(readDataFile)
            if isinstance(readDataFrame,pd.DataFrame):
                readFrame = pd.merge(readFrame,readDataFrame,how='left')
            else:
                print("Error reading read data file")
        if outfile is not None:
            if os.path.isfile(outfile):
                priorFrame = pd.read_table(outfile) 
                print("Appending read list to existing file: "+outfile)
                #~ genomeFrame.append(df,ignore_index=True)
                totalFrame = pd.concat([priorFrame,readFrame],ignore_index=True)
            else:
                totalFrame = readFrame
                print("Saving list to "+outfile)
                ##ToDo: I should validate that 
            totalFrame.to_csv(outfile,sep='\t',index=False)
    if doAssignReadSets:
        assignReadSets(readFrame)
    return readFrame