Exemplos de parseLibraryConfig em Python, exemplos de expt_config_parser.parseLibraryConfig em Python

Exemplo n.º 1

0

Exibir arquivo

def loadExperimentData(experimentFile, supportedLibraryPath, library, basePath = '.'):
	libDict, librariesToTables = parseLibraryConfig(os.path.join(supportedLibraryPath, 'library_config.txt'))

	geneTableDict = dict()
	phenotypeTableDict = dict()
	libraryTableDict = dict()

	parser = SafeConfigParser()
	parser.read(experimentFile)
	for exptConfigFile in parser.sections():
		configDict = parseExptConfig(exptConfigFile,libDict)[0]

		libraryTable = pd.read_csv(os.path.join(basePath,configDict['output_folder'],configDict['experiment_name']) + '_librarytable.txt',
			sep='\t', index_col=range(1), header=0)
		libraryTableDict[configDict['experiment_name']] = libraryTable

		geneTable = pd.read_csv(os.path.join(basePath,configDict['output_folder'],configDict['experiment_name']) + '_genetable.txt',
			sep='\t',index_col=range(2),header=range(3))
		phenotypeTable = pd.read_csv(os.path.join(basePath,configDict['output_folder'],configDict['experiment_name']) + '_phenotypetable.txt',\
			sep='\t',index_col=range(1),header=range(2))

		condTups = [(condStr.split(':')[0],condStr.split(':')[1]) for condStr in parser.get(exptConfigFile, 'condition_tuples').strip().split('\n')]
		# print condTups

		geneTableDict[configDict['experiment_name']] = geneTable.loc[:,[level_name for level_name in geneTable.columns if (level_name[0],level_name[1]) in condTups]]
		phenotypeTableDict[configDict['experiment_name']] = phenotypeTable.loc[:,[level_name for level_name in phenotypeTable.columns if (level_name[0],level_name[1]) in condTups]]

	mergedLibraryTable = pd.concat(libraryTableDict.values())
	# print mergedLibraryTable.head()
	mergedLibraryTable_dedup = mergedLibraryTable.drop_duplicates(['gene','sequence'])
	# print mergedLibraryTable_dedup.head()
	mergedGeneTable = pd.concat(geneTableDict.values(), keys=geneTableDict.keys(), axis = 1)
	# print mergedGeneTable.head()
	mergedPhenotypeTable = pd.concat(phenotypeTableDict.values(), keys=phenotypeTableDict.keys(), axis = 1)
	# print mergedPhenotypeTable.head()
	mergedPhenotypeTable_dedup = mergedPhenotypeTable.loc[mergedLibraryTable_dedup.index]

	return mergedLibraryTable_dedup, mergedPhenotypeTable_dedup, mergedGeneTable

Exemplo n.º 2

0

Exibir arquivo

Arquivo: process_experiments.py Projeto: mhorlbeck/ScreenProcessing

def processExperimentsFromConfig(configFile,
                                 libraryDirectory,
                                 generatePlots='png'):
    #load in the supported libraries and sublibraries
    try:
        librariesToSublibraries, librariesToTables = parseLibraryConfig(
            os.path.join(libraryDirectory, defaultLibConfigName))
    except ValueError as err:
        print ' '.join(err.args)
        return

    exptParameters, parseStatus, parseString = parseExptConfig(
        configFile, librariesToSublibraries)

    printNow(parseString)

    if parseStatus > 0:  #Critical errors in parsing
        print 'Exiting due to experiment config file errors\n'
        return

    makeDirectory(exptParameters['output_folder'])
    outbase = os.path.join(exptParameters['output_folder'],
                           exptParameters['experiment_name'])

    if generatePlots != 'off':
        plotDirectory = os.path.join(
            exptParameters['output_folder'],
            exptParameters['experiment_name'] + '_plots')
        makeDirectory(plotDirectory)

        screen_analysis.changeDisplayFigureSettings(
            newDirectory=plotDirectory,
            newImageExtension=generatePlots,
            newPlotWithPylab=False)

    #load in library table and filter to requested sublibraries
    printNow('Accessing library information')

    libraryTable = pd.read_csv(os.path.join(
        libraryDirectory, librariesToTables[exptParameters['library']]),
                               sep='\t',
                               tupleize_cols=False,
                               header=0,
                               index_col=0).sort_index()
    sublibColumn = libraryTable.apply(lambda row: row['sublibrary'].lower() in
                                      exptParameters['sublibraries'],
                                      axis=1)

    if sum(sublibColumn) == 0:
        print 'After limiting analysis to specified sublibraries, no elements are left'
        return

    libraryTable[sublibColumn].to_csv(outbase + '_librarytable.txt',
                                      sep='\t',
                                      tupleize_cols=False)

    #load in counts, create table of total counts in each and each file as a column
    printNow('Loading counts data')

    columnDict = dict()
    for tup in sorted(exptParameters['counts_file_list']):
        if tup in columnDict:
            print 'Asserting that tuples of condition, replicate, and count file should be unique; are the cases where this should not be enforced?'
            raise Exception(
                'condition, replicate, and count file combination already assigned'
            )

        countSeries = readCountsFile(tup[2]).reset_index(
        ).drop_duplicates('id').set_index(
            'id'
        )  #for now also dropping duplicate ids in counts for overlapping linc sublibraries
        countSeries = libraryTable[sublibColumn].align(
            countSeries, axis=0, join='left',
            fill_value=0)[1]  #expand series to fill 0 for every missing entry

        columnDict[tup] = countSeries[
            'counts']  #[sublibColumn] #then shrink series to only desired sublibraries

    # print columnDict
    countsTable = pd.DataFrame(
        columnDict)  #, index=libraryTable[sublibColumn].index)
    countsTable.to_csv(outbase + '_rawcountstable.txt',
                       sep='\t',
                       tupleize_cols=False)
    countsTable.sum().to_csv(outbase + '_rawcountstable_summary.txt', sep='\t')

    #merge counts for same conditions/replicates, and create summary table
    #save scatter plot before each merger, and histogram of counts post mergers
    printNow('Merging experiment counts split across lanes/indexes')

    exptGroups = countsTable.groupby(level=[0, 1], axis=1)
    mergedCountsTable = exptGroups.aggregate(np.sum)
    mergedCountsTable.to_csv(outbase + '_mergedcountstable.txt',
                             sep='\t',
                             tupleize_cols=False)
    mergedCountsTable.sum().to_csv(outbase + '_mergedcountstable_summary.txt',
                                   sep='\t')

    if generatePlots != 'off' and max(exptGroups.count().iloc[0]) > 1:
        printNow('-generating scatter plots of counts pre-merger')

        tempDataDict = {
            'library': libraryTable[sublibColumn],
            'premerged counts': countsTable,
            'counts': mergedCountsTable
        }

        for (phenotype, replicate), countsCols in exptGroups:
            if len(countsCols.columns) == 1:
                continue

            else:
                screen_analysis.premergedCountsScatterMatrix(
                    tempDataDict, phenotype, replicate)

    if generatePlots != 'off':
        printNow('-generating sgRNA read count histograms')

        tempDataDict = {
            'library': libraryTable[sublibColumn],
            'counts': mergedCountsTable
        }

        for (phenotype, replicate), countsCol in mergedCountsTable.iteritems():
            screen_analysis.countsHistogram(tempDataDict, phenotype, replicate)

    #create pairs of columns for each comparison, filter to na, then generate sgRNA phenotype score
    printNow('Computing sgRNA phenotype scores')

    growthValueDict = {(tup[0], tup[1]): tup[2]
                       for tup in exptParameters['growth_value_tuples']}
    phenotypeList = list(set(zip(*exptParameters['condition_tuples'])[0]))
    replicateList = sorted(
        list(set(zip(*exptParameters['counts_file_list'])[1])))

    phenotypeScoreDict = dict()
    for (phenotype, condition1,
         condition2) in exptParameters['condition_tuples']:
        for replicate in replicateList:
            column1 = mergedCountsTable[(condition1, replicate)]
            column2 = mergedCountsTable[(condition2, replicate)]
            filtCols = filterLowCounts(pd.concat(
                (column1, column2), axis=1), exptParameters['filter_type'],
                                       exptParameters['minimum_reads'])

            score = computePhenotypeScore(
                filtCols[(condition1, replicate)],
                filtCols[(condition2, replicate)], libraryTable[sublibColumn],
                growthValueDict[(phenotype, replicate)],
                exptParameters['pseudocount_behavior'],
                exptParameters['pseudocount'])

            phenotypeScoreDict[(phenotype, replicate)] = score

    if generatePlots != 'off':
        tempDataDict = {
            'library': libraryTable[sublibColumn],
            'counts': mergedCountsTable,
            'phenotypes': pd.DataFrame(phenotypeScoreDict)
        }

        printNow('-generating phenotype histograms and scatter plots')

        for (phenotype, condition1,
             condition2) in exptParameters['condition_tuples']:
            for replicate in replicateList:
                screen_analysis.countsScatter(
                    tempDataDict,
                    condition1,
                    replicate,
                    condition2,
                    replicate,
                    colorByPhenotype_condition=phenotype,
                    colorByPhenotype_replicate=replicate)

                screen_analysis.phenotypeHistogram(tempDataDict, phenotype,
                                                   replicate)
                screen_analysis.sgRNAsPassingFilterHist(
                    tempDataDict, phenotype, replicate)

    #scatterplot sgRNAs for all replicates, then average together and add columns to phenotype score table
    if len(replicateList) > 1:
        printNow('Averaging replicates')

        for phenotype in phenotypeList:
            repCols = pd.DataFrame({
                (phen, rep): col
                for (phen, rep), col in phenotypeScoreDict.iteritems()
                if phen == phenotype
            })
            phenotypeScoreDict[(
                phenotype, 'ave_' + '_'.join(replicateList)
            )] = repCols.mean(
                axis=1, skipna=False
            )  #average nan and real to nan; otherwise this could lead to data points with just one rep informing results

    phenotypeTable = pd.DataFrame(phenotypeScoreDict)
    phenotypeTable.to_csv(outbase + '_phenotypetable.txt',
                          sep='\t',
                          tupleize_cols=False)

    if len(replicateList) > 1 and generatePlots != 'off':
        tempDataDict = {
            'library': libraryTable[sublibColumn],
            'phenotypes': phenotypeTable
        }

        printNow(
            '-generating replicate phenotype histograms and scatter plots')

        for phenotype, phengroup in phenotypeTable.groupby(level=0, axis=1):
            for i, ((p, rep1), col1) in enumerate(phengroup.iteritems()):
                if rep1[:4] == 'ave_':
                    screen_analysis.phenotypeHistogram(tempDataDict, phenotype,
                                                       rep1)

                for j, ((p, rep2), col2) in enumerate(phengroup.iteritems()):
                    if rep2[:4] == 'ave_' or j <= i:
                        continue

                    else:
                        screen_analysis.phenotypeScatter(
                            tempDataDict, phenotype, rep1, phenotype, rep2)

    #generate pseudogenes
    negTable = phenotypeTable.loc[libraryTable[sublibColumn].loc[:, 'gene'] ==
                                  'negative_control', :]

    if exptParameters['generate_pseudogene_dist'] != 'off' and len(
            exptParameters['analyses']) > 0:
        print 'Generating a pseudogene distribution from negative controls'
        sys.stdout.flush()

        pseudoTableList = []
        pseudoLibTables = []
        negValues = negTable.values
        negColumns = negTable.columns

        if exptParameters['generate_pseudogene_dist'].lower() == 'manual':
            for pseudogene in range(exptParameters['num_pseudogenes']):
                randIndices = np.random.randint(
                    0, len(negTable), exptParameters['pseudogene_size'])
                pseudoTable = negValues[randIndices, :]
                pseudoIndex = [
                    'pseudo_%d_%d' % (pseudogene, i)
                    for i in range(exptParameters['pseudogene_size'])
                ]
                pseudoSeqs = [
                    'seq_%d_%d' % (pseudogene, i)
                    for i in range(exptParameters['pseudogene_size'])
                ]  #so pseudogenes aren't treated as duplicates
                pseudoTableList.append(
                    pd.DataFrame(pseudoTable,
                                 index=pseudoIndex,
                                 columns=negColumns))
                pseudoLib = pd.DataFrame(
                    {
                        'gene': ['pseudo_%d' % pseudogene] *
                        exptParameters['pseudogene_size'],
                        'transcripts':
                        ['na'] * exptParameters['pseudogene_size'],
                        'sequence':
                        pseudoSeqs
                    },
                    index=pseudoIndex)
                pseudoLibTables.append(pseudoLib)

        elif exptParameters['generate_pseudogene_dist'].lower() == 'auto':
            for pseudogene, (gene, group) in enumerate(
                    libraryTable[sublibColumn].drop_duplicates(
                        ['gene', 'sequence']).groupby('gene')):
                if gene == 'negative_control':
                    continue
                for transcript, (transcriptName, transcriptGroup) in enumerate(
                        group.groupby('transcripts')):
                    randIndices = np.random.randint(0, len(negTable),
                                                    len(transcriptGroup))
                    pseudoTable = negValues[randIndices, :]
                    pseudoIndex = [
                        'pseudo_%d_%d_%d' % (pseudogene, transcript, i)
                        for i in range(len(transcriptGroup))
                    ]
                    pseudoSeqs = [
                        'seq_%d_%d_%d' % (pseudogene, transcript, i)
                        for i in range(len(transcriptGroup))
                    ]
                    pseudoTableList.append(
                        pd.DataFrame(pseudoTable,
                                     index=pseudoIndex,
                                     columns=negColumns))
                    pseudoLib = pd.DataFrame(
                        {
                            'gene':
                            ['pseudo_%d' % pseudogene] * len(transcriptGroup),
                            'transcripts':
                            ['pseudo_transcript_%d' % transcript] *
                            len(transcriptGroup),
                            'sequence':
                            pseudoSeqs
                        },
                        index=pseudoIndex)
                    pseudoLibTables.append(pseudoLib)

        else:
            print 'generate_pseudogene_dist parameter not recognized, defaulting to off'

        phenotypeTable = phenotypeTable.append(pd.concat(pseudoTableList))
        libraryTableGeneAnalysis = libraryTable[sublibColumn].append(
            pd.concat(pseudoLibTables))
    else:
        libraryTableGeneAnalysis = libraryTable[sublibColumn]

    #compute gene scores for replicates, averaged reps, and pseudogenes
    if len(exptParameters['analyses']) > 0:
        print 'Computing gene scores'
        sys.stdout.flush()

        phenotypeTable_deduplicated = phenotypeTable.loc[
            libraryTableGeneAnalysis.drop_duplicates(['gene',
                                                      'sequence']).index]
        if exptParameters['collapse_to_transcripts'] == True:
            geneGroups = phenotypeTable_deduplicated.loc[
                libraryTableGeneAnalysis.
                loc[:, 'gene'] != 'negative_control', :].groupby([
                    libraryTableGeneAnalysis['gene'],
                    libraryTableGeneAnalysis['transcripts']
                ])
        else:
            geneGroups = phenotypeTable_deduplicated.loc[
                libraryTableGeneAnalysis.
                loc[:, 'gene'] != 'negative_control', :].groupby(
                    libraryTableGeneAnalysis['gene'])

        analysisTables = []
        for analysis in exptParameters['analyses']:
            print '--' + analysis
            sys.stdout.flush()

            analysisTables.append(
                applyGeneScoreFunction(geneGroups, negTable, analysis,
                                       exptParameters['analyses'][analysis]))

        geneTable = pd.concat(analysisTables,
                              axis=1).reorder_levels([1, 2, 0],
                                                     axis=1).sort_index(axis=1)
        geneTable.to_csv(outbase + '_genetable.txt',
                         sep='\t',
                         tupleize_cols=False)

        ### collapse the gene-transcript indices into a single score for a gene by best MW p-value, where applicable
        if exptParameters[
                'collapse_to_transcripts'] == True and 'calculate_mw' in exptParameters[
                    'analyses']:
            print 'Collapsing transcript scores to gene scores'
            sys.stdout.flush()

            geneTableCollapsed = scoreGeneByBestTranscript(geneTable)
            geneTableCollapsed.to_csv(outbase + '_genetable_collapsed.txt',
                                      sep='\t',
                                      tupleize_cols=False)

    if generatePlots != 'off':
        if 'calculate_ave' in exptParameters[
                'analyses'] and 'calculate_mw' in exptParameters['analyses']:
            tempDataDict = {
                'library':
                libraryTable[sublibColumn],
                'gene scores':
                geneTableCollapsed
                if exptParameters['collapse_to_transcripts'] else geneTable
            }

            for (phenotype,
                 replicate), gtable in geneTableCollapsed.groupby(level=[0, 1],
                                                                  axis=1):
                if len(
                        replicateList
                ) == 1 or replicate[:4] == 'ave_':  #just plot averaged reps where available
                    screen_analysis.volcanoPlot(tempDataDict,
                                                phenotype,
                                                replicate,
                                                labelHits=True)

    print 'Done!'

Exemplo n.º 3

0

Exibir arquivo

Arquivo: process_experiments.py Projeto: mhorlbeck/ScreenProcessing

def processExperimentsFromConfig(configFile, libraryDirectory, generatePlots='png'):
    #load in the supported libraries and sublibraries
    try:
        librariesToSublibraries, librariesToTables = parseLibraryConfig(os.path.join(libraryDirectory, defaultLibConfigName))
    except ValueError as err:
        print ' '.join(err.args)
        return

    exptParameters, parseStatus, parseString = parseExptConfig(configFile, librariesToSublibraries)

    printNow(parseString)

    if parseStatus > 0: #Critical errors in parsing
        print 'Exiting due to experiment config file errors\n'
        return

    makeDirectory(exptParameters['output_folder'])
    outbase = os.path.join(exptParameters['output_folder'],exptParameters['experiment_name'])
    
    if generatePlots != 'off':
        plotDirectory = os.path.join(exptParameters['output_folder'],exptParameters['experiment_name'] + '_plots')
        makeDirectory(plotDirectory)
    
        screen_analysis.changeDisplayFigureSettings(newDirectory=plotDirectory, newImageExtension = generatePlots, newPlotWithPylab = False)
    

    #load in library table and filter to requested sublibraries
    printNow('Accessing library information')

    libraryTable = pd.read_csv(os.path.join(libraryDirectory, librariesToTables[exptParameters['library']]), sep = '\t', tupleize_cols=False, header=0, index_col=0).sort_index()
    sublibColumn = libraryTable.apply(lambda row: row['sublibrary'].lower() in exptParameters['sublibraries'], axis=1)

    if sum(sublibColumn) == 0:
        print 'After limiting analysis to specified sublibraries, no elements are left'
        return

    libraryTable[sublibColumn].to_csv(outbase + '_librarytable.txt', sep='\t', tupleize_cols = False)

    #load in counts, create table of total counts in each and each file as a column
    printNow('Loading counts data')

    columnDict = dict()
    for tup in sorted(exptParameters['counts_file_list']):
        if tup in columnDict:
            print 'Asserting that tuples of condition, replicate, and count file should be unique; are the cases where this should not be enforced?'
            raise Exception('condition, replicate, and count file combination already assigned')
        
        countSeries = readCountsFile(tup[2]).reset_index().drop_duplicates('id').set_index('id') #for now also dropping duplicate ids in counts for overlapping linc sublibraries
        countSeries = libraryTable[sublibColumn].align(countSeries, axis=0, join='left', fill_value=0)[1] #expand series to fill 0 for every missing entry

        columnDict[tup] = countSeries['counts'] #[sublibColumn] #then shrink series to only desired sublibraries

    # print columnDict
    countsTable = pd.DataFrame(columnDict)#, index=libraryTable[sublibColumn].index)
    countsTable.to_csv(outbase + '_rawcountstable.txt', sep='\t', tupleize_cols = False)
    countsTable.sum().to_csv(outbase + '_rawcountstable_summary.txt', sep='\t')

    #merge counts for same conditions/replicates, and create summary table
    #save scatter plot before each merger, and histogram of counts post mergers
    printNow('Merging experiment counts split across lanes/indexes')
    
    exptGroups = countsTable.groupby(level=[0,1], axis=1)
    mergedCountsTable = exptGroups.aggregate(np.sum)
    mergedCountsTable.to_csv(outbase + '_mergedcountstable.txt', sep='\t', tupleize_cols = False)
    mergedCountsTable.sum().to_csv(outbase + '_mergedcountstable_summary.txt', sep='\t')
    
    if generatePlots != 'off' and max(exptGroups.count().iloc[0]) > 1:
        printNow('-generating scatter plots of counts pre-merger')
    
        tempDataDict = {'library': libraryTable[sublibColumn],
                        'premerged counts': countsTable,
                       'counts': mergedCountsTable}

        for (phenotype, replicate), countsCols in exptGroups:
            if len(countsCols.columns) == 1:
                continue
            
            else:
                screen_analysis.premergedCountsScatterMatrix(tempDataDict, phenotype, replicate)

    if generatePlots != 'off':
        printNow('-generating sgRNA read count histograms')
    
        tempDataDict = {'library': libraryTable[sublibColumn],
                        'counts': mergedCountsTable}
                    
        for (phenotype, replicate), countsCol in mergedCountsTable.iteritems():
            screen_analysis.countsHistogram(tempDataDict, phenotype, replicate)
    
    #create pairs of columns for each comparison, filter to na, then generate sgRNA phenotype score
    printNow('Computing sgRNA phenotype scores')

    growthValueDict = {(tup[0],tup[1]):tup[2] for tup in exptParameters['growth_value_tuples']}
    phenotypeList = list(set(zip(*exptParameters['condition_tuples'])[0]))
    replicateList = sorted(list(set(zip(*exptParameters['counts_file_list'])[1])))

    phenotypeScoreDict = dict()
    for (phenotype, condition1, condition2) in exptParameters['condition_tuples']:
        for replicate in replicateList:
            column1 = mergedCountsTable[(condition1,replicate)]
            column2 = mergedCountsTable[(condition2,replicate)]
            filtCols = filterLowCounts(pd.concat((column1, column2), axis = 1), exptParameters['filter_type'], exptParameters['minimum_reads'])
            

            score = computePhenotypeScore(filtCols[(condition1, replicate)], filtCols[(condition2,replicate)], 
                libraryTable[sublibColumn], growthValueDict[(phenotype,replicate)], 
                exptParameters['pseudocount_behavior'], exptParameters['pseudocount'])

            phenotypeScoreDict[(phenotype,replicate)] = score
    
    if generatePlots  != 'off':
        tempDataDict = {'library': libraryTable[sublibColumn],
                        'counts': mergedCountsTable,
                        'phenotypes': pd.DataFrame(phenotypeScoreDict)}
                        
        printNow('-generating phenotype histograms and scatter plots')
        
        for (phenotype, condition1, condition2) in exptParameters['condition_tuples']:
            for replicate in replicateList:
                screen_analysis.countsScatter(tempDataDict, condition1, replicate, condition2, replicate, 
                    colorByPhenotype_condition = phenotype, colorByPhenotype_replicate = replicate)
                    
                screen_analysis.phenotypeHistogram(tempDataDict, phenotype, replicate)
                screen_analysis.sgRNAsPassingFilterHist(tempDataDict, phenotype, replicate)
    
    #scatterplot sgRNAs for all replicates, then average together and add columns to phenotype score table
    if len(replicateList) > 1:
        printNow('Averaging replicates')

        for phenotype in phenotypeList:
            repCols = pd.DataFrame({(phen,rep):col for (phen,rep), col in phenotypeScoreDict.iteritems() if phen == phenotype})
            phenotypeScoreDict[(phenotype,'ave_' + '_'.join(replicateList))] = repCols.mean(axis=1,skipna=False) #average nan and real to nan; otherwise this could lead to data points with just one rep informing results

    phenotypeTable = pd.DataFrame(phenotypeScoreDict)
    phenotypeTable.to_csv(outbase + '_phenotypetable.txt', sep='\t', tupleize_cols = False)

    if len(replicateList) > 1 and generatePlots != 'off':
        tempDataDict = {'library': libraryTable[sublibColumn],
                        'phenotypes': phenotypeTable}
                    
        printNow('-generating replicate phenotype histograms and scatter plots')
    
        for phenotype, phengroup in phenotypeTable.groupby(level=0, axis=1):
            for i, ((p, rep1), col1) in enumerate(phengroup.iteritems()):
                if rep1[:4] == 'ave_':
                    screen_analysis.phenotypeHistogram(tempDataDict, phenotype, rep1)
            
                for j, ((p, rep2), col2) in enumerate(phengroup.iteritems()):
                    if rep2[:4] == 'ave_' or j<=i:
                        continue
                    
                    else:
                        screen_analysis.phenotypeScatter(tempDataDict, phenotype, rep1, phenotype, rep2)                    
                

    #generate pseudogenes
    negTable = phenotypeTable.loc[libraryTable[sublibColumn].loc[:,'gene'] == 'negative_control',:]

    if exptParameters['generate_pseudogene_dist'] != 'off' and len(exptParameters['analyses']) > 0:
        print 'Generating a pseudogene distribution from negative controls'
        sys.stdout.flush()

        pseudoTableList = []
        pseudoLibTables = []
        negValues = negTable.values
        negColumns = negTable.columns

        if exptParameters['generate_pseudogene_dist'].lower() == 'manual':
            for pseudogene in range(exptParameters['num_pseudogenes']):
                randIndices = np.random.randint(0, len(negTable), exptParameters['pseudogene_size'])
                pseudoTable = negValues[randIndices,:]
                pseudoIndex = ['pseudo_%d_%d' % (pseudogene,i) for i in range(exptParameters['pseudogene_size'])]
                pseudoSeqs = ['seq_%d_%d' % (pseudogene,i) for i in range(exptParameters['pseudogene_size'])] #so pseudogenes aren't treated as duplicates
                pseudoTableList.append(pd.DataFrame(pseudoTable,index=pseudoIndex,columns=negColumns))
                pseudoLib = pd.DataFrame({'gene':['pseudo_%d'%pseudogene]*exptParameters['pseudogene_size'],
                    'transcripts':['na']*exptParameters['pseudogene_size'],
                    'sequence':pseudoSeqs},index=pseudoIndex)
                pseudoLibTables.append(pseudoLib)

        elif exptParameters['generate_pseudogene_dist'].lower() == 'auto':
            for pseudogene, (gene, group) in enumerate(libraryTable[sublibColumn].drop_duplicates(['gene','sequence']).groupby('gene')):
                if gene == 'negative_control':
                    continue 
                for transcript, (transcriptName, transcriptGroup) in enumerate(group.groupby('transcripts')):
                    randIndices = np.random.randint(0, len(negTable), len(transcriptGroup))
                    pseudoTable = negValues[randIndices,:]
                    pseudoIndex = ['pseudo_%d_%d_%d' % (pseudogene, transcript, i) for i in range(len(transcriptGroup))]
                    pseudoSeqs = ['seq_%d_%d_%d' % (pseudogene, transcript, i) for i in range(len(transcriptGroup))]
                    pseudoTableList.append(pd.DataFrame(pseudoTable,index=pseudoIndex,columns=negColumns))
                    pseudoLib = pd.DataFrame({'gene':['pseudo_%d'%pseudogene]*len(transcriptGroup),
                        'transcripts':['pseudo_transcript_%d'%transcript]*len(transcriptGroup),
                        'sequence':pseudoSeqs},index=pseudoIndex)
                    pseudoLibTables.append(pseudoLib)

        else:
            print 'generate_pseudogene_dist parameter not recognized, defaulting to off'

        phenotypeTable = phenotypeTable.append(pd.concat(pseudoTableList))
        libraryTableGeneAnalysis = libraryTable[sublibColumn].append(pd.concat(pseudoLibTables))
    else:
        libraryTableGeneAnalysis = libraryTable[sublibColumn]

    #compute gene scores for replicates, averaged reps, and pseudogenes
    if len(exptParameters['analyses']) > 0:
        print 'Computing gene scores'
        sys.stdout.flush()

        phenotypeTable_deduplicated = phenotypeTable.loc[libraryTableGeneAnalysis.drop_duplicates(['gene','sequence']).index]
        if exptParameters['collapse_to_transcripts'] == True:
            geneGroups = phenotypeTable_deduplicated.loc[libraryTableGeneAnalysis.loc[:,'gene'] != 'negative_control',:].groupby([libraryTableGeneAnalysis['gene'],libraryTableGeneAnalysis['transcripts']])
        else:
            geneGroups = phenotypeTable_deduplicated.loc[libraryTableGeneAnalysis.loc[:,'gene'] != 'negative_control',:].groupby(libraryTableGeneAnalysis['gene'])

        analysisTables = []
        for analysis in exptParameters['analyses']:
            print '--' + analysis
            sys.stdout.flush()

            analysisTables.append(applyGeneScoreFunction(geneGroups, negTable, analysis, exptParameters['analyses'][analysis]))

        geneTable = pd.concat(analysisTables, axis=1).reorder_levels([1,2,0],axis=1).sort_index(axis=1)
        geneTable.to_csv(outbase + '_genetable.txt',sep='\t', tupleize_cols = False)

        ### collapse the gene-transcript indices into a single score for a gene by best MW p-value, where applicable
        if exptParameters['collapse_to_transcripts'] == True and 'calculate_mw' in exptParameters['analyses']:
            print 'Collapsing transcript scores to gene scores'
            sys.stdout.flush()

            geneTableCollapsed = scoreGeneByBestTranscript(geneTable)
            geneTableCollapsed.to_csv(outbase + '_genetable_collapsed.txt',sep='\t', tupleize_cols = False)
    
    if generatePlots != 'off':
        if 'calculate_ave' in exptParameters['analyses'] and 'calculate_mw' in exptParameters['analyses']:
            tempDataDict = {'library': libraryTable[sublibColumn],
                            'gene scores': geneTableCollapsed if exptParameters['collapse_to_transcripts'] else geneTable}
                            
            for (phenotype, replicate), gtable in geneTableCollapsed.groupby(level=[0,1], axis=1):
                if len(replicateList) == 1 or replicate[:4] == 'ave_': #just plot averaged reps where available
                    screen_analysis.volcanoPlot(tempDataDict, phenotype, replicate, labelHits=True)

    print 'Done!'

Exemplo n.º 4

0

Exibir arquivo

Arquivo: process_experiments.py Projeto: Minzhe/ScreenProcessing

def processExperimentsFromConfig(configFile, libraryDirectory):
    #load in the supported libraries and sublibraries
    librariesToSublibraries, librariesToTables = parseLibraryConfig(os.path.join(libraryDirectory, defaultLibConfigName))

    exptParameters, parseStatus, parseString = parseExptConfig(configFile, librariesToSublibraries)

    print parseString
    sys.stdout.flush()

    if parseStatus > 0: #Critical errors in parsing
        print 'Exiting due to parsing errors\n'
        return

    outbase = os.path.join(exptParameters['output_folder'],exptParameters['experiment_name'])

    #load in library table and filter to requested sublibraries
    print 'Accessing library information'
    sys.stdout.flush()

    libraryTable = pd.read_csv(os.path.join(libraryDirectory, librariesToTables[exptParameters['library']]), sep = '\t', tupleize_cols=False, header=0, index_col=0).sort_index()
    sublibColumn = libraryTable.apply(lambda row: row['sublibrary'] in exptParameters['sublibraries'], axis=1)

    libraryTable[sublibColumn].to_csv(outbase + '_librarytable.txt', sep='\t', tupelize_cols = False)

    #load in counts, create table of total counts in each and each file as a column
    print 'Loading counts data'
    sys.stdout.flush()

    columnDict = dict()
    for tup in sorted(exptParameters['counts_file_list']):
        if tup in columnDict:
            print 'Asserting that tuples of condition, replicate, and count file should be unique; are the cases where this should not be enforced?'
            raise Exception('condition, replicate, and count file combination already assigned')
        
        countSeries = readCountsFile(tup[2]).reset_index().drop_duplicates('id').set_index('id') #for now also dropping duplicate ids in counts for overlapping linc sublibraries
        countSeries = libraryTable[sublibColumn].align(countSeries, axis=0, join='left', fill_value=0)[1] #expand series to fill 0 for every missing entry

        columnDict[tup] = countSeries['counts'] #[sublibColumn] #then shrink series to only desired sublibraries

    # print columnDict
    countsTable = pd.DataFrame(columnDict)#, index=libraryTable[sublibColumn].index)
    countsTable.to_csv(outbase + '_rawcountstable.txt', sep='\t', tupleize_cols = False)
    countsTable.sum().to_csv(outbase + '_rawcountstable_summary.txt', sep='\t')

    #merge counts for same conditions/replicates, and create summary table
    #save scatter plot before each merger, and histogram of counts post mergers
    print 'Merging experiment counts split across lanes/indexes'
    print 'Generating scatter plots of counts pre-merger and histograms of counts post-merger'
    sys.stdout.flush()

    exptGroups = countsTable.groupby(level=[0,1], axis=1)
    for (condition, replicate), countsCols in exptGroups:
        if len(countsCols.columns) == 1:
            continue

        for i, col1 in enumerate(countsCols):
            for j, col2 in enumerate(countsCols):
                if j > i: #enforce that each pair is compared once
                    rasteredScatter(countsCols[col1],countsCols[col2],'\n'.join(col1),
                        '\n'.join(col2),outbase + '_premergescatter_%s_%s_%dv%d.svg' % (condition,replicate,i,j))

    mergedCountsTable = exptGroups.aggregate(np.sum)
    mergedCountsTable.to_csv(outbase + '_mergedcountstable.txt', sep='\t', tupleize_cols = False)
    mergedCountsTable.sum().to_csv(outbase + '_mergedcountstable_summary.txt', sep='\t')

    for col in mergedCountsTable:
        generateHistogram(mergedCountsTable[col],', '.join(col),outbase + '_postmergehist_%s_%s.svg' % (condition,replicate))

    #create pairs of columns for each comparison, filter to na, then generate sgRNA phenotype score
    print 'Computing sgRNA phenotype scores'
    sys.stdout.flush()

    growthValueDict = {(tup[0],tup[1]):tup[2] for tup in exptParameters['growth_value_tuples']}
    phenotypeList = list(set(zip(*exptParameters['condition_tuples'])[0]))
    replicateList = list(set(zip(*exptParameters['counts_file_list'])[1]))

    phenotypeScoreDict = dict()
    for (phenotype, condition1, condition2) in exptParameters['condition_tuples']:
        for replicate in replicateList:
            column1 = mergedCountsTable[(condition1,replicate)]
            column2 = mergedCountsTable[(condition2,replicate)]
            filtCols = filterLowCounts(pd.concat((column1, column2), axis = 1), exptParameters['filter_type'], exptParameters['minimum_reads'])
            

            score = computePhenotypeScore(filtCols[(condition1, replicate)], filtCols[(condition2,replicate)], 
                libraryTable[sublibColumn], growthValueDict[(phenotype,replicate)], 
                exptParameters['pseudocount_behavior'], exptParameters['pseudocount'])

            phenotypeScoreDict[(phenotype,replicate)] = score

    
    #scatterplot sgRNAs for all replicates, then average together and add columns to phenotype score table
    if len(replicateList) > 1:
        print 'Plotting and averaging replicates'
        sys.stdout.flush()

        for phenotype in phenotypeList:
            for i, rep1 in enumerate(replicateList):
                for j, rep2 in enumerate(replicateList):
                    if j > i:
                        rasteredScatter(phenotypeScoreDict[(phenotype,rep1)],phenotypeScoreDict[(phenotype,rep2)],
                            ', '.join((phenotype,rep1)), ', '.join((phenotype,rep2)),
                            outbase + '_phenotypescatter_%s_%sv%s.svg' % (condition,rep1,rep2))

            repCols = pd.DataFrame({(phen,rep):col for (phen,rep), col in phenotypeScoreDict.iteritems() if phen == phenotype})
            phenotypeScoreDict[(phenotype,'ave_' + '_'.join(replicateList))] = repCols.mean(axis=1,skipna=False) #average nan and real to nan; otherwise this could lead to data points with just one rep informing results

    phenotypeTable = pd.DataFrame(phenotypeScoreDict)
    phenotypeTable.to_csv(outbase + '_phenotypetable.txt', sep='\t', tupleize_cols = False)

    #generate pseudogenes
    negTable = phenotypeTable.loc[libraryTable[sublibColumn].loc[:,'gene'] == 'negative_control',:]

    if exptParameters['generate_pseudogene_dist'] == True and len(exptParameters['analyses']) > 0:
        print 'Generating a pseudogene distribution from negative controls'
        sys.stdout.flush()

        pseudoTableList = []
        pseudoLibTables = []
        negValues = negTable.values
        negColumns = negTable.columns
        for pseudogene in range(exptParameters['num_pseudogenes']):
            randIndices = np.random.randint(0, len(negTable), exptParameters['pseudogene_size'])
            pseudoTable = negValues[randIndices,:]
            pseudoIndex = ['pseudo_%d_%d' % (pseudogene,i) for i in range(exptParameters['pseudogene_size'])]
            pseudoSeqs = ['seq_%d_%d' % (pseudogene,i) for i in range(exptParameters['pseudogene_size'])] #so pseudogenes aren't treated as duplicates
            pseudoTableList.append(pd.DataFrame(pseudoTable,index=pseudoIndex,columns=negColumns))
            pseudoLib = pd.DataFrame({'gene':['pseudo_%d'%pseudogene]*exptParameters['pseudogene_size'],
                'transcripts':['na']*exptParameters['pseudogene_size'],
                'sequence':pseudoSeqs},index=pseudoIndex)
            pseudoLibTables.append(pseudoLib)

        phenotypeTable = phenotypeTable.append(pd.concat(pseudoTableList))
        libraryTableGeneAnalysis = libraryTable[sublibColumn].append(pd.concat(pseudoLibTables))
    else:
        libraryTableGeneAnalysis = libraryTable[sublibColumn]

    #return phenotypeTable, libraryTable

    #compute gene scores for replicates, averaged reps, and pseudogenes
    if len(exptParameters['analyses']) > 0:
        print 'Computing gene scores'
        sys.stdout.flush()

        phenotypeTable_deduplicated = phenotypeTable.loc[libraryTableGeneAnalysis.drop_duplicates(['gene','sequence']).index]
        if exptParameters['collapse_to_transcripts'] == True:
            geneGroups = phenotypeTable_deduplicated.loc[libraryTableGeneAnalysis.loc[:,'gene'] != 'negative_control',:].groupby([libraryTableGeneAnalysis['gene'],libraryTableGeneAnalysis['transcripts']])
        else:
            geneGroups = phenotypeTable_deduplicated.loc[libraryTableGeneAnalysis.loc[:,'gene'] != 'negative_control',:].groupby(libraryTableGeneAnalysis['gene'])

        analysisTables = []
        for analysis in exptParameters['analyses']:
            print '--' + analysis
            sys.stdout.flush()

            analysisTables.append(applyGeneScoreFunction(geneGroups, negTable, analysis, exptParameters['analyses'][analysis]))

        geneTable = pd.concat(analysisTables, axis=1).reorder_levels([1,2,0],axis=1).sort_index(axis=1)
        geneTable.to_csv(outbase + '_genetable.txt',sep='\t', tupleize_cols = False)

        ### collapse the gene-transcript indices into a single score for a gene by best MW p-value, where applicable
        if exptParameters['collapse_to_transcripts'] == True and 'calculate_mw' in exptParameters['analyses']:
            print 'Collapsing transcript scores to gene scores'
            sys.stdout.flush()

            geneTableCollapsed = scoreGeneByBestTranscript(geneTable)
            geneTableCollapsed.to_csv(outbase + '_genetable_collapsed.txt',sep='\t', tupleize_cols = False)


    #generate summary graphs depending on which analyses were selected


    print 'Done!'