Пример #1
0
    def readDiffRegs(self, args):

        for countsFile in args.diffreg:
            if not fileExists(countsFile):
                raise PSToolException("Diffreg file does not exist: " + str(countsFile))

        for diffFile in args.diffreg:
            df = EnrichmentDF.parseFromFile(diffFile)
Пример #2
0
    def makeResults(self, parallelResult, oEnvironment, args):

        allDiffRegData = {}
        allDiffRegSims = defaultdict(dict)
        conditions = set()

        for file in args.diffreg:

            thisData = EnrichmentDF(DataFrame.parseFromFile(file))
            condPair = tuple(thisData.getConditions())

            for cond in condPair:
                conditions.add(cond)

            allDiffRegData[condPair] = thisData

            for method in args.methods:

                methodFCs = []
                for x in thisData.getColumn(method + "_log2FC"):
                    if x != None and x!= 'None':
                        methodFCs.append(abs(float(x)))

                average = sum(methodFCs) / len(methodFCs)

                allDiffRegSims[method][condPair] = average


        allConditions = sorted(list(conditions))

        for method in allDiffRegSims:

            sims = np.zeros( (len(allConditions), len(allConditions)) )

            for condPair in allDiffRegSims[method]:
                sims[ allConditions.index(condPair[0]), allConditions.index(condPair[1]) ] = allDiffRegSims[method][condPair]
                sims[allConditions.index(condPair[1]), allConditions.index(condPair[0])] = allDiffRegSims[method][
                    condPair]

            PorePlot.heat_map_cluster(sims, allConditions, allConditions, "Similarity: " + str(method), "", pltcfg=args.pltcfg)
Пример #3
0
    def __init__(self, args):

        super(FoldChangeAnalysis, self).__init__(args)

        self.counts = None
        self.condData = EnrichmentDF()
Пример #4
0
class FoldChangeAnalysis(ParallelPSTInterface):
    def __init__(self, args):

        super(FoldChangeAnalysis, self).__init__(args)

        self.counts = None
        self.condData = EnrichmentDF()

    def _makePropDict(self):

        return None

    def readCounts(self, args):

        counts = {}
        for countFiles in args.counts:

            groupName = None

            for countFile in countFiles:

                if groupName == None:
                    groupName = countFile.name
                    counts[groupName] = []

                df = DataFrame.parseFromFile(countFile.name, [
                    'gene', 'coverage', 'coverage_rank', 'read_counts',
                    'read_counts_rank', 'read_counts_sec',
                    'read_counts_sec_rank'
                ])

                df.setFilepath(os.path.abspath(countFile.name))
                counts[groupName].append(df)

        return counts

    def readDiffRegs(self, args):

        # TODO what did I want to do with this argument?
        for diffFile in args.diffreg:
            df = EnrichmentDF.parseFromFile(diffFile)

    def prepareInputs(self, args):
        return []

    def execParallel(self, data, environment):

        return None

    def joinParallel(self, existResult, newResult, oEnvironment):

        return None

    def makeResults(self, parallelResult, oEnvironment, args):

        if not args.counts == None:
            """
            counts is a defaultdict(list) for each condition name with maybe multiple samples
            """
            counts = self.readCounts(args)

            vConds = sorted([x for x in counts])

            createdComparisons = defaultdict(list)
            conditions = []

            for valueSource in ['coverage', 'read_counts']:
                self.condData = EnrichmentDF()

                replicates = {}

                for condition in vConds:

                    condData = counts[condition]

                    condReplicates = []
                    for condDataSample in condData:

                        geneNames = condDataSample.getColumnIndex('gene')
                        geneCounts = condDataSample.getColumnIndex(valueSource)

                        condRow = condDataSample.toDataRow(
                            geneNames, geneCounts)

                        sampleName = condDataSample.filepath
                        conditions.append(sampleName)

                        condReplicates.append(sampleName)

                        self.condData.addCondition(condRow, sampleName)

                    replicates[condition] = condReplicates

                print("Running for conditions: " + str(vConds))

                createdComparisons[valueSource] += self.condData.runDEanalysis(
                    args.output,
                    prefix=valueSource,
                    rscriptPath=args.rscript.name,
                    methods=args.methods,
                    replicates=replicates,
                    noDErun=args.noanalysis)

            self.prepareHTMLOut(createdComparisons, replicates, args)

        if args.diffreg != None:

            createdComparisons = defaultdict(list)
            conditions = set()

            for file in args.diffreg:

                df = EnrichmentDF.parseFromFile(file)
                valueSource = self.getValueSource(df)

                conditions += df.getConditions()

                createdComparisons[valueSource].append(file)

            self.prepareHTMLOut(createdComparisons, conditions, args)

    def getValueSource(self, df):
        return df.data[0][1]

    def prepareHTMLOut(self, createdComparisons, replicates, args):

        for valueSource in createdComparisons:

            allComparisons = createdComparisons[valueSource]

            condPair2File = {}
            for x in allComparisons:
                condPair2File[(x[0], x[1])] = x[2]

            print("Comparisons")
            for x in condPair2File:
                print(x, condPair2File[x])

            self.condData.printResult(
                args.output,
                prefix=valueSource,
                conditionPair2File=condPair2File,
                replicates=replicates
            )  #conditions=conditions, files=createdComparisons[valueSource])
Пример #5
0
    def makeResults(self, parallelResult, oEnvironment, args):

        if not args.counts == None:
            """
            counts is a defaultdict(list) for each condition name with maybe multiple samples
            """
            counts = self.readCounts(args)

            vConds = sorted([x for x in counts])

            createdComparisons = defaultdict(list)
            conditions = []

            for valueSource in ['coverage', 'read_counts']:
                self.condData = EnrichmentDF()

                replicates = {}

                for condition in vConds:

                    condData = counts[condition]

                    condReplicates = []
                    for condDataSample in condData:

                        geneNames = condDataSample.getColumnIndex('gene')
                        geneCounts = condDataSample.getColumnIndex(valueSource)

                        condRow = condDataSample.toDataRow(
                            geneNames, geneCounts)

                        sampleName = condDataSample.filepath
                        conditions.append(sampleName)

                        condReplicates.append(sampleName)

                        self.condData.addCondition(condRow, sampleName)

                    replicates[condition] = condReplicates

                print("Running for conditions: " + str(vConds))

                createdComparisons[valueSource] += self.condData.runDEanalysis(
                    args.output,
                    prefix=valueSource,
                    rscriptPath=args.rscript.name,
                    methods=args.methods,
                    replicates=replicates,
                    noDErun=args.noanalysis)

            self.prepareHTMLOut(createdComparisons, replicates, args)

        if args.diffreg != None:

            createdComparisons = defaultdict(list)
            conditions = set()

            for file in args.diffreg:

                df = EnrichmentDF.parseFromFile(file)
                valueSource = self.getValueSource(df)

                conditions += df.getConditions()

                createdComparisons[valueSource].append(file)

            self.prepareHTMLOut(createdComparisons, conditions, args)
Пример #6
0
    def readDiffRegs(self, args):

        # TODO what did I want to do with this argument?
        for diffFile in args.diffreg:
            df = EnrichmentDF.parseFromFile(diffFile)
Пример #7
0
    def makeResults(self, parallelResult, oEnvironment, args):

        allDiffRegData = {}
        allDiffRegSims = defaultdict(dict)
        conditions = set()

        def parseNones(row):

            ret = [None] * len(row)
            for i in range(0, len(row)):
                if row[i] != 'None':
                    ret[i] = row[i]

            return ret

        topGenes = Counter()

        for file in args.diffreg:

            thisData = EnrichmentDF(DataFrame.parseFromFile(file))
            thisData.applyToRow(parseNones)

            condPair = tuple(thisData.getConditions())

            for cond in condPair:
                conditions.add(cond)

            allDiffRegData[condPair] = thisData

            for method in args.methods:

                methodFCs = []

                pvals = thisData.toDataRow(
                    thisData.getColumnIndex('id'),
                    thisData.getColumnIndex(method + "_RAW.PVA"))
                genepval = [(x[0], float(x[1])) for x in pvals.to_pairs()
                            if x[1] != None]

                genepval.sort(key=lambda x: x[1])

                for i in range(0, args.top):
                    topGenes[genepval[i][0]] += 1

        outputDF = DataFrame()

        geneIDidx = outputDF.addColumn('gene_id')
        countIdx = outputDF.addColumn('count')
        linkIdx = outputDF.addColumn('link')

        for (gene, count) in topGenes.most_common():

            geneRow = DataRow.fromDict({
                'gene_id':
                gene,
                'count':
                count,
                'link':
                "<a href='http://www.uniprot.org/uniprot/?query=" + gene +
                "&sort=score' target='_blank'>UniProt</a>",
            })

            outputDF.addRow(geneRow)

        outputDF.export(args.output, ExportTYPE.HTML)
Пример #8
0
    def makeResults(self, parallelResult, oEnvironment, args):

        if not args.counts == None:
            """
            counts is a defaultdict(list) for each condition name with maybe multiple samples
            """

            geneEnhancement = self.loadEnhancement(args.enhanced)
            geneLengths = self.loadGeneLengths(args.lengths)

            counts, cond2samples = self.readCounts(args,
                                                   biotypes=geneEnhancement,
                                                   gene2length=geneLengths)

            #vConds = sorted([x for x in counts])
            vConds = [x for x in counts]

            createdComparisons = defaultdict(list)
            conditions = []

            for valueSource in ['count']:
                self.condData = EnrichmentDF()
                replicates = OrderedDict()

                for condition in vConds:

                    condData = counts[condition]

                    condReplicates = []
                    for condDataSample in condData:
                        geneNames = condDataSample.getColumnIndex('gene')
                        geneCounts = condDataSample.getColumnIndex(valueSource)

                        rowUpdates = []
                        sampleName = condDataSample.filepath

                        print(sampleName, len(condDataSample))
                        for row in condDataSample:

                            rowData = {
                                "id": row["gene"],
                                sampleName: row[valueSource]
                            }

                            if args.libsize:
                                rowData[sampleName + ".LS"] = row["LS"]

                            if args.fpkm:
                                rowData[sampleName + ".FPKM"] = row["FPKM"]

                            if args.tpm:
                                rowData[sampleName + ".TPM"] = row["TPM"]

                            rowUpdates.append(rowData)

                        #condRows = condDataSample.namedRows(geneNames, interestCols)
                        #condRow = condDataSample.toDataRow(geneNames, geneCounts)

                        conditions.append(sampleName)
                        condReplicates.append(sampleName)

                        print("Add Condition", sampleName, rowUpdates[0])
                        self.condData.addConditions(rowUpdates, sampleName)

                    replicates[condition] = condReplicates

                print("Running for conditions: " + str(vConds))

                createdComparisons[valueSource] += self.condData.runDEanalysis(
                    args.output,
                    prefix=valueSource,
                    rscriptPath=args.rscript.name,
                    methods=args.methods,
                    replicates=replicates,
                    noDErun=args.noanalysis,
                    enhanceSymbol=geneEnhancement,
                    geneLengths=geneLengths)

            self.prepareHTMLOut(createdComparisons, replicates, args)
Пример #9
0
class FoldChangeFeatureCountsAnalysis(ParallelPSTInterface):
    def __init__(self, args):

        super(FoldChangeFeatureCountsAnalysis, self).__init__(args)

        self.counts = None
        self.condData = EnrichmentDF()

    def _makePropDict(self):

        return None

    def readCounts(self, args, biotypes=None, gene2length=None):

        if args.norrna and biotypes == None:
            raise argparse.ArgumentParser().error(
                "removal of rRNA requires --enhanced!")
        if args.removemtrna and biotypes == None:
            raise argparse.ArgumentParser().error(
                "removal of mtRNA requires --enhanced!")
        if args.only_protein_coding and biotypes == None:
            raise argparse.ArgumentParser().error(
                "--only-protein-coding requires --enhanced!")

        if args.fpkm and gene2length == None:
            raise argparse.ArgumentParser().error(
                "calculation of FPKM requires --lengths!")
        if args.tpm and gene2length == None:
            raise argparse.ArgumentParser().error(
                "calculation of TPM requires --lengths!")

        featureCountsColumns = [
            "Geneid", "Chr", "Start", "End", "Strand", "Length"
        ]

        counts = defaultdict(lambda: list())

        condition2samples = defaultdict(list)

        for idx, countFile in enumerate(args.counts):

            print("Loading File", idx, ":", countFile.name)

            countFilePrefix = args.prefixes[idx]
            df = DataFrame.parseFromFile(countFile.name, skipChar='#')

            allheaders = df.getHeader()
            sampleHeaders = [
                x for x in allheaders if not x in featureCountsColumns
            ]

            for sample in sampleHeaders:
                condition2samples[sample].append(countFilePrefix + sample)

            for condGroup in args.conditions:
                condName = condGroup[0]
                for condElement in condGroup:

                    print(condName, condElement, condElement in allheaders)
                    if args.allow_nonexistant_cond and not condElement in allheaders:
                        continue

                    subDf = df.selectColumns({
                        "Geneid": "gene",
                        condElement: "count"
                    })

                    if args.removestable:
                        geneColIdx = subDf.getColumnIndex("gene")
                        subDf.applyByRow("gene",
                                         lambda x: x[geneColIdx].split(".")[0])

                    if biotypes != None and args.norrna:
                        geneColIdx = subDf.getColumnIndex("gene")
                        subDf.filterRows(
                            lambda x: x[geneColIdx] in biotypes and not "rRNA"
                            in biotypes[x[geneColIdx]][1])

                    if biotypes != None and args.removemtrna:
                        geneColIdx = subDf.getColumnIndex("gene")
                        subDf.filterRows(
                            lambda x: x[geneColIdx] in biotypes and not "Mt_"
                            in biotypes[x[geneColIdx]][1])

                    if biotypes != None and args.only_protein_coding:
                        subDf.filterRows(
                            lambda x: x[geneColIdx] in biotypes and
                            "protein_coding" in biotypes[x[geneColIdx]][1])

                    if os.path.isdir(condElement) or os.path.isfile(
                            condElement):
                        subDf.setFilepath(os.path.abspath(condElement))
                    else:
                        subDf.setFilepath(condElement)

                    if args.libsize:
                        countCol = subDf.getColumnIndex("count")
                        geneCol = subDf.getColumnIndex("gene")

                        totalCounts = sum([x[countCol] for x in subDf.data])

                        libSizeIdx = subDf.addColumn("LS", 0)

                        def addLibSize(x):
                            x[libSizeIdx] = (x[countCol] / totalCounts) * 10000

                            return tuple(x)

                        subDf.applyToRow(addLibSize)

                    if args.fpkm:

                        countCol = subDf.getColumnIndex("count")
                        geneCol = subDf.getColumnIndex("gene")

                        totalCounts = sum([x[countCol] for x in subDf.data])

                        fpkmIdx = subDf.addColumn("FPKM", 0)

                        def addFPKM(x):

                            geneID = x[geneCol]
                            geneLength = gene2length.get(geneID, 0)

                            if geneLength == 0:
                                x[fpkmIdx] = 0
                            else:
                                x[fpkmIdx] = x[countCol] / (
                                    totalCounts * geneLength) * pow(10, 9)

                            return tuple(x)

                        subDf.applyToRow(addFPKM)

                    if args.tpm:

                        countCol = subDf.getColumnIndex("count")
                        geneCol = subDf.getColumnIndex("gene")

                        totalCounts = sum([x[countCol] for x in subDf.data])
                        totalRatio = 0

                        for row in subDf:
                            geneID = row["gene"]
                            geneCount = row["count"]
                            geneLength = gene2length.get(geneID, 0)

                            if geneLength == 0:
                                pass
                            else:
                                totalRatio += geneCount / geneLength

                        tpmIdx = subDf.addColumn("TPM", 0)

                        def addTPM(x):

                            geneID = x[geneCol]
                            geneLength = gene2length.get(geneID, 0)

                            if geneLength == 0:
                                x[fpkmIdx] = 0
                            else:
                                x[tpmIdx] = x[countCol] / (
                                    geneLength * totalRatio) * pow(10, 6)

                            return tuple(x)

                        subDf.applyToRow(addTPM)

                    counts[condName].append(subDf)

        return counts, condition2samples

    def readDiffRegs(self, args):

        # TODO what did I want to do with this argument?
        for diffFile in args.diffreg:
            df = EnrichmentDF.parseFromFile(diffFile)

    def prepareInputs(self, args):
        return []

    def execParallel(self, data, environment):

        return None

    def joinParallel(self, existResult, newResult, oEnvironment):

        return None

    def loadEnhancement(self, fileE):

        if fileE == None:
            print("Not loading gene name enhancements")
            return {}

        print("Loading gene name enhancements", fileE.name)

        ens2sym = {}

        for lidx, line in enumerate(fileE):
            line = line.strip().split("\t")

            if lidx == 0 or line[0].startswith("#"):
                continue

            ensemblID = line[0]
            geneSymbol = line[1]
            biotype = line[2]

            #if len(geneSymbol) == 0:
            #    continue

            ens2sym[ensemblID] = (geneSymbol, biotype)

        return ens2sym

    def loadGeneLengths(self, fileE):

        if fileE == None:
            print("Not loading gene lengths")
            return None

        print("Loading gene lengths", fileE.name)
        """
            Ensembl_gene_identifier GeneID  length
            ENSMUSG00000000001      14679   3262
            ENSMUSG00000000003      54192   902
            ENSMUSG00000000028      12544   2252
        """

        ens2gl = {}
        for lidx, line in enumerate(fileE):
            line = line.strip().split("\t")

            if lidx == 0:
                try:
                    int(line[1])
                except:
                    continue

            ensemblID = line[0]
            geneLength = line[1]

            if len(ensemblID) == 0 or len(geneLength) == 0:
                continue

            geneLength = int(geneLength)
            ens2gl[ensemblID] = geneLength

        return ens2gl

    def makeResults(self, parallelResult, oEnvironment, args):

        if not args.counts == None:
            """
            counts is a defaultdict(list) for each condition name with maybe multiple samples
            """

            geneEnhancement = self.loadEnhancement(args.enhanced)
            geneLengths = self.loadGeneLengths(args.lengths)

            counts, cond2samples = self.readCounts(args,
                                                   biotypes=geneEnhancement,
                                                   gene2length=geneLengths)

            #vConds = sorted([x for x in counts])
            vConds = [x for x in counts]

            createdComparisons = defaultdict(list)
            conditions = []

            for valueSource in ['count']:
                self.condData = EnrichmentDF()
                replicates = OrderedDict()

                for condition in vConds:

                    condData = counts[condition]

                    condReplicates = []
                    for condDataSample in condData:
                        geneNames = condDataSample.getColumnIndex('gene')
                        geneCounts = condDataSample.getColumnIndex(valueSource)

                        rowUpdates = []
                        sampleName = condDataSample.filepath

                        print(sampleName, len(condDataSample))
                        for row in condDataSample:

                            rowData = {
                                "id": row["gene"],
                                sampleName: row[valueSource]
                            }

                            if args.libsize:
                                rowData[sampleName + ".LS"] = row["LS"]

                            if args.fpkm:
                                rowData[sampleName + ".FPKM"] = row["FPKM"]

                            if args.tpm:
                                rowData[sampleName + ".TPM"] = row["TPM"]

                            rowUpdates.append(rowData)

                        #condRows = condDataSample.namedRows(geneNames, interestCols)
                        #condRow = condDataSample.toDataRow(geneNames, geneCounts)

                        conditions.append(sampleName)
                        condReplicates.append(sampleName)

                        print("Add Condition", sampleName, rowUpdates[0])
                        self.condData.addConditions(rowUpdates, sampleName)

                    replicates[condition] = condReplicates

                print("Running for conditions: " + str(vConds))

                createdComparisons[valueSource] += self.condData.runDEanalysis(
                    args.output,
                    prefix=valueSource,
                    rscriptPath=args.rscript.name,
                    methods=args.methods,
                    replicates=replicates,
                    noDErun=args.noanalysis,
                    enhanceSymbol=geneEnhancement,
                    geneLengths=geneLengths)

            self.prepareHTMLOut(createdComparisons, replicates, args)

    def getValueSource(self, df):
        return df.data[0][1]

    def prepareHTMLOut(self, createdComparisons, replicates, args):

        for valueSource in createdComparisons:

            allComparisons = createdComparisons[valueSource]

            condPair2File = {}
            for x in allComparisons:
                condPair2File[(x[0], x[1])] = x[2]

            print("Comparisons")
            for x in condPair2File:
                print(x, condPair2File[x])

            self.condData.printResult(
                args.output,
                prefix=valueSource,
                conditionPair2File=condPair2File,
                replicates=replicates
            )  # conditions=conditions, files=createdComparisons[valueSource])
Пример #10
0
    def __init__(self, args):

        super(FoldChangeSimilarity, self).__init__(args)

        self.counts = None
        self.condData = EnrichmentDF()