Exemplo n.º 1
0
        othrType = sys.argv[3]
        outFile = sys.argv[4]
        corrThresh = abs(float(sys.argv[5]))
        if (not othrType.startswith("N:")):
            othrType = "N:" + othrType

    ## corrThresh = 0.40
    ## corrThresh = 0.20

    print " "
    print " "
    print " ************* "
    print " in methCorr : "
    print " ***************************************************************** "
    print " calling readTSV ... ", methFile
    methD = tsvIO.readTSV(methFile)
    tsvIO.lookAtDataD(methD)

    print " calling readTSV ... ", othrFile
    othrD = tsvIO.readTSV(othrFile)
    tsvIO.lookAtDataD(othrD)

    try:
        methRowLabels = methD['rowLabels']
        methColLabels = methD['colLabels']
        methDataMatrix = methD['dataMatrix']
    except:
        print " no valid METH feature matrix "
        sys.exit(-1)

    numMethRow = len(methRowLabels)
Exemplo n.º 2
0
            outFile = sys.argv[2]
            do_summaryMeth = 1
        else:
            print " "
            print " Usage: %s <input TSV file> <output TSV file> "
            print " "
            print " ERROR -- bad command line arguments "
            sys.exit(-1)

    print " "
    print " Running : %s  %s  %s  " % (sys.argv[0], sys.argv[1], sys.argv[2])
    print " "

    # read in the input feature matrix first, just in case there
    # actually isn't one yet available ...
    testD = tsvIO.readTSV(inFile)
    try:
        print len(testD['rowLabels']), len(testD['colLabels'])
    except:
        print " --> invalid / missing input feature matrix "
        sys.exit(-1)

    # we want to "check" for "deleted" METH probes
    if (do_summaryMeth):
        newD = summaryMeth(testD)
        ## tsvIO.writeTSV_dataMatrix ( newD, 0, 0, outFile )
        testD = newD

    # and finally write it out ...
    tsvIO.writeTSV_dataMatrix(testD, 0, 0, outFile)
Exemplo n.º 3
0
    infFilename = bioinformaticsReferencesDir + "/ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info"

    print " "
    print " Running : %s %s %s %s " % (sys.argv[0], sys.argv[1], sys.argv[2], sys.argv[3])
    print "           %s " % gafFilename
    print "           %s " % gencodeFilename
    print "           %s " % refGeneFilename
    print "           %s " % cybFilename
    print "           %s " % infFilename
    print " "
    print " "

    # read in the input feature matrix first, just in case there
    # actually isn't one yet available ...
    print " --> calling tsvIO.readTSV ... "
    testD = tsvIO.readTSV(inFile)
    try:
        print len(testD['rowLabels']), len(testD['colLabels'])
        if (len(testD['rowLabels']) == 0 or len(testD['colLabels']) == 0):
            print " EXITING ... no data "
            sys.exit(-1)
    except:
        print " --> invalid / missing input feature matrix "
        sys.exit(-1)

    # read in the gene_info file ...
    # this was turned off ... looking into turning it back on (1/7/13)
    # turning it back off (1/17/14)
    if (0):
        print " --> calling readGeneInfoFile ... "
        (geneInfoDict, synMapDict) = refData.readGeneInfoFile(infFilename)
Exemplo n.º 4
0
        fhA.close()
        fhB = file(inFileB, 'r')
        fhB.close()
    except:
        print " one or the other file does not exist ??? "
        print inFileA
        print inFileB
        sys.exit(-1)

    print " "
    print " reading input files ... : "
    print "     file A : ", inFileA
    print "     file B : ", inFileB
    print " "

    dataA = tsvIO.readTSV(inFileA)
    dataB = tsvIO.readTSV(inFileB)

    if (len(dataA) == 0):
        print " input file A does not exist ??? "
        print inFileA
        sys.exit(-1)

    if (len(dataB) == 0):
        print inFileB
        print " input file B does not exist ??? "
        sys.exit(-1)

    # first take a look at the feature (row) labels ...
    rowLabelsA = makeBetterLabels(dataA['rowLabels'])
    rowLabelsB = makeBetterLabels(dataB['rowLabels'])
Exemplo n.º 5
0
    if (1):
        if (len(sys.argv) != 2):
            print " Usage : %s <input tsv file> " % sys.argv[0]
            print " ERROR -- bad command line arguments "
            sys.exit(-1)

        tsvFile = sys.argv[1]

    print " "
    print " "
    print " ************** "
    print " in methCorr2 : "
    print " ***************************************************************** "
    print " calling readTSV ... ", tsvFile
    methD = tsvIO.readTSV(tsvFile)
    tsvIO.lookAtDataD(methD)

    try:
        methRowLabels = methD['rowLabels']
        methColLabels = methD['colLabels']
        methDataMatrix = methD['dataMatrix']
    except:
        print " no valid METH feature matrix "
        sys.exit(-1)

    numMethRow = len(methRowLabels)
    numMethCol = len(methColLabels)

    dThresh = 10000
    minCount = 30
Exemplo n.º 6
0
            ii += 3
        if (len(listInfo) < 1):
            print " ERROR ??? no sample-list information provided ??? "
            sys.exit(-1)

    print " "
    print " in filterTSVbySampList.py ... "
    print "         input file  : ", inFile
    print "         output file : ", outFile
    print "         list info   : ", listInfo
    print " "

    # print " "
    # print " ***************************************************************** "
    # print " calling readTSV ... ", inFile
    dataD = tsvIO.readTSV(inFile)
    if (dataD == {}):
        sys.exit(-1)

    tsvIO.lookAtDataD(dataD)

    # print " "
    # print " reading sample list ... "
    numLists = len(listInfo)
    listDetails = [0] * numLists
    listBW = [0] * numLists
    listLS = [0] * numLists
    for iList in range(numLists):
        print " --> loading sample list #%d from <%s> " % ((iList + 1), listInfo[iList][0])
        listDetails[iList] = readSampleListFromFile(listInfo[iList][0])
        listBW[iList] = listInfo[iList][1]
Exemplo n.º 7
0
            0]
        print "         ", sys.argv
        print " ERROR -- bad command line arguments "
        sys.exit(-1)

    tsvName1 = sys.argv[1]
    tsvName2 = sys.argv[2]
    tsvName3 = sys.argv[3]

    # test out readTSV ...
    ## tsvName = "coad_read_clinical.27jan.tsv"
    print " "
    print " ****************************************************************** "
    print " IN add2clinTSV.py ... "
    print " reading input file <%s> " % tsvName1
    allClinDict = tsvIO.readTSV(tsvName1)

    # take a look ...
    (naCounts, otherCounts) = miscClin.lookAtClinDict(allClinDict)
    bestKeyOrder = miscClin.getBestKeyOrder(allClinDict, naCounts)

    # now we want to read in a new tsv file ...
    print " "
    print " ****************************************************************** "
    print " reading input file <%s> " % tsvName2
    tmpDict = tsvIO.readTSV(tsvName2)

    # check to make sure that we actually got something back ...
    if (len(tmpDict) == 0):
        print " WARNING ... no information found ... "
Exemplo n.º 8
0
    if (1):
        if (len(sys.argv) != 4):
            print " Usage : %s <input feature matrix> <output feature matrix> <featType> " % sys.argv[
                0]
            print "         to avoid filtering on featType, use ANY "
            print " ERROR -- bad command line arguments "
            sys.exit(-1)
        inFile = sys.argv[1]
        outFile = sys.argv[2]
        featType = sys.argv[3]

    print " "
    print " "
    print " ******************** "
    print "  in filterIdentFeat  "
    print " ******************** "

    inD = tsvIO.readTSV(inFile)
    rowLabels = inD['rowLabels']
    numRow = len(rowLabels)

    outD = removeIdenticalFeatures(inD, featType)

    tsvIO.writeTSV_dataMatrix(outD, 0, 0, outFile)

    print " FINISHED "
    print " "

# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
Exemplo n.º 9
0
# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

if __name__ == "__main__":

    if (len(sys.argv) != 3):
        print " Usage : %s <input TSV> <output TSV> " % sys.argv[0]
        print " ERROR -- bad command line arguments "
        sys.exit(-1)

    tsvNameIn = sys.argv[1]
    tsvNameOut = sys.argv[2]

    print " "
    print " ****************************************************************** "
    print " reading input file <%s> " % tsvNameIn
    tmpD = tsvIO.readTSV(tsvNameIn)

    if (len(tmpD) == 0):
        print " in addIndicators ... no input data ... nothing to do here ... "
        sys.exit(-1)

    # automatically generate indicator features for categorical features
    tmpD = addIndicatorFeatures(tmpD)

    tsvIO.writeTSV_dataMatrix(tmpD, 0, 0, tsvNameOut)

    print " "
    print " "
    print " FINISHED "
    print " "
    print " "
Exemplo n.º 10
0
        print " Usage : %s <old input TSV> <new input TSV> <output merged TSV> " % sys.argv[0]
        print "         ", sys.argv
        print " ERROR -- bad command line arguments "
        sys.exit(-1)

    tsvName1 = sys.argv[1]
    tsvName2 = sys.argv[2]
    tsvName3 = sys.argv[3]

    # test out readTSV ...
    ## tsvName = "coad_read_clinical.27jan.tsv"
    print " "
    print " ****************************************************************** "
    print " IN add2clinTSV.py ... "
    print " reading input file <%s> " % tsvName1
    allClinDict = tsvIO.readTSV(tsvName1)

    # take a look ...
    (naCounts, otherCounts) = miscClin.lookAtClinDict(allClinDict)
    bestKeyOrder = miscClin.getBestKeyOrder(allClinDict, naCounts)

    # now we want to read in a new tsv file ...
    print " "
    print " ****************************************************************** "
    print " reading input file <%s> " % tsvName2
    tmpDict = tsvIO.readTSV(tsvName2)

    # check to make sure that we actually got something back ...
    if (len(tmpDict) == 0):
        print " WARNING ... no information found ... "
Exemplo n.º 11
0
if __name__ == "__main__":

    if (1):
        if (len(sys.argv) != 2):
            print " Usage : %s <input tsv file> " % sys.argv[0]
            sys.exit(-1)

        tsvFile = sys.argv[1]

    print " "
    print " "
    print " ************** "
    print " in methCorr2 : "
    print " ***************************************************************** "
    print " calling readTSV ... ", tsvFile
    methD = tsvIO.readTSV(tsvFile)
    tsvIO.lookAtDataD(methD)

    try:
        methRowLabels = methD['rowLabels']
        methColLabels = methD['colLabels']
        methDataMatrix = methD['dataMatrix']
    except:
        print " no valid METH feature matrix "
        sys.exit(-1)

    numMethRow = len(methRowLabels)
    numMethCol = len(methColLabels)

    dThresh = 10000
    minCount = 30
Exemplo n.º 12
0
        fhA.close()
        fhB = file(inFileB, 'r')
        fhB.close()
    except:
        print " one or the other file does not exist ??? "
        print inFileA
        print inFileB
        sys.exit(-1)

    print " "
    print " reading input files ... : "
    print "     file A : ", inFileA
    print "     file B : ", inFileB
    print " "

    dataA = tsvIO.readTSV(inFileA)
    dataB = tsvIO.readTSV(inFileB)

    if (len(dataA) == 0):
        print " input file A does not exist ??? "
        print inFileA
        sys.exit(-1)

    if (len(dataB) == 0):
        print inFileB
        print " input file B does not exist ??? "
        sys.exit(-1)

    # first take a look at the feature (row) labels ...
    rowLabelsA = makeBetterLabels ( dataA['rowLabels'] )
    rowLabelsB = makeBetterLabels ( dataB['rowLabels'] )
Exemplo n.º 13
0
            sys.exit(-1)

        tsvFile = sys.argv[1]
        outFile = sys.argv[2]
        dThresh = int ( sys.argv[3] )
        minCount = int ( sys.argv[4] )
        corrThresh = float ( sys.argv[5] )

    print " "
    print " "
    print " ************** "
    print " in methCorr3 : ", dThresh, minCount, corrThresh
    print " ***************************************************************** "
    print ' (a) TIME ', time.asctime(time.localtime(time.time()))
    print " calling readTSV ... ", tsvFile
    tsvD = tsvIO.readTSV(tsvFile)
    tsvIO.lookAtDataD(tsvD)

    try:
        tsvRowLabels = tsvD['rowLabels']
        tsvColLabels = tsvD['colLabels']
        tsvDataMatrix = tsvD['dataMatrix']
    except:
        print " no valid METH feature matrix "
        sys.exit(-1)

    numRow = len(tsvRowLabels)
    numCol = len(tsvColLabels)

    keepMeth = {}
    keepGexp = {}
Exemplo n.º 14
0
def handleOneFeature ( featString, typeString, tsvFilename, \
                       pathwaysFilename, numRandFactor=200, dirName='' ):

    print " "
    print " ***************************************************************** "
    print " "
    print " in handleOneFeature <%s> <%s> <%s> <%s> " % \
        ( featString, typeString, tsvFilename, dirName )
    print " "

    isBinaryFeat = 0
    isNumericFeat = 0
    if ( featString.startswith("B:") ):
        isBinaryFeat = 1
    elif ( featString.startswith("N:") ):
        isNumericFeat = 1

    ## get the time-stamp for the TSV file
    tTSV = os.path.getmtime ( tsvFilename ) 

    ## first we need to read in the feature matrix ...
    dataD = tsvIO.readTSV ( tsvFilename )

    rowLabels = dataD['rowLabels']
    dataMatrix = dataD['dataMatrix']
    foundRows = []
    exactMatch = []
    for iRow in range(len(rowLabels)):

        if ( rowLabels[iRow].startswith(featString) ):
            exactMatch += [ iRow ]

    foundRows = exactMatch

    if ( len(foundRows) < 1 ):
        print " ERROR ... no features found ", featString, len(rowLabels)
        print tsvFilename
        print " --> SKIPPING ... "
        return()
        ## sys.exit(-1)
    elif ( len(foundRows) > 1 ):
        print " ERROR ... more than one feature found ", featString, len(rowLabels)
        print tsvFilename
        print foundRows
        print " --> SKIPPING ... "
        return ()
        ## sys.exit(-1)

    print " "
    print " "

    ## figure out the root directory name and then the base output file name
    ( rootDirName, justFileName ) = splitPath ( tsvFilename )
    print " rootDirName = <%s> " % rootDirName
    print " justFileName = <%s> " % justFileName

    if ( dirName != '' ):
        if ( dirName[-1] != '/' ): dirName += '/'

    ## test whether output directory already exists ...
    if ( not os.path.exists(rootDirName+dirName) ):
        ## create output directory
        cmdString = 'mkdir %s%s' % ( rootDirName, dirName )
        ## print cmdString
        ( status, output ) = commands.getstatusoutput ( cmdString )
        if ( not os.path.exists(rootDirName+dirName) ):
            print " ERROR ??? failed to create output directory ??? "
            print cmdString
            print status
            sys.exit(-1)
        else:
            print " output directory created %s " % (rootDirName+dirName)
    else:
        print " output directory already exists %s " % (rootDirName+dirName)


    prBaseName = rootDirName + dirName + justFileName[:-3]
    print " prBaseName = <%s> " % prBaseName

    for iRow in foundRows:

        print " "
        print " "
        curLabel = rowLabels[iRow]
        print iRow, curLabel
        multiCatFlag = checkMultiCat ( curLabel, dataMatrix[iRow] )
        ## print dataMatrix[iRow]

        ## maybe we actually don't have to redo anything ...
        prOutFile = prBaseName + "%d.pxP" % iRow
        if ( prAlreadyDone ( prOutFile, tTSV ) ):
            prOutFile = prBaseName + "%d.pxN" % iRow
            if ( prAlreadyDone ( prOutFile, tTSV ) ):
                prOutFile = prBaseName + "%d.pxA" % iRow
                if ( prAlreadyDone ( prOutFile, tTSV ) ):
                    print "  -->  already done !!! <%s> " % curLabel
                    continue

        print " "
        print " "
        print " ************************************************************* "
        print " TIME : ", time.asctime(time.localtime(time.time()))
        print " "

        cmdString = 'python %s/main/run-pairwise-v2.py ' % ( gidgetConfigVars['TCGAFMP_ROOT_DIR'] )
        cmdString += '--pvalue 2. --one "%s" --tsvFile %s' % ( curLabel, tsvFilename )
        print cmdString
        ( status, output ) = commands.getstatusoutput ( cmdString )
        print " status : ", status
        print " output : ", output

        print " "
        print " "
        print " ************************************************************* "
        print " "

        pwpvFile = tsvFilename[:-3] + "%d.all.pwpv.sort" % iRow

        ## ok, now we are going to loop over 3 different scoring options ...
        signList = [ '+', '-', 'x' ]
        tagList  = [ 'pxP', 'pxN', 'pxA' ]

        ## if this feature is a categorical feature with more than 2 categories,
        ## then we cannot test the sign of the correlation ...
        if ( multiCatFlag ):
            print " WARNING: will only create the pxA output file for this feature "
            signList = [ 'x' ]
            tagList  = [ 'pxA' ]

        for iTest in range(len(signList)):

            ## first do the pathway-ranking looking for positive correlations ...
            prOutFile = prBaseName + "%d.%s" % ( iRow, tagList[iTest] )

            if ( not prAlreadyDone ( prOutFile, tTSV ) ):
                cmdString = 'rm -fr %s' % prOutFile
                print cmdString
                ( status, output ) = commands.getstatusoutput ( cmdString )
                print " "
                print " "
                cmdString = 'python %s/main/runPR.py ' % ( gidgetConfigVars['TCGAFMP_ROOT_DIR'] )
                cmdString += ' --tsvFile %s ' % tsvFilename
                cmdString += ' --pwpvFile %s ' % pwpvFile
                cmdString += ' --pathways %s ' % pathwaysFilename
                cmdString += ' --featName "%s" ' % curLabel
                cmdString += ' --sign "%s" ' % signList[iTest]
                cmdString += ' --nRand %d ' % numRandFactor
                cmdString += ' > %s ' % prOutFile
                print cmdString
                print " "
    
                if ( 1 ):
                    if ( 0 ):
                        print " just pretending ... "
                    else:
                        ( status, output ) = commands.getstatusoutput ( cmdString )
                        print " status : ", status
                        print " output : ", output
                else:
                    os.system ( cmdString )
    
                time.sleep ( 10 )
    

        print " "
        print " "
        print " ************************************************************* "
        print " "
Exemplo n.º 15
0
if __name__ == "__main__":

    if (1):
        if (len(sys.argv) != 4):
            print " Usage : %s <input feature matrix> <output feature matrix> <featType> " % sys.argv[0]
            print "         to avoid filtering on featType, use ANY "
            sys.exit(-1)
        inFile = sys.argv[1]
        outFile = sys.argv[2]
        featType = sys.argv[3]

    print " "
    print " "
    print " ******************** "
    print "  in filterIdentFeat  "
    print " ******************** "

    inD = tsvIO.readTSV(inFile)
    rowLabels = inD['rowLabels']
    numRow = len(rowLabels)

    outD = removeIdenticalFeatures(inD, featType)

    tsvIO.writeTSV_dataMatrix(outD, 0, 0, outFile)

    print " FINISHED "
    print " "


# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
Exemplo n.º 16
0
    dTypeList = []
    fTypeList = []

    # if trying to prune LOTS of rows and columns, then set the *MaxNAfrac
    # values to small values ... if trying not to prune ANYTHING, then set
    # the values near 1 ...

    ## colMaxNAfrac = 0.90
    ## rowMaxNAfrac = 0.90

    for aFile in inFileList:

        print " "
        print " ***************************************************************** "
        print " calling readTSV ... ", aFile
        testD = tsvIO.readTSV(aFile)

        if (len(testD) == 0):
            print " --> nothing found ??? continuing ... "
            continue

        tsvIO.lookAtDataD(testD)

        if (1):

            # TCGA-CJ-4635-01A-02R
            # the first 12 characters identify the patient
            # the first 15 characters identify the sample

            # now we check for duplicates at the sample level ...
            print " "
Exemplo n.º 17
0
    # for the input barcode lengths ...
    getBarcodeLength(inFileList)

    # if trying to prune LOTS of rows and columns, then set the *MaxNAfrac
    # values to small values ... if trying not to prune ANYTHING, then set
    # the values near 1 ...

    ## colMaxNAfrac = 0.90
    ## rowMaxNAfrac = 0.90

    for aFile in inFileList:

        print " "
        print " ***************************************************************** "
        print " calling readTSV ... ", aFile
        testD = tsvIO.readTSV(aFile)

        ## check to see if we actually have any data ...
        skipFile = 0
        try:
            if (len(testD) == 0): skipFile = 1
            if (len(testD['rowLabels']) == 0): skipFile = 1
            if (len(testD['colLabels']) == 0): skipFile = 1
        except:
            print " ERROR in looking at data from <%s> ??? " % (aFile)
            skipFile = 1

        if (skipFile):
            print " --> nothing found ??? continuing ... "
            continue
Exemplo n.º 18
0
# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

if __name__ == "__main__":

    if (len(sys.argv) != 3):
        print " Usage : %s <input TSV> <output TSV> " % sys.argv[0]
        sys.exit(-1)

    tsvNameIn = sys.argv[1]
    tsvNameOut = sys.argv[2]

    print " "
    print " ****************************************************************** "
    print " reading input file <%s> " % tsvNameIn
    tmpD = tsvIO.readTSV(tsvNameIn)

    if (len(tmpD) == 0):
        print " in addIndicators ... no input data ... nothing to do here ... "
        sys.exit(-1)

    # automatically generate indicator features for categorical features
    tmpD = addIndicatorFeatures(tmpD)

    tsvIO.writeTSV_dataMatrix(tmpD, 0, 0, tsvNameOut)

    print " "
    print " "
    print " FINISHED "
    print " "
    print " "
Exemplo n.º 19
0
# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

if __name__ == "__main__":

    if (1):
        if (len(sys.argv) == 3):
            inFile = sys.argv[1]
            outFile = sys.argv[2]
        else:
            print " "
            print " Usage: %s <input TSV file> <output TSV file> "
            print " "
            print " ERROR -- bad command line arguments "
            sys.exit(-1)

    print " "
    print " Running : %s %s %s " % (sys.argv[0], sys.argv[1], sys.argv[2])
    print " "
    print " "

    # now read in the input feature matrix ...
    dataD = tsvIO.readTSV(inFile)

    # add new custom features ...
    dataD = addCustomFeatures(dataD)

    # and write the matrix back out
    tsvIO.writeTSV_dataMatrix(dataD, 0, 0, outFile)

# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
Exemplo n.º 20
0
            print " "
            print " ERROR -- bad command line arguments "
            sys.exit(-1)

    print " "
    print " Running : %s %s %s %s " % (sys.argv[0], sys.argv[1], sys.argv[2], sys.argv[3])
    print " "
    print " "

    listDict = {}

    # read in the current clinical file ...
    topDir = "%s/%s/%s" % (gidgetConfigVars['TCGAFMP_DATA_DIR'], tumorString, dateString)
    clin1name = topDir + "/" + "%s.clinical.%s.tsv" % ( tumorString, dateString )
    print clin1name
    allClinDict = tsvIO.readTSV ( clin1name )

    # find out which features are interesting ...

    # BUT IS THIS REALLY COMPLETELY NOT NECESSARY ??? 
    # was this just for debugging purposes ???
    fList = getFeatList ( featureList )

    for aF in fList:
        print aF

        for aKey in allClinDict.keys():

            if ( aKey[1] == ":" ):
                aTokens = aKey.split(':')
                tKey = aTokens[2]