예제 #1
0
def loadData(matrixFile, outDir):
    #open the file and parse the contents
    logging.info('Parsing datamatrix')
    file = open(matrixFile, 'r')

    #store a few things for later
    headers = []
    sampleSummary = Sample('all')
    samples = {}

    #loop over the file
    for line in file.readlines():
        #remove the return
        line = line.rstrip('\n')

        #tokeninze the line
        tokens = line.split('\t')

        #if it is the first line we can just store the headers
        if line.startswith('GeneName'):
            headers = tokens

        #else we need to process the data
        else:
            sampleName = tokens[0]
            #print sampleName

            #create my sample
            sample = Sample(sampleName)

            #now lets start processing everything
            for num, variantInfo in enumerate(tokens[1:]):
                #grab the header info (we shouldn't be doing this for each sample, need to fix)
                gene, refpos = headers[num + 1].split('_')
                reference, position = re.findall(r"[^\W\d_]+|\d+", refpos)
                #print '%i %s %s %s' % (num, gene, reference, position)

                #now lets parse the variant info
                #if LOW_DEPTH, then skip
                if not variantInfo.startswith('LOW_DEPTH'):
                    variantInfoTokens = re.findall(r"[-+]?\d*\.\d+|\d+|\w+", variantInfo) #variantInfo.split('/')

                    #simple case is where there is just one alternate allele
                    if len(variantInfoTokens) == 3:
                        #base case
                        #reference
                        sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[2]))
                        sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[2]))

                        #variant
                        sample.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[2]))
                        sampleSummary.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[2]))

                    elif len(variantInfoTokens) == 6:
                        #need to handle when there are two alternate alleles
                        #reference
                        sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[4]) - float(variantInfoTokens[5]))
                        sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[4]) - float(variantInfoTokens[5]))

                        #variant 1
                        sample.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[4]))
                        sampleSummary.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[4]))

                        #variant 2
                        sample.addVariant(gene, reference, variantInfoTokens[2], position, float(variantInfoTokens[5]))
                        sampleSummary.addVariant(gene, reference, variantInfoTokens[2], position, float(variantInfoTokens[5]))
                    elif len(variantInfoTokens) == 2:
                        #this means there was a deletion
                        #reference
                        sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[1]))
                        sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[1]))

                        #deletion
                        sample.addVariant(gene, reference, '-', position, float(variantInfoTokens[1]))
                        sampleSummary.addVariant(gene, reference, '-', position, float(variantInfoTokens[1]))
                    else:
                        #this really shoudln't happen
                        print variantInfoTokens
                        logging.warning('Sample with more than two alternate alleles - %s %s' % (sampleName, variantInfo))

            #now add the sample
            samples[sampleName] = sample

    #close the file
    file.close()

    #now lets build our feature vectors, for now it will be all the variants and all the genes
    logging.info('Building feature vectors')

    #dump the array to a spreadsheet for debugging
    spreadsheet = xlsxwriter.Workbook('%s/features.xlsx' % outDir)
    featureWorksheet = spreadsheet.add_worksheet('Features')

    #set up our huge feature array and class label vectors. features is
    features = np.zeros((len(samples), len(sampleSummary.getVariants()) + len(sampleSummary.getGenes())))
    labels = np.zeros((len(samples)))

    #now let's populate our matrix
    for sampleNumber, sample in enumerate(sorted(samples)):
        sample = samples[sample]
        labels[sampleNumber] = sample.getClassLabel()

        featureWorksheet.write(sampleNumber+1, 0, sample.getSampleName())
        featureWorksheet.write(sampleNumber+1, 1, sample.getClassLabel())

        #first start with the variants
        for featureNumber, variant in enumerate(sorted(sampleSummary.getVariants())):
            geneName, position, referenceBase, variantBase = variant.split('_')
            featureWorksheet.write(0, featureNumber+2, variant)

            if(sample.hasVariant(geneName, referenceBase, variantBase, position)):
                #add frequency to array
                features[sampleNumber][featureNumber] = sample.getVariantFrequency(geneName, referenceBase, variantBase, position)
                featureWorksheet.write(sampleNumber+1, featureNumber+2, sample.getVariantFrequency(geneName, referenceBase, variantBase, position))
            else:
                featureWorksheet.write(sampleNumber+1, featureNumber+2, 0)

        #now at the gene level
        for featureNumber, gene in enumerate(sorted(sampleSummary.getGenes())):
            featureWorksheet.write(0, featureNumber+len(sampleSummary.getVariants())+1, gene)

            if(sample.hasGene(gene)):
                features[sampleNumber][featureNumber + len(sampleSummary.getVariants())] = 1
                featureWorksheet.write(sampleNumber+1, featureNumber+len(sampleSummary.getVariants())+2, 1)
            else:
                featureWorksheet.write(sampleNumber+1, featureNumber+len(sampleSummary.getVariants())+2, 0)

    #close the spreadsheet
    spreadsheet.close()

    #return the matrix
    print features
    print labels
    return(features, labels)