def loadData(matrixFile, outDir): #open the file and parse the contents logging.info('Parsing datamatrix') file = open(matrixFile, 'r') #store a few things for later headers = [] sampleSummary = Sample('all') samples = {} #loop over the file for line in file.readlines(): #remove the return line = line.rstrip('\n') #tokeninze the line tokens = line.split('\t') #if it is the first line we can just store the headers if line.startswith('GeneName'): headers = tokens #else we need to process the data else: sampleName = tokens[0] #print sampleName #create my sample sample = Sample(sampleName) #now lets start processing everything for num, variantInfo in enumerate(tokens[1:]): #grab the header info (we shouldn't be doing this for each sample, need to fix) gene, refpos = headers[num + 1].split('_') reference, position = re.findall(r"[^\W\d_]+|\d+", refpos) #print '%i %s %s %s' % (num, gene, reference, position) #now lets parse the variant info #if LOW_DEPTH, then skip if not variantInfo.startswith('LOW_DEPTH'): variantInfoTokens = re.findall(r"[-+]?\d*\.\d+|\d+|\w+", variantInfo) #variantInfo.split('/') #simple case is where there is just one alternate allele if len(variantInfoTokens) == 3: #base case #reference sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[2])) sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[2])) #variant sample.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[2])) sampleSummary.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[2])) elif len(variantInfoTokens) == 6: #need to handle when there are two alternate alleles #reference sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[4]) - float(variantInfoTokens[5])) sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[4]) - float(variantInfoTokens[5])) #variant 1 sample.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[4])) sampleSummary.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[4])) #variant 2 sample.addVariant(gene, reference, variantInfoTokens[2], position, float(variantInfoTokens[5])) sampleSummary.addVariant(gene, reference, variantInfoTokens[2], position, float(variantInfoTokens[5])) elif len(variantInfoTokens) == 2: #this means there was a deletion #reference sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[1])) sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[1])) #deletion sample.addVariant(gene, reference, '-', position, float(variantInfoTokens[1])) sampleSummary.addVariant(gene, reference, '-', position, float(variantInfoTokens[1])) else: #this really shoudln't happen print variantInfoTokens logging.warning('Sample with more than two alternate alleles - %s %s' % (sampleName, variantInfo)) #now add the sample samples[sampleName] = sample #close the file file.close() #now lets build our feature vectors, for now it will be all the variants and all the genes logging.info('Building feature vectors') #dump the array to a spreadsheet for debugging spreadsheet = xlsxwriter.Workbook('%s/features.xlsx' % outDir) featureWorksheet = spreadsheet.add_worksheet('Features') #set up our huge feature array and class label vectors. features is features = np.zeros((len(samples), len(sampleSummary.getVariants()) + len(sampleSummary.getGenes()))) labels = np.zeros((len(samples))) #now let's populate our matrix for sampleNumber, sample in enumerate(sorted(samples)): sample = samples[sample] labels[sampleNumber] = sample.getClassLabel() featureWorksheet.write(sampleNumber+1, 0, sample.getSampleName()) featureWorksheet.write(sampleNumber+1, 1, sample.getClassLabel()) #first start with the variants for featureNumber, variant in enumerate(sorted(sampleSummary.getVariants())): geneName, position, referenceBase, variantBase = variant.split('_') featureWorksheet.write(0, featureNumber+2, variant) if(sample.hasVariant(geneName, referenceBase, variantBase, position)): #add frequency to array features[sampleNumber][featureNumber] = sample.getVariantFrequency(geneName, referenceBase, variantBase, position) featureWorksheet.write(sampleNumber+1, featureNumber+2, sample.getVariantFrequency(geneName, referenceBase, variantBase, position)) else: featureWorksheet.write(sampleNumber+1, featureNumber+2, 0) #now at the gene level for featureNumber, gene in enumerate(sorted(sampleSummary.getGenes())): featureWorksheet.write(0, featureNumber+len(sampleSummary.getVariants())+1, gene) if(sample.hasGene(gene)): features[sampleNumber][featureNumber + len(sampleSummary.getVariants())] = 1 featureWorksheet.write(sampleNumber+1, featureNumber+len(sampleSummary.getVariants())+2, 1) else: featureWorksheet.write(sampleNumber+1, featureNumber+len(sampleSummary.getVariants())+2, 0) #close the spreadsheet spreadsheet.close() #return the matrix print features print labels return(features, labels)