Python Sample.getVariantFrequency 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: sample

클래스/타입: Sample

메소드/함수: getVariantFrequency

hotexamples.com에서의 예제들: 1

Python Sample.getVariantFrequency - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 sample.Sample.getVariantFrequency에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Sample(30)

evaluate(6)

collect_sample(5)

addallhistory(5)

get_flywheel_change(3)

get_sample_flywheel_updates(3)

calc(2)

getAllLocOnPlate(2)

getAllOnPlate(2)

create_sample(2)

create_empty_database(2)

copy_from(2)

clearSystematics(2)

generate_x_y(2)

calc_speed(2)

from_wavfile(2)

bestCMu(2)

get_image(2)

afMeasurements(2)

__init__(2)

get_alphanum_percentage(1)

get(1)

hasVariant(1)

hasGene(1)

getClassLabel(1)

getData(1)

getGenes(1)

getSampleName(1)

getVariantFrequency(1)

getVariants(1)

get_alphanum_count(1)

get_sample_data(1)

get_avg_word_len(1)

get_id_and_sample(1)

get_sample_and_text(1)

get_char_appearances(1)

get_sample(1)

get_char_count(1)

get_pv_sample(1)

get_load_sample_rtp(1)

get_ev_sample(1)

get_fragment(1)

get_load_sample_nrtp(1)

get_fragment_with_interval(1)

A(1)

fromRaw(1)

from_values(1)

call(1)

Mu(1)

Type(1)

예제 #1

파일 보기

파일: matrix_analyzer.py 프로젝트: dyermd/legos

def loadData(matrixFile, outDir):
    #open the file and parse the contents
    logging.info('Parsing datamatrix')
    file = open(matrixFile, 'r')

    #store a few things for later
    headers = []
    sampleSummary = Sample('all')
    samples = {}

    #loop over the file
    for line in file.readlines():
        #remove the return
        line = line.rstrip('\n')

        #tokeninze the line
        tokens = line.split('\t')

        #if it is the first line we can just store the headers
        if line.startswith('GeneName'):
            headers = tokens

        #else we need to process the data
        else:
            sampleName = tokens[0]
            #print sampleName

            #create my sample
            sample = Sample(sampleName)

            #now lets start processing everything
            for num, variantInfo in enumerate(tokens[1:]):
                #grab the header info (we shouldn't be doing this for each sample, need to fix)
                gene, refpos = headers[num + 1].split('_')
                reference, position = re.findall(r"[^\W\d_]+|\d+", refpos)
                #print '%i %s %s %s' % (num, gene, reference, position)

                #now lets parse the variant info
                #if LOW_DEPTH, then skip
                if not variantInfo.startswith('LOW_DEPTH'):
                    variantInfoTokens = re.findall(r"[-+]?\d*\.\d+|\d+|\w+", variantInfo) #variantInfo.split('/')

                    #simple case is where there is just one alternate allele
                    if len(variantInfoTokens) == 3:
                        #base case
                        #reference
                        sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[2]))
                        sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[2]))

                        #variant
                        sample.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[2]))
                        sampleSummary.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[2]))

                    elif len(variantInfoTokens) == 6:
                        #need to handle when there are two alternate alleles
                        #reference
                        sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[4]) - float(variantInfoTokens[5]))
                        sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[4]) - float(variantInfoTokens[5]))

                        #variant 1
                        sample.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[4]))
                        sampleSummary.addVariant(gene, reference, variantInfoTokens[1], position, float(variantInfoTokens[4]))

                        #variant 2
                        sample.addVariant(gene, reference, variantInfoTokens[2], position, float(variantInfoTokens[5]))
                        sampleSummary.addVariant(gene, reference, variantInfoTokens[2], position, float(variantInfoTokens[5]))
                    elif len(variantInfoTokens) == 2:
                        #this means there was a deletion
                        #reference
                        sample.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[1]))
                        sampleSummary.addVariant(gene, reference, reference, position, 1.0 - float(variantInfoTokens[1]))

                        #deletion
                        sample.addVariant(gene, reference, '-', position, float(variantInfoTokens[1]))
                        sampleSummary.addVariant(gene, reference, '-', position, float(variantInfoTokens[1]))
                    else:
                        #this really shoudln't happen
                        print variantInfoTokens
                        logging.warning('Sample with more than two alternate alleles - %s %s' % (sampleName, variantInfo))

            #now add the sample
            samples[sampleName] = sample

    #close the file
    file.close()

    #now lets build our feature vectors, for now it will be all the variants and all the genes
    logging.info('Building feature vectors')

    #dump the array to a spreadsheet for debugging
    spreadsheet = xlsxwriter.Workbook('%s/features.xlsx' % outDir)
    featureWorksheet = spreadsheet.add_worksheet('Features')

    #set up our huge feature array and class label vectors. features is
    features = np.zeros((len(samples), len(sampleSummary.getVariants()) + len(sampleSummary.getGenes())))
    labels = np.zeros((len(samples)))

    #now let's populate our matrix
    for sampleNumber, sample in enumerate(sorted(samples)):
        sample = samples[sample]
        labels[sampleNumber] = sample.getClassLabel()

        featureWorksheet.write(sampleNumber+1, 0, sample.getSampleName())
        featureWorksheet.write(sampleNumber+1, 1, sample.getClassLabel())

        #first start with the variants
        for featureNumber, variant in enumerate(sorted(sampleSummary.getVariants())):
            geneName, position, referenceBase, variantBase = variant.split('_')
            featureWorksheet.write(0, featureNumber+2, variant)

            if(sample.hasVariant(geneName, referenceBase, variantBase, position)):
                #add frequency to array
                features[sampleNumber][featureNumber] = sample.getVariantFrequency(geneName, referenceBase, variantBase, position)
                featureWorksheet.write(sampleNumber+1, featureNumber+2, sample.getVariantFrequency(geneName, referenceBase, variantBase, position))
            else:
                featureWorksheet.write(sampleNumber+1, featureNumber+2, 0)

        #now at the gene level
        for featureNumber, gene in enumerate(sorted(sampleSummary.getGenes())):
            featureWorksheet.write(0, featureNumber+len(sampleSummary.getVariants())+1, gene)

            if(sample.hasGene(gene)):
                features[sampleNumber][featureNumber + len(sampleSummary.getVariants())] = 1
                featureWorksheet.write(sampleNumber+1, featureNumber+len(sampleSummary.getVariants())+2, 1)
            else:
                featureWorksheet.write(sampleNumber+1, featureNumber+len(sampleSummary.getVariants())+2, 0)

    #close the spreadsheet
    spreadsheet.close()

    #return the matrix
    print features
    print labels
    return(features, labels)