示例#1
0
def get_format_data(data = None,fileName=None):
    if data is None:
        data = getData3(fileName)
    result = defaultdict(list)
    i=0
    while i<len(data):
        
        result[i] = checkAllcancer(data[i][1])
        result[i]['content'] = get_section(data[i][1])        
        i+=1
    return data,result
示例#2
0
def baseDB(dbName=None):
    # create database valdb that collects historical data to be used as a base database
    if dbName is None:
        dbName = "Valdb.data"
    valdb_destroy(dbName)
    from glob import glob

    files = glob("./data/*.csv")
    for f in files:
        data = getData3(f)
        # create baseDB
        collection = get_collection(data)
        valdb = valdb_add(collection, dbName=dbName)
    return valdb
示例#3
0
                result[clean_key] = clean_val
                result = get_subcontent(result,info,sub_content)
        i=j
        #print 'result',result
    return result

def get_format_data(data = None,fileName=None):
    if data is None:
        data = getData3(fileName)
    result = defaultdict(list)
    i=0
    while i<len(data):
        
        result[i] = checkAllcancer(data[i][1])
        result[i]['content'] = get_section(data[i][1])        
        i+=1
    return data,result
    
if __name__ == '__main__':
    #from file_utilities import match_encounter_id
    if 'data' not in locals():
        pass
    data = getData3('./data/ovarian.csv')
    data,result = get_format_data(data)
    #new_result = match_encounter_id(data,result)
    
    
    
    
    
示例#4
0
                k_clean = keydb_clean_string(k)
                if len(k_clean)==0:
                    break
                k_clean = k_clean[0]
#                k_clean = k                          
                value = item[k][0]
                if value!='' and value!="_":
                    if collection.get(k_clean)==None:
                        collection[k_clean]=[]
                    value = value.replace("_","")
                    collection[k_clean].append(value.lower())
        i+=1
    return collection
    
if __name__ == '__main__':
    data = getData3() 
    collection_score = {}
    collection = get_collection(data)
#    key = collection.keys() # just for testing, we can get a list of key we want to find value score
    key = ['beyond pelvis']    
    for k in key:
        valdb_destroy('Valdb.data')    
        valdb_destroy('Valdb_wordcount.data')
        v = collection[k]
        if collection_score.get(k)==None:
            collection_score[k] = []
        collection_score[k].append(v)
        
        #This is where you get the value score
        dbVal,dbVal_wordcount,score = score_fromdb(v)
        collection_score[k].append(score)
示例#5
0
def keydb_build():
    '''
    this function is for building all keydbs from all csvs from /data folder. 
    when the build finishes, you will have all *.data where * is the cancer name. 
    batch building all libraries under the same directory. (not keydb directory)    
    '''
    from get_data_breast import get_format_data
    from file_utilities import getData3
    start = time.time()
    start0 = start
    times = []
    #times is an array of 1 tuple, for each of the time there is an explanation. 

    '''
    #clean all databases
    keydb_destroy()    
    keydb_marginal_destroy()
    elapsed = time.time()-start
    print 'destroy keydb and marginal db finished. elapsed time=',elapsed,'s'
    times.append((['destroy keydb and marginal db',elapsed]))    
    start = time.time()    
    '''
    
    
    #get file list
    from glob import glob
    files = glob('./data/*.csv')
    
    #lengths = []
    for f in files:
        #destroy the keydb for each data
        #keydb_marginal_destroy(get_name(f)+'.data')
        #continue
        #get data
        data = getData3(f)     
        data,result = get_format_data(data)
        for cancer in result.keys():
            if cancer == 'content':
                continue
            for key in result[cancer].keys():
                if len(re.findall('\_',key))>=2:
                    result.append([cancer,result[cancer][key]])
        continue
        #lengths.append([f,len(data)])
        #continue
        elapsed = time.time()-start
        print 'laoding data finished. elapsed time=',elapsed,'s'   
        times.append((['loading data '+get_name(f),elapsed]))
        start = time.time()
        
    
        
        ###################test
        #testNote = '\nUTERINE CANCER STAGING SUMMARY\nd0 d1:data1\nd0 d1 d3:data3\nd1 d2: data2\nd1 d2: data3\n\nAmerican Joint Committee on Cancer (2009) Tumor-Node-Metastasis (TNM) staging for endometrial cancer:\nTumor (T):\t\tpT1a\nNodes (N):\t\tpN0\nMetastasis (M):\tpMX\n\n'
        #testResult = keydb_get_note(testNote)
        #keydb_marginal_add_note(testNote)
        #realResult = testResult.copy()
        #for key in testResult.keys():
        #    realResult[key]= keydb_marginal_newkey(key)
        ###################test over
        

        #load value  
        i=0
        for value in data:
            i+=1
            tempStart = time.time()
            #adding the note to db. 
            #keydb_marginal_add_note(value[1])
            #adding the note to specific db, namely breast.data etc
            keydb_marginal_add_note(value[1],dbName = get_name(f)+'.data' )

            print i,'/',len(data), time.time() - tempStart            
            #valdb_add_note(value[1])
            
        
        
        elapsed = time.time()-start
        print 'add note to marginal db finished. elapsed time=',elapsed,'s'   
        times.append((['adding data ' +get_name(f),elapsed]))
        start = time.time()
        
        '''
        #detect keys, excluding those of content
        marginal_result=result.copy()
        marginaldb = keydb_marginal_load()
        valuedb = valuedb_load()
        for key1,value1 in result.items():
            for key2,value2 in value1.items():
                if key2 == 'content':
                    continue
                else:
                    for key in value2.keys():
                        marginal_result[key1][key2][key]['keyscore'] = keydb_marginal_newkey(key,marginaldb=marginaldb)
                        marginal_result[key1][key2][key]['valuescore'] = valuedb_newvalue(value2[key],valuedb = valuedb)
                        
        elapsed = time.time()-start
        print 'detecting key finished. elapsed time=',elapsed,'s'  
        times.append((['detecting data',elapsed]))
        start = time.time()                               
        '''
        
        
        '''
        keydb_add_result(result)
        keydb = keydb_load()
        keydb_marginal_destroy()
        keydb_marginal_add_db(keydb)
        marginaldb = keydb_marginal_load()
        cResult = {}
        test = result[1][result[1].keys()[1]].keys()[0]
        
        marginal = keydb_marginal_marginal(test)
        chained = keydb_marginal_chained(test)
        
        '''
        
        ''' ALTERNATIVE WAYS TO GET DB'''
        ''' USING GET_KEY_FREQ ROUTINE
        db = {}
        for record in result.values():
            db=dict_add(db,get_key_freq(record))
        keydb_add(db)
        ### USING ADD_NOTE_KEYDB ROUTINE
        db = {}
        for value[1] in data.values() as record:
            db=dict_add(db,keydb_get_note(record))
        keydb_add(db)
        '''
        elapsed = time.time()-start0
        print 'finished exectuing. elapsed time=',elapsed,'s'
        times.append(['total time '+get_name(f),elapsed])
    print(times)