def get_format_data(data = None,fileName=None): if data is None: data = getData3(fileName) result = defaultdict(list) i=0 while i<len(data): result[i] = checkAllcancer(data[i][1]) result[i]['content'] = get_section(data[i][1]) i+=1 return data,result
def baseDB(dbName=None): # create database valdb that collects historical data to be used as a base database if dbName is None: dbName = "Valdb.data" valdb_destroy(dbName) from glob import glob files = glob("./data/*.csv") for f in files: data = getData3(f) # create baseDB collection = get_collection(data) valdb = valdb_add(collection, dbName=dbName) return valdb
result[clean_key] = clean_val result = get_subcontent(result,info,sub_content) i=j #print 'result',result return result def get_format_data(data = None,fileName=None): if data is None: data = getData3(fileName) result = defaultdict(list) i=0 while i<len(data): result[i] = checkAllcancer(data[i][1]) result[i]['content'] = get_section(data[i][1]) i+=1 return data,result if __name__ == '__main__': #from file_utilities import match_encounter_id if 'data' not in locals(): pass data = getData3('./data/ovarian.csv') data,result = get_format_data(data) #new_result = match_encounter_id(data,result)
k_clean = keydb_clean_string(k) if len(k_clean)==0: break k_clean = k_clean[0] # k_clean = k value = item[k][0] if value!='' and value!="_": if collection.get(k_clean)==None: collection[k_clean]=[] value = value.replace("_","") collection[k_clean].append(value.lower()) i+=1 return collection if __name__ == '__main__': data = getData3() collection_score = {} collection = get_collection(data) # key = collection.keys() # just for testing, we can get a list of key we want to find value score key = ['beyond pelvis'] for k in key: valdb_destroy('Valdb.data') valdb_destroy('Valdb_wordcount.data') v = collection[k] if collection_score.get(k)==None: collection_score[k] = [] collection_score[k].append(v) #This is where you get the value score dbVal,dbVal_wordcount,score = score_fromdb(v) collection_score[k].append(score)
def keydb_build(): ''' this function is for building all keydbs from all csvs from /data folder. when the build finishes, you will have all *.data where * is the cancer name. batch building all libraries under the same directory. (not keydb directory) ''' from get_data_breast import get_format_data from file_utilities import getData3 start = time.time() start0 = start times = [] #times is an array of 1 tuple, for each of the time there is an explanation. ''' #clean all databases keydb_destroy() keydb_marginal_destroy() elapsed = time.time()-start print 'destroy keydb and marginal db finished. elapsed time=',elapsed,'s' times.append((['destroy keydb and marginal db',elapsed])) start = time.time() ''' #get file list from glob import glob files = glob('./data/*.csv') #lengths = [] for f in files: #destroy the keydb for each data #keydb_marginal_destroy(get_name(f)+'.data') #continue #get data data = getData3(f) data,result = get_format_data(data) for cancer in result.keys(): if cancer == 'content': continue for key in result[cancer].keys(): if len(re.findall('\_',key))>=2: result.append([cancer,result[cancer][key]]) continue #lengths.append([f,len(data)]) #continue elapsed = time.time()-start print 'laoding data finished. elapsed time=',elapsed,'s' times.append((['loading data '+get_name(f),elapsed])) start = time.time() ###################test #testNote = '\nUTERINE CANCER STAGING SUMMARY\nd0 d1:data1\nd0 d1 d3:data3\nd1 d2: data2\nd1 d2: data3\n\nAmerican Joint Committee on Cancer (2009) Tumor-Node-Metastasis (TNM) staging for endometrial cancer:\nTumor (T):\t\tpT1a\nNodes (N):\t\tpN0\nMetastasis (M):\tpMX\n\n' #testResult = keydb_get_note(testNote) #keydb_marginal_add_note(testNote) #realResult = testResult.copy() #for key in testResult.keys(): # realResult[key]= keydb_marginal_newkey(key) ###################test over #load value i=0 for value in data: i+=1 tempStart = time.time() #adding the note to db. #keydb_marginal_add_note(value[1]) #adding the note to specific db, namely breast.data etc keydb_marginal_add_note(value[1],dbName = get_name(f)+'.data' ) print i,'/',len(data), time.time() - tempStart #valdb_add_note(value[1]) elapsed = time.time()-start print 'add note to marginal db finished. elapsed time=',elapsed,'s' times.append((['adding data ' +get_name(f),elapsed])) start = time.time() ''' #detect keys, excluding those of content marginal_result=result.copy() marginaldb = keydb_marginal_load() valuedb = valuedb_load() for key1,value1 in result.items(): for key2,value2 in value1.items(): if key2 == 'content': continue else: for key in value2.keys(): marginal_result[key1][key2][key]['keyscore'] = keydb_marginal_newkey(key,marginaldb=marginaldb) marginal_result[key1][key2][key]['valuescore'] = valuedb_newvalue(value2[key],valuedb = valuedb) elapsed = time.time()-start print 'detecting key finished. elapsed time=',elapsed,'s' times.append((['detecting data',elapsed])) start = time.time() ''' ''' keydb_add_result(result) keydb = keydb_load() keydb_marginal_destroy() keydb_marginal_add_db(keydb) marginaldb = keydb_marginal_load() cResult = {} test = result[1][result[1].keys()[1]].keys()[0] marginal = keydb_marginal_marginal(test) chained = keydb_marginal_chained(test) ''' ''' ALTERNATIVE WAYS TO GET DB''' ''' USING GET_KEY_FREQ ROUTINE db = {} for record in result.values(): db=dict_add(db,get_key_freq(record)) keydb_add(db) ### USING ADD_NOTE_KEYDB ROUTINE db = {} for value[1] in data.values() as record: db=dict_add(db,keydb_get_note(record)) keydb_add(db) ''' elapsed = time.time()-start0 print 'finished exectuing. elapsed time=',elapsed,'s' times.append(['total time '+get_name(f),elapsed]) print(times)