def conf_result(): ''' deprecated. returns the confidence result. Keyscore and value score. 1. get arguments from parser 2. use arguments to feed into our keydb_marginal_newkey and get_score from confidenceval 3. return the result in json format. ''' args = parser.parse_args() key = args['key'] value = args['value'] cancername = args['cancer'] if cancername.strip()=='': cancername = args.get('cancer_select') if cancername == None: cancername = request.form.get('cancer_select') marginaldbname = str(cancername)+'.data' if key == None: key = request.form.get('key') value = request.form.get('value') cancername = request.form.get('cancername') if cancername==None: cancername = request.form.get('cancer_select') marginaldbname = str(cancername)+'.data' if key==None or value == None: return ' '.join([str(item) for item in ['No info', 'key',key,'value',value,'cancername',cancername]]) else: keyresult = '' try: marginaldb = keydb_marginal_load(marginaldbname) keyresult = keydb_marginal_newkey(key,marginaldb) except Exception,err: print err print 'ERROR: key db error' stringValResult = '' try: valresult = getScore(key,value,keydb_marginal_load('Valdb.data')) stringValResult = ' '.join([str(item) for item in valresult.values()]) except Exception, err: print err print 'ERROR: val db error'
def getScore(key, value, valdb=None, add=True): """ This function calculates the value score # score_type : [0,1] among three types => (1) num, (2) text, (3) num_text # calculate proportion of a particular type of v with respect to the total frequency # The larger, the higher confidence # # score_length : only apply to num_text type (let score of other types to be 1) # calculate proportaion of long or short text with respect to total number of num_text type # The larger, the higher confidence # # score_wordcount : only apply to text type # calculate absolute value of (word count for v - med of c and then divided by std.dev of c) where c is a vector of wordcount # The smaller, the higher confidence # # score_token : only apply to text type (Note: the value can be negative) # calculate the difference between frequency of particular token and equal portion (1/total number of tokens) where token is value # The larger, the higher confidence Sample use: input : key = 'tumor grade', value = '3' output: 'type 1.0 length 1 wordcount NA token NA' """ # load the "Valdb.data" database if the database is not specified if valdb is None: valdb = keydb_marginal_load("Valdb.data") # add new data to the database # default is to add a new value to the database if add == True: dictInput = {key: [value]} valdb = valdb_add(dictInput) score = {} dbVal = {} dbVal_wordcount = [] # get frequency for current value countdict = getCount(value) # get frequency for all value in valdb for k, v in valdb.iteritems(): if k == key: for v2 in valdb[k]: countdict_current = getCount(v2) dbVal = dict_add(countdict_current, dbVal) if countdict_current["text"] == 1: dbVal_wordcount.append(len(v2.split(" "))) # Type feature: calculate proportion of a particular type of v with respect to the total frequency score["Type"] = float( ( countdict["num"] * dbVal["num"] + countdict["num_text"] * dbVal["num_text"] + countdict["text"] * dbVal["text"] ) ) / float(dbVal["total"]) # Length feature: only apply to num_text type (let score of other types to be 1) # calculate proportaion of long or short text with respect to total number of num_text type if countdict["num_text"] == 1: score["Length"] = float( ( countdict["num_text_long"] * dbVal["num_text_long"] + countdict["num_text_short"] * dbVal["num_text_short"] ) ) / float(dbVal["num_text"]) else: score["Length"] = 1 # Wordcount feature and Token feature : only apply to text type if countdict["text"] == 1: # Wordcount feature c = np.array(dbVal_wordcount) # c is a vector of word count dbVal_wordcount.sort() med = dbVal_wordcount[len(dbVal_wordcount) / 2] # median of a vector containing word count # If std.dev(c) which is the denominator is not 0, calculate the score # score = absolute value of (word count for value - med and then divided by std.dev of c) if c.std() != 0: score["Wordcount"] = abs(float((len(value.split(" ")) - med)) / float(c.std())) # If std.dev(c) is 0: # check if word count of value is equal to med then set score to 0 (good case) # otherwise, set score to be 100 (bad case) else: # print 'value',value if len(value.split(" ")) == med: score["Wordcount"] = 0 else: score["Wordcount"] = 100 # Token feature label = ["total", "num", "num_text", "text", "num_text_short", "num_text_long"] # token of value token = list(set(dbVal.keys()) - set(label)) token_combine = {} # combine synonym and antonym with original word antonym = defaultdict(list) remove_item = {} for k in token: # if k has already included as synonym or antonym of other token, k is not get processed if k in remove_item.keys(): continue flag = 0 # collect synonym and antonym in syn and an, respectively syn, an = Syn_Ant(k) if an != {}: antonym[an.keys()[0]] = an.values()[0] # If any item in token is synonym or antonym of k, combine frequency and remove that word from the token list. # Collect the new frequency in a token_combine dictionary for s in syn: if str(s) in token and str(s) != k: remove_item[s] = k token_combine[k] = dbVal[k] + dbVal[str(s)] flag = 1 for key, val in an.iteritems(): if str(val) in token: remove_item[val] = k token_combine[k] = dbVal[k] + dbVal[str(val)] flag = 1 # If there is no synonym or antonym of k contained in token list, collect the frequency from dbVal if flag == 0: token_combine[k] = dbVal[k] # Calculate the score for each element in the original token list for k in list(set(dbVal.keys()) - set(label)): # making sure k is always equal to v if k != value: continue if token_combine.get(k) is not None: num_token = token_combine[k] else: num_token = token_combine[remove_item[k]] eq_portion = float(1) / float(len(token_combine)) percentage = float(num_token) / float(dbVal["total"]) score["Token"] = float(percentage - eq_portion) # if value is not text type, Wordcount and Token features are "NA" else: score["Wordcount"] = "NA" score["Token"] = "NA" # for a new data format (to be combined with Abstractor) score_type = score["Type"] score_length = score["Length"] score_wordcount = score["Wordcount"] score_token = score["Token"] # return {'type':score_type,'length':score_length,'wordcount':score_wordcount,'token':score_token} return " ".join( [ str(item) for item in ["type", score_type, "length", score_length, "wordcount", score_wordcount, "token", score_token] ] )
def Extract(): ''' depracted. Use jsontest instead. this is a nice web interface for testing. user only needs to input text and a univerid, Returns: json formated result from given text and universe_id ''' args = parser.parse_args() note = args['text'] cancerName = args['universe_id'] print 'args:',args print 'form:',request.form #should ahve just check form first then the args. if note == None: note = request.form.get('data') cancerName = request.form.get('cancer') if cancerName is None: cancerName = '' if note == None or note.strip()=='': return 'No info' else: result = {} try: result = checkAllcancer(note) result_confidence= result.copy() for cancer in result.keys(): if cancerName.strip() != '': marginaldbname = cancerName.lower()+'.data' else: marginaldbname=None print 'marginaldbname: ',marginaldbname if marginaldbname is not None: marginaldb = keydb_marginal_load(marginaldbname) else: marginaldb = keydb_marginal_load() ######################################## # the below code is for getting confidence score. ######################################### for k,v in result[cancer].items(): #note that v is a list contains value and original value. value = v[0] #now we can do value processing. #put your code here #now we can do key confidence processing. #it needs a library indicating which unverse it belongs to. #in here we will just try to use our pre-existing libraries. #namely, if you have breast cancer as cancer, then try: result_confidence[cancer][k].append(keydb_marginal_newkey(k,value,marginaldb,marginaldbname,True)) except Exception, err: print 'ERROR: key_confidence failed' print err try: value_score = getScore(k,value,keydb_marginal_load('Valdb.data')) result_confidence[cancer][k].append(' '.join([str(item) for item in value_score.values()])) except Exception, err: print 'ERROR: value_confidence failed' print err result['specimens']=get_section(note)