Пример #1
0
def conf_result():
    '''
    deprecated. 
    returns the confidence result. 
    Keyscore and value score. 
    1. get arguments from parser
    2. use arguments to feed into our keydb_marginal_newkey and get_score from confidenceval
    3. return the result in json format. 
    '''
    args = parser.parse_args()
    key = args['key'] 
    value = args['value']
    cancername = args['cancer']
    if cancername.strip()=='':
        cancername  = args.get('cancer_select')
    if cancername == None:
        cancername = request.form.get('cancer_select')
        
    marginaldbname = str(cancername)+'.data'
    if key == None:
        key = request.form.get('key')
        value = request.form.get('value')
        cancername = request.form.get('cancername')
        if cancername==None:
            cancername  = request.form.get('cancer_select')
        marginaldbname = str(cancername)+'.data'
    if key==None or value == None:
        
        return ' '.join([str(item) for item in ['No info', 'key',key,'value',value,'cancername',cancername]])
    else:
        keyresult = ''
        try:
            marginaldb = keydb_marginal_load(marginaldbname)
            keyresult = keydb_marginal_newkey(key,marginaldb)
        except Exception,err:
            print err
            print 'ERROR: key db error'
        
        stringValResult = ''
        try:
            valresult = getScore(key,value,keydb_marginal_load('Valdb.data'))
            stringValResult = ' '.join([str(item) for item in valresult.values()])
        except Exception, err:
            print err
            print 'ERROR: val db error'
Пример #2
0
def getScore(key, value, valdb=None, add=True):
    """
    This function calculates the value score 
    # score_type : [0,1] among three types => (1) num, (2) text, (3) num_text
    # calculate proportion of a particular type of v with respect to the total frequency
    # The larger, the higher confidence
    #
    # score_length : only apply to num_text type (let score of other types to be 1)
    # calculate proportaion of long or short text with respect to total number of num_text type
    # The larger, the higher confidence
    #
    # score_wordcount : only apply to text type
    # calculate  absolute value of (word count for v - med of c and then divided by std.dev of c) where c is a vector of wordcount
    # The smaller, the higher confidence
    #
    # score_token : only apply to text type (Note: the value can be negative)
    # calculate the difference between frequency of particular token and equal portion (1/total number of tokens) where token is value
    # The larger, the higher confidence
    
    Sample use:
        input : key = 'tumor grade', value = '3'
        output: 'type 1.0 length 1 wordcount NA token NA'
    """
    # load the "Valdb.data" database if the database is not specified
    if valdb is None:
        valdb = keydb_marginal_load("Valdb.data")

    # add new data to the database
    # default is to add a new value to the database
    if add == True:
        dictInput = {key: [value]}
        valdb = valdb_add(dictInput)

    score = {}
    dbVal = {}
    dbVal_wordcount = []

    # get frequency for current value
    countdict = getCount(value)

    # get frequency for all value in valdb
    for k, v in valdb.iteritems():
        if k == key:
            for v2 in valdb[k]:
                countdict_current = getCount(v2)
                dbVal = dict_add(countdict_current, dbVal)
                if countdict_current["text"] == 1:
                    dbVal_wordcount.append(len(v2.split(" ")))

    # Type feature: calculate proportion of a particular type of v with respect to the total frequency

    score["Type"] = float(
        (
            countdict["num"] * dbVal["num"]
            + countdict["num_text"] * dbVal["num_text"]
            + countdict["text"] * dbVal["text"]
        )
    ) / float(dbVal["total"])

    # Length feature: only apply to num_text type (let score of other types to be 1)
    # calculate proportaion of long or short text with respect to total number of num_text type
    if countdict["num_text"] == 1:
        score["Length"] = float(
            (
                countdict["num_text_long"] * dbVal["num_text_long"]
                + countdict["num_text_short"] * dbVal["num_text_short"]
            )
        ) / float(dbVal["num_text"])
    else:
        score["Length"] = 1

    # Wordcount feature and Token feature : only apply to text type
    if countdict["text"] == 1:
        # Wordcount feature

        c = np.array(dbVal_wordcount)  # c is a vector of word count
        dbVal_wordcount.sort()
        med = dbVal_wordcount[len(dbVal_wordcount) / 2]  # median of a vector containing word count
        # If std.dev(c) which is the denominator is not 0, calculate the score
        # score = absolute value of (word count for value - med and then divided by std.dev of c)
        if c.std() != 0:
            score["Wordcount"] = abs(float((len(value.split(" ")) - med)) / float(c.std()))
        # If std.dev(c) is 0:
        #   check if word count of value is equal to med then set score to 0 (good case)
        #   otherwise, set score to be 100 (bad case)
        else:
            #            print 'value',value
            if len(value.split(" ")) == med:
                score["Wordcount"] = 0
            else:
                score["Wordcount"] = 100

        # Token feature
        label = ["total", "num", "num_text", "text", "num_text_short", "num_text_long"]
        # token of value
        token = list(set(dbVal.keys()) - set(label))
        token_combine = {}  # combine synonym and antonym with original word
        antonym = defaultdict(list)
        remove_item = {}

        for k in token:
            # if k has already included as synonym or antonym of other token, k is not get processed
            if k in remove_item.keys():
                continue
            flag = 0
            # collect synonym and antonym in syn and an, respectively
            syn, an = Syn_Ant(k)
            if an != {}:
                antonym[an.keys()[0]] = an.values()[0]
            # If any item in token is synonym or antonym of k, combine frequency and remove that word from the token list.
            # Collect the new frequency in a token_combine dictionary
            for s in syn:
                if str(s) in token and str(s) != k:
                    remove_item[s] = k
                    token_combine[k] = dbVal[k] + dbVal[str(s)]
                    flag = 1

            for key, val in an.iteritems():
                if str(val) in token:
                    remove_item[val] = k
                    token_combine[k] = dbVal[k] + dbVal[str(val)]
                    flag = 1

            # If there is no synonym or antonym of k contained in token list, collect the frequency from dbVal
            if flag == 0:
                token_combine[k] = dbVal[k]

        # Calculate the score for each element in the original token list
        for k in list(set(dbVal.keys()) - set(label)):
            # making sure k is always equal to v
            if k != value:
                continue
            if token_combine.get(k) is not None:
                num_token = token_combine[k]
            else:
                num_token = token_combine[remove_item[k]]

            eq_portion = float(1) / float(len(token_combine))
            percentage = float(num_token) / float(dbVal["total"])
            score["Token"] = float(percentage - eq_portion)

    # if value is not text type, Wordcount and Token features are "NA"
    else:
        score["Wordcount"] = "NA"
        score["Token"] = "NA"

    # for a new data format (to be combined with Abstractor)
    score_type = score["Type"]
    score_length = score["Length"]
    score_wordcount = score["Wordcount"]
    score_token = score["Token"]

    # return {'type':score_type,'length':score_length,'wordcount':score_wordcount,'token':score_token}
    return " ".join(
        [
            str(item)
            for item in ["type", score_type, "length", score_length, "wordcount", score_wordcount, "token", score_token]
        ]
    )
Пример #3
0
def Extract():
    '''
    depracted. Use jsontest instead. 
    this is a nice web interface for testing. 
    user only needs to input text and a univerid, 
    Returns: 
        json formated result from given text and universe_id
    '''
    args = parser.parse_args()
    note = args['text']
    cancerName = args['universe_id']
    print 'args:',args
    print 'form:',request.form
    #should ahve just check form first then the args. 
    if note == None:
        note = request.form.get('data')
        cancerName = request.form.get('cancer')
        
    if cancerName is None:
        cancerName = ''
        
    if note == None or note.strip()=='':
        return 'No info'
    else:
        result = {}
        try:
            result = checkAllcancer(note)
            result_confidence= result.copy()
            for cancer in result.keys():
                if cancerName.strip() != '':
                    marginaldbname = cancerName.lower()+'.data'
                else:
                    marginaldbname=None
                print 'marginaldbname: ',marginaldbname
                if marginaldbname is not None:
                    marginaldb = keydb_marginal_load(marginaldbname)
                else:
                    marginaldb = keydb_marginal_load()
                ########################################
                # the below code is for getting confidence score. 
                #########################################                
                for k,v in result[cancer].items():
                    #note that v is a list contains value and original value. 
                    value = v[0]
                    #now we can do value processing. 
                    #put your code here
                    
                    #now we can do key confidence processing. 
                    #it needs a library indicating which unverse it belongs to. 
                    #in here we will just try to use our pre-existing libraries. 
                    #namely, if you have breast cancer as cancer, then
                    try:
                        result_confidence[cancer][k].append(keydb_marginal_newkey(k,value,marginaldb,marginaldbname,True))
                    except Exception, err:
                        print 'ERROR: key_confidence failed'
                        print err
                    try:
                        value_score = getScore(k,value,keydb_marginal_load('Valdb.data'))
                        result_confidence[cancer][k].append(' '.join([str(item) for item in value_score.values()]))
                    except Exception, err:
                        print 'ERROR: value_confidence failed'
                        print err

            result['specimens']=get_section(note)