def sum_dx_and_bl_pos(row): dx = row['Init_Diagnosis'] pos = row['AV45_wcereb_BIN1.11'] if isnan(dx) or isnan(pos): return np.nan if float(pos) == 0.0: new_val = '%s_BLNeg' % (dx,) elif float(pos) == 1.0: new_val = '%s_BLPos' % (dx,) else: raise Exception("Unknown positivity: %s" % pos) return new_val
def check_missing_rate(): data_path = 'DorCirurgiaCategNAReduzido.csv' #'Dados/risk_factors_cervical_cancer.csv' data = pd.read_csv(data_path, header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) X = data print(X.shape) patients_missing = [] ci = 0 features_missing = [] cj = 0 for ix, row in X.iterrows(): for j in X.columns: if (utils.isnan(row[j])): if (ix not in patients_missing): ci += 1 patients_missing.append(ix) if (j not in features_missing): cj += 1 features_missing.append(j) print(ci / X.shape[0]) print(cj / X.shape[1])
def get_lines_for_label(mdf, label, n, m=0, mins=None): ''' mdf: dataframe containing averages label: label for which table lines are generated n: number of total elements for each label in mdf m: number of elements per row mins: label to be bolded for each item ''' lines = [] r = ((n // m) + 1) * m s = '| {} '.format(label) for i in range(r): if i < n: x = mdf.loc[label].iloc[i] value = '--' if isnan(x) else '{x:.2f}'.format(x=x) if i < n and mins is not None and mins[i] == label: s += '| **{}** '.format(value) else: s += '| {} '.format(value) else: value = '--' s += '| {} '.format(value) if m != 0 and i != 0 and i != (r - 1) and (i + 1) % m == 0: lines.append(s + '|') s = '| {} '.format(label) lines.append(s + '|') return lines
def compare(self, v1, v2): # even though np.array_equal also works on scalars, we don't use it # systematically because it does not work on list of strings if isinstance(v1, np.ndarray) or isinstance(v2, np.ndarray): v1, v2 = np.asarray(v1), np.asarray(v2) if v1.shape != v2.shape: return False, ' (shape differ: %s vs %s)' % (v1.shape, v2.shape) result = np.array_equal(v1, v2) nan_v1, nan_v2 = isnan(v1), isnan(v2) if (not result and np.any(nan_v1 | nan_v2) and np.array_equal(nan_v1, nan_v2)): return False, ' but arrays contain NaNs, did you meant to ' \ 'use assertNanEqual instead?' else: return result else: return v1 == v2
def plot_missing_rate(): data_path = 'RotEOmbroCirurgiaCategNAReduzido.csv' #'Dados/risk_factors_cervical_cancer.csv' class_name = 'Q92510_opcForca[RotEOmbro]' #class_name = 'Q92510_snDorPos' class_questionnaire = 'Q92510' missing_input = 'none' #'mean' transform = False scale = True use_text = False dummy = False use_feature_selection = False data, original_attributes, categories = read.readData( data_path=data_path, class_name=class_name, class_questionnaire=class_questionnaire, missing_input=missing_input, dummy=dummy, transform_numeric=transform, use_text=use_text, skip_class_questionnaire=True) X = data print(X.shape) features_missing = [0, 0, 0, 0, 0] m = 0 for j in range((X.shape[1])): cj = 0 for i in range((X.shape[0])): if (utils.isnan(X[i][j])): cj += 1 if (cj / X.shape[0] == 0): print(original_attributes[j]) features_missing[0] += 1 elif (cj / X.shape[0] <= 0.25): features_missing[1] += 1 elif (cj / X.shape[0] <= 0.5): features_missing[2] += 1 elif (cj / X.shape[0] <= 0.75): features_missing[3] += 1 elif (cj / X.shape[0] < 1): features_missing[4] += 1 m += cj / X.shape[0] print(m / X.shape[1]) exit() print(features_missing) plt.pie( features_missing[::-1], labels=['0%', '0.05% a 25%', '26% a 50%', '51% a 75%', '76% a 98%'][::-1], colors=colors, startangle=90, radius=1, autopct=lambda p: '{:.0f}'.format(p * sum(features_missing) / 100)) plt.show()
def format_value_error(value, error): if isnan(value): return '--' if error == 0.0: return '{}'.format(value) (truncated_error, sigfigs) = process_error(error) formatted_value = format_value(value, sigfigs) # return '{:.2f} ± {:.2f}'.format(formatted_value, truncated_error) # return '{} ± {}'.format(formatted_value, truncated_error) return '{} ({})'.format(formatted_value, truncated_error)
def interpret_real(s, context=None): """Convert a raw Real value to the float it represents. This is more lenient than the SGF spec: it accepts strings accepted as a float by the platform libc. It rejects infinities and NaNs. """ result = float(s) if isinf(result): raise ValueError("infinite") if isnan(result): raise ValueError("not a number") return result
def get_test_comparison_df(df, l1, l2, l3=None, suffix=None, errors=True, formatting=None): tests = test_all_evolutions(df, l1, l2) comparisons = [test['result'] for test in tests if test is not None] [avg1, avg2] = [np.mean(df.loc[l]) for l in [l1, l2]] [std1, std2] = [np.std(df.loc[l]) for l in [l1, l2]] effect_size = [ process_effect_size(test['d']) if test else None for test in tests ] # hypothesis: assume that both labels obtain the same result hypothesis_results = [ 'Not Reject' if x == 'eq' else 'Reject' for x in comparisons ] [label1, label2] = [ l if suffix is None else '{} {}'.format(l, suffix) for l in [l1, l2] ] lines1 = [] lines2 = [] lines3 = [] if l3: avg3 = np.mean(df.loc[l3]) std3 = np.std(df.loc[l3]) label3 = l3 if suffix is None else '{} {}'.format(l3, suffix) n = get_num_evolutions(df) for i in range(0, n): if errors: line1 = format_value_error(avg1[i], std1[i]) line2 = format_value_error(avg2[i], std2[i]) line3 = format_value_error(avg3[i], std3[i]) if l3 else None else: line1 = '--' if isnan(avg1[i]) else '{:.2f}'.format(avg1[i]) line2 = '--' if isnan(avg2[i]) else '{:.2f}'.format(avg2[i]) if l3: line3 = '--' if isnan(avg3[i]) else '{:.2f}'.format(avg3[i]) # we don't have to format line3 since it is not in the comparison if formatting == 'markdown' and i < len(comparisons): line1 = f'**{line1}**' if comparisons[i] == 'lt' else line1 line2 = f'**{line2}**' if comparisons[i] == 'gt' else line2 elif formatting == 'latex' and i < len(comparisons): line1 = '\\textbf{{{}}}'.format( line1) if comparisons[i] == 'lt' else line1 line2 = '\\textbf{{{}}}'.format( line2) if comparisons[i] == 'gt' else line2 lines1.append(line1) lines2.append(line2) lines3.append(line3) if l3 else None effect_size = [es if es else '--' for es in effect_size] if l3: data = { label1: lines1, label2: lines2, label3: lines3, 'Effect Size': effect_size } else: data = {label1: lines1, label2: lines2, 'Effect Size': effect_size} return pd.DataFrame(data=data)
loss1 = F.nll_loss(score_s, target_s) + F.nll_loss(score_e, target_e) #print({'loss1:':loss1}) # P(c|Q), consider mention pure_Q=pure_Q.clone() pure_Q[Q_mask.data==0]=-float('inf') #pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf')) B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone() # B,max_Q --> B,max_c,max_Q (give all Q' original score to c) # B_max_Q_old1=B_max_Q.clone() B_max_Q[CQ_mask.data==0]=-float('inf') # B,max_c,max_Q mask, get each c's real Q B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2)) # B,max_c,max_Q , get each c's P(Q|c) some max_c 's, have no Q # have max_can line, no nan/ other line, all nan # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),1) # log(p==1)==0, no loss # B_max_Q_old2=B_max_Q.clone() B_max_Q=B_max_Q.clone() B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=1 # print(B_max_Q) # P(Q) ans_in_can=to_var(np.array(ans_in_can,dtype='int64'),use_cuda=self.args.cuda) # B, ans_index=ans_in_can.unsqueeze(1).expand(B_max_Q.size(0),B_max_Q.size(2)).unsqueeze(1) # B,1,max_Q final_Q=B_max_Q.gather(1,ans_index).squeeze(1) # B,max_Q # print(final_Q) assert torch.sum(isnan(final_Q.data.cpu()))==0 # ans in Q answear=to_var(np.array(ans_in_Q,dtype='int64'),use_cuda=self.args.cuda) if self.args.db_softmax: loss2 =F.nll_loss(F.log_softmax(final_Q),answear) else: answear_index=answear.unsqueeze(1) # B,1 predict_prob=final_Q.gather(1,answear_index.long()) # B,1
def plot_followup_movements(): data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/' data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/' #data_path = data_path + 'Q61802_unified-surgical-evaluation/Responses_Q61802.csv' data_path = data_path + 'Q92510_unified-follow-up-assessment/Responses_Q92510.csv' data = pd.read_csv(data_path, header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) admission_data = pd.read_csv( '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/FlexCotoveloNew.csv', header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) outcome_right = 'opcForcaD[FlexCotovelo]' #'snDorPos' outcome_left = 'opcForcaE[FlexCotovelo]' #'snDorPos' #print(len(([int(a/30) for a in data['formTempoAval']]))) patients_considered = {} patient_outcomes = {} #return_periods = [] for i, row in data.iterrows(): if (row['participant code']) not in patients_considered: patients_considered[row['participant code']] = row['formTempoAval'] if (np.all(admission_data['Q44071_opcLdLesao'] [admission_data['participant code'] == row['participant code']] == 'D')): patient_outcomes[row['participant code']] = row[outcome_right] elif (np.all(admission_data['Q44071_opcLdLesao'] [admission_data['participant code'] == row['participant code']] == 'E')): patient_outcomes[row['participant code']] = row[outcome_left] else: 'Preprocessing of side {0} not implemented'.format( admission_data['Q44071_opcLdLesao'][ admission_data['participant code']]) #return_periods.append(row['formTempoAval']) else: if (row['formTempoAval'] > patients_considered[row['participant code']]): if (np.all(admission_data['Q44071_opcLdLesao'] [admission_data['participant code'] == row['participant code']] == 'D')): if (row[outcome_right] != 'NINA' and not utils.isnan(row[outcome_right])): patient_outcomes[ row['participant code']] = row[outcome_right] patients_considered[ row['participant code']] = row['formTempoAval'] elif (np.all(admission_data['Q44071_opcLdLesao'] [admission_data['participant code'] == row['participant code']] == 'E')): if (row[outcome_left] != 'NINA' and not utils.isnan(row[outcome_left])): patient_outcomes[ row['participant code']] = row[outcome_left] patients_considered[ row['participant code']] = row['formTempoAval'] else: if (utils.isnan(patient_outcomes[row['participant code']])): if (np.all(admission_data['Q44071_opcLdLesao'] [admission_data['participant code'] == row['participant code']] == 'D')): if (row[outcome_right] != 'NINA' and not utils.isnan(row[outcome_right])): patient_outcomes[ row['participant code']] = row[outcome_right] patients_considered[ row['participant code']] = row['formTempoAval'] elif (np.all(admission_data['Q44071_opcLdLesao'] [admission_data['participant code'] == row['participant code']] == 'E')): if (row[outcome_left] != 'NINA' and not utils.isnan(row[outcome_left])): patient_outcomes[ row['participant code']] = row[outcome_left] patients_considered[ row['participant code']] = row['formTempoAval'] #print(row['participant code']) #import pdb #pdb.set_trace() #labels = {'S':'Sim','N':'Não',np.nan:'Não informado'} for k in patients_considered.keys(): patients_considered[k] = int(patients_considered[k] / 30) xlabels = list(np.arange(6)) + [np.nan] #['N','S',np.nan] label = lambda x: 'Não informado' if utils.isnan(x) else x #labels = {'S':'Sim','N':'Não',np.nan:'Não informado'} y = [0] * 7 for value in patient_outcomes.values(): if (utils.isnan(value)): y[-1] += 1 else: y[int(value)] += 1 width = 0.8 fig = plt.figure() ax = fig.add_subplot(111) plt.bar(range(len(xlabels)), y, width=width) ax.set_xticks(np.arange(len(xlabels)) + width / 2) ax.set_yticks(range(0, 30, 5)) ax.set_xticklabels([label(l) for l in xlabels]) print(Counter(patient_outcomes.values())) plt.xlabel('Força muscular avaliada sobre flexão do cotovelo') plt.show()
def plot_event(): data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/' data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/' data_path = data_path + 'Q44071_unified-admission-assessment/Responses_Q44071.csv' data = pd.read_csv(data_path, header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) events_right = data.filter(like='lisTpTraumaD') events_left = data.filter(like='lisTpTraumaE') events_description = { 'lisTpTrauma[arma]': 'Arma de fogo', 'lisTpTrauma[moto]': 'Acidente motociclístico', 'lisTpTrauma[auto]': 'Acidente automobilístico', 'lisTpTrauma[atropelamento]': 'Atropelamento', 'lisTpTrauma[cirurgia]': 'Cirurgia', 'lisTpTrauma[corte]': 'Objeto cortante', 'lisTpTrauma[ocupacao]': 'Acidente ocupacional', 'lisTpTrauma[other]': 'Outros' } event_names = {} for c in events_right.columns: event_names[re.sub('D', '', c)] = [re.sub('D', 'E', c), c] index = 0 x = np.arange(len(event_names.keys())) width = 0.4 fig = plt.figure() ax = fig.add_subplot(111) i = 0 events_in_plot = [] for event in sorted(event_names.keys()): yleft = sum([ a[1] for a in Counter(events_left[event_names[event][0]]).items() if not utils.isnan(a[0]) ]) yright = sum([ a[1] for a in Counter(events_right[event_names[event][1]]).items() if not utils.isnan(a[0]) ]) if (yleft != 0 or yright != 0): l = plt.bar(i, yleft, width, color='blue') r = plt.bar(i + width, yright, width, color='red') events_in_plot.append(event) else: continue i += 1 #print([[Counter(events_left[event_names[event][0]]) for event in y] for event in y]) #exit() # y = sorted(event_names.keys()) # left = plt.bar(x, [Counter(events_left[event_names[event][0]])['Y'] for event in y], width,color='blue') # right = plt.bar(x+width, [Counter(events_right[event_names[event][1]])['Y'] for event in y], width,color='red') ax.set_xticks(np.arange(i) + width) ax.set_xticklabels([events_description[e] for e in events_in_plot], rotation=90) plt.ylabel('Frequência') ax.legend((l, r), ('Esquerdo', 'Direito')) plt.tight_layout() #plt.width = width plt.show()
def transform_to_JSON(clf, fcs, out='FeatureContributions.json', diffsur=True, X=None, addline=None): import json import pandas as pd import utils if (X is None): if (not isinstance(clf.X, pd.DataFrame)): X = pd.DataFrame(clf.X, columns=clf.attributes) else: X = clf.X #data = read.readData(data_path = data_path, class_name = class_name) #newcolumns = np.append(X.columns,['Q44071_snCplexoAt',class_name]) #newX = pd.merge(data,X,how='inner',on='Q44071_participant_code')[newcolumns] F = {} for i in range(len(fcs)): if (diffsur): for feature_index in fcs[i].keys(): if (feature_index not in F): F[feature_index] = { 'name': clf.attributes[feature_index], 'ycategs': sorted( list([ a for a in set(X[X.columns[feature_index]]) if not utils.isnan(a) ])) + ['nan'], 'redopoints': [], 'redxpoints': [], 'blueopoints': [], 'bluexpoints': [] } if (clf.X['Q44071_snCplexoAt'][i] == 'S'): if (clf.y[i] == 'INSATISFATORIO'): if (not utils.isnan(X[X.columns[feature_index]][i])): F[feature_index]['redopoints'].append([ round(fcs[i][feature_index], 5), F[feature_index]['ycategs'].index( X[X.columns[feature_index]][i]) ]) else: F[feature_index]['redopoints'].append([ round(fcs[i][feature_index], 5), len(F[feature_index]['ycategs']) - 1 ]) else: if (not utils.isnan(X[X.columns[feature_index]][i])): F[feature_index]['blueopoints'].append([ round(fcs[i][feature_index], 5), F[feature_index]['ycategs'].index( X[X.columns[feature_index]][i]) ]) else: F[feature_index]['blueopoints'].append([ round(fcs[i][feature_index], 5), len(F[feature_index]['ycategs']) - 1 ]) else: if (clf.y[i] == 'INSATISFATORIO'): if (not utils.isnan(X[X.columns[feature_index]][i])): F[feature_index]['redxpoints'].append([ round(fcs[i][feature_index], 5), F[feature_index]['ycategs'].index( X[X.columns[feature_index]][i]) ]) else: F[feature_index]['redxpoints'].append([ round(fcs[i][feature_index], 5), len(F[feature_index]['ycategs']) - 1 ]) else: if (not utils.isnan(X[X.columns[feature_index]][i])): F[feature_index]['bluexpoints'].append([ round(fcs[i][feature_index], 5), F[feature_index]['ycategs'].index( X[X.columns[feature_index]][i]) ]) else: F[feature_index]['bluexpoints'].append([ round(fcs[i][feature_index], 5), len(F[feature_index]['ycategs']) - 1 ]) else: for feature_index in fcs[i].keys(): if (feature_index not in F.keys()): if (isinstance(X, pd.DataFrame)): F[feature_index] = { 'name': clf.attributes[feature_index], 'value': X.values[i][feature_index] if not utils.isnan(X.values[i][feature_index]) else 'nan', 'contribution': 0 } else: F[feature_index] = { 'name': clf.attributes[feature_index], 'value': X[i][feature_index] if not utils.isnan(X[i][feature_index]) else 'nan', 'contribution': 0 } F[feature_index]['contribution'] = fcs[i][feature_index] file = open(out, 'w') if (addline is not None): F['classification'] = addline jsonfile = json.dumps(F, ensure_ascii=False) file.write(jsonfile)
assert (m.predict(['RAIN', 80, 70, 'T']) == "DON'T PLAY") assert (m.predict(['SUNNY', 50, 50, 'T']) == 'PLAY') assert (m.predict(['SUNNY', 50, 91, 'T']) == "DON'T PLAY") assert (m.predict([np.nan, 50, 91, 'T']) == "DON'T PLAY") print('Testing Decision Tree with missing values (branch_nan = True)...') m = dt.DecisionTreeClassifier(missing_branch=True) # data, attributes, categories = read.readData(data_path = '../Dados/Test_with_nan.csv', class_name='Class', # dummy=dummy,transform_numeric=transform,use_text = use_text,missing_input='none') X[5][0] = np.nan # X = data[:,0:-1] # y = np.array(data[:,-1]) m.fit(X, y) m.to_pdf(original_attributes, out='out.pdf') outlook_index = np.where(original_attributes == 'Outlook')[0][0] not_nan_rows = [ a for a in range(X.shape[0]) if not utils.isnan(X[:, outlook_index][a]) ] Xnotnan = (X[not_nan_rows, :]) ynotnan = y[not_nan_rows] Xs, ys, d = utils.split_categ(Xnotnan, ynotnan, outlook_index, list(set(Xnotnan[:, outlook_index]))) assert (np.isclose( (len(ynotnan) / len(y)) * utils.information_gain(ynotnan, ys), 0.199, rtol=1e-2)) assert (np.isclose((len(ynotnan) / len(y)) * utils.gain_ratio(ynotnan, ys, y), 0.110, rtol=1e-2)) #outlook, temperature, humidity, windy assert (m.predict((['OVERCAST', 80, 90, 'T'])) == 'Play'.upper())
def build_tree(self, Xc, yc, feature_indices, depth, weights, pdist=None): #,parent_fiv='root'): # only consider the instances that are at the node, partially or entirely rows_to_consider = sorted(weights.keys()) X = Xc[rows_to_consider, :] y = yc[rows_to_consider] # calculate the class distribution at the node (absolute values) dist = {} for k in set(yc): dist[k] = 0 for k in weights.keys(): dist[yc[k]] += weights[k] # if all "whole" instances at the node belong to the same class or if maximum tree depth was reached if (utils.entropy(y) == 0 or (len([k for k in dist.keys() if dist[k] < 1]) > 0) or depth == self.max_depth): # in case of a tie of the class distributions, final class will be the most frequent # class at the parent node if (len(dist.keys()) > 1 and len(set(dist.values())) == 1 and pdist is not None): #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist)) final_class = max(pdist.keys(), key=lambda k: pdist[k]) # final class will be the most frequent class at the node else: final_class = max(dist.keys(), key=lambda k: dist[k]) # return a decision node return Node(feature_index=None, values=None, branches=None, branch_nan=None, sample_size=sum( [k for k in weights.values() if k == 1]), distr=dist, is_class=True, final_class=final_class ) #,config=parent_fiv+'->'+str(final_class)) # get the feature and its split value(s) that maximize the information gain if (self.random_subspace is False and self.mtry is not None): nfeature_indices = random.sample( list(feature_indices), int(self.mtry(len(feature_indices)))) else: nfeature_indices = feature_indices feature_index, values = self.find_split(X, y, nfeature_indices, weights) #if the best split could not be found, returns a decision node if (feature_index == -1): #print('best split could not be found.') if (len(dist.keys()) > 1 and len(set(dist.values())) == 1 and pdist is not None): #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist)) final_class = max(pdist.keys(), key=lambda k: pdist[k]) # final class will be the most frequent class at the node else: final_class = max(dist.keys(), key=lambda k: dist[k]) return Node(feature_index=None, values=None, branches=None, branch_nan=None, sample_size=sum( [k for k in weights.values() if k == 1]), distr=dist, is_class=True, final_class=final_class) # get rows where the values of X for the feature are not missing not_nan_rows = [ a for a in range(X.shape[0]) if (not utils.isnan(X[:, feature_index][a])) ] # get the rows where they are missing nan_rows = np.delete(list(range(X.shape[0])), not_nan_rows) Xnotnan = (X[not_nan_rows, :]) ynotnan = y[not_nan_rows] ynan = y[nan_rows] # get the sets (and its weights) that result when the not missing data are split # based on the feature and its value(s) Xs, ys, dweights = utils.split(Xnotnan, ynotnan, feature_index, values) # if instances belong to only one subset, returns a decision node -- might be useless if (len(ys) < 2): #print('instances belong to only one subset.') if (len(dist.keys()) > 1 and len(set(dist.values())) == 1 and pdist is not None): #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist)) final_class = max(pdist.keys(), key=lambda k: pdist[k]) # final class will be the most frequent class at the node else: final_class = max(dist.keys(), key=lambda k: dist[k]) # if(self.print): return Node(feature_index=None, values=None, branches=None, branch_nan=None, sample_size=sum( [k for k in weights.values() if k == 1]), distr=dist, is_class=True, final_class=final_class) branch_nan = None branches = [] # translate the dweights indexes to the weights indexes for i in range(len(dweights)): dweights[i] = dict( (rows_to_consider[not_nan_rows[j]], dweights[i][j]) for j in dweights[i]) for j in dweights[i].keys(): if j in weights.keys(): dweights[i][j] = weights[j] # sum of the weights of the instances in the node with known values s = (sum([sum(x.values()) for x in dweights])) # for each split set for i in range(len(ys)): # if it's not empty if len(ys[i]) != 0: # C.45 approach if (self.missing_branch is False): # calculate probability of outcome values[i], estimated as the sum of the weights # of instances in the node known to have outcome values[i] divided by the sum of the # weights of the cases in the node with known outcomes prob_values_i = round(float(sum(dweights[i].values()) / s), 5) # for each instance with missing value, update its weight for the child node for j in nan_rows: (dweights[i])[rows_to_consider[j]] = weights[ rows_to_consider[j]] * prob_values_i branches.append( self.build_tree( Xc, yc, feature_indices, depth + 1, dweights[i], dist)) #,parent_fiv=str(feature_index)+'->'+v)) # nan branch approach if (self.missing_branch): # if there are samples with known values if (ynan.shape[0] != 0): # continue building tree from the nan branch branch_nan = self.build_tree( Xc, yc, feature_indices, depth + 1, dict([[a, 1] for a in np.array(rows_to_consider)[nan_rows] ]), dist) #,str(feature_index)+'->NAN') # if there aren't, then assign to the nan branch a decision node with no instances # (for future classification purposes). else: if (len(dist.keys()) > 1 and len(set(dist.values())) == 1 and pdist is not None): #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist)) final_class = max(pdist.keys(), key=lambda k: pdist[k]) # final class will be the most frequent class at the node else: final_class = max(dist.keys(), key=lambda k: dist[k]) # assign to the nan branch a decision node branch_nan = Node(feature_index=None, values=None, branches=None, branch_nan=None, sample_size=0, distr={k: 0 for k in set(y)}, is_class=True, final_class=final_class ) #,config=parent_fiv+'->'+str(final_class)) same_class = False fclass = branches[0].final_class import pdb if (fclass is not None): for child in range(1, len(branches)): if (branches[child].final_class != fclass): same_class = False break if (child == len(branches) - 1): if (branch_nan): if (branch_nan.final_class != fclass): same_class = False else: same_class = True else: same_class = True if (same_class is True): #print('class node - all children nodes belong to the same class') if (len(dist.keys()) > 1 and len(set(dist.values())) == 1 and pdist is not None): #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist)) final_class = max(pdist.keys(), key=lambda k: pdist[k]) else: final_class = max(dist.keys(), key=lambda k: dist[k]) return Node(feature_index=None, values=None, branches=None, branch_nan=None, sample_size=sum( [k for k in weights.values() if k == 1]), distr=dist, is_class=True, final_class=final_class ) #,config=parent_fiv+'->'+str(final_class)) # # returns a test node with its feature index and values and its branches. return Node(feature_index=feature_index, values=values, branches=branches, branch_nan=branch_nan, sample_size=sum([k for k in weights.values() if k == 1]), distr=dist)
def feature_contribution(self,X=None): print('calculating feature contribution') #C = set(self.y) if(X is None): if(isinstance(self.X,pd.DataFrame)): X = self.X.values else: X = self.X else: if(isinstance(X, pd.DataFrame) and X.shape[1] != self.X.shape[1]): X = X[X.columns[[np.where(a == X.columns)[0][0] for a in self.X.columns if a in X.columns]]] for f in range(len(self.X.columns)): if(self.X.columns[f] not in X.columns): X.insert(f,self.X.columns[f],[np.nan]*X.shape[0]) X = X.values if(self.control_class is None): if('SUCESSO' in set(self.y)): control_class = 'SUCESSO' else: control_class = list(set(self.y))[0] print('Control class set as %r' % control_class) else: control_class = self.control_class fcs = [] for i in range(X.shape[0]): FC = {} c = 0 #for k in C: t_index = 0 # if(i_index == 9): # import pdb # pdb.set_trace() for t in self.forest: if(i in self.forest[t_index].oob): #print(oob[t_index]) t_index+=1 continue t_index +=1 child_list = [[1,t.root]] while len(child_list) > 0: w, parent = child_list.pop(0) while parent.is_class is False: f = parent.feature_index #print(i[f]) #print(parent.values) if(f not in FC.keys()): FC[f] = 0 # FC[f] = {c:0 for c in C} if(utils.isnan(X[i][f])): if(parent.branch_nan is None): sp = sum(parent.distr.values()) for c in parent.branches: child_list.append([round(w*(sum(c.distr.values()))/sp,2),c]) w,child = child_list.pop(0) else: child = parent.branch_nan else: if(len(parent.values) == 1): if X[i][f] <= parent.values[0]: child = parent.branches[0] else: child = parent.branches[1] else: if(str(X[i][f]) not in parent.values): if(parent.branch_nan is None): sp = sum(parent.distr.values()) for c in parent.branches: child_list.append([round(w*(sum(c.distr.values()))/sp,2),c]) w,child = child_list.pop(0) else: child = parent.branch_nan else: child = parent.branches[parent.values.index(str(X[i][f]))] sc = sum(child.distr.values()) if(sc == 0): child.distr = t.root.distr sc = sum(child.distr.values()) sp = sum(parent.distr.values()) FC[f] = FC[f] + w*(child.distr[control_class]/sc - parent.distr[control_class]/sp) parent = child for element in FC: FC[element] = FC[element] / self.ntrees #for el in FC[element]: # FC[element][el] = FC[element][el] / self.ntrees fcs.append(FC) return fcs
def __call__(self, input): def _just_resize(): img = input['img'] w, h = img.size # perform scaling input['img'] = img.resize((self.ix, self.iy), Image.ANTIALIAS) if np.sum(input['loc']) != 0: loc = input['loc'] loc[0, :] = loc[0, :] * self.ix / w loc[1, :] = loc[1, :] * self.iy / h input['loc'] = loc def _transform(): angle = self.rangle * (2 * torch.rand(1)[0] - 1) grad_angle = angle * math.pi / 180 scale = 1 + self.rscale * (2 * torch.rand(1)[0] - 1) transx = self.rtrans * (2 * torch.rand(1)[0] - 1) transy = self.rtrans * (2 * torch.rand(1)[0] - 1) img = input['img'] w, h = img.size centerX, centerY = w // 2, h // 2 # perform rotation img = img.rotate(angle, Image.BICUBIC) # perform translation img = img.transform(img.size, Image.AFFINE, (1, 0, transx, 0, 1, transy)) # perform scaling img = img.resize((int(math.ceil(scale * h)), int(math.ceil(scale * w))), Image.ANTIALIAS) w, h = img.size x1 = round((w - self.ix) // 2) y1 = round((h - self.iy) // 2) input['img'] = img.crop((x1, y1, x1 + self.ix, y1 + self.iy)) if np.sum(input['loc']) != 0: loc = input['loc'] newloc = np.ones((3, loc.shape[1])) newloc[0:2, :] = loc trans_matrix = np.array([[1,0,-1*transx], [0,1,-1*transy], [0,0,1]]) scale_matrix = np.array([[scale,0,0], [0,scale,0], [0,0,1]]) angle_matrix = np.array([ [math.cos(grad_angle),math.sin(grad_angle),0], [-math.sin(grad_angle),math.cos(grad_angle),0], [0,0,1]]) # perform rotation newloc[0,:] = newloc[0,:] - centerY newloc[1,:] = newloc[1,:] - centerX newloc = np.dot(angle_matrix, newloc) newloc[0,:] = newloc[0,:] + centerY newloc[1,:] = newloc[1,:] + centerX # perform translation newloc = np.dot(trans_matrix, newloc) # perform scaling newloc = np.dot(scale_matrix, newloc) newloc[0,:] = newloc[0,:] - y1 newloc[1,:] = newloc[1,:] - x1 input['loc'] = newloc[0:2,:] for i in range(input['loc'].shape[1]): if not np.isnan(input['loc'][:, i]).any(): if np.any(input['loc'][:, i] < 0) or \ input['loc'][0,i] > self.iy or \ input['loc'][1,i] > self.ix: input['loc'][:, i] = np.nan # TODO: fill the surrounding with normal noise input['occ'][0, i] = 0 # FIXME: create multiple images for the same sample with different occluded blocks for testing purposes # input['im'][:, 10:40, 22:50] = 0 # adding one more at the end for the center landmark # add the center of image as the last landmark h, w = input['img'].size input['loc'] = np.hstack((input['loc'], np.array([[w // 2], [h // 2]]))) input['occ'] = torch.cat((input['occ'], torch.ByteTensor([[1]])), 1) input['mask'] = torch.cat((input['mask'], torch.ByteTensor([[1]])), 1) orig_img = input['img'] orig_loc = input['loc'] orig_occ = input['occ'].clone() orig_mask = input['mask'].clone() _transform() if self.keep_landmarks_visible: # train: making sure all landmarks are still visible, if not perform # another transformation mask = input['mask'] mask2D = torch.cat((mask, mask), dim=0) landmarks = torch.from_numpy(input['loc']) limit = 100 while not (mask == mask * input['occ']).all() or utils.isnan(landmarks[mask2D]).any(): input['img'] = orig_img input['loc'] = orig_loc input['occ'] = orig_occ.clone() input['mask'] = orig_mask.clone() _transform() mask = input['mask'] mask2D = torch.cat((mask, mask), dim=0) landmarks = torch.from_numpy(input['loc']) limit -= 1 if limit == 0: input['img'] = orig_img input['loc'] = orig_loc input['occ'] = orig_occ.clone() input['mask'] = orig_mask.clone() _just_resize() print('using the orignal data because even after 100 transformation, there are still occluded landmarks!!!') break input['tgt'] = self.toHeatmaps(input['loc'], self.image_resolution) return input
def predict(self,ex,top_n=1,pool=None,normalize_ss=False,exp_final_Q=False): self.network.eval() train_mode=self.args.train_mode # old if train_mode=='string_match': if self.use_cuda: inputs = [e if e is None else Variable(e.cuda(async=True), volatile=True) for e in ex[:5]] else: inputs = [e if e is None else Variable(e, volatile=True) for e in ex[:5]] score_s, score_e = self.network(*inputs) # no normalize, just exp # Decode predictions score_s = score_s.data.cpu() score_e = score_e.data.cpu() max_len=15 args = (score_s, score_e, top_n, max_len) # return # pred_s :B,top_n each ex's top_n start token pos # pred_e :B,top_n each ex's top_n end token pos # pred_score: B,top_n each ex's top_n span's score if pool: return pool.apply_async(self.decode, args) else: return self.decode(*args) if train_mode=='string_match_base_dis': dw, f, dw_mask, qw, qw_mask, Qw,Qw_mask,Q_mask,ex2Q,CQ_mask,Q_ids,Q_names,triples,cans,ids=ex dw, f, dw_mask, qw, qw_mask, Qw,Qw_mask = to_vars_torch(ex[:7],self.use_cuda,evaluate=True) [Q_mask,CQ_mask]=to_vars([Q_mask,CQ_mask], use_cuda=self.use_cuda,evaluate=True) inputs=[dw, f, dw_mask, qw, qw_mask, Qw,Qw_mask,Q_mask,ex2Q] # return score_s, score_e, (after log softmax), Q_pure ( B,max_Q, before Q_mask,) score_s, score_e ,pure_Q = self.network(*inputs) # B,T_d, test, just use to predict span # Decode predictions score_s = score_s.data.cpu() score_e = score_e.data.cpu() max_len=15 args = (score_s, score_e, cans, top_n, max_len) # cans: ex's all can's all token spans if pool: handle=pool.apply_async(self.decode_candidates, args) ans_in_can,scores=handle.get() else: ans_in_can,scores=self.decode_candidates(*args) # P(c|Q), consider mention pure_Q=pure_Q.clone() pure_Q[Q_mask.data==0]=-float('inf') #pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf')) B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone() # B,max_Q --> B,max_c,max_Q (give all Q' original score to c) B_max_Q[CQ_mask.data==0]=-float('inf') if self.normalize_q: B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2)) # B,max_c,max_Q , get each c's P(Q|c) some max_c 's, have no Q else: B_max_Q=torch.exp(B_max_Q) # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),0) # some lines are invalid (lines >real can) B_max_Q=B_max_Q.clone() B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=0 # P(Q) ans_in_can=to_var(np.array(ans_in_can,dtype='int64'),use_cuda=self.args.cuda) # B, ans_index=ans_in_can.unsqueeze(1).expand(B_max_Q.size(0),B_max_Q.size(2)).unsqueeze(1) # B,1,max_Q final_Q=B_max_Q.gather(1,ans_index).squeeze(1) assert torch.sum(isnan(final_Q.data.cpu()))==0 #final_score,final_index=torch.max(final_Q,-1) # B,1 final_score,final_index=torch.sort(final_Q,-1,descending=True) # return Q_mask,B_max_Q,final_Q,final_score,final_index return final_score,final_index,Q_mask,ids # B,1, each ex's predict Q's index and corresponding score # test: number, index : torch.max(final_Q,-1) B,1 s: exp, (give candidate),find all can's score:B,max_c still normalize / B_max_Q : exp # scores, indexs:torch.sort(final_Q,-1) B,max_Q if train_mode=='contain' or train_mode=='NER': # C_pos,C_doc_mask,C_mask,Q_mask,CQ_mask:np ex2Q,ans_in_Q ,Q_ids(each ex's all Q), list dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,C_pos,C_doc_mask,C_mask,ex2Q,CQ_mask,Q_ids,Q_names,triples,ids=ex dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask = to_vars_torch(ex[:7],self.use_cuda,evaluate=True) C_pos,C_doc_mask,C_mask,Q_mask,CQ_mask=to_vars([C_pos,C_doc_mask,C_mask,Q_mask,CQ_mask], use_cuda=self.use_cuda,evaluate=True) inputs=[dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,ex2Q] # return s (after doc mask + softmax ), Q_pure (B,max_Q, before Q_mask) score,pure_Q=self.network(*inputs) # P(c) s_masked=score*C_doc_mask.float() # B,D only keep candidate in s # keep watching if normalize_ss: s_masked+=0.00001 s_normal=s_masked/torch.sum(s_masked,dim=1).expand_as(s_masked)# B,D normalize s in candidate else: s_normal=s_masked # s_normal=s_masked/torch.sum(s_masked,dim=1).expand_as(s_masked) # B,D normalize s in candidate B_max_c=torch.bmm(s_normal.unsqueeze(1),C_pos.float()).squeeze(1) # B,max_c s: B,1,D * B,D,max_c sum c's pos in s already c's prob,sum==1 # B_max_c=B_max_c/torch.sum(B_max_c,dim=1).expand_as(B_max_c) # just train # B_max_c.data.masked_fill_(C_mask.data==0,-float('inf')) # B_max_c=F.softmax(B_max_c.data) # B,max_c, after softmax P(c) assert torch.sum(isnan(B_max_c.data.cpu()))==0 # P(c|Q), consider mention # pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf')) pure_Q=pure_Q.clone() pure_Q[Q_mask.data==0]=-float('inf') B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone() # B,max_Q --> B,max_c,max_Q (give all Q' original score to c) B_max_Q[CQ_mask.data==0]=-float('inf') if self.normalize_q: B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2)) # B,max_c,max_Q , get each c's P(Q|c) some max_c 's, have no Q else: B_max_Q=torch.exp(B_max_Q) B_max_Q=B_max_Q.clone() B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=0 # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),0) # some lines are invalid (lines >real can) # P(Q) final_Q=torch.bmm(B_max_c.unsqueeze(1),B_max_Q).squeeze(1) # B,max_Q : bmm( B,1,max_c P(c), B,max_c,max_Q P(Q|c)) final_Q=torch.exp(final_Q) if exp_final_Q else final_Q assert torch.sum(isnan(final_Q.data.cpu()))==0 # final_score,final_index=torch.max(final_Q,-1) # B,1 # return Q_mask,B_max_c,B_max_Q,final_Q,final_score,final_index final_score,final_index=torch.sort(final_Q,-1,descending=True) return final_score,final_index,Q_mask,ids # B,1, each ex's predict Q's index and corresponding score # test: number, index : torch.max(final_Q,-1) B,1 s: exp, (give candidate),find all can's score:B,max_c still normalize / B_max_Q : exp # scores, indexs:torch.sort(final_Q,-1) B,max_Q if train_mode=='span': # start_indexs,end_indexs,span_mask,span2c,C_mask, Q_mask, CQ_mask: np ex2Q,ans_in_Q: list dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,start_indexs,end_indexs,span_mask,span2c,C_mask,ex2Q,CQ_mask,Q_ids,Q_names,triples,ids=ex dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask = to_vars_torch(ex[:7],self.use_cuda,evaluate=True) start_indexs,end_indexs,span_mask,span2c,C_mask, Q_mask, CQ_mask=to_vars\ ([start_indexs,end_indexs,span_mask,span2c,C_mask, Q_mask, CQ_mask], use_cuda=self.use_cuda,evaluate=True) inputs=[dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,ex2Q] # return score_s, score_e, (after doc mask ,softmax), Q_pure (B,max_Q, before mask) score_s, score_e,pure_Q=self.network(*inputs) # combine start_indexs,end_indexs,span_mask,span2c,C_mask to compute B,max_c combile Q_mask pure Q combine CQ_mask real Q # P(c) span_start=score_s.gather(dim=1,index=start_indexs) # B,D--> B,max_span * span_mask (softmax), softmax_over_span.each span's score span_end=score_e.gather(dim=1,index=end_indexs) # B,D--> B,max_span span_s=span_start*span_end*span_mask.float() if normalize_ss: span_s+=0.00001 span_normal=span_s/torch.sum(span_s,dim=1).expand_as(span_s) # normalize B,max_span, each span's score, after mask else: span_normal=span_s #span_normal=span_s/torch.sum(span_s,dim=1).expand_as(span_s) B_max_c=torch.bmm(span_normal.unsqueeze(1),span2c.float()).squeeze(1) # B,1,max_span B,max_span,max_can bmm--> B,max_num_c # B_max_c.data.masked_fill_(C_mask.data==0,-float('inf')) # B_max_c=F.softmax(B_max_c.data) # B,max_c, after softmax P(c) assert torch.sum(isnan(B_max_c.data.cpu()))==0 # P(c|Q), consider mention # pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf')) pure_Q=pure_Q.clone() pure_Q[Q_mask.data==0]=-float('inf') B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone() # B,max_Q --> B,max_c,max_Q (give all Q' original score to c) B_max_Q[CQ_mask.data==0]=-float('inf') if self.normalize_q: B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2)) # B,max_c,max_Q , get each c's P(Q|c) some max_c 's, have no Q else: B_max_Q=torch.exp(B_max_Q) # B,max_c,max_Q , get each c's P(Q|c) some max_c 's, have no Q # have max_can line, no nan/ other line, all nan B_max_Q=B_max_Q.clone() B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=0 # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),0) # some lines are invalid (lines >real can) # P(Q) final_Q=torch.bmm(B_max_c.unsqueeze(1),B_max_Q).squeeze(1) # B,max_Q : bmm( B,1,max_c P(c), B,max_c,max_Q P(Q|c)) final_Q=torch.exp(final_Q) if exp_final_Q else final_Q assert torch.sum(isnan(final_Q.data.cpu()))==0 final_score,final_index=torch.sort(final_Q,-1,descending=True) # final_score1,final_index1=final_score1[:,0],final_index1[:,0] # final_score2,final_index2=torch.max(final_Q,-1) # B,1 # final_score2,final_index2=final_score2.squeeze(1),final_index2.squeeze(1) # print(final_Q) #print(final_index) #print(Q_mask) # return Q_mask,B_max_c,B_max_Q,final_Q,final_score,final_index return final_score,final_index,Q_mask,ids
def plot_feature_contributions(X, feature_index, fcs, attributes, class_of_interest, title=None): if (not utils.isint(X[utils.firstNotNan( X[:, feature_index])][feature_index]) and not utils.isfloat( X[utils.firstNotNan(X[:, feature_index])][feature_index])): values = [i for i in set(X[:, feature_index]) if not utils.isnan(i) ] + [np.nan] pos_fcs = [] neg_fcs = [] pos_values = [] neg_values = [] zero_fcs = [] zero_values = [] contributions = {} for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (fcs[i][feature_index][class_of_interest] > 0): pos_fcs.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values if (utils.isnan(X[i][feature_index])): pos_values.append(len(values) - 1) else: pos_values.append(values.index(X[i][feature_index])) elif (fcs[i][feature_index][class_of_interest] == 0): zero_fcs.append(0) if (utils.isnan(X[i][feature_index])): zero_values.append(len(values) - 1) else: zero_values.append(values.index(X[i][feature_index])) else: neg_fcs.append(fcs[i][feature_index][class_of_interest]) if (utils.isnan(X[i][feature_index])): neg_values.append(len(values) - 1) else: neg_values.append(values.index(X[i][feature_index])) if (X[i][feature_index] not in contributions.keys()): contributions[X[i][feature_index]] = [ fcs[i][feature_index][class_of_interest] ] else: contributions[X[i][feature_index]].append( fcs[i][feature_index][class_of_interest]) print('Contributions:') for value in contributions.keys(): print('Value %r' % value) print( '\nMean: %r Variance: %r' % (np.mean(contributions[value]), np.var(contributions[value]))) c = (contributions.items()) boxplot([a[1] for a in c], [a[0] for a in c], title=None) ax = plt.subplot(111) plt.plot(pos_fcs, pos_values, 'x', color='blue') plt.plot(neg_fcs, neg_values, 'x', color='red') plt.plot(zero_fcs, zero_values, 'x', color='black') plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticks(np.array(range(len(values) + 2)) - 1) ax.set_yticklabels([str('')] + values + [str('')]) plt.show() else: values = sorted([ round(i, 4) for i in (set(X[:, feature_index])) if not utils.isnan(i) ]) # + [np.nan] nan_index = values[-1] - values[-2] pos_fcs = [] neg_fcs = [] pos_values = [] neg_values = [] zero_fcs = [] zero_values = [] contributions = {} for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (fcs[i][feature_index][class_of_interest] > 0): pos_fcs.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values if (utils.isnan(X[i][feature_index])): pos_values.append(values[-1] + nan_index) else: pos_values.append(X[i][feature_index]) elif (fcs[i][feature_index][class_of_interest] == 0): zero_fcs.append(0) if (utils.isnan(X[i][feature_index])): zero_values.append(values[-1] + nan_index) else: zero_values.append(X[i][feature_index]) else: neg_fcs.append(fcs[i][feature_index][class_of_interest]) if (utils.isnan(X[i][feature_index])): neg_values.append(values[-1] + nan_index) else: neg_values.append((X[i][feature_index])) if (utils.isnan(X[i][feature_index])): if ('nan' in contributions.keys()): contributions['nan'].append( fcs[i][feature_index][class_of_interest]) else: contributions['nan'] = [ fcs[i][feature_index][class_of_interest] ] elif (X[i][feature_index] in contributions.keys()): contributions[(X[i][feature_index])].append( fcs[i][feature_index][class_of_interest]) else: contributions[(X[i][feature_index])] = [ fcs[i][feature_index][class_of_interest] ] print('Contributions:') for value in contributions.keys(): print('Value %r' % value) print( 'Mean: %r Variance: %r' % (np.mean(contributions[value]), np.std(contributions[value]))) c = (contributions.items()) boxplot([a[1] for a in c], [a[0] for a in c], title=None) fig, ax = plt.subplots() plt.plot(pos_fcs, pos_values, 'x', color='blue') plt.plot(neg_fcs, neg_values, 'x', color='red') plt.plot(zero_fcs, zero_values, 'x', color='black') fig.canvas.draw() labels = [''] + [item.get_text() for item in ax.get_yticklabels()] + [''] if (values[-1] + nan_index < ax.get_yticks()[-1]): plt.yticks( [values[0] - nan_index] + sorted(list(ax.get_yticks()) + [values[-1] + nan_index])) else: plt.yticks([values[0] - nan_index] + sorted( list(ax.get_yticks()) + [values[-1] + nan_index, values[-1] + 2 * nan_index])) labels[-2] = 'nan' plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticklabels(labels) plt.show() if (title is not None): plt.savefig(title) plt.close()
def plot_feature_contributions_surgery_class(X, y, feature_index, fcs, attributes, class_of_interest, title=None): surgery_index = np.where(attributes == 'Q44071_snCplexoAt')[0][0] if (not utils.isint(X[utils.firstNotNan( X[:, feature_index])][feature_index]) and not utils.isfloat( X[utils.firstNotNan(X[:, feature_index])][feature_index])): values = [i for i in set(X[:, feature_index]) if not utils.isnan(i) ] + [np.nan] x_surgery = [] surgery_colors = [] x_no_surgery = [] no_surgery_colors = [] x_nan = [] nan_colors = [] y_surgery = [] y_no_surgery = [] y_nan = [] contributions = {} for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'): x_surgery.append(fcs[i][feature_index][class_of_interest]) y_surgery.append(values.index(X[i][feature_index])) if (y[i] == class_of_interest): surgery_colors.append('blue') else: surgery_colors.append('red') elif (utils.isnan(X[i][surgery_index])): x_nan.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values y_nan.append(len(values) - 1) if (y[i] == class_of_interest): nan_colors.append('blue') else: nan_colors.append('red') else: x_no_surgery.append( fcs[i][feature_index][class_of_interest]) y_no_surgery.append(values.index(X[i][feature_index])) if (y[i] == class_of_interest): no_surgery_colors.append('blue') else: no_surgery_colors.append('red') # if(X[i][feature_index] not in contributions.keys()): # contributions[X[i][feature_index]] = [fcs[i][feature_index][class_of_interest]] # else: # contributions[X[i][feature_index]].append(fcs[i][feature_index][class_of_interest]) coi = str(class_of_interest) ax = plt.subplot(111) ax.scatter(x_surgery, y_surgery, marker='o', s=60, edgecolors=surgery_colors, facecolors='none') ax.scatter(x_no_surgery, y_no_surgery, marker='x', s=60, edgecolors=no_surgery_colors, facecolors='none') ax.scatter(x_nan, y_nan, marker='d', s=60, edgecolors=nan_colors, facecolors='none') plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticks(np.array(range(len(values) + 2)) - 1) ax.set_yticklabels([str('')] + values + [str('')]) red_patch = mpatches.Patch(color='red') blue_patch = mpatches.Patch(color='blue') xmarker = mlines.Line2D([], [], color='black', marker='x', markersize=10, linestyle='None') omarker = mlines.Line2D([], [], color='black', marker='o', markersize=10, linestyle='None', markerfacecolor='None', markeredgecolor='black') #plt.legend(handles=[red_patch,blue_patch]) plt.legend([red_patch, blue_patch, xmarker, omarker], [ 'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi, 'Não passou por cirurgia', 'Passou por cirurgia' ], numpoints=1, fontsize='small') plt.show() else: values = sorted([ round(i, 4) for i in (set(X[:, feature_index])) if not utils.isnan(i) ]) # + [np.nan] print(values) nan_index = values[-1] - values[-2] x_surgery = [] surgery_colors = [] x_no_surgery = [] no_surgery_colors = [] x_nan = [] nan_colors = [] y_surgery = [] y_no_surgery = [] y_nan = [] for i in range(X.shape[0]): if (feature_index in fcs[i].keys()): if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'): x_surgery.append(fcs[i][feature_index][class_of_interest]) y_surgery.append((X[i][feature_index])) if (y[i] == class_of_interest): surgery_colors.append('blue') else: surgery_colors.append('red') elif (utils.isnan(X[i][surgery_index])): x_nan.append(fcs[i][feature_index][class_of_interest]) #this is necessary because of weird behavior when X[i][feature_index] is nan #and for some reason it says that nan is not values y_nan.append(values[-1] + nan_index) if (y[i] == class_of_interest): nan_colors.append('blue') else: nan_colors.append('red') else: x_no_surgery.append( fcs[i][feature_index][class_of_interest]) y_no_surgery.append((X[i][feature_index])) if (y[i] == class_of_interest): no_surgery_colors.append('blue') else: no_surgery_colors.append('red') coi = str(class_of_interest) fig, ax = plt.subplots() ax.scatter(x_surgery, y_surgery, marker='o', s=60, facecolors='none', edgecolors=surgery_colors) ax.scatter(x_no_surgery, y_no_surgery, marker='x', s=60, edgecolors=no_surgery_colors) ax.scatter(x_nan, y_nan, marker='d', s=60, facecolors='none', edgecolors=nan_colors) fig.canvas.draw() labels = [''] + [item.get_text() for item in ax.get_yticklabels()] + [''] if (values[-1] + nan_index < ax.get_yticks()[-1]): plt.yticks( [values[0] - nan_index] + sorted(list(ax.get_yticks()) + [values[-1] + nan_index])) else: plt.yticks([values[0] - nan_index] + sorted( list(ax.get_yticks()) + [values[-1] + nan_index, values[-1] + 2 * nan_index])) labels[-2] = 'nan' plt.xlabel('feature contribution') plt.ylabel('values of feature %r' % attributes[feature_index]) ax.set_yticklabels(labels) red_patch = mpatches.Patch(color='red') blue_patch = mpatches.Patch(color='blue') xmarker = mlines.Line2D([], [], color='black', marker='x', markersize=10, label='Bla', linestyle='None') omarker = mlines.Line2D([], [], color='black', marker='o', markersize=10, label='Bla', linestyle='None', markerfacecolor='None', markeredgecolor='black') #plt.legend(handles=[red_patch,blue_patch]) plt.legend([red_patch, blue_patch, xmarker, omarker], [ 'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi, 'Não passou por cirurgia', 'Passou por cirurgia' ], numpoints=1, fontsize='small') plt.show() if (title is not None): plt.savefig(title) plt.close() f = open(title, 'w') f.write('X=' + str(X)) f.write('\ny=' + str(y)) f.write('\nfcs=' + str(fcs)) f.write('\nfeatures=' + str(attributes)) f.write('\nfeature_index=' + str(feature_index)) f.write('\nvalues=' + str(values)) f.write('\nx_surgery=' + str(x_surgery)) f.write('\ny_surgery=' + str(y_surgery)) f.write('\nsurgery_colors=' + str(surgery_colors)) f.write('\nx_no_surgery=' + str(x_no_surgery)) f.write('\ny_no_surgery=' + str(y_no_surgery)) f.write('\nno_surgery_colors=' + str(no_surgery_colors)) f.write('\nx_nan=' + str(x_nan)) f.write('\ny_nan=' + str(y_nan)) f.write('\nnan_colors=' + str(nan_colors))
def refineDataFrame(self): """ Refines self.df to set the default download mode as 'audio' and creates new entries for default_title, default_artist, and default_album to check if fresh download is required. Usage: ----- self.refineDataFrame() Returns: ------- NULL Creates: ------- self.df : pandas dataframe Creates 3 new columns default_title, default_artist, and default_album from existing download. If no file exists it is left blank. """ self.df['default_title'] = '' self.df['default_artist'] = '' self.df['default_album'] = '' [row, col] = self.df.shape for r in range(row): if (utils.isnan(self.df['mode'][r])): self.df['mode'][r] = 'audio' if (isinstance(self.df['title'][r], str)): if (self.df['mode'][r] == 'audio'): if (os.path.exists('audio/' + self.df['title'][r] + '.mp3')): title, artist, album = utils.get_metadata_file( 'audio/' + self.df['title'][r] + '.mp3') else: title = self.df['title'][r] if not (utils.isnan(self.df['artist'][r])): artist = self.df['artist'][r] if not (utils.isnan(self.df['album'][r])): album = self.df['album'][r] elif (self.df['mode'][r] == 'video'): if (os.path.exists('video/' + self.df['title'][r] + '.mp4')): title, artist, album = utils.get_metadata_file( 'video/' + self.df['title'][r] + '.mp4') else: title = self.df['title'][r] if not (utils.isnan(self.df['artist'][r])): artist = self.df['artist'][r] if not (utils.isnan(self.df['album'][r])): album = self.df['album'][r] else: sys.exit('Not a valid mode. Quitting program.') else: title, artist, album = utils.get_metadata_link( self.df['link'][r]) if (utils.isnan(self.df['title'][r])): self.df['title'][r] = title if (utils.isnan(self.df['artist'][r])): self.df['artist'][r] = '' if (utils.isnan(self.df['album'][r])): self.df['album'][r] = '' self.df['default_title'][r] = title self.df['default_artist'][r] = artist self.df['default_album'][r] = album
def forward(self, dw, f, dw_mask, qw, qw_mask, Qw, Qw_mask, Q_mask, ex2Q): # embeddings dw_emb = self.embedding(dw) # B,|D|,h qw_emb = self.embedding(qw) # B,|Q|,h Qw_emb = self.embedding(Qw) # Q_max,|Q_tokens|,h B = len(dw_emb) # Q=len(Qw_emb) # dropout on embeddings if self.args.dropout_emb > 0: dw_emb = F.dropout(dw_emb, p=self.args.dropout_emb, training=self.training) qw_emb = F.dropout(qw_emb, p=self.args.dropout_emb, training=self.training) Qw_emb = F.dropout(Qw_emb, p=self.args.dropout_emb, training=self.training) # each doc token's att sum vector for query, as this token's soft feature vector (compare with q_in_token) doc_input = [dw_emb] if self.args.doc_use_qemb: if self.self_linear: dw_project = self.Linear_self(dw_emb.view( -1, self.embed_size)).view(B, -1, self.embed_size) # B,|D|,h dw_project = F.relu(dw_project) qw_project = self.Linear_self(qw_emb.view( -1, self.embed_size)).view(B, -1, self.embed_size) # B,|Q|,h qw_project = F.relu(qw_project) else: dw_project = dw_emb qw_project = qw_emb b2q_att = torch.bmm(dw_project, qw_project.transpose( 2, 1)) # B,|D|,|Q|, each d to all q's attention score b2q_att = b2q_att.clone() b2q_att[qw_mask.unsqueeze(1).expand_as( b2q_att).data] = -float('inf') # b2q_att.data.masked_fill_(qw_mask.unsqueeze(1).expand_as(b2q_att).data,-float('inf')) # masked with q's real len b2q_att = F.softmax(b2q_att.view(-1, qw_emb.size(1))).view( B, dw_project.size(1), qw_project.size( 1)) # and softmax B,|D|,|Q| 0.1,0.3,0.6,0 b2q_each_vec = torch.bmm( b2q_att, qw_project) # B,|D|,h_q each d's summed attention to Q: 1,h doc_input.append(b2q_each_vec) if self.args.num_features > 0: doc_input.append(f) # doc encoder # B,|D|,h, (B,|D|,h), B,|D|,n_f doc_input = torch.cat(doc_input, 2) # no padding if (self.training and not self.args.rnn_padding) or dw_mask.data.sum() == 0: outputs = [doc_input] hns = [] for i in range(len(self.doc_encoder)): inputs = outputs[-1] # dropout on this layyer inputs = F.dropout(inputs, training=self.training, p=self.dropout_rnn) output, h_n = self.doc_encoder[i]( inputs ) # output: B,T,n_direction*n_h_dden # h_n:n_direction,B,n_h_dden # lstm hn:(h_n, c_n) outputs.append(output) h_n = torch.cat(h_n, -1) if self.rnn_type != 'lstm' else torch.cat( h_n[0], -1) hns.append(h_n) if self.concat: doc_output = torch.cat( outputs[1:], -1 ) # B,D, n_lay*n_direct*h each token t : h1,t->,h1,t <-, h2,t->,h2,t <-, h3,t->,h3,t <-, else: doc_output = outputs[ -1] # B,D, n_direct*h each token t: h3,t->,h3,t <-, # padding elif self.args.rnn_padding or not self.training: l = torch.sum(dw_mask.eq(0).long(), 1).squeeze(-1) # B, real len sort_len, sort_idx = torch.sort(l, dim=0, descending=True) # B, _, resort = torch.sort(sort_idx, dim=0) # resort B's ex to original outputs = [doc_input[sort_idx.data]] hns = [] for i in range(len(self.doc_encoder)): inputs = outputs[-1] pack_inputs = torch.nn.utils.rnn.pack_padded_sequence( inputs, sort_len.data.cpu().numpy(), batch_first=True) # pack input . len: numpy/list inputs = F.dropout(pack_inputs.data, training=self.training, p=self.dropout_rnn) # dropout inputs = torch.nn.utils.rnn.PackedSequence( inputs, pack_inputs.batch_sizes) # repack output, h_n = self.doc_encoder[i]( inputs ) # output: B,T,n_direction*n_h_dden # h_n: n_direction,B,n_h_dden output, _ = torch.nn.utils.rnn.pad_packed_sequence( output, batch_first=True) # real_output, output_len outputs.append(output) h_n = torch.cat(h_n, -1) if self.rnn_type != 'lstm' else torch.cat( h_n[0], -1) hns.append(h_n) if self.concat: doc_output = torch.cat( outputs[1:], -1 ) # B,D, n_lay*n_direct*h each token t : h1,t->,h1,t <-, h2,t->,h2,t <-, h3,t->,h3,t <-, else: doc_output = outputs[ -1] # B,D, n_direct*h each token t: h3,t->,h3,t <-, doc_output = doc_output[resort.data] # after padding, doc len may shorter,# padding on some dimension in t if doc_output.size(1) != dw_mask.size(1): padding = torch.zeros(doc_output.size(0), dw_mask.size(1) - doc_output.size(1), doc_output.size(2)).type( doc_output.data.type()) doc_output = torch.cat([doc_output, Variable(padding)], 1) if self.concat: doc_h = torch.cat(hns, -1) # B,n_direc*n_layyer*h else: doc_h = hns[-1] # B,n_direc*h doc_output = F.dropout(doc_output, training=self.training, p=self.final_dropout ) # B,|D|,n_direc*n_layyer*h / B,|D|,n_direc*h doc_h = F.dropout( doc_h, training=self.training, p=self.h_output) # B,n_direc*n_layyer*h / B,n_direc*h # question encoder # no padding if (self.training and not self.args.rnn_padding) or qw_mask.data.sum() == 0: outputs = [qw_emb] # B,|Q|,h hns = [] for i in range(len(self.ques_encoder)): inputs = outputs[-1] # dropout on this layyer inputs = F.dropout(inputs, training=self.training, p=self.dropout_rnn) output, h_n = self.ques_encoder[i]( inputs ) # output: B,T,n_direction*n_h_dden # h_n:n_direction,B,n_h_dden # lstm hn:(h_n, c_n) #print(output.size()) outputs.append(output) h_n = torch.cat(h_n, -1) if self.rnn_type != 'lstm' else torch.cat( h_n[0], -1) hns.append(h_n) if self.concat: ques_output = torch.cat( outputs[1:], -1 ) # B,Q, n_lay*n_direct*h each token t : h1,t->,h1,t <-, h2,t->,h2,t <-, h3,t->,h3,t <-, else: ques_output = outputs[ -1] # B,Q, n_direct*h each token t: h3,t->,h3,t <-, # padding elif self.args.rnn_padding or not self.training: l = torch.sum(qw_mask.eq(0).long(), 1).squeeze(-1) # B, real len sort_len, sort_idx = torch.sort(l, dim=0, descending=True) # B, _, resort = torch.sort(sort_idx, dim=0) # resort B's ex to original outputs = [qw_emb[sort_idx.data]] hns = [] for i in range(len(self.ques_encoder)): inputs = outputs[-1] pack_inputs = torch.nn.utils.rnn.pack_padded_sequence( inputs, sort_len.data.cpu().numpy(), batch_first=True) # pack input . len: numpy/list inputs = F.dropout(pack_inputs.data, training=self.training, p=self.dropout_rnn) # dropout inputs = torch.nn.utils.rnn.PackedSequence( inputs, pack_inputs.batch_sizes) # repack output, h_n = self.ques_encoder[i]( inputs ) # output: B,T,n_direction*n_h_dden # h_n: n_direction,B,n_h_dden output, _ = torch.nn.utils.rnn.pad_packed_sequence( output, batch_first=True) # real_output, output_len #print(output.size()) outputs.append(output) h_n = torch.cat(h_n, -1) if self.rnn_type != 'lstm' else torch.cat( h_n[0], -1) hns.append(h_n) if self.concat: ques_output = torch.cat( outputs[1:], -1 ) # B,T, n_lay*n_direct*h each token t : h1,t->,h1,t <-, h2,t->,h2,t <-, h3,t->,h3,t <-, else: ques_output = outputs[ -1] # B,T, n_direct*h each token t: h3,t->,h3,t <-, ques_output = ques_output[resort.data] # after padding, doc len may shorter,# padding on some dimension in t if ques_output.size(1) != qw_mask.size(1): padding = torch.zeros(ques_output.size(0), qw_mask.size(1) - ques_output.size(1), ques_output.size(2)).type( ques_output.data.type()) ques_output = torch.cat([ques_output, Variable(padding)], 1) if self.concat: ques_h = torch.cat(hns, -1) # B,n_direc*n_layyer*h else: ques_h = hns[-1] # B,n_direc*h ques_output = F.dropout( ques_output, training=self.training, p=self.final_dropout ) # B,|Q|,n_direc*n_layyer*h / B,|Q|,n_direc*h ques_h = F.dropout( ques_h, training=self.training, p=self.h_output) # B,q_h: B,n_direc*n_layyer*h/ B,n_direc*h # give different q_token different weight if self.args.q_self_weight: #print(self.h) #print(self.ques_output_size) #print(ques_output.size()) # B,T, n_lay*n_direct*h self_score = self.q_self_Linear( ques_output.view(-1, self.ques_output_size)).squeeze(-1).view( B, -1) # B*|Q|,h h,1 --> B,Q each q token's self score self_score = self_score.clone() self_score[qw_mask.data] = -float('inf') #self_score.data.masked_fill_(qw_mask.data,-float('inf')) self_score = F.softmax(self_score) # B,|Q| ques_final = torch.bmm( self_score.unsqueeze(1), ques_output ).squeeze( 1 ) # B,1,|Q| * B,|Q|,q_h --> B,q_h can use ques_final/ ques_h # Q encoder # no padding if (self.training and not self.args.rnn_padding) or Qw_mask.data.sum() == 0: outputs = [Qw_emb] # n_Q,|Q_tokens|,h hns = [] for i in range(len(self.Q_encoder)): inputs = outputs[-1] # dropout on this layyer inputs = F.dropout(inputs, training=self.training, p=self.dropout_rnn) output, h_n = self.Q_encoder[i]( inputs ) # output: B,T,n_direction*n_h_dden # h_n:n_direction,B,n_h_dden # lstm hn:(h_n, c_n) outputs.append(output) h_n = torch.cat(h_n, -1) if self.rnn_type != 'lstm' else torch.cat( h_n[0], -1) hns.append(h_n) if self.concat: Q_output = torch.cat( outputs[1:], -1 ) # B,Q, n_lay*n_direct*h each token t : h1,t->,h1,t <-, h2,t->,h2,t <-, h3,t->,h3,t <-, else: Q_output = outputs[ -1] # B,Q, n_direct*h each token t: h3,t->,h3,t <-, # padding elif self.args.rnn_padding or not self.training: l = torch.sum(Qw_mask.eq(0).long(), 1).squeeze(-1) # B, real len sort_len, sort_idx = torch.sort(l, dim=0, descending=True) # B, _, resort = torch.sort(sort_idx, dim=0) # resort B's ex to original outputs = [Qw_emb[sort_idx.data]] hns = [] for i in range(len(self.Q_encoder)): inputs = outputs[-1] pack_inputs = torch.nn.utils.rnn.pack_padded_sequence( inputs, sort_len.data.cpu().numpy(), batch_first=True) # pack input . len: numpy/list inputs = F.dropout(pack_inputs.data, training=self.training, p=self.dropout_rnn) # dropout inputs = torch.nn.utils.rnn.PackedSequence( inputs, pack_inputs.batch_sizes) # repack output, h_n = self.Q_encoder[i]( inputs ) # output: B,T,n_direction*n_h_dden # h_n: n_direction,B,n_h_dden output, _ = torch.nn.utils.rnn.pad_packed_sequence( output, batch_first=True) # real_output, output_len outputs.append(output) h_n = torch.cat(h_n, -1) if self.rnn_type != 'lstm' else torch.cat( h_n[0], -1) hns.append(h_n) if self.concat: Q_output = torch.cat( outputs[1:], -1 ) # B,T, n_lay*n_direct*h each token t : h1,t->,h1,t <-, h2,t->,h2,t <-, h3,t->,h3,t <-, else: Q_output = outputs[ -1] # B,T, n_direct*h each token t: h3,t->,h3,t <-, Q_output = Q_output[resort.data] # after padding, doc len may shorter,# padding on some dimension in t if Q_output.size(1) != Qw_mask.size(1): padding = torch.zeros(Q_output.size(0), Qw_mask.size(1) - Q_output.size(1), Q_output.size(2)).type( Q_output.data.type()) Q_output = torch.cat([Q_output, Variable(padding)], 1) if self.concat: Q_h = torch.cat(hns, -1) # |n_Q|,n_direc*n_layyer*h else: Q_h = hns[-1] # |n_Q|,n_direc*h Q_output = F.dropout( Q_output, training=self.training, p=self.final_dropout ) # n_Q,|Q_tokens|,n_direc*n_layyer*h / n_Q,|Q_tokens|,n_direc*h Q_h = F.dropout( Q_h, training=self.training, p=self.h_output) # n_Q, n_direc*n_layyer*h/ n_Q ,n_direc*h # Q2d wQ = self.Q2doc(Q_h) # n_Q,h_Q * h_Q,h_d --> n_Q, h_d #print(type(Q_mask)) trans_Q = np.zeros([B, Q_mask.size(1), self.doc_output_size], dtype='float32') trans_Q = to_var( trans_Q, self.args.cuda ) # B,max_Q,h_d * doc B,h,1 --> B,max_Q, with mask for ex_id in range(B): start, end = ex2Q[ex_id] # Q's pos range in all Q trans_Q[ex_id, :end - start, :] = wQ[start:end, :].clone() pure_Q = torch.bmm(trans_Q, doc_h.unsqueeze(2)).squeeze( 2) # B,max_Q Q_mask,same size Q*W*D # q 2 each d ques_final = ques_h # ques_h / ques_final B,h_q --> B,1,h_d if self.args.train_mode == 'string_match_base_dis': score_s = torch.bmm( self.q2doc_s(ques_final).unsqueeze(1), doc_output.transpose(1, 2)).squeeze( 1) # B,1,h_d * B,h_d,|D| --> B,1,|D|--> B,|D| score_e = torch.bmm( self.q2doc_e(ques_final).unsqueeze(1), doc_output.transpose(1, 2)).squeeze( 1) # B,1,h_d * B,h_d,|D| --> B,1,|D|--> B,|D| score_s = score_s.clone() score_s[dw_mask.data] = -float('inf') score_e = score_e.clone() score_e[dw_mask.data] = -float('inf') # score_s.data.masked_fill_(dw_mask.data,-float('inf')) # score_e.data.masked_fill_(dw_mask.data,-float('inf')) if self.training: score_s = F.log_softmax(score_s) # B,|D|, to compute B,max_C score_e = F.log_softmax(score_e) # B,|D|, to compute B,max_C else: score_s = torch.exp(score_s) # B,|D| score_e = torch.exp(score_e) # B,|D| #print(score_e) #print(score_s) assert torch.sum(isnan(score_s.data.cpu())) == 0 assert torch.sum(isnan(score_e.data.cpu())) == 0 return score_s, score_e, pure_Q # pure_Q B,max_Q, before mask if self.args.train_mode == 'span': score_s = torch.bmm( self.q2doc_s(ques_final).unsqueeze(1), doc_output.transpose(1, 2)).squeeze( 1) # B,1,h_d * B,h_d,|D| --> B,1,|D|--> B,|D| score_e = torch.bmm( self.q2doc_e(ques_final).unsqueeze(1), doc_output.transpose(1, 2)).squeeze( 1) # B,1,h_d * B,h_d,|D| --> B,1,|D|--> B,|D| #print(ques_final) # hd:768 D: B:64 #print(doc_output) # ~ #print(score_s) #print(score_e) # print({'score_e_before_mask':score_e}) # print({'dw_mask':dw_mask}) score_s = score_s.clone() score_s[dw_mask.data] = -float('inf') score_e = score_e.clone() score_e[dw_mask.data] = -float('inf') #print(score_e) #print(score_s) # print({'score_e_before_softmax':score_e}) #score_s.data.masked_fill_(dw_mask.data,-float('inf')) #score_e.data.masked_fill_(dw_mask.data,-float('inf')) if self.training or self.normalize: score_s = F.softmax(score_s) score_e = F.softmax(score_e) else: score_s = torch.exp(score_s) # B,|D| score_e = torch.exp(score_e) # B,|D| # print({'score_e_after_softmax':score_e}) #print(score_e) #print(score_s) # assert torch.sum(isnan(score_e.data.cpu()))==0 # assert torch.sum(isnan(score_s.data.cpu()))==0 return score_s, score_e, pure_Q #,ques_final,doc_output if self.args.train_mode == 'contain' or self.args.train_mode == 'NER': score = torch.bmm( self.q2doc(ques_final).unsqueeze(1), doc_output.transpose(1, 2)).squeeze( 1) # B,1,h_d * B,h_d,|D| --> B,1,|D|--> B,|D| # score.data.masked_fill_(dw_mask.data,-float('inf')) score = score.clone() score[dw_mask.data] = -float('inf') if self.training or self.normalize: score = F.softmax(score) else: score = torch.exp(score) assert torch.sum(isnan(score.data.cpu())) == 0 return score, pure_Q
def plot_followup_improvements(): data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/' data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/' #data_path = data_path + 'Q61802_unified-surgical-evaluation/Responses_Q61802.csv' data_path = data_path + 'Q92510_unified-follow-up-assessment/Responses_Q92510.csv' followup_data = pd.read_csv( data_path, header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/' data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/' data_path = data_path + 'Q44071_unified-admission-assessment/Responses_Q44071.csv' admission_data = pd.read_csv( data_path, header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) print(admission_data.shape) print(followup_data.shape) return_value = {} return_period = {} surgery_patients = [] injury_side_column = admission_data.filter(like='opcLdLesao').columns[0] merged_data = admission_data.merge(followup_data, how='inner', on='participant code', suffixes=('_a', '_f')) for ix, row in merged_data.iterrows(): if row['participant code'] in return_value.keys(): if (not utils.isnan(row['opcForca' + row[injury_side_column] + '[AbdOmbro]_f'])): return_value[row['participant code']].append( row['opcForca' + row[injury_side_column] + '[AbdOmbro]_f']) if (row['formTempoAval_f'] < return_period[row['participant code']][-1]): return_value[row['participant code']][-1], return_value[ row['participant code']][-2] = return_value[ row['participant code']][-2], return_value[ row['participant code']][-1] tmp = return_period[row['participant code']][-1] return_period[ row['participant code']][-1] = row['formTempoAval_f'] return_period[row['participant code']].append(tmp) else: return_period[row['participant code']].append( row['formTempoAval_f']) else: if (not utils.isnan(row['opcForca' + row[injury_side_column] + '[AbdOmbro]_a'])): return_value[row['participant code']] = [ row['opcForca' + row[injury_side_column] + '[AbdOmbro]_a'] ] return_period[row['participant code']] = [ row['formTempoAval_a'] ] if (not utils.isnan(row['opcForca' + row[injury_side_column] + '[AbdOmbro]_f'])): return_value[row['participant code']].append( row['opcForca' + row[injury_side_column] + '[AbdOmbro]_f']) return_period[row['participant code']].append( row['formTempoAval_f']) if (row['snCplexoAt_a'] == 'S' or row['snCplexoAt_f'] == 'S'): surgery_patients.append(row['participant code']) spatients_to_plot = [] speriods_to_plot = [] nspatients_to_plot = [] nsperiods_to_plot = [] for patient in return_value.keys(): if (len(return_value[patient]) >= 3): if (patient in surgery_patients): spatients_to_plot.append(return_value[patient]) speriods_to_plot.append(return_period[patient]) else: nspatients_to_plot.append(return_value[patient]) nsperiods_to_plot.append(return_period[patient]) print(min([b for a in return_period.values() for b in a])) print(max([b for a in return_period.values() for b in a])) exit() for j in range(0, len(spatients_to_plot), 5): ax = plt.subplot(111) plt.axis((0, 3000, -1, 6)) for i in range(j, j + 5): if (i < len(spatients_to_plot)): ax.plot(speriods_to_plot[i], spatients_to_plot[i], 'x-') #,color=colors[i]) else: break plt.show() ax = plt.subplot(111) plt.axis((0, 3000, -1, 6)) for i in range(len(nspatients_to_plot)): ax.plot(nsperiods_to_plot[i], nspatients_to_plot[i], 'x-') plt.show()
def predict_rec(self, X, node, shuffle_attribute=None): # if the node is a class node, then it should return the class distribution if node.is_class: d = {} # sum of class distributions (absolute values) s = sum(node.distr.values()) # for each class for k in node.distr.keys(): # if s == 0, then there are no instances at the node - which means that # it's a decision node coming from a nan branch (branch_nan) if (s == 0): # adds 1 to the final_class (most probable one) in case of a possible # future classification of an instance that ends up at this final node d[node.final_class] = 1 return d # returns the node distribution (relative values) else: d[k] = node.distr[k] / s return d # if the value of the node feature should be permuted on the instance if (shuffle_attribute is not None and node.feature_index == shuffle_attribute): # list of probabilities to randomly assign the instance to the node branches probs = [] # for each node that descend from the branches (except the last one) for j in range(len(node.branches) - 1): # add to the list the probability that the instance would end up at the node # if it was randomly assigned to it - that is, the number of instances at the node # divided by the number of instances at its parent's node (round to 5 decimal digits). probs.append( round( sum(node.branches[j].distr.values()) / sum(node.distr.values()), 5)) # if there is a branch for missing values at the node if (node.branch_nan is not None): # add to the list the probability that the instanece would end up at the node from the last branch probs.append( round( sum(node.branches[len(node.branches) - 1].distr.values()) / sum(node.distr.values()), 5)) # if the sum of probabilities exceeded 1 if (1 - sum(probs) < 0): #change the last probability to be 1 - the sum of probabilities probs[-1] = round(1 - sum(probs[:-1]), 5) # the last branch (or the nan branch, if it exists) will be assigned with probability 0 probs.append(0) # probability for the last branch (or the nan branch, if it exists) else: probs.append(1 - sum(probs)) # randomly select the branch according the the probabilities calculated above i = np.random.choice(range(len(probs)), p=probs) # if the nan branch was selected, continue the prediction running the instance # through that branch if (i == len(node.branches) and node.branch_nan is not None): return self.predict_rec(X, node.branch_nan, shuffle_attribute) # continue the prediction running the instance through the randomly chosen branch else: return self.predict_rec(X, node.branches[i], shuffle_attribute) # if the value of instance X for the feature on the node is missing if (utils.isnan(X[node.feature_index])): # if there isn't a nan branch (C4.5 approach) if (node.branch_nan is None): # list of possible outcomes distr = [] # list of relative distribution of possible outcomes prob_branch = [] # add to the list of possible outcomes the prediction of the instance # through each one of the branches for n in node.branches: distr.append(self.predict_rec(X, n, shuffle_attribute)) prob_branch.append( sum(n.distr.values()) / sum(node.distr.values())) d = {} # for each possible class at the node for k in node.distr.keys(): d[k] = 0 # for each possible outcome, add to the distribution the # probability of that outcome for i in range(len(distr)): d[k] += prob_branch[i] * distr[i][k] return d # if there is a branch for the missing values else: # continue prediction through the nan branch y = self.predict_rec(X, node.branch_nan, shuffle_attribute) # if the value of instance X for the node feature is not missing and # it corresponds to a numeric feature (len(node.values) = 1) elif len(node.values) == 1: # if the value of instance X for the node feature is less than the # value to compare, then continue the prediction through the left # branch (node.branches[0]). if (X[node.feature_index] <= node.values[0]): y = self.predict_rec(X, node.branches[0], shuffle_attribute) # else continue through the right branch (node.branches[1]) else: y = self.predict_rec(X, node.branches[1], shuffle_attribute) # if the node feature is categorical else: # node.values.index(str(X[node.feature_index])) should return the # index of the value of X for the node feature on the node.values list, # but if it can't find it, it means that this value hasn't been seen yet # (none of the instances used to train the tree had that value). In that case, # it'll raise an ValueError. try: y = self.predict_rec( X, node.branches[node.values.index( str(X[node.feature_index]))], shuffle_attribute) except (ValueError): # if the value hasn't been seen at the training phase, then it'll be considered as a missing value. # if there is a nan branch, continue prediction through it if (node.branch_nan is not None): y = self.predict_rec(X, node.branch_nan, shuffle_attribute) # if there isn't a nan branch, then use C4.5 approach else: distr = [] prob_branch = [] for n in node.branches: distr.append(self.predict_rec(X, n, shuffle_attribute)) prob_branch.append( sum(n.distr.values()) / sum(node.distr.values())) d = {} for k in node.distr.keys(): d[k] = 0 for i in range(len(distr)): d[k] += prob_branch[i] * distr[i][k] return d return y
def plot_followup_pain(): data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/' data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/' #data_path = data_path + 'Q61802_unified-surgical-evaluation/Responses_Q61802.csv' data_path = data_path + 'Q92510_unified-follow-up-assessment/Responses_Q92510.csv' #data_path = data_path + 'Q44071_unified-admission-assessment/Responses_Q44071.csv' data = pd.read_csv(data_path, header=0, delimiter=",", na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) # admission_data = pd.read_csv('~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/Dor.csv', header=0, delimiter=",", # na_values=['N/A', 'None','nan','NAAI','NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False) outcome = 'snDorPos' #'opcForcaD[FlexCotovelo]' #'snDorPos' #outcome_left = 'snDorPos'#'opcForcaE[FlexCotovelo]'#'snDorPos' #print(len(([int(a/30) for a in data['formTempoAval']]))) patients_considered = {} patient_outcomes = {} #return_periods = [] for i, row in data.iterrows(): if (row['participant code']) not in patients_considered: patients_considered[row['participant code']] = row['formTempoAval'] patient_outcomes[row['participant code']] = row[outcome] #return_periods.append(row['formTempoAval']) else: if (row['formTempoAval'] > patients_considered[row['participant code']]): if (row[outcome] != 'NINA' and not utils.isnan(row[outcome])): patient_outcomes[row['participant code']] = row[outcome] patients_considered[ row['participant code']] = row['formTempoAval'] else: if (utils.isnan(patient_outcomes[row['participant code']])): if (row[outcome] != 'NINA' and not utils.isnan(row[outcome])): patient_outcomes[ row['participant code']] = row[outcome] patients_considered[ row['participant code']] = row['formTempoAval'] #print(row['participant code']) #import pdb #pdb.set_trace() #labels = {'S':'Sim','N':'Não',np.nan:'Não informado'} for k in patients_considered.keys(): patients_considered[k] = int(patients_considered[k] / 30) xlabels = ['N', 'S', np.nan] labels = {'S': 'Sim', 'N': 'Não', np.nan: 'Não informado'} y = [(Counter(patient_outcomes.values())[x]) for x in xlabels] width = 0.8 fig = plt.figure() ax = fig.add_subplot(111) plt.bar(range(len(xlabels)), y, width=width) ax.set_xticks(np.arange(len(xlabels)) + width / 2) ax.set_xticklabels([labels[l] for l in xlabels]) plt.xlabel('Sente dor após a lesão?') plt.show()
def find_split(self, X, y, feature_indices, weights): best_gain = -float('inf') best_feature_index = -1 best_value = [0] # for each feature to be considered for feature_index in sorted(feature_indices): # get rows of instances with known values for the feature not_nan_rows = [ a for a in range(X.shape[0]) if not utils.isnan(X[:, feature_index][a]) ] Xnotnan = (X[not_nan_rows, :]) ynotnan = y[not_nan_rows] #if there aren't any instances with known values for the feature, go to the next one if (Xnotnan.shape[0] == 0): continue # get all possible values for the feature index values = sorted(set(Xnotnan[:, feature_index])) # if the values are numeric if (utils.isnum(Xnotnan[0, feature_index])): # split the data using each value for j in range(len(values) - 1): #value = (float(values[j]) + float(values[j+1]))/2 -- original value = values[j] # split data using the feature and the value Xs, ys, d = utils.split_num(Xnotnan, ynotnan, feature_index, value) # calculate gain considering the rate of missing values. # the bigger the rate, the smaller the gain gain = (len(ynotnan) / len(y)) * utils.information_gain( ynotnan, ys) if gain >= best_gain: # if there's a tie on info gain, decide using gain ratio # if(gain == best_gain and best_feature_index != -1): # print('tie of gain') # gr = utils.gain_ratio(ynotnan,ys,y) # not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])] # Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value) # # calculate gain ratio of previous best feature to compare # gr_p = utils.gain_ratio(ynotnan,yss,y) # # if the current feature's gain ratio is not better than the previous one, then # # go to the next feature # if(gr < gr_p): # continue best_gain = gain best_feature_index = feature_index best_value = [ values[j] ] #c4.5 choses the largest value in the trainig set that #does not exceed the midpoint (value). This ensures that all #threshold values appearing in trees actually occur in the data # if the values are categorical else: # split the data using the values Xs, ys, d = utils.split_categ(Xnotnan, ynotnan, feature_index, values) gain = ((len(ynotnan) / len(y)) * utils.information_gain(ynotnan, ys) ) #utils.gain_ratio(ynotnan,ys,y)) if gain >= best_gain: # if(gain == best_gain and best_feature_index != -1): # print('tie of gain') # gr = utils.gain_ratio(ynotnan,ys,y) # not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])] # Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value) # gr_p = utils.gain_ratio(ynotnan,yss,y) # if(gr < gr_p): # continue best_gain = gain best_feature_index = feature_index best_value = values return best_feature_index, best_value
def _sanity_check(self, ground_metric_matrix): assert not (ground_metric_matrix < 0).any() assert not (isnan(ground_metric_matrix).any())