def information_gain(array_source, array_children_list, criterion='gini'): """Computes the information gain between the first and second array using the criterion 'gini' or 'entropy' 333""" if isinstance(array_source, np.ndarray) == 1 and isinstance( array_children_list, np.ndarray) == 1: if criterion == "gini" or criterion == "entropy": if criterion == "gini": So = gini(array_source) q = len(array_children_list) N = len(array_source) somme = 0.0 for i in range(q): somme += (len(array_children_list / N) * gini(array_children_list)) IG = So - somme return (IG) else: So = entropy(array_source) q = len(array_children_list) N = len(array_source) somme = 0.0 for i in range(q): somme += (len(array_children_list / N) * entropy(array_children_list)) IG = So - somme return (IG) else: print("info_gain: error in children list or criterion type") else: print("info_gain: error in type of array")
def cal_diversity(): subj_refnum = json.loads(open('data/subj_refnum.json').read()) subj_totalnum = len(subj_refnum.keys()) paper_year= json.loads(open('data/paper_year.json').read()) logging.info('year data loaded ....') citnum_total = defaultdict(int) for subj in subj_refnum.keys(): for subj2 in subj_refnum[subj].keys(): citnum_total[subj2]+=subj_refnum[subj][subj2] of = open('data/pid_divs.txt','w') progress = 0 for line in open('data/paper_ref_attrs.json'): progress+=1 logging.info('progress {} ...'.format(progress)) pid_div_vs = {} line = line.strip() paper_ref_attrs = json.loads(line) for pid in paper_ref_attrs.keys(): years = [] c5s = [] c10s = [] all_subjs = [] for ref_attr in paper_ref_attrs[pid]: year,c5,c10,subjs = ref_attr years.append(int(year)-int(paper_year[pid])) c5s.append(c5) c10s.append(c10) all_subjs.append(subjs) if len(years)<5: continue year_div = gini(np.array(years)) c5_div = gini(c5s) c10_div = gini(c10s) subj_div = cal_subj_div(all_subjs,subj_refnum,subj_totalnum,citnum_total) pid_div_vs[pid] = [year_div,c5_div,c10_div,subj_div] of.write(json.dumps(pid_div_vs)+"\n") logging.info('paper attr done.')
def fit(self, X, y): """ Build the decision tree from the training set (X, y). The training set has m data_points (examples). Each of them has n features. Args: X: a pandas.Dataframe representing the training input of dimension m x n. y: a pandas.Dataframe representing the labels (m x 1). Returns: object self: Trained tree. Raises: This method should not raise any Exception. """ # Your code here. You can add more things if needed # self.root r = gini(X) # print(f'r>> {r}') # print(X.iloc[:,:]) for feat in range(4): tmp = X.iloc[:, feat] mean = np.mean(tmp) right, left = self.split_(X, feat, mean) inf = information_gain(X, [right, left]) print(inf) # print(inf) return self.root
def diversity_of_equal_percentile(pid_citnum, N): cits = pid_citnum.values() total = np.sum(cits) num = len(cits) acc_total = 0 c_p = 0 num_of_p = 0 percents = [] for v in sorted(cits, key=lambda x: int(x), reverse=True): acc_total += v num_of_p += 1 ## if acc_total / float(total) - c_p >= 1 / float(N): c_p += 1 / float(N) percents.append(num_of_p / float(num)) num_of_p = 0 ##得到不同社区的文章比例,后计算不同percentile的论文的diversity diversity = gini(percents) # print(percents) # print(diversity) return percents, diversity
def main(): # parse arguments infile = open(sys.argv[1], 'r') outfile = open(sys.argv[2], 'w') reads_threshold = int(sys.argv[3]) cis_threshold = float(sys.argv[4]) max_aberration = float(sys.argv[5]) gini_threshold = float(sys.argv[6].strip()) # print header lines outfile.write(OUTFILE_HEADER) outfile.write(LINE_STRUCTURE % ("THRESHOLD", reads_threshold, cis_threshold, 0.0, gini_threshold, max_aberration, "N/A")) # calculate GiniQC and other metrics for each cell. Then write each to outfile lines = infile.readlines() for line in tqdm(lines): if "/" in line: file = line.strip() else: file = "/".join(sys.argv[1].split("/")[:-1])+"/"+line.strip() cell_name = line.split(".cool")[0] try: matrix = cooler.Cooler(file) except: print("Files must be in cool format") normalized, reads, cis_reads, trans_reads = normalize_matrix(matrix) percent_cis = 100.0*cis_reads/reads raw_gini = gini(normalized) adj_gini = adjust(raw_gini, reads) chrom_aberration = get_max_aberration(matrix) passed = (reads > reads_threshold) and (percent_cis > cis_threshold) and (adj_gini > gini_threshold) and (chrom_aberration < max_aberration) outfile.write(LINE_STRUCTURE % (cell_name, reads, percent_cis, raw_gini, adj_gini, chrom_aberration, passed))
def test_gini_val(self): X_list, y_list = read_data_class() gini_val = 0.0 gini_test = gini.gini(X_list,y_list,3)[2] self.assertEqual(gini_val, gini_test)
def cal_subj_div(subj_totalnum, subj_nums, subjs, subj_subj_sim): variety = len(subjs) / float(subj_totalnum) balance = gini(subj_nums) disparsity = cal_disparsity(subjs, subj_subj_sim) return variety * balance * disparsity
def get_threshold(combos, cull_by_cis, bedfile): reads = {} rawgini = {} adjustedgini = {} bins_df = make_df(bedfile) if cull_by_cis: cis_threshold = int(sys.argv[3]) for pair in tqdm(combos): if pair[0] == pair[1]: continue pair = tuple(pair) cool1, cool2 = get_cools(pair) try: matrix1 = np.array(cool1.matrix(as_pixels=True, balance=False)[:]) matrix2 = np.array(cool2.matrix(as_pixels=True, balance=False)[:]) except: continue numreads1 = sum(matrix1[:, -1]) numreads2 = sum(matrix2[:, -1]) totalreads = numreads1 + numreads2 if cull_by_cis and (calculate_cistrans(matrix1) < cis_threshold or calculate_cistrans(matrix2) < cis_threshold): continue if numreads1 == 0 or numreads2 == 0 or totalreads < 50000: continue numtoselect = int( abs(np.random.normal(totalreads / 2, totalreads / 20))) rands = np.random.choice(np.arange(1, totalreads), numtoselect, replace=False) rands.sort() pixel_df = fill_pixel_df(rands, matrix1, matrix2, numreads1, numreads2) cooler.create_cooler("temp.cool", bins=bins_df, pixels=pixel_df, dtypes={ 'bin1_id': int, 'bin2_id': int, 'count': int }, ordered=True) newcool = cooler.Cooler("temp.cool") normalized, reads[pair], cis, trans = normalize_matrix(newcool) rawgini[pair] = gini(normalized) adjustedgini[pair] = adjust(rawgini[pair], reads[pair]) os.unlink("temp.cool") return reads, rawgini, adjustedgini
def plantclass(X, y, num): """ Input: X - macierz z przykladami budujacymi drzewo y - wektor z decyzjami num - liczba cech, sposrod ktorych gini wybiera wartosc podzialu Rekurencyjna funkcja budujaca drzewo. Wybiera wartosc podzialu na podstawie wlasnosci Gini impurity. Budowanie drzewa konczy sie, kiedy wartosc Gini w wezle jest rowna 0.0 - wtedy tez tworzone sa liscie z decyzjami. """ gini_tup = gini.gini(X, y, num) if gini_tup[2] == 0: set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0], gini_tup[1], y) if len(y1) == 0 and len(y2) > 0: fbval = float(y2[0]) tbval = abs(fbval - 1) elif len(y2) == 0 and len(y1) > 0: tbval = float(y1[0]) fbval = abs(tbval - 1) elif len(y1) > 0 and len(y2) > 0: tbval = y1[0] fbval = y2[0] return node.Node(tb=leaf.Leaf(tbval), fb=leaf.Leaf(fbval), value=gini_tup[1], index=gini_tup[0], gn=gini_tup[2]) else: set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0], gini_tup[1], y) if len(set1) != 0: trueBranch = Tree.plantclass(set1, y1, num) else: trueBranch = leaf.Leaf(random.randint(0, 1)) if len(set2) != 0: falseBranch = Tree.plantclass(set2, y2, num) else: falseBranch = leaf.Leaf(random.randint(0, 1)) return node.Node(tb=trueBranch, fb=falseBranch, value=gini_tup[1], index=gini_tup[0], gn=gini_tup[2])
def cal_subj_div(all_subjs,subj_refnum,subj_totalnum,citnum_total): subj_set = [] subj_num = [] for subjs in all_subjs: subj_num.append(len(subjs)) subj_set.extend(subjs) subj_set = list(set(subj_set)) ## nc/N variety = len(subj_set)/float(subj_totalnum) balance = gini(subj_num) disparsity = cal_disparsity(subj_set,subj_refnum,citnum_total) return variety*balance*disparsity,variety,balance,disparsity
def plantclass(X, y, num): """ Input: X - macierz z przykladami budujacymi drzewo y - wektor z decyzjami num - liczba cech, sposrod ktorych gini wybiera wartosc podzialu Rekurencyjna funkcja budujaca drzewo. Wybiera wartosc podzialu na podstawie wlasnosci Gini impurity. Budowanie drzewa konczy sie, kiedy wartosc Gini w wezle jest rowna 0.0 - wtedy tez tworzone sa liscie z decyzjami. """ gini_tup = gini.gini(X,y,num) if gini_tup[2] == 0: set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0],gini_tup[1],y) if len(y1) == 0 and len(y2)>0: fbval = float(y2[0]) tbval = abs(fbval-1) elif len(y2) == 0 and len(y1)>0: tbval = float(y1[0]) fbval = abs(tbval-1) elif len(y1) > 0 and len(y2) >0: tbval = y1[0] fbval = y2[0] return node.Node(tb=leaf.Leaf(tbval), fb = leaf.Leaf(fbval), value=gini_tup[1], index=gini_tup[0], gn = gini_tup[2]) else: set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0],gini_tup[1],y) if len(set1) != 0: trueBranch = Tree.plantclass(set1, y1, num) else: trueBranch = leaf.Leaf(random.randint(0,1)) if len(set2) != 0: falseBranch = Tree.plantclass(set2, y2, num) else: falseBranch = leaf.Leaf(random.randint(0,1)) return node.Node(tb=trueBranch, fb=falseBranch, value=gini_tup[1], index=gini_tup[0], gn = gini_tup[2])
def best_value(rows, col): """ param: fdata 第0行为属性,第1行为类别 """ # 选取一列,确定以一个值作为分界分出两部分的gini增益最大, # 返回这个分界值,以及gini增益,和划分后的两个数组 # 遍历取值:2-10,>=取值的分到true # 然后分裂成两个 # 循环,在循环中保存gini增益值最大的分界值和分裂后的数组 best_gain = 0 # keep track of the best information gain best_question = None # keep train of the feature / value that produced it current_uncertainty = gini(rows) for val in range(2, 11): question = Question(col, val) true_rows, false_rows = partition(rows, question) if len(true_rows) == 0 or len(false_rows) == 0: continue gain = info_gain(true_rows, false_rows, current_uncertainty) if gain > best_gain: best_gain, best_question = gain, question return best_gain, best_question
# -*- coding: UTF-8 -*- import gini tabela = [] f = open("gini_dane.txt", "r") for i in f: tabela.append(i.strip().split("\t")) y = [] g = open("gini_klasyfikacje.txt", "r") for j in g: y.append(j.strip()) print "Tabela X z wartościami y po prawej stronie:" for element in zip(tabela, y): print (element[0] + [element[1]]) print "Wynik działania funkcji na zbiorze danych\n(indeks cechy / wartość cechy / wartość Gini impurity):" a = gini.gini(tabela, y, 3) print a
def run_sim(self): self.policy.eval() with torch.no_grad(): trajectories = np.asarray( [Trajectory() for i in range(self.n_trajectories)]) ra_length = 1 # epsilon = 0.9 item_embeds = torch.from_numpy(self.item_embeddings).to( self.device).float() ave_score = 0 ave_cost = 0 states = self.env.reset() # print(states.shape) recommended_item_onehot = torch.FloatTensor( self.n_trajectories, self.nb_item).zero_().to(self.device) recommendations = [] for t in range(self.trajectory_len): policy_input = torch.FloatTensor(states).to(self.device).view( self.n_trajectories, -1) weight_dists = self.policy(policy_input) w = weight_dists.sample() # print(w.shape) item_weights = torch.mm(w.view(-1, item_embeds.shape[1]), item_embeds.transpose(0, 1)).view( self.n_trajectories, ra_length, -1) item_weights = torch.mul(item_weights.transpose(0, 1), 1 - recommended_item_onehot).reshape( states.shape[0], ra_length, -1) item_idxes = torch.argmax(item_weights, dim=2) recommendations.append(item_idxes) recommended_item_onehot = recommended_item_onehot.scatter_( 1, item_idxes, 1) actions = item_embeds[item_idxes.cpu().detach()] states_prime, rewards, costs, info = self.env.step( actions, item_idxes) for i in range(len(trajectories)): trajectory = trajectories[i] trajectory.observations.append(policy_input[i].to( self.device).squeeze()) trajectory.actions.append(actions[i].to( self.device).squeeze()) trajectory.rewards.append(rewards[i].to( self.device).squeeze()) trajectory.costs.append(costs[i].to(self.device).squeeze()) states = states_prime ave_score += torch.sum(info).detach().cpu() ave_cost += torch.sum(costs).detach().cpu() memory = Memory(trajectories) # print(ave_score.float()/(self.trajectory_len*self.n_trajectories), ave_cost/(self.trajectory_len*self.n_trajectories)) self.pop_rate.append(ave_cost / (self.trajectory_len * self.n_trajectories)) recommendation_tensor = torch.cat(recommendations, 1) idx, val = torch.unique(torch.cat(recommendations), return_counts=True) hr = (ave_score.float() / (self.trajectory_len * self.n_trajectories)).cpu().numpy() self.hit_rate.append(hr) val_ = torch.cat( (val.float(), torch.zeros(self.nb_item - len(val)).to(self.device))) g = gini(val_.cpu().numpy()) self.gini_coefficient.append(g) return memory
print('First best output {}, capital {}, interest rate {}, hours {} and labour supply {}'.format(fb_Y, fb_K, fb_r, fb_H, fb_L)) results_FB = dict( (name, eval(name)) for name in ['fb_Y', 'fb_K', 'fb_r', 'fb_w', 'fb_H', 'fb_L']) #====Calcuate Incomplete Market Results ===# eqm_r_IM = brentq(Gamma_IM, -fp.delta*.95, (1-cp.beta)/cp.beta, xtol = tol_brent) im_r, im_w, im_Lambda, im_K, im_L, im_H, im_coefvar, im_a, im_z_rlz, im_h_val, im_l_val, im_policy = compute_agg_prices(cp,z_seq, social =1) im_Y = KL_to_Y(im_K, im_L, fp) im_gini_a = gini(im_a) im_gini_i = gini(im_z_rlz) results_IM = dict( (name, eval(name)) for name in ['im_r', 'im_w', 'im_Lambda', 'im_K', 'im_L',\ 'im_H', 'im_coefvar', 'im_a', 'im_z_rlz', 'im_h_val',\ 'im_l_val', 'im_policy', 'im_Y', 'im_gini_a', 'im_gini_i']) print('Incomplete market capital {}, interest rate {}, hours {} and labour supply {} and Lambda {}'.format(im_K, im_r, im_H, im_L, im_Lambda)) cp.r = eqm_r_IM cp.R = 1+cp.r cp.w = r_to_w(cp.r,fp, cp) cap_lab_ratio = ((cp.r+fp.delta)/fp.alpha)**(1/(fp.alpha-1))
def selection(): var = 'newsalaryperfte' if selectVariable.value == 'Previous Salary (AY 2016-2017)': var = 'cursalaryperfte' campus = selectCampus.value college = selectCollege.value dept = selectDept.value if excludeSlider.value > 0: exclude = slice(None, -excludeSlider.value) else: exclude = slice(None) df = salaries.loc[(salaries.campus == campus) & (salaries.college == college) & (salaries.dept == dept), ['empname', 'empdepttitle', var]].rename(columns={var: 'value'}) df['Rank'] = df['value'].rank(ascending=False) df['ylabel'] = df.apply(lambda row: '{:g} {}'.format(row['Rank'], row['empname']), axis=1) df['value_scaled'] = df['value']/1000 return df.sort_values('value', na_position='first', ascending=True).iloc[exclude], df['value_scaled'].quantile(0.5), gini(df['value_scaled'])
def cal_wos_paper_divs(): pid_pubyear, pid_subjects, pid_topsubjs, pid_teamsize = load_basic_data() ## pid c2 pid_c2 = json.loads(open('../WOS_data_processing/data/pid_c2.json').read()) ## pid c5 pid_c5 = json.loads(open('../WOS_data_processing/data/pid_c5.json').read()) ## pid_c10 pid_c10 = json.loads( open('../WOS_data_processing/data/pid_c10.json').read()) ## subject subject sim subj_subj_sim = json.loads( open('../WOS_data_processing/data/subj_subj_sim.json').read()) # 计算c2 c5 c10的percentile c2_percentile = nums_to_percentile_dict(pid_c2.values()) c5_percentile = nums_to_percentile_dict(pid_c5.values()) c10_percentile = nums_to_percentile_dict(pid_c10.values()) subj_totalnum = float(len(subj_subj_sim.keys())) pid_divs = {} progress = 0 sub_progress = 0 total_paper_num = 0 total_cit_num = 0 selected_paper_num = 0 selected_cit_num = 0 cn_dis = defaultdict(int) ref_nums = defaultdict(int) for line in open('../WOS_data_processing/data/pid_refs.txt'): line = line.strip() progress += 1 pid_refs = json.loads(line) for pid in pid_refs.keys(): sub_progress += 1 if sub_progress % 1000000 == 0: logging.info('progress:{},sub progress {} ...'.format( progress, sub_progress)) pubyear = int(pid_pubyear.get(pid, 9999)) ## 1980年 到 如果年份大于2004则舍弃 if pubyear > 2004 or pubyear < 1980: continue total_paper_num += 1 total_cit_num += len(pid_refs[pid]) ref_nums[len(pid_refs[pid])] += 1 if len(pid_refs[pid]) < 4 or len(pid_refs[pid]) > 100: continue selected_cit_num += len(pid_refs[pid]) selected_paper_num += 1 ## 对于每一篇文章来讲 需要计算三个 ## year differences ## subject diversity ## c5 diversity ## c10 diversity yds = [] subjs = [] subj_nums = [] c2s = [] c5s = [] c10s = [] c2ps = [] c5ps = [] c10ps = [] for ref_id in pid_refs[pid]: yds.append(abs(int(pid_pubyear[ref_id]) - pubyear)) c2s.append(pid_c2.get(ref_id, 0)) c5s.append(pid_c5.get(ref_id, 0)) c10s.append(pid_c10.get(ref_id, 0)) c2ps.append(c2_percentile[pid_c2.get(ref_id, 0)]) c5ps.append(c5_percentile[pid_c5.get(ref_id, 0)]) c10ps.append(c10_percentile[pid_c10.get(ref_id, 0)]) subj_nums.append(len(pid_subjects.get(ref_id, []))) subjs.extend(pid_subjects[ref_id]) cn_dis[ref_id] += 1 ## 通过上面的值计算每篇论文reference的diversity yd_div = gini(yds) c2_div = gini(c2s) c5_div = gini(c5s) c10_div = gini(c10s) # 均值以及std yd_mean = np.mean(yds) yd_std = np.std(yds) # c2_mean = np.mean(c2s) c2_std = np.std(c2s) # c5_mean = np.mean(c5s) c5_std = np.std(c5s) # c10_mean = np.mean(c10s) c10_std = np.std(c10s) c2p_div = gini(c2ps) c5p_div = gini(c5ps) c10p_div = gini(c10ps) c2p_mean = np.mean(c2ps) c2p_std = np.std(c2ps) # c5p_mean = np.mean(c5ps) c5p_std = np.std(c5ps) # c10p_mean = np.mean(c10ps) c10p_std = np.std(c10ps) subjs = list(set(subjs)) if len(subjs) <= 1: subj_div = 0 else: subj_div = cal_subj_div(subj_totalnum, subj_nums, subjs, subj_subj_sim) pid_divs[pid] = [ yd_div, subj_div, c2_div, c5_div, c10_div, yd_mean, yd_std, c2_mean, c2_std, c5_mean, c5_std, c10_mean, c10_std, c2p_div, c5p_div, c10p_div, c2p_mean, c2p_std, c5p_mean, c5p_std, c10p_mean, c10p_std ] open('data/pid_divs.json', 'w').write(json.dumps(pid_divs)) logging.info('{} papers div data saved to data/pid_divs.json'.format( len(pid_divs.keys()))) # 将现有的需要统计的指标进行列出来 print('===============================') print('Total paper num:', total_paper_num, ',total num of citation links:', total_cit_num) print('reserved paper num:', selected_paper_num, ',reserved num of citation links:', selected_cit_num) # 将保留的引用次数分布画出来 cc_counter = Counter(cn_dis.values()) xs = [] ys = [] for cc in sorted(cc_counter.keys()): if cc == 100: print('Number of papers cited 100 times:', cc_counter[cc]) xs.append(cc) ys.append(cc_counter[cc]) plt.figure(figsize=(7, 5)) plt.plot(xs, ys, 'o', fillstyle='none') plt.xscale('log') plt.yscale('log') plt.xlabel('number of citations') plt.ylabel('number of publications') plt.tight_layout() plt.savefig('fig/citation_distritbuion.png', dpi=400) # 将refnum_distribution进行画出来 xs = [] ys = [] for rn in sorted(ref_nums.keys()): if rn > 100: continue xs.append(rn) ys.append(ref_nums[rn]) plt.figure(figsize=(7, 5)) plt.plot(xs, ys) plt.xlabel('number of references') plt.ylabel('number of publications') plt.xscale('log') plt.yscale('log') plt.tight_layout() plt.savefig('fig/refnum_distribution.png', dpi=400) print('DONE')
import numpy as np from gini import gini # print(gini(np.array([]))) # None # print(gini(np.array({1, 2}))) # None # print(gini(np.array('bob'))) # None print(gini(np.array([0, 0, 0, 0, 0, 0]))) # 0.0 print(gini(np.array([6]))) # 0.0 print(gini(np.array(['a', 'a', 'b', 'b']))) # 1.0 print(gini(np.array(['0', '0', '1', '0', 'bob', '1']))) # 1.4591479170272448 print(gini(np.array([0, 0, 1, 0, 2, 1]))) # 1.4591479170272448 print(gini(np.array(['0', 'bob', '1']))) # 1.584962500721156 print(gini(np.array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))) # 0.0 print(gini(np.array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))) # 0.4689955935892812 print(gini(np.array([0, 0, 1]))) # 0.9182958340544896
def find_measures(year, month=0, pnt=False): if month is 0: # Yearly file_name_holderdata = "Shareholder" + str(year) + ".csv" file_name_measures = "Measures" + str(year) + ".csv" path = r"C:\Users\Mahdi\OneDrive\Master Thesis\Data" else: file_name_holderdata = "Shareholder" + str(year) + '_' + str( month) + ".csv" file_name_measures = "Measures" + str(year) + '_' + str(month) + ".csv" path = r"C:\Users\Mahdi\OneDrive\Master Thesis\Data\MonthlyShareholder" os.chdir(path) SDATA = pd.read_csv(file_name_holderdata, index_col=0).drop_duplicates() # Creating Dataframe for saving concentration mearsurs CMdf = SDATA.groupby('Symbol', as_index=False).agg({ 'Id_tse': 'first', 'Industry': 'first', 'percent': 'sum', 'ShareHolder': 'count', 'MarketCap': 'first' }).rename(columns={ 'ShareHolder': 'Num_holders', 'percent': 'sum_over1' }) CMdf.reset_index(drop=True, inplace=True) Orginal_Size = len(CMdf) print('Number of observed firms in year ', str(year), ' is : ', Orginal_Size) # Largest Owner temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': 'max' }).rename(columns={'percent': 'Largest_Owner'}) CMdf = pd.merge(CMdf, temp, left_on='Symbol', right_on='Symbol', how='left') # First/Second temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: max(x) / nth_max(x, nth=2, interval=False)} }).rename(columns={'percent': 'First_Second'}) CMdf = pd.merge(CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename( columns={('First_Second', '<lambda>'): 'First_Second'}) # First/SumToFour temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: max(x) / sum(nth_max(x, nth=[2, 4], interval=True))} }).rename(columns={'percent': 'First_Sumtwofour'}) CMdf = pd.merge( CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename( columns={('First_Sumtwofour', '<lambda>'): 'First_Sumtwofour'}) # Sumfive temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: sum(nth_max(x, nth=[1, 5], interval=True))} }).rename(columns={'percent': 'Sumfive'}) CMdf = pd.merge( CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename(columns={('Sumfive', '<lambda>'): 'Sumfive'}) temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: sum(nth_max(x, nth=[1, 4], interval=True))} }).rename(columns={'percent': 'Sumfour'}) CMdf = pd.merge( CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename(columns={('Sumfour', '<lambda>'): 'Sumfour'}) temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: sum(nth_max(x, nth=[1, 3], interval=True))} }).rename(columns={'percent': 'Sumthree'}) CMdf = pd.merge( CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename(columns={('Sumthree', '<lambda>'): 'Sumthree'}) temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: sum(nth_max(x, nth=[1, 2], interval=True))} }).rename(columns={'percent': 'Sumtwo'}) CMdf = pd.merge( CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename(columns={('Sumtwo', '<lambda>'): 'Sumtwo'}) # Gini temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: gini(list(x))} }).rename(columns={'percent': 'Gini'}) CMdf = pd.merge(CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename(columns={('Gini', '<lambda>'): 'Gini'}) # Herfindahl temp = SDATA.groupby('Symbol', as_index=False).agg({ 'percent': {lambda x: sum([(t / 100)**2 for t in list(x)])} }).rename(columns={'percent': 'Herfindhal'}) CMdf = pd.merge( CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename(columns={('Herfindhal', '<lambda>'): 'Herfindhal'}) # Shapley-Shubik # For refilling try: os.chdir(path) CMdf_load = pd.read_csv(file_name_measures) CMdf = pd.merge(CMdf, CMdf_load[[ 'Symbol', 'SSCL', 'SSCO', 'SSDL', 'SSDO', 'BZCL', 'BZCO', 'BZDL' ]], left_on='Symbol', right_on='Symbol', how='left') print('RE-FILL!') except: print('NEW!') # For the first time # Initiating columns CMdf['SSCL'] = np.nan CMdf['SSCO'] = np.nan CMdf['SSDL'] = np.nan CMdf['SSDO'] = np.nan CMdf['BZCL'] = np.nan CMdf['BZCO'] = np.nan CMdf['BZDL'] = np.nan data = fill_shapley_banzhaf(data=CMdf, SDATA=SDATA, fast_mode=True, time_pnt=pnt, major_thr=10) CMdf = data['CMdf'] print('len(Errors): ', len(data['Errors'])) data['Errors'] [x for x in data['Errors'] if x[2] != 'Error: request error!'] Output_Size = len(CMdf) print('Orginal Size is ', Orginal_Size, ' and output size is: ', Output_Size) os.chdir(path) CMdf.to_csv(file_name_measures) return (file_name_measures + ' is done!\n')