def lreshape(data: DataFrame, groups, dropna=True, label=None): """ Reshape wide-format data to long. Generalized inverse of ``DataFrame.pivot``. Accepts a dictionary, `groups`, in which each key is a new column name and each value is a list of old column names that will be "melted" under the new column name as part of the reshape. Parameters ---------- data : DataFrame The wide-format DataFrame. groups : dict Dictionary in the form: `{new_name : list_of_columns}`. dropna : bool, default: True Whether include columns whose entries are all NaN or not. label : optional Deprecated parameter. Returns ------- DataFrame Reshaped DataFrame. """ if not isinstance(data, DataFrame): raise ValueError("can not lreshape with instance of type {}".format( type(data))) ErrorMessage.default_to_pandas("`lreshape`") return DataFrame( pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label))
def import_table_doctopic(self): """Import data into doctopic table""" src_file = self.get_source_file( 'output-doc-topics') #self.config['output-doc-topics'] doctopic = pd.read_csv(src_file, sep='\t', header=None) doc = pd.DataFrame(doctopic.iloc[:, 1]) doc.columns = ['doc_tmp'] doc['src_doc_id'] = doc.doc_tmp.apply(lambda x: x.split(',')[0]) doc['doc_label'] = doc.doc_tmp.apply(lambda x: x.split(',')[1]) doc = doc[['src_doc_id', 'doc_label']] doc.index.name = 'doc_id' doctopic.drop(1, axis=1, inplace=True) doctopic.rename(columns={0: 'doc_id'}, inplace=True) y = [col for col in doctopic.columns[1:]] doctopic_narrow = pd.lreshape(doctopic, {'topic_weight': y}) doctopic_narrow['topic_id'] = [ i for i in range(self.config['num-topics']) for doc_id in doctopic['doc_id'] ] doctopic_narrow = doctopic_narrow[[ 'doc_id', 'topic_id', 'topic_weight' ]] doctopic_narrow.set_index(['doc_id', 'topic_id'], inplace=True) doctopic_narrow['topic_weight_zscore'] = stats.zscore( doctopic_narrow.topic_weight) dtm = doctopic_narrow.reset_index()\ .set_index(['doc_id','topic_id'])['topic_weight'].unstack() dtm.to_csv(self.tables_dir + 'DOCTOPIC.csv') doc.to_csv(self.tables_dir + 'DOC.csv') doctopic_narrow.to_csv(self.tables_dir + 'DOCTOPIC_NARROW.csv')
def lreshape(data: DataFrame, groups, dropna=True, label=None): if not isinstance(data, DataFrame): raise ValueError("can not lreshape with instance of type {}".format( type(data))) ErrorMessage.default_to_pandas("`lreshape`") return DataFrame( pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label))
def import_table_doctopic(self, src_file=None): """Import data into doctopic table""" if not src_file: src_file = self.mallet['train-topics']['output-doc-topics'] if 'doc-topics-threshold' in self.mallet['train-topics']: DOC = [] DOCTOPIC = [] src = PoloFile(src_file) for line in src[1:]: row = line.split('\t') row.pop() # Pretty sure this is right doc_id = row[0] src_doc_id = int(row[1].split(',')[0]) doc_label = row[1].split(',')[1] DOC.append([doc_id, src_doc_id, doc_label]) for i in range(2, len(row), 2): topic_id = row[i] topic_weight = row[i + 1] DOCTOPIC.append([doc_id, topic_id, topic_weight]) doctopic = pd.DataFrame(DOCTOPIC, columns=['doc_id', 'topic_id', 'topic_weight']) doctopic.set_index(['doc_id', 'topic_id'], inplace=True) doctopic['topic_weight_zscore'] = stats.zscore(doctopic.topic_weight) self.computed_thresh = round(doctopic.topic_weight.quantile(self.cfg_tw_quantile), 3) doc = pd.DataFrame(DOC, columns=['doc_id', 'src_doc_id', 'doc_label']) doc.set_index('doc_id', inplace=True) self.put_table(doctopic, 'doctopic', index=True) self.put_table(doc, 'doc', index=True) else: doctopic = pd.read_csv(src_file, sep='\t', header=None) doc = pd.DataFrame(doctopic.iloc[:, 1]) doc.columns = ['doc_tmp'] doc['src_doc_id'] = doc.doc_tmp.apply(lambda x: int(x.split(',')[0])) doc['doc_label'] = doc.doc_tmp.apply(lambda x: x.split(',')[1]) doc = doc[['src_doc_id', 'doc_label']] doc.index.name = 'doc_id' self.put_table(doc, 'doc', index=True) doctopic.drop(1, axis = 1, inplace=True) doctopic.rename(columns={0:'doc_id'}, inplace=True) y = [col for col in doctopic.columns[1:]] doctopic_narrow = pd.lreshape(doctopic, {'topic_weight': y}) doctopic_narrow['topic_id'] = [i for i in range(self.cfg_num_topics) for doc_id in doctopic['doc_id']] doctopic_narrow = doctopic_narrow[['doc_id', 'topic_id', 'topic_weight']] doctopic_narrow.set_index(['doc_id', 'topic_id'], inplace=True) doctopic_narrow['topic_weight_zscore'] = stats.zscore(doctopic_narrow.topic_weight) self.computed_thresh = round(doctopic_narrow.topic_weight\ .quantile(self.cfg_tw_quantile), 3) self.put_table(doctopic_narrow, 'doctopic', index=True) # todo: Revisit this; in the best place to do this? self.set_config_item('computed_thresh', self.computed_thresh)
def generalize_country_to_region( workdata, column: str, countries=pd.read_csv( 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv', usecols=['name', 'alpha-2', 'alpha-3', 'region'])): """ A DataFrame adott oszlopában lévő országneveket és kódokat cseréli le annak a régiónak a nevére, ahol az ország található. :param workdata: a WorkData példány, ami a DataFramet tartalmazza :param column: az oszlop neve :param countries: az országokat, kódokat, és régiókat tartalmazó file :return: """ reshaped = pd.lreshape(countries, { 'country': ['name', 'alpha-2', 'alpha-3'], 'region': ['region', 'region', 'region'] }, dropna=False) dictionary = dict(zip(reshaped['country'], reshaped['region'])) workdata.df[column] = workdata.df[column].map(dictionary)
def count_unique_heros(): # загружаем датасет features = pd.read_csv('features.csv', index_col='match_id') # получаем кол-во записей в датасете rows_num = features.shape[0] # заполняем пустоты в признаках for f, n in features.count().items(): if n != rows_num: features[f].fillna(features[f].mean(), inplace=True) values = [] #заполняем массив наименований признаков for f in list(features.columns): if 'hero' in f: values.append(f) #решейпим матрицу в вектор df1 = pd.lreshape(features, {'hero': values}) #выводим кол-во уникальных значений (героев) print(df1['hero'].value_counts().shape[0])
def test_pairs(self): data = { 'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009'], 'birthwt': [1766, 3301, 1454, 3139, 4133], 'id': [101, 102, 103, 104, 105], 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009'], 'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], 'wt1': [1823, 3338, 1549, 3298, 4306], 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], 'wt3': [2293.0, nan, nan, 3377.0, 4805.0] } df = DataFrame(data) spec = { 'visitdt': ['visitdt%d' % i for i in range(1, 4)], 'wt': ['wt%d' % i for i in range(1, 4)] } result = lreshape(df, spec) exp_data = { 'birthdt': [ '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '21dec2008', '11jan2009' ], 'birthwt': [ 1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, 4133, 1766, 3139, 4133 ], 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], 'sex': [ 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female' ], 'visitdt': [ '11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009', '21jan2009', '22jan2009', '31dec2008', '03feb2009', '05feb2009', '02jan2009', '15feb2009' ], 'wt': [ 1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0 ] } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) result = lreshape(df, spec, dropna=False) exp_data = { 'birthdt': [ '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009' ], 'birthwt': [ 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133 ], 'id': [ 101, 102, 103, 104, 105, 101, 102, 103, 104, 105, 101, 102, 103, 104, 105 ], 'sex': [ 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female' ], 'visitdt': [ '11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009', '21jan2009', nan, '22jan2009', '31dec2008', '03feb2009', '05feb2009', nan, nan, '02jan2009', '15feb2009' ], 'wt': [ 1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, 4805.0 ] } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) spec = { 'visitdt': ['visitdt%d' % i for i in range(1, 3)], 'wt': ['wt%d' % i for i in range(1, 4)] } pytest.raises(ValueError, lreshape, df, spec)
def test_pairs(self): data = { "birthdt": [ "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", ], "birthwt": [1766, 3301, 1454, 3139, 4133], "id": [101, 102, 103, 104, 105], "sex": ["Male", "Female", "Female", "Female", "Female"], "visitdt1": [ "11jan2009", "22dec2008", "04jan2009", "29dec2008", "20jan2009", ], "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"], "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"], "wt1": [1823, 3338, 1549, 3298, 4306], "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0], "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0], } df = DataFrame(data) spec = { "visitdt": [f"visitdt{i:d}" for i in range(1, 4)], "wt": [f"wt{i:d}" for i in range(1, 4)], } result = lreshape(df, spec) exp_data = { "birthdt": [ "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "21dec2008", "11jan2009", ], "birthwt": [ 1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, 4133, 1766, 3139, 4133, ], "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], "sex": [ "Male", "Female", "Female", "Female", "Female", "Male", "Female", "Female", "Female", "Male", "Female", "Female", ], "visitdt": [ "11jan2009", "22dec2008", "04jan2009", "29dec2008", "20jan2009", "21jan2009", "22jan2009", "31dec2008", "03feb2009", "05feb2009", "02jan2009", "15feb2009", ], "wt": [ 1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0, ], } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) result = lreshape(df, spec, dropna=False) exp_data = { "birthdt": [ "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", ], "birthwt": [ 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133, ], "id": [ 101, 102, 103, 104, 105, 101, 102, 103, 104, 105, 101, 102, 103, 104, 105, ], "sex": [ "Male", "Female", "Female", "Female", "Female", "Male", "Female", "Female", "Female", "Female", "Male", "Female", "Female", "Female", "Female", ], "visitdt": [ "11jan2009", "22dec2008", "04jan2009", "29dec2008", "20jan2009", "21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009", "05feb2009", np.nan, np.nan, "02jan2009", "15feb2009", ], "wt": [ 1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, np.nan, 1892.0, 3338.0, 4575.0, 2293.0, np.nan, np.nan, 3377.0, 4805.0, ], } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) with tm.assert_produces_warning(FutureWarning): lreshape(df, spec, dropna=False, label="foo") spec = { "visitdt": [f"visitdt{i:d}" for i in range(1, 3)], "wt": [f"wt{i:d}" for i in range(1, 4)], } msg = "All column lists must be same length" with pytest.raises(ValueError, match=msg): lreshape(df, spec)
def test_pairs(self): data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009'], 'birthwt': [1766, 3301, 1454, 3139, 4133], 'id': [101, 102, 103, 104, 105], 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009'], 'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], 'wt1': [1823, 3338, 1549, 3298, 4306], 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} df = DataFrame(data) spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], 'wt': ['wt%d' % i for i in range(1, 4)]} result = lreshape(df, spec) exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '21dec2008', '11jan2009'], 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, 4133, 1766, 3139, 4133], 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female'], 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009', '21jan2009', '22jan2009', '31dec2008', '03feb2009', '05feb2009', '02jan2009', '15feb2009'], 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) result = lreshape(df, spec, dropna=False) exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009'], 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133], 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, 101, 102, 103, 104, 105], 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female'], 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009', '21jan2009', nan, '22jan2009', '31dec2008', '03feb2009', '05feb2009', nan, nan, '02jan2009', '15feb2009'], 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, 4805.0]} exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], 'wt': ['wt%d' % i for i in range(1, 4)]} pytest.raises(ValueError, lreshape, df, spec)
def main(): evaluate_run = False results_folder = os.path.join(os.getcwd(), "results_" + walk + "/" + experiment) if not os.path.isdir(results_folder): print(colored("Error, " + results_folder + " does not exist", 'red')) else: print(colored("OK, " + results_folder + " exists", 'green')) rr = {} rb = {} br = {} bb = {} for timeout_folder in natsorted(os.listdir(os.path.join(results_folder))): print(colored("Timeout folder:", 'blue'), timeout_folder) if timeout_folder.endswith("pickle"): continue parameters = timeout_folder.split("_") for param in parameters: if param.startswith("timeout"): timeout = int(param.split("#")[-1]) * 10 # print("\t timeoutR:",timeoutR) if timeout == -1: print(colored("\tWARNING: wrong timeout folder", 'red')) continue if os.path.isfile( os.path.join( results_folder, pickle_file_root + "_timeout#" + str(timeout) + "_.pickle")): run_memory_mean = pd.read_pickle( os.path.join( results_folder, pickle_file_root + "_timeout#" + str(timeout) + "_.pickle")) print( colored( pickle_file_root + "_timeout#" + str(timeout) + "_.pickle already exists for timeout:" + str(timeout), 'green')) else: # print(colored( # os.path.join(results_folder, pickle_file_root"_timeout#"+str(timeout*10)+"_.pickle"), # 'red')) # sys.exit() for filename in natsorted( os.listdir(os.path.join(results_folder, timeout_folder))): filename_seed = filename.split("_")[0] # print(filename) if filename.endswith("areaLOG_client.tsv"): if not os.path.getsize( os.path.join(results_folder, timeout_folder, filename)) > 0: print( colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename) df_area_client = pd.read_csv(os.path.join( results_folder, timeout_folder, filename), sep="\t", header=None) if filename.endswith("areaLOG_server.tsv"): if not os.path.getsize( os.path.join(results_folder, timeout_folder, filename)) > 0: print( colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename) df_area_server = pd.read_csv(os.path.join( results_folder, timeout_folder, filename), sep="\t", header=None) # if filename.endswith("taskLOG_client.tsv"): # if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0: # print(colored("\tWARNING, empty file at:" + filename, 'red')) # continue # # print('\tfilename: ', filename) # df_task_client = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t", # header=None) # # if filename.endswith("taskLOG_server.tsv"): # if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0: # print(colored("\tWARNING, empty file at:" + filename, 'red')) # continue # # print('\tfilename: ', filename) # df_task_server = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t", # header=None) if filename.endswith("kiloLOG_client.tsv"): if not os.path.getsize( os.path.join(results_folder, timeout_folder, filename)) > 0: print( colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename) df_kilo_client = pd.read_csv(os.path.join( results_folder, timeout_folder, filename), sep="\t", header=None) if filename.endswith("kiloLOG_server.tsv"): if not os.path.getsize( os.path.join(results_folder, timeout_folder, filename)) > 0: print( colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename, end='\n') df_kilo_server = pd.read_csv(os.path.join( results_folder, timeout_folder, filename), sep="\t", header=None) evaluate_run = True if evaluate_run: '''Kilo log part''' if len(df_kilo_client.columns) > 145: # print("Cutting null elements in client kilo df") df_kilo_client.drop( df_kilo_client.columns[len(df_kilo_client.columns) - 1], axis=1, inplace=True) if len(df_kilo_server.columns) > 145: # print("Cutting null elements in server kilo df") df_kilo_server.drop( df_kilo_server.columns[len(df_kilo_server.columns) - 1], axis=1, inplace=True) col_kilo_labels = ['time'] for i in range(0, len(df_kilo_server.columns) - 1, 6): # print(i,end=", ") col_kilo_labels += [ 'id' + str(i // 6), 'state' + str(i // 6), 'posx' + str(i // 6), 'posy' + str(i // 6), 'ori' + str(i // 6), 'same_state' + str(i // 6) ] col_kilo_to_drop = [] for i in range((len(df_kilo_server.columns) - 1) // 6): # print(i,end=", ") col_kilo_to_drop += ['same_state' + str(i)] df_kilo_server.columns = col_kilo_labels df_kilo_client.columns = col_kilo_labels df_kilo_server = df_kilo_server.drop(col_kilo_to_drop, axis=1) df_kilo_client = df_kilo_client.drop(col_kilo_to_drop, axis=1) '''Completed task LOG part''' # task_label = ['time', 'id', 'creationTime', 'completitionTime', 'color', 'contained'] # df_task_client.columns = task_label '''Area LOG part''' col_area_labels = ['time'] for i in range(0, len(df_area_server.columns) - 2, 6): # print(i, end=", ") col_area_labels += [ 'id' + str(i // 6), 'posx' + str(i // 6), 'posy' + str(i // 6), 'color' + str(i // 6), 'completed' + str(i // 6), 'contained' + str(i // 6) ] # Remove last empty col and assign labels to df_area_server if len(df_area_server.columns) > 49: # print("Cutting null elements in area server df") df_area_server.drop( df_area_server.columns[len(df_area_server.columns) - 1], axis=1, inplace=True) df_area_server.columns = col_area_labels # First df_area_client row contains garbage # so is substituted with the second row except for the time, # then remove Nan values in [:,49:] if len(df_area_client.columns) > 49: # print("Cutting null elements in area client df") df_area_client.loc[0, 1:] = df_area_client.loc[1, 1:] df_area_client = df_area_client.drop(np.arange( 49, len(df_area_client.columns)), axis=1) df_area_client.columns = col_area_labels area_color_label = [] for i in range(num_areas): area_color_label += ["color" + str(i)] # print("color"+str(i)) areas_client_color = df_area_client[area_color_label].iloc[ 0, :].values areas_server_color = df_area_server[area_color_label].iloc[ 0, :].values # print(areas_client_color) # print(areas_server_color) area_pos_label = [] for i in range(num_areas): area_pos_label += ["posx" + str(i)] area_pos_label += ["posy" + str(i)] areas_pos = df_area_client[area_pos_label].iloc[ 0, :].values # print(areas_pos) areas_pos = areas_pos.reshape(-1, 2) color_list = ["color" + str(i) for i in range(num_areas)] df_area3_s = df_area_server.iloc[:1, :][color_list] df_area3_c = df_area_client.iloc[:1, :][color_list] for i, idx in enumerate( range(1, len(df_area3_c.columns) * 2, 2)): # print(i, ' ', idx) df_area3_c.insert(loc=idx, column='other_col' + str(i), value=df_area3_s.iloc[0][i]) client = [ col for col in df_area3_c.columns if 'color' in col ] server = [ col for col in df_area3_c.columns if 'other_col' in col ] df_area_colors = pd.lreshape(df_area3_c, { 'color_client': client, 'color_server': server }) area_type = [] for area in df_area_colors.values: if area[0] == 0 and area[1] == 0: area_type += ['BB'] if area[0] == 0 and area[1] == 1: area_type += ['BR'] if area[0] == 1 and area[1] == 0: area_type += ['RB'] if area[0] == 1 and area[1] == 1: area_type += ['RR'] df_area_colors.insert(loc=2, column='area_type', value=area_type) '''Post process server''' for i, kilo_id in enumerate( np.arange(1, len(df_kilo_server.columns), 5)): # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue')) # print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n') kilo_pos = df_kilo_server.iloc[:, kilo_id + i + 2:kilo_id + i + 4].values # print(kilo_pos) in_area = np.empty(kilo_pos.shape[0], dtype=int) in_area.fill(-1) for area_idx, area_pos in enumerate(areas_pos): # print(area_idx, ' ', area_pos) dist = np.linalg.norm(kilo_pos - area_pos, axis=1) # print(dist, end='\n\n') in_area = np.where( dist < area_threshold, df_area_colors.iloc[area_idx][-1][::-1], in_area) # in_area = np.where(in_area == -1, np.NaN, in_area) # print(in_area) df_kilo_server.insert(loc=int(kilo_id + i + 2), column='area_type' + str(i), value=in_area) '''Post process client''' for i, kilo_id in enumerate( np.arange(1, len(df_kilo_client.columns), 5)): # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue')) # print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n') kilo_pos = df_kilo_client.iloc[:, kilo_id + i + 2:kilo_id + i + 4].values # print(kilo_pos) in_area = np.empty(kilo_pos.shape[0], dtype=int) in_area.fill(-1) for area_idx, area_pos in enumerate(areas_pos): # print(area_idx,' ', area_pos) dist = np.linalg.norm(kilo_pos - area_pos, axis=1) # print(dist, end='\n\n') in_area = np.where( dist < area_threshold, df_area_colors.iloc[area_idx][-1], in_area) # in_area = np.where(in_area == -1, np.NaN, in_area) # print(in_area) df_kilo_client.insert(loc=int(kilo_id + i + 2), column='area_type' + str(i), value=in_area) '''Here finally evaluated in which area the timeout elapses''' kilo_resume = [["state" + str(i), "area_type" + str(i)] for i in range(num_robots)] kilo_resume = np.reshape(kilo_resume, (-1)) server_kilo_resume = df_kilo_server.iloc[:][kilo_resume] client_kilo_resume = df_kilo_client.iloc[:][kilo_resume] total_exp_df = client_kilo_resume.join(server_kilo_resume, lsuffix='_c', rsuffix='_s') if value_studied == "mean_timeout": timeout_count = pd.DataFrame( columns=['RR', 'RB', 'BR', 'BB']) for i in range(0, len(total_exp_df.columns), 2): # print(total_exp_df.iloc[:50,i:i+2]) kilo_state = total_exp_df.iloc[:, i:i + 2] kilo_state = kilo_state.replace(2, 3) mask = (kilo_state[ kilo_state.columns.values[0]].diff() == 2) # print(kilo_state[mask]) # print(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), end='\n\n') robot_timeout = kilo_state[mask][ kilo_state.columns.values[1]].value_counts( ).to_frame().T # robot_timeout = pd.DataFrame(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), columns=['RR, RB,BR,BB']) # print(robot_timeout) timeout_count = timeout_count.append(robot_timeout) # print(robot_timeout, end='\n\n') timeout_count = timeout_count.fillna(0) single_run_mean = timeout_count.mean(axis=0) else: completed_area_count = pd.DataFrame( columns=['RR', 'RB', 'BR', 'BB']) for i in range(0, len(total_exp_df.columns), 2): # print(total_exp_df.iloc[:50,i:i+2]) kilo_state = total_exp_df.iloc[:, i:i + 2] mask = (kilo_state[ kilo_state.columns.values[0]].diff() == -1) # print(kilo_state[mask]) # print(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), end='\n\n') robot_completed_area = kilo_state[mask][ kilo_state.columns.values[1]].value_counts( ).to_frame().T # robot_completed_area = pd.DataFrame(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), columns=['RR, RB,BR,BB']) # print(robot_completed_area) completed_area_count = completed_area_count.append( robot_completed_area) # print(robot_completed_area, end='\n\n') completed_area_count = completed_area_count.fillna(0) single_run_mean = completed_area_count.mean(axis=0) single_df = single_run_mean.to_frame().T single_df.index = [filename_seed] if os.path.isfile( os.path.join( results_folder, pickle_file_root + "_timeout#" + str(timeout) + "_.pickle")): run_memory_mean = pd.read_pickle( os.path.join( results_folder, pickle_file_root + "_timeout#" + str(timeout) + "_.pickle")) run_memory_mean = run_memory_mean.append(single_df) run_memory_mean.to_pickle( os.path.join( results_folder, pickle_file_root + "_timeout#" + str(timeout) + "_.pickle")) print("Timeout:", timeout, end=", ") print("Appending mean run, file size: ", run_memory_mean.shape) else: print("Timeout:", timeout, end=", ") print("Writing mean run") single_df.to_pickle( os.path.join( results_folder, pickle_file_root + "_timeout#" + str(timeout) + "_.pickle")) evaluate_run = False rr[timeout] = run_memory_mean['RR'].values rb[timeout] = run_memory_mean['RB'].values br[timeout] = run_memory_mean['BR'].values bb[timeout] = run_memory_mean['BB'].values if value_studied == "mean_timeout": figureName = 'meanElapsedTimeout' else: figureName = 'meanCompletedAreas' figureName += '_groupsize' + groupSize + '_' + experiment + '_' + walk print("rr", rr) boxplots_utils.grouped_4_boxplot(rr, rb, br, bb, y_lim, figureName)
wc_full_data = wc_full_data.sort_values(by=['Type', 'SKU']) is_french = wc_full_data['SKU'].astype(str).str.contains('_fr') wc_data_french = wc_full_data[is_french] wc_data = wc_full_data[is_french == False] # Clean the invalid name wc_data['Name'] = wc_data['Name'].astype(str) wc_data = wc_data[wc_data['Name'].str.contains('#REF!') == False] # wc_data contains English rows # wc_data_french contains French Rows attribute_keys = wc_data.columns[wc_data.columns.str.endswith(' name')] attribute_values = wc_data.columns[wc_data.columns.str.endswith('value(s)')] wc_data_attributes = pd.lreshape(wc_data, { 'key': attribute_keys, 'value': attribute_values }) wc_data_attributes = wc_data_attributes.pivot(index='ID', columns='key', values='value') wc_data = pd.merge(wc_data, wc_data_attributes, on='ID') wc_data['slug'] = wc_data['Name'].apply(lambda x: slugify(x)) # %% slug_mask = wc_data['slug'].duplicated(keep=False) wc_data.loc[slug_mask, ['slug', 'Name']].sort_values(by=['Name']) wc_data.loc[slug_mask, 'slug'] += wc_data.groupby('slug').cumcount().add(1).astype(str) wc_data['new_sku'] = wc_data['SKU'].fillna('').apply(lambda x: x.split('-')[0])
def make_link_and_node_df(sankey_df, num_steps: int, dropna=False): """Takes a df in the following format (output of make_sankey_df): | 0 | 1 | ... | num | step_0 | step_1 | ... ----+-------+-------+-----+-----+--------+--------+----- 1 | cat_1 | cat_2 | ... | 2 | 0 | 8 | ... 2 | cat_2 | None | ... | 10 | 1 | 9 | ... ... Returns link_df: | source | target | num ---+--------+--------+----- 0 | 0 | 8 | 114 1 | 1 | 9 | 57 ... Returns node_df: | source | label | step ---+--------+-------+-------- 0 | 0 | cat_1 | step_0 1 | 1 | cat_2 | step_0 ... """ # reshape into source-target steps = range(num_steps) link_df = pd.lreshape(sankey_df, groups={ 'source': [f'step_{step}' for step in steps[:-1]], 'target': [f'step_{step}' for step in steps[1:]] })[['source', 'target', 'num']] link_df = link_df.groupby(['source', 'target']).sum().reset_index() # get index labels node_df = pd.lreshape( sankey_df, groups={ 'source': [f'step_{step}' for step in steps], 'label': steps })[['source', 'label' ]].drop_duplicates().sort_values('source').reset_index(drop=True) # link source indices to step step_source = sankey_df[[f'step_{step}' for step in steps]].to_dict(orient='list') step_source = {k: list(set(v)) for k, v in step_source.items()} source_step_dict = {} for k, v in step_source.items(): for source in v: source_step_dict[source] = k node_df['step'] = node_df['source'].apply(lambda x: source_step_dict[x]) if dropna is True: # generate new indices for link_df step_stack_df = pd.lreshape( link_df, {'step_stack': ['source', 'target']})[['step_stack']] step_stack_df['new_idx'] = step_stack_df['step_stack'].astype( 'category').cat.codes step_stack_df = step_stack_df.drop_duplicates() replace_dict = dict( zip(step_stack_df['step_stack'], step_stack_df['new_idx'])) link_df.loc[:, ['source', 'target' ]] = link_df.loc[:, ['source', 'target']].replace( replace_dict) # reassign missing keys # filter out missing keys from node_df node_df = node_df[(node_df['source'].isin(replace_dict.keys()))] node_df.loc[:, 'source'] = node_df.loc[:, 'source'].replace( replace_dict) # reassign missing keys return link_df, node_df
def process(): input_json = json.load(open('input.json', 'r')) m_i_df = pd.read_pickle(input_json['data_pickle']) #input_df #result = current_app. print('proc till tolist') m_i_df["away_players"] = m_i_df[[ 'away1_id', 'away2_id', 'away3_id', 'away4_id', "away5_id", "away6_id", "away7_id", 'away8_id', 'away9_id', 'away10_id', 'away11_id' ]].values.tolist() m_i_df["home_players"] = m_i_df.loc[:, [ 'home1_id', 'home2_id', 'home3_id', 'home4_id', 'home5_id', 'home6_id', 'home7_id', 'home8_id', 'home9_id', 'home10_id', 'home11_id' ]].values.tolist() m_i_df["score_home"], m_i_df["score_away"] = m_i_df["score"].str.split( '-', 1).str m_i_df["score_home"] = m_i_df["score_home"].apply(int) m_i_df["score_away"] = m_i_df["score_away"].apply(int) m_i_df["away_players"] = m_i_df[[ 'away1_id', 'away2_id', 'away3_id', 'away4_id', "away5_id", "away6_id", "away7_id", 'away8_id', 'away9_id', 'away10_id', 'away11_id' ]].values.tolist() m_i_df["home_players"] = m_i_df.loc[:, [ 'home1_id', 'home2_id', 'home3_id', 'home4_id', 'home5_id', 'home6_id', 'home7_id', 'home8_id', 'home9_id', 'home10_id', 'home11_id' ]].values.tolist() m_i_df["away_players_bd"] = m_i_df["away_players"].apply(eletkor_to_list) m_i_df["home_players_bd"] = m_i_df["home_players"].apply(eletkor_to_list) m_i_df["away_players_bd"] = m_i_df[[ 'date', 'away_players_bd', ]].values.tolist() m_i_df["home_players_bd"] = m_i_df[[ 'date', 'home_players_bd', ]].values.tolist() m_i_df["avg_home_age"] = m_i_df["home_players_bd"].apply(calculate_age) m_i_df["avg_away_age"] = m_i_df["away_players_bd"].apply(calculate_age) m_i_df["max_age"] = m_i_df["home_players_bd"].apply(calculate_max_age) m_i_df["max_age_place"] = m_i_df["home_players_bd"].apply( calculate_max_age_place) m_i_df["AVG_HEIGHT_H"] = m_i_df["home_players"].apply(avg_height) m_i_df["AVG_HEIGHT_A"] = m_i_df["away_players"].apply(avg_height) m_i_df["stad_cap_a"] = m_i_df.away_team.apply(stad_cap) m_i_df["stad_cap_h"] = m_i_df.home_team.apply(stad_cap) m_i_df["all_goals"] = m_i_df["score_away"] + m_i_df["score_home"] m_i_df["goal_difference"] = m_i_df["score_away"] - m_i_df["score_home"] m_i_df_proba = pd.concat([ m_i_df[:], m_i_df["away_players"].apply(height_to_list).apply( pd.Series).rename(columns=lambda x: 'heights_a_' + str(x + 1))[:] ], axis=1) m_i_df_proba = pd.concat([ m_i_df_proba[:], m_i_df["home_players"].apply(height_to_list).apply( pd.Series).rename(columns=lambda x: 'heights_h_' + str(x + 1))[:] ], axis=1) pos_height_df = pd.lreshape( m_i_df_proba, { 'position': [ 'away1_pos', 'away2_pos', 'away3_pos', 'away4_pos', "away5_pos", "away6_pos", "away7_pos", 'away8_pos', 'away9_pos', 'away10_pos', 'away11_pos', 'home1_pos', 'home2_pos', 'home3_pos', 'home4_pos', 'home5_pos', 'home6_pos', 'home7_pos', 'home8_pos', 'home9_pos', 'home10_pos', 'home11_pos' ], 'heights': [ 'heights_a_1', 'heights_a_2', 'heights_a_3', 'heights_a_4', 'heights_a_5', 'heights_a_6', "heights_a_7", "heights_a_8", "heights_a_9", 'heights_a_10', 'heights_a_11', 'heights_h_1', 'heights_h_2', 'heights_h_3', 'heights_h_4', 'heights_h_5', 'heights_h_6', "heights_h_7", "heights_h_8", "heights_h_9", 'heights_h_10', 'heights_h_11' ] }).pipe(lambda x: x[["position", "heights"]]) pos_height_df["heights"] = pos_height_df["heights"].apply(try_height_2) m_i_df = pd.concat([ m_i_df[:], m_i_df["away_players_bd"].apply(calculate_valamelyik_age).apply( pd.Series).rename(columns=lambda x: 'ages_a_' + str(x + 1))[:] ], axis=1) m_i_df = pd.concat([ m_i_df[:], m_i_df["home_players_bd"].apply(calculate_valamelyik_age).apply( pd.Series).rename(columns=lambda x: 'ages_h_' + str(x + 1))[:] ], axis=1) pos_age_df = pd.lreshape( m_i_df, { 'position': [ 'away1_pos', 'away2_pos', 'away3_pos', 'away4_pos', "away5_pos", "away6_pos", "away7_pos", 'away8_pos', 'away9_pos', 'away10_pos', 'away11_pos', 'home1_pos', 'home2_pos', 'home3_pos', 'home4_pos', 'home5_pos', 'home6_pos', 'home7_pos', 'home8_pos', 'home9_pos', 'home10_pos', 'home11_pos' ], 'ages': [ 'ages_a_1', 'ages_a_2', 'ages_a_3', 'ages_a_4', 'ages_a_5', 'ages_a_6', "ages_a_7", "ages_a_8", "ages_a_9", 'ages_a_10', 'ages_a_11', 'ages_h_1', 'ages_h_2', 'ages_h_3', 'ages_h_4', 'ages_h_5', 'ages_h_6', "ages_h_7", "ages_h_8", "ages_h_9", 'ages_h_10', 'ages_h_11' ] }).pipe(lambda x: x)[["position", "ages"]] print("EDDIG ELJUT - PROCESS") new_df = pd.merge(m_i_df[["away_team","home_team","date","avg_home_age","avg_away_age","result","score"]]\ ,current_app.m_o_df[["odds","odds1" ,"odds2" ,"oddsx","team_id_at","team_id_ht","date","score"]]\ ,how='left', left_on=["away_team","home_team","date","score"], right_on = ["team_id_at","team_id_ht","date","score"]) new_df[["odds", "odds1", "odds2", "oddsx"]] = new_df[["odds", "odds1", "odds2", "oddsx"]].applymap(oddsx_to_float) new_df = new_df.dropna(subset=['odds']) new_df["result"] = new_df["result"].apply(lambda x: float(x)) new_df["win_as_udog_h"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply( win_udog_h, axis=1) new_df["win_as_udog_a"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply( win_udog_a, axis=1) new_df["lose_fav_h"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply( lose_fav_h, axis=1) new_df["lose_fav_a"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply( lose_fav_a, axis=1) new_df[ "looser"] = new_df.loc[:, ["team_id_ht", "team_id_at", "result"]].apply( looser, axis=1) new_df["looser_odds"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply( looser_odds, axis=1) m_i_df["away_values"] = m_i_df.loc[:, ["date", "away_players"]].apply( player_value, axis=1) m_i_df["home_values"] = m_i_df.loc[:, ["date", "home_players"]].apply( player_value, axis=1) new_df = pd.merge(new_df, m_i_df[[ "away_values", "home_values", "away_team", "home_team", "date", "score" ]], how='left', left_on=["away_team", "home_team", "date", "score"], right_on=["away_team", "home_team", "date", "score"]) value_a = m_i_df['away_values'].apply(pd.Series) value_h = m_i_df['home_values'].apply(pd.Series) value_a = value_a.rename(columns=lambda x: 'value_a_' + str(x + 1)) value_h = value_h.rename(columns=lambda x: 'value_h_' + str(x + 1)) out = [ {'most-used-formation':m_i_df[['home_formation','away_formation']].unstack().value_counts().index[0]}, {'number-of-players-with-no-games':str(len(current_app.p_i_df)-len(current_app.p_i_df.loc[current_app.p_i_df.playerid.isin(pd.DataFrame(m_i_df[['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id']].unstack().unique())[0])]))}, {'player-with-highest-number-of-games':str(m_i_df[['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id']].unstack().value_counts().index[0])}, {'player-with-highest-number-of-games-where-his-team-didnt-concede': int(pd.concat([pd.DataFrame(m_i_df.loc[(m_i_df['score_home'] == 0)][['home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id']]),pd.DataFrame(m_i_df.loc[(m_i_df['score_away'] == 0)][['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id", "away6_id","away7_id",'away8_id','away9_id']])]).unstack().value_counts().index[0])}, {'most-games-played-in-same-position-by-player':str( pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id'], 'position':['away10_pos','away11_pos','away1_pos','away2_pos','away3_pos','away4_pos',"away5_pos","away6_pos","away7_pos",'away8_pos','away9_pos','home1_pos','home2_pos','home3_pos','home4_pos','home5_pos','home6_pos','home7_pos','home8_pos','home9_pos','home10_pos','home11_pos']})\ .pipe(lambda x:x)[["player_id","position"]]\ .groupby(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id", "away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id'], 'position':['away10_pos','away11_pos','away1_pos','away2_pos','away3_pos','away4_pos',"away5_pos","away6_pos","away7_pos",'away8_pos','away9_pos','home1_pos','home2_pos','home3_pos','home4_pos','home5_pos','home6_pos','home7_pos','home8_pos','home9_pos','home10_pos','home11_pos']})\ .pipe(lambda x:x)[["player_id","position"]].columns.tolist(),as_index=False).size().max())}, {'most-different-positions-by-player':str(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id", "away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id', 'home8_id','home9_id','home10_id','home11_id'], 'position':['away10_pos','away11_pos','away1_pos','away2_pos','away3_pos','away4_pos',"away5_pos", "away6_pos","away7_pos",'away8_pos','away9_pos', 'home1_pos','home2_pos','home3_pos','home4_pos','home5_pos','home6_pos','home7_pos', 'home8_pos','home9_pos','home10_pos','home11_pos']}).pipe(lambda x:x)[["player_id","position"]][["player_id","position"]].groupby('player_id')["position"].nunique().max())}, {'most-different-formations-by-player':str(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id'],'formation':['away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation',"home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation"]}).pipe(lambda x: x)[["player_id","formation"]].groupby('player_id')["formation"].nunique().max())}, {'largest-odds-overcome-in-game':new_df[new_df["result"]!=0.0]["odds"].max()}, {'largest-height-difference-overcome-in-game':m_i_df.loc[:,["result","AVG_HEIGHT_H","AVG_HEIGHT_A"]].apply(height_diff_OC,axis=1).max()}, {'longest-time-in-days-between-two-games-for-player':None}, {'biggest-value-difference':str(m_i_df.loc[:,["away_values" ,"home_values"]].apply(values_diff,axis=1).max())}, {'biggest-value-difference-upset':int(abs(m_i_df.loc[:,["away_values" ,"home_values","result"]].apply(values_diff_ups,axis=1).min()))}, # an upset means the unexpected team won {'biggest-value-difference-with-higher-odds':int(new_df.loc[:,["away_values" ,"home_values","odds1","odds2"]].apply(values_diff_ups_odds,axis=1).max())}, {'biggest-stadium-capacity-difference-upset':None}, {'capacity-of-stadium-of-team-with-most-games':pd.DataFrame(pd.lreshape(m_i_df, {'team_id':['away_team','home_team'],'seats':["stad_cap_a",'stad_cap_h']}).groupby(pd.lreshape(m_i_df, {'team_id':['away_team','home_team'],'seats':["stad_cap_a",'stad_cap_h']})[["team_id","seats"]].columns.tolist(),as_index=False).size()).idxmax()[0][1]}, {'id-of-oldest-team-to-win-a-game':id_of_oldest_team_to_win_a_game(m_i_df)}, {'biggest-age-difference-between-teams-match-id':int(m_i_df.iloc[abs(m_i_df["avg_away_age"]-m_i_df["avg_home_age"]).idxmax(),:]["mkey"])}, {'median-of-winning-team-average-age':(m_i_df.loc[:,["result","avg_away_age","avg_home_age"]].apply(gyoztes_kor,axis=1)).median()}, {'median-of-favorite-team-average-age':int(new_df.loc[:,["odds1" ,"odds2","avg_home_age","avg_away_age"]].apply(fav_age,axis=1).median())}, # favorite means has lower odds of winning {'median-of-underdog-team-average-age':int(new_df.loc[:,["odds1" ,"odds2","avg_home_age","avg_away_age"]].apply(udog_age,axis=1).median())}, # underdog means has higher odds of winning {'team-with-most-wins-as-underdog':pd.lreshape(new_df, {'team_id':['team_id_at','team_id_ht'],'wins_as_udog':["win_as_udog_a",'win_as_udog_h']}).pipe(lambda x: x[["team_id","wins_as_udog"]])\ .groupby("team_id").agg({"wins_as_udog":"sum"})["wins_as_udog"].idxmax()}, {'team-with-most-losses-as-favorite':pd.lreshape(new_df, {'team_id':['team_id_at','team_id_ht'],'lose_as_fav':["lose_fav_a",'lose_fav_h']}).pipe(lambda x: x[["team_id","lose_as_fav"]])\ .groupby("team_id").agg({"lose_as_fav":"sum"})["lose_as_fav"].idxmax()}, {'team-with-lowest-average-odds-of-draw':pd.lreshape(new_df, {'team_id':['team_id_at','team_id_ht'],'oddsx':["oddsx",'oddsx']}).pipe(lambda x: x[["team_id","oddsx"]])\ .groupby("team_id").agg({"oddsx":"mean"})["oddsx"].idxmin()}, {'position-with-highest-average-value':None}, {'position-with-largest-average-height':pos_height_df.groupby("position").agg({"heights":"mean"})["heights"].idxmax()}, {'position-with-youngest-average-age':pos_age_df.groupby("position").agg({"ages":"mean"})["ages"].idxmin()}, {'goalkeeper-with-most-clean-sheets':None},#az átlagosan legtöbb gólt kapó kapus születési dátuma {'stadium-capactiy-of-team-with-most-avg-goals-in-a-game':None},#átlagosan leggólgazdagabb meccseket játszó csapat stadionjának befogadóképessége {'team-with-highest-profit-for-losing':int(new_df[["looser_odds","looser"]].groupby("looser")["looser_odds"].sum().idxmax())},#a csapat, akinek, ha minden meccsén ellenük fogadsz, összesítve a legnagyobb profitot termeli (mindig ugyanakkora összeggel fogadsz rá) {'largest-std-in-goal-difference-team':int(pd.lreshape(m_i_df, {'team_id':['away_team','home_team'], 'goal_difference':["goal_difference",'goal_difference']}).pipe(lambda x: x[["team_id","goal_difference"]])\ .groupby("team_id").agg({"goal_difference":"std"})["goal_difference"].idxmax())},#a legnagyobb gólkülönbség szórással rendelkező csapat {'player-with-most-different-teams': int(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id", "away6_id","away7_id",'away8_id','away9_id', 'home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id', 'home8_id','home9_id','home10_id','home11_id'], 'team_id':['away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team', 'home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team']})\ .groupby("player_id")["team_id"].nunique().idxmax())},#a legtöbb csapatban pályára lépő játékos {'longest-losing-streak-team':None},#a leghosszabb vesztes széria by team {'longest-home-winning-streak-stadium-capacity':None },#leghosszabb hazai pályás győzelmi széria helyszínének befogadóképessége {'win-ratio-of-actual-highest-rated-player':None},#az adott időpillanatban legértékesebb játékos átlagos win ratioja {'oldest-player-to-win-a-home-game':m_i_df.iloc[m_i_df[m_i_df["result"]==1]["max_age"].idxmax(),:]["home_players"][m_i_df.iloc[m_i_df[m_i_df["result"]==1]["max_age"].idxmax(),:]["max_age_place"][0]]}#a legidősebb hazai pályán győztes meccset játszó játékos ] json.dump(out, open('output.json', 'w')) return 'FING'
def compute(hi,d_hrs,d_divo,period=3,transform=1): #compute moments, period #says how many years correspond to one #period #Get Date at Interview hi.insert(0, 'IDN', range(0, len(hi))) hi['res']=hi['NUMUNION']+hi['NUMCOHMR'] #Get Duration bins bins_d=np.linspace(0,1200,int((100/period)+1)) bins_d_label=np.linspace(1,len(bins_d)-1,len(bins_d)-1) ########################## #Gen cohabitation Dataset ######################### #Get date at interview hi['int']=hi['IDATMM']+(hi['IDATYY']-1900)*12 #Gen age at interview hi['ageint']=round((((hi['IDATYY']-1900)*12+hi['IDATMM'])-hi['birth_month'])/12,0) #Take only if cohabitations coh=hi[(hi['NUMUNION']-hi['NUMMAR']>0) | (hi['NUMCOHMR']>0)].copy() #Create number of cohabitations coh['num']=0.0 for i in range(9): if(np.any(coh['HOWBEG0'+str(i+1)])=='coh'): coh.loc[coh['HOWBEG0'+str(i+1)]=='coh','num']=coh.loc[coh['HOWBEG0'+str(i+1)]=='coh','num']+1.0 #Expand the data cohe=coh.loc[coh.index.repeat(np.array(coh.num,dtype=np.int32))] #Link each cohabitation to relationship number cohe['rell'] = cohe.groupby(['IDN']).cumcount()+1 cohe['cou']=1 cohe['rel']=None for i in range(9): if(np.any(coh['HOWBEG0'+str(i+1)])=='coh'): cohe.loc[(cohe['HOWBEG0'+str(i+1)]=='coh') & (cohe['rell']==cohe['cou']),'rel']=i+1 cohe.loc[cohe['HOWBEG0'+str(i+1)]=='coh','cou']= cohe.loc[cohe['HOWBEG0'+str(i+1)]=='coh','cou']+1 #Get beginning and end of relationhip cohe['beg']=-1 cohe['endd']=-1 cohe['how']=-1 cohe['mar']=-1 for i in range(9): cohe.loc[(i+1==cohe['rel']),'beg']=cohe.loc[(i+1==cohe['rel']),'BEGDAT0'+str(i+1)] cohe.loc[(i+1==cohe['rel']),'endd']=cohe.loc[(i+1==cohe['rel']),'ENDDAT0'+str(i+1)] cohe.loc[(i+1==cohe['rel']),'how']=cohe.loc[(i+1==cohe['rel']),'HOWEND0'+str(i+1)] cohe.loc[(i+1==cohe['rel']),'mar']=cohe.loc[(i+1==cohe['rel']),'MARDAT0'+str(i+1)] #add here an indicator of whether it should be unilateral duvorce scenario #Get how relationship end cohe['fine']='censored' cohe.loc[cohe['how']=='sep','fine']='sep' cohe.loc[cohe['how']=='div','fine']='mar' cohe.loc[(cohe['how']=='intact') & (cohe['mar']>1),'fine']='mar' #Replace censored date if still together cohe['end']=-1 cohe.loc[cohe['fine']=='sep','end']=cohe.loc[cohe['fine']=='sep','endd'] cohe.loc[cohe['fine']=='mar','end']=cohe.loc[cohe['fine']=='mar','mar'] cohe.loc[cohe['fine']=='censored','end']=cohe.loc[cohe['fine']=='censored','int'] #Duration cohe['dur']=cohe['end']-cohe['beg'] #Keep if no error for duration cohe=cohe[(cohe['dur']>0) & (cohe['dur']<2000)] #Transform Duration in Years cohe['dury'] = pd.cut(x=cohe['dur'], bins=bins_d,labels=bins_d_label) cohe['dury']=cohe['dury'].astype(float) #Eliminate non useful things del coh ########################## #Gen marriage Dataset ######################### #Take only if marriages mar=hi[hi['NUMMAR']>0].copy() #Create number of cohabitations mar['num']=0 for i in range(9): mar.loc[mar['MARDAT0'+str(i+1)]>0,'num']=mar.loc[mar['MARDAT0'+str(i+1)]>0,'num']+1 #Expand the data mare=mar.loc[mar.index.repeat(mar.num)] #Link each marriage to relationship number mare['rell'] = mare.groupby(['IDN']).cumcount()+1 mare['cou']=1 mare['rel']=None for i in range(9): mare.loc[(mare['MARDAT0'+str(i+1)]>0) & (mare['rell']==mare['cou']),'rel']=i+1 mare.loc[mare['MARDAT0'+str(i+1)]>0,'cou']= mare.loc[mare['MARDAT0'+str(i+1)]>0,'cou']+1 #Get beginning and end of relationhip mare['beg']=-1 mare['endd']=-1 mare['how']=-1 mare['mar']=-1 for i in range(9): mare.loc[(i+1==mare['rel']),'beg']=mare.loc[(i+1==mare['rel']),'MARDAT0'+str(i+1)] mare.loc[(i+1==mare['rel']),'endd']=mare.loc[(i+1==mare['rel']),'ENDDAT0'+str(i+1)] mare.loc[(i+1==mare['rel']),'how']=mare.loc[(i+1==mare['rel']),'HOWEND0'+str(i+1)] #Get how relationship end mare['fine']='censored' mare.loc[mare['how']=='div','fine']='div' #Replace censored date if still together mare['end']=-1 mare.loc[mare['fine']=='div','end']=mare.loc[mare['fine']=='div','endd'] mare.loc[mare['fine']=='censored','end']=mare.loc[mare['fine']=='censored','int'] #Duration mare['dur']=mare['end']-mare['beg'] #Keep if no error for duration mare=mare[(mare['dur']>0) & (mare['dur']<2000)] #Transform Duration in Years mare['dury'] = pd.cut(x=mare['dur'], bins=bins_d,labels=bins_d_label) mare['dury']=mare['dury'].astype(float) del mar ############################# #Build relationship by month ############################## #Eliminate observation if info on beg-end not complete #for i in range(9): # hi=hi[(np.isfinite(hi['BEGDAT0'+str(i+1)])) & (hi['BEGDAT0'+str(i+1)]<3999)] #Get date in time at which the guy is 20,25...,50 (9) for j in range(7): hi['time_'+str(20+(j)*5)]=hi['DOBY']*12+hi['DOBM']+(20+(j)*5)*12 #Get the status for j in range(7): #Create the variable of Status hi['status_'+str(20+(j)*5)]='single' for i in range(9): if(np.any(hi['HOWBEG0'+str(i+1)])!=None): #Get if in couple hi.loc[(hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)]) & (hi['BEGDAT0'+str(i+1)]<3999) & (((hi['time_'+str(20+(j)*5)]<=hi['ENDDAT0'+str(i+1)]) & (hi['ENDDAT0'+str(i+1)]>0)) | (hi['ENDDAT0'+str(i+1)]==0) | (hi['WIDDAT0'+str(i+1)]>0) ) ,'status_'+str(20+(j)*5)]='mar' if(np.any(hi['HOWBEG0'+str(i+1)])=='coh'): #Substitute if actually cohabitation hi.loc[(hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)]) & (hi['BEGDAT0'+str(i+1)]<3999) & (((hi['time_'+str(20+(j)*5)]<=hi['ENDDAT0'+str(i+1)]) & (hi['ENDDAT0'+str(i+1)]>0)) | (hi['ENDDAT0'+str(i+1)]==0) | (hi['WIDDAT0'+str(i+1)]>0) ) & (hi['status_'+str(20+(j)*5)]=='mar') & (hi['HOWBEG0'+str(i+1)]=='coh') & ((hi['MARDAT0'+str(i+1)]==0) | (hi['MARDAT0'+str(i+1)]>hi['time_'+str(20+(j)*5)])) ,'status_'+str(20+(j)*5)]='coh' #Create the variables ever cohabited and ever married for j in range(7): #Create the variable of ever married or cohabit hi['everm_'+str(20+(j)*5)]=0.0 hi['everc_'+str(20+(j)*5)]=0.0 for i in range(9): #if(np.any(hi['HOWBEG0'+str(i+1)])=='coh'): #Get if ever cohabited #hi.loc[((hi['everc_'+str(20+(max(j-1,0))*5)]>=0.1) | ((hi['HOWBEG0'+str(i+1)]=='coh') & (hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)]))),'everc_'+str(20+(j)*5)]=1.0 hi.loc[(hi['everc_'+str(20+(max(j-1,0))*5)]>=0.1),'everc_'+str(20+(j)*5)]=1.0 try: hi.loc[((hi['HOWBEG0'+str(i+1)]=='coh') & (hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)])),'everc_'+str(20+(j)*5)]=1.0 except: pass #Get if ever married hi.loc[((hi['everm_'+str(20+(max(j-1,0))*5)]>=0.1) | (hi['time_'+str(20+(j)*5)]>=hi['MARDAT0'+str(i+1)])),'everm_'+str(20+(j)*5)]=1.0 ###################################### #Build employment by status in 1986 ###################################### empl=hi[(hi['M2DP01']=='FEMALE') & (hi['weeks']<99)].copy() empl['stat']='single' empl['dist']=99999 for j in range(7): empl.loc[np.abs(empl['time_'+str(20+(j)*5)]-86*12)<empl['dist'],'stat']=hi['status_'+str(20+(j)*5)] ########################## #BUILD HAZARD RATES ######################### #Hazard of Separation hazs=list() hazs=hazards(cohe,'sep','dury','fine',hazs,int(6/period),'SAMWT') #Hazard of Marriage hazm=list() hazm=hazards(cohe,'mar','dury','fine',hazm,int(6/period),'SAMWT') #Hazard of Divorce hazd=list() hazd=hazards(mare,'div','dury','fine',hazd,int(12/period),'SAMWT') #Eventually transform Hazards pooling more years together if transform>1: #Divorce hazdp=list() pop=1 for i in range(int(12/(period*transform))): haz1=hazd[transform*i]*pop haz2=hazd[transform*i+1]*(pop-haz1) hazdp=[(haz1+haz2)/pop]+hazdp pop=pop-(haz1+haz2) hazdp.reverse() hazdp=np.array(hazdp).T hazd=hazdp #Separation and Marriage hazsp=list() hazmp=list() pop=1 for i in range(int(6/(period*transform))): hazs1=hazs[transform*i]*pop hazm1=hazm[transform*i]*pop hazs2=hazs[transform*i+1]*(pop-hazs1-hazm1) hazm2=hazm[transform*i+1]*(pop-hazs1-hazm1) hazsp=[(hazs1+hazs2)/pop]+hazsp hazmp=[(hazm1+hazm2)/pop]+hazmp pop=pop-(hazs1+hazs2+hazm1+hazm2) hazsp.reverse() hazsp=np.array(hazsp).T hazs=hazsp hazmp.reverse() hazmp=np.array(hazmp).T hazm=hazmp ######################################## #Construct share of each relationship ####################################### mar=np.zeros(6) coh=np.zeros(6) emar=np.zeros(6) ecoh=np.zeros(6) for j in range(6): mar[j]=np.average(hi['status_'+str(20+(j)*5)]=='mar', weights=np.array(hi['SAMWT'])) coh[j]=np.average(hi['status_'+str(20+(j)*5)]=='coh', weights=np.array(hi['SAMWT'])) emar[j]=np.average(hi['everm_'+str(20+(j)*5)], weights=np.array(hi['SAMWT'])) ecoh[j]=np.average(hi['everc_'+str(20+(j)*5)], weights=np.array(hi['SAMWT'])) ######################################### #Create the age at unilateral divorce+ #regression on the effect of unilateral divorce ########################################### #Number of relationships for the person hi['numerl']=0.0 #List of variables to keep keep_var=list() keep_var=keep_var+['numerl']+['state']+['SAMWT'] for i in range(9): #Make sure that some relationship of order i exist if (np.any(hi['BEGDAT0'+str(i+1)])): #Add relationship order hi['order'+str(i+1)]=np.nan hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'order'+str(i+1)]=i+1 #Add number of relationships hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'numerl']+=1.0 #Get whether the relationship started in marriage or cohabitation hi['imar'+str(i+1)]=np.nan hi.loc[hi['HOWBEG0'+str(i+1)]=='coh','imar'+str(i+1)]=0.0 hi.loc[hi['HOWBEG0'+str(i+1)]=='mar','imar'+str(i+1)]=1.0 #Get age at relationship hi['iage'+str(i+1)]=np.nan hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'iage'+str(i+1)]=round((hi['BEGDAT0'+str(i+1)]-hi['birth_month'])/12) #Get if unilateral divorce when relationship started hi['unid'+str(i+1)]=np.nan hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'unid'+str(i+1)]=0.0 hi.loc[(round(hi['BEGDAT0'+str(i+1)]/12+1900)>=hi['unil']) & (hi['unil']>0.1),'unid'+str(i+1)]=1.0 #Year Realationship Started hi['year'+str(i+1)]=np.nan hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'year'+str(i+1)]=round(hi['BEGDAT0'+str(i+1)]/12+1900) #Keep variables keep_var=keep_var+['year'+str(i+1)]+['unid'+str(i+1)]+['iage'+str(i+1)]+['imar'+str(i+1)]+['order'+str(i+1)] #New Dataset to reshape hi2=hi[keep_var] #Reahspe Dataset years = ([col for col in hi2.columns if col.startswith('year')]) unids = ([col for col in hi2.columns if col.startswith('unid')]) iages = ([col for col in hi2.columns if col.startswith('iage')]) imars = ([col for col in hi2.columns if col.startswith('imar')]) order = ([col for col in hi2.columns if col.startswith('order')]) hi3 = pd.lreshape(hi2, {'year' : years,'unid' : unids,'iage' : iages,'imar' : imars,'order' : order}) #Eliminate if missing hi3.replace([np.inf, -np.inf], np.nan) hi3.dropna(subset=['imar','unid']) #Regression FE_ols = smf.wls(formula='imar ~ unid+C(iage)+C(state)+C(year)',weights=hi3['SAMWT'], data = hi3.dropna()).fit() #FE_ols = smf.ols(formula='imar ~ unid+C(iage)+C(state)+C(year)', data = hi3.dropna()).fit() beta_unid=FE_ols.params['unid'] #Get age at which unilateral divorced was introduced hi['age_unid']=0.0 hi.loc[hi['unil']==0,'age_unid']=1000.0 hi.loc[hi['unil']!=0,'age_unid']=hi['unil']-hi['birth'] #Get age in the second survey date_age=pd.read_csv('age_drop.csv') #From hi make '-1' if law changed before the guy starts hi.loc[hi['age_unid']<0,'age_unid']=-1 ############################## #Compute hours using the psid ################################ #Account for the survey to be retrospective d_hrs['age']=d_hrs['age']-1.0 #Trim if hrs>2000 d_hrs.loc[d_hrs['wls']>=2000,'wls']=2000 #First keep the right birth cohorts d_hrs['birth']=d_hrs['year']-d_hrs['age'] d_hrs=d_hrs[(d_hrs['birth']>=1940) & (d_hrs['birth']<1955)] #Generate variables of interest d_hrs['mar']=-1.0 d_hrs.loc[(d_hrs['mls']==1),'mar']=1.0 d_hrs.loc[(d_hrs['mls']>1) & (d_hrs['mls']<100),'mar']=0.0 #Get mean labor supply mean_fls=np.average(d_hrs.loc[(d_hrs['age']>=20) & (d_hrs['age']<=60),'wls'])/2000 #New dataset d_hrs2=d_hrs[(d_hrs['mar']>=0) & (d_hrs['year']>=1977)] #Get Ratio of Female to Male FLP #23-38-53 fls_ratio=np.zeros((2)) fls_ratio[0]=np.average(d_hrs2.loc[(d_hrs2['mar']==1.0) & (d_hrs['age']>=23) & (d_hrs['age']<=38),'wls'])/np.average(d_hrs2.loc[(d_hrs2['mar']==0.0) & (d_hrs['age']>=23) & (d_hrs['age']<=38),'wls']) fls_ratio[1]=np.average(d_hrs2.loc[(d_hrs2['mar']==1.0) & (d_hrs['age']>=38) & (d_hrs['age']<=53),'wls'])/np.average(d_hrs2.loc[(d_hrs2['mar']==0.0) & (d_hrs['age']>=38) & (d_hrs['age']<=53),'wls']) #Get difference in male wages in marriage and cohabitation weightm=d_hrs2.loc[(d_hrs2['mar']==1.0) & (np.isnan(d_hrs2['ln_ly'])==False),'wls'] weightc=d_hrs2.loc[(d_hrs2['mar']==0.0) & (np.isnan(d_hrs2['ln_ly'])==False),'wls'] wage_ratio=np.average(d_hrs2.loc[(d_hrs2['mar']==1.0) & (np.isnan(d_hrs2['ln_ly'])==False),'ln_ly'],weights=weightm)-np.average(d_hrs2.loc[(d_hrs2['mar']==0.0) & (np.isnan(d_hrs2['ln_ly'])==False),'ln_ly'],weights=weightc) ####################################### #Get divorce by income using PSID ######################################## divR=np.average(d_divo.loc[(d_divo['ln_ly']>d_divo['wtmedian']),'div']) divP=np.average(d_divo.loc[(d_divo['ln_ly']<d_divo['wtmedian']),'div']) marR=np.average(d_divo.loc[(d_divo['ln_ly']>d_divo['wtmedian']),'mar']) marP=np.average(d_divo.loc[(d_divo['ln_ly']<d_divo['wtmedian']),'mar']) div_ratio=(divR/marR)/(divP/marP) ######################################## #FREQENCIES ####################################### def CountFrequency(my_list): # Creating an empty dictionary freq = {} for item in my_list: if (item in freq): freq[item] += 1 else: freq[item] = 1 #for key, value in freq.items(): # print ("% d : % d"%(key, value)) return freq #Modify age unid freq_pc=dict() freq_pc['male'] = CountFrequency(hi.loc[hi['M2DP01']=='MALE','age_unid'].tolist()) freq_pc['female'] = CountFrequency(hi.loc[hi['M2DP01']=='FEMALE','age_unid'].tolist()) freq_pc['share_female']=np.mean(hi['M2DP01']=='FEMALE') #Frequencies for age in the second wave freq_i= CountFrequency(date_age['age'].tolist()) #Frequencies for age at intervire freq_ai=CountFrequency(hi['ageint'].tolist()) #Frequencies of agents by age at unid and gender freq_nsfh = hi[['M2DP01','age_unid','SAMWT']]#hi.groupby(['M2DP01','age_unid'])['SAMWT'].count() #Get distribution of types using the psid freq_psid_tot=d_hrs[['age','unid']] freq_psid_par=d_hrs2[['age','unid','mar']] freq_psid_div=d_divo[['age','unid']] #Create a dictionary for saving simulated moments listofTuples = [("hazs" , hazs), ("hazm" , hazm),("hazd" , hazd),("emar" , emar), ("ecoh" , ecoh), ("fls_ratio" , fls_ratio),("wage_ratio" , wage_ratio),("div_ratio" , div_ratio), ("mean_fls" , mean_fls),("mar" , mar),("coh" , coh), ("freq_pc" , freq_pc), ("freq_i" , freq_i),("beta_unid" , beta_unid),("freq_ai" , freq_ai), ("freq_nsfh" , freq_nsfh),("freq_psid_tot" , freq_psid_tot),("freq_psid_par" , freq_psid_par),("freq_psid_div" , freq_psid_div)] dic_mom=dict(listofTuples) del hi,hi2,hi3 return dic_mom
def main(): evaluate_run = False results_folder = os.path.join(os.getcwd(), "results_"+walk+"/" + experiment) if not os.path.isdir(results_folder): print(colored("Error, " + results_folder + " does not exist", 'red')) else: print(colored("OK, " + results_folder + " exists", 'green')) for timeout_folder in natsorted(os.listdir(os.path.join(results_folder))): if timeout_folder.endswith("pickle"): continue print(colored("Timeout folder:", 'blue'), timeout_folder) df_kilo_timeout = pd.DataFrame() timeout = -1 parameters = timeout_folder.split("_") for param in parameters: if param.startswith("timeout"): timeout = int(param.split("#")[-1]) * 10 # print("\t timeoutR:",timeoutR) if timeout == -1: print(colored("\tWARNING: wrong timeout folder", 'red')) continue if os.path.isfile(os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#" + str(timeout) + "_.pickle")): print("Already exists ", os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#" + str(timeout) + "_.pickle")) else: # print(colored( # os.path.join(results_folder, pickle_file_root"_timeout#"+str(timeout*10)+"_.pickle"), # 'red')) # sys.exit() for filename in natsorted(os.listdir(os.path.join(results_folder, timeout_folder))): filename_seed = filename.split("_")[0].split("#")[-1] # print(filename) if filename.endswith("areaLOG_client.tsv"): if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0: print(colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename) df_area_client = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t", header=None) if filename.endswith("areaLOG_server.tsv"): if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0: print(colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename) df_area_server = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t", header=None) if filename.endswith("kiloLOG_client.tsv"): if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0: print(colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename) df_kilo_client = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t", header=None) if filename.endswith("kiloLOG_server.tsv"): if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0: print(colored("\tWARNING, empty file at:" + filename, 'red')) continue # print('\tfilename: ', filename, end='\n') df_kilo_server = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t", header=None) evaluate_run = True if evaluate_run: print(colored("\tEvaluating run:" + filename_seed, 'blue')) '''Kilo log part''' if len(df_kilo_client.columns) > 145: # print("Cutting null elements in client kilo df") df_kilo_client.drop(df_kilo_client.columns[len(df_kilo_client.columns) - 1], axis=1, inplace=True) if len(df_kilo_server.columns) > 145: # print("Cutting null elements in server kilo df") df_kilo_server.drop(df_kilo_server.columns[len(df_kilo_server.columns) - 1], axis=1, inplace=True) col_kilo_labels = ['time'] for i in range(0, len(df_kilo_server.columns) - 1, 6): # print(i,end=", ") col_kilo_labels += ['id' + str(i // 6), 'state' + str(i // 6), 'posx' + str(i // 6), 'posy' + str(i // 6), 'ori' + str(i // 6), 'same_state' + str(i // 6)] col_kilo_to_drop = [] for i in range((len(df_kilo_server.columns) - 1) // 6): # print(i,end=", ") col_kilo_to_drop += ['same_state' + str(i)] df_kilo_server.columns = col_kilo_labels df_kilo_client.columns = col_kilo_labels df_kilo_server = df_kilo_server.drop(col_kilo_to_drop, axis=1) df_kilo_client = df_kilo_client.drop(col_kilo_to_drop, axis=1) '''Area LOG part''' col_area_labels = ['time'] for i in range(0, len(df_area_server.columns) - 2, 6): # print(i, end=", ") col_area_labels += ['id' + str(i // 6), 'posx' + str(i // 6), 'posy' + str(i // 6), 'color' + str(i // 6), 'completed' + str(i // 6), 'contained' + str(i // 6)] # Remove last empty col and assign labels to df_area_server if len(df_area_server.columns) > 49: # print("Cutting null elements in area server df") df_area_server.drop(df_area_server.columns[len(df_area_server.columns) - 1], axis=1, inplace=True) df_area_server.columns = col_area_labels # First df_area_client row contains garbage # so is substituted with the second row except for the time, # then remove Nan values in [:,49:] if len(df_area_client.columns) > 49: # print("Cutting null elements in area client df") df_area_client.loc[0, 1:] = df_area_client.loc[1, 1:] df_area_client = df_area_client.drop(np.arange(49, len(df_area_client.columns)), axis=1) df_area_client.columns = col_area_labels area_pos_label = [] for i in range(num_areas): area_pos_label += ["posx" + str(i)] area_pos_label += ["posy" + str(i)] areas_pos = df_area_client[area_pos_label].iloc[0, :].values # print(areas_pos) areas_pos = areas_pos.reshape(-1, 2) color_list = ["color" + str(i) for i in range(num_areas)] df_area3_s = df_area_server.iloc[:1, :][color_list] df_area3_c = df_area_client.iloc[:1, :][color_list] for i, idx in enumerate(range(1, len(df_area3_c.columns) * 2, 2)): # print(i, ' ', idx) df_area3_c.insert(loc=idx, column='other_col' + str(i), value=df_area3_s.iloc[0][i]) client = [col for col in df_area3_c.columns if 'color' in col] server = [col for col in df_area3_c.columns if 'other_col' in col] df_area_colors = pd.lreshape(df_area3_c, {'color_client': client, 'color_server': server}) area_type = [] for area in df_area_colors.values: if area[0] == 0 and area[1] == 0: area_type += ['BB'] if area[0] == 0 and area[1] == 1: area_type += ['BR'] if area[0] == 1 and area[1] == 0: area_type += ['RB'] if area[0] == 1 and area[1] == 1: area_type += ['RR'] df_area_colors.insert(loc=2, column='area_type', value=area_type) '''Post process server''' for i_c, kilo_id in enumerate(np.arange(1, len(df_kilo_server.columns), 5)): # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue')) # print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n') kilo_pos = df_kilo_server.iloc[:, kilo_id + i_c + 2:kilo_id + i_c + 4].values # print(kilo_pos) in_area = np.empty(kilo_pos.shape[0], dtype=int) in_area.fill(-1) for area_idx, area_pos in enumerate(areas_pos): # print(area_idx, ' ', area_pos) dist = np.linalg.norm(kilo_pos - area_pos, axis=1) # print(dist, end='\n\n') in_area = np.where(dist < area_threshold, df_area_colors.iloc[area_idx][-1][::-1], in_area) # in_area = np.where(in_area == -1, np.NaN, in_area) # print(in_area) df_kilo_server.insert(loc=int(kilo_id + i_c + 2), column='area_type' + str(i_c), value=in_area) '''Post process client''' for i_s, kilo_id in enumerate(np.arange(1, len(df_kilo_client.columns), 5)): # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue')) # print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n') kilo_pos = df_kilo_client.iloc[:, kilo_id + i_s + 2:kilo_id + i_s + 4].values # print(kilo_pos) in_area = np.empty(kilo_pos.shape[0], dtype=int) in_area.fill(-1) for area_idx, area_pos in enumerate(areas_pos): # print(area_idx,' ', area_pos) dist = np.linalg.norm(kilo_pos - area_pos, axis=1) # print(dist, end='\n\n') in_area = np.where(dist < area_threshold, df_area_colors.iloc[area_idx][-1], in_area) # in_area = np.where(in_area == -1, np.NaN, in_area) # print(in_area) df_kilo_client.insert(loc=int(kilo_id + i_s + 2), column='area_type' + str(i_s), value=in_area) df_kilo_single_run = df_kilo_client.join(df_kilo_server, lsuffix='_c', rsuffix='_s') df_kilo_single_run = df_kilo_single_run.set_index(df_kilo_single_run.index.astype(str) + '_' + filename_seed) df_kilo_timeout = df_kilo_timeout.append(df_kilo_single_run) evaluate_run = False '''Save pickle file''' df_kilo_timeout.to_pickle(os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#"+str(timeout)+"_.pickle")) print("Saving at: ", os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#"+str(timeout)+"_.pickle")) print("Changing dir")