示例#1
0
文件: general.py 项目: prutskov/modin
def lreshape(data: DataFrame, groups, dropna=True, label=None):
    """
    Reshape wide-format data to long. Generalized inverse of ``DataFrame.pivot``.

    Accepts a dictionary, `groups`, in which each key is a new column name
    and each value is a list of old column names that will be "melted" under
    the new column name as part of the reshape.

    Parameters
    ----------
    data : DataFrame
        The wide-format DataFrame.
    groups : dict
        Dictionary in the form: `{new_name : list_of_columns}`.
    dropna : bool, default: True
        Whether include columns whose entries are all NaN or not.
    label : optional
        Deprecated parameter.

    Returns
    -------
    DataFrame
        Reshaped DataFrame.
    """
    if not isinstance(data, DataFrame):
        raise ValueError("can not lreshape with instance of type {}".format(
            type(data)))
    ErrorMessage.default_to_pandas("`lreshape`")
    return DataFrame(
        pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label))
示例#2
0
 def import_table_doctopic(self):
     """Import data into doctopic table"""
     src_file = self.get_source_file(
         'output-doc-topics')  #self.config['output-doc-topics']
     doctopic = pd.read_csv(src_file, sep='\t', header=None)
     doc = pd.DataFrame(doctopic.iloc[:, 1])
     doc.columns = ['doc_tmp']
     doc['src_doc_id'] = doc.doc_tmp.apply(lambda x: x.split(',')[0])
     doc['doc_label'] = doc.doc_tmp.apply(lambda x: x.split(',')[1])
     doc = doc[['src_doc_id', 'doc_label']]
     doc.index.name = 'doc_id'
     doctopic.drop(1, axis=1, inplace=True)
     doctopic.rename(columns={0: 'doc_id'}, inplace=True)
     y = [col for col in doctopic.columns[1:]]
     doctopic_narrow = pd.lreshape(doctopic, {'topic_weight': y})
     doctopic_narrow['topic_id'] = [
         i for i in range(self.config['num-topics'])
         for doc_id in doctopic['doc_id']
     ]
     doctopic_narrow = doctopic_narrow[[
         'doc_id', 'topic_id', 'topic_weight'
     ]]
     doctopic_narrow.set_index(['doc_id', 'topic_id'], inplace=True)
     doctopic_narrow['topic_weight_zscore'] = stats.zscore(
         doctopic_narrow.topic_weight)
     dtm = doctopic_narrow.reset_index()\
         .set_index(['doc_id','topic_id'])['topic_weight'].unstack()
     dtm.to_csv(self.tables_dir + 'DOCTOPIC.csv')
     doc.to_csv(self.tables_dir + 'DOC.csv')
     doctopic_narrow.to_csv(self.tables_dir + 'DOCTOPIC_NARROW.csv')
示例#3
0
文件: general.py 项目: ddi-zrl/modin
def lreshape(data: DataFrame, groups, dropna=True, label=None):
    if not isinstance(data, DataFrame):
        raise ValueError("can not lreshape with instance of type {}".format(
            type(data)))
    ErrorMessage.default_to_pandas("`lreshape`")
    return DataFrame(
        pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label))
示例#4
0
    def import_table_doctopic(self, src_file=None):
        """Import data into doctopic table"""
        if not src_file: src_file = self.mallet['train-topics']['output-doc-topics']
        if 'doc-topics-threshold' in self.mallet['train-topics']:
            DOC = []
            DOCTOPIC = []
            src = PoloFile(src_file)
            for line in src[1:]:
                row = line.split('\t')
                row.pop()  # Pretty sure this is right
                doc_id = row[0]
                src_doc_id = int(row[1].split(',')[0])
                doc_label = row[1].split(',')[1]
                DOC.append([doc_id, src_doc_id, doc_label])
                for i in range(2, len(row), 2):
                    topic_id = row[i]
                    topic_weight = row[i + 1]
                    DOCTOPIC.append([doc_id, topic_id, topic_weight])
            doctopic = pd.DataFrame(DOCTOPIC, columns=['doc_id', 'topic_id', 'topic_weight'])
            doctopic.set_index(['doc_id', 'topic_id'], inplace=True)
            doctopic['topic_weight_zscore'] = stats.zscore(doctopic.topic_weight)
            self.computed_thresh = round(doctopic.topic_weight.quantile(self.cfg_tw_quantile), 3)
            doc = pd.DataFrame(DOC, columns=['doc_id', 'src_doc_id', 'doc_label'])
            doc.set_index('doc_id', inplace=True)
            self.put_table(doctopic, 'doctopic', index=True)
            self.put_table(doc, 'doc', index=True)
        else:
            doctopic = pd.read_csv(src_file, sep='\t', header=None)
            doc = pd.DataFrame(doctopic.iloc[:, 1])
            doc.columns = ['doc_tmp']
            doc['src_doc_id'] = doc.doc_tmp.apply(lambda x: int(x.split(',')[0]))
            doc['doc_label'] = doc.doc_tmp.apply(lambda x: x.split(',')[1])
            doc = doc[['src_doc_id', 'doc_label']]
            doc.index.name = 'doc_id'
            self.put_table(doc, 'doc', index=True)
            doctopic.drop(1, axis = 1, inplace=True)
            doctopic.rename(columns={0:'doc_id'}, inplace=True)
            y = [col for col in doctopic.columns[1:]]
            doctopic_narrow = pd.lreshape(doctopic, {'topic_weight': y})
            doctopic_narrow['topic_id'] = [i for i in range(self.cfg_num_topics)
                                           for doc_id in doctopic['doc_id']]
            doctopic_narrow = doctopic_narrow[['doc_id', 'topic_id', 'topic_weight']]
            doctopic_narrow.set_index(['doc_id', 'topic_id'], inplace=True)
            doctopic_narrow['topic_weight_zscore'] = stats.zscore(doctopic_narrow.topic_weight)
            self.computed_thresh = round(doctopic_narrow.topic_weight\
                                         .quantile(self.cfg_tw_quantile), 3)
            self.put_table(doctopic_narrow, 'doctopic', index=True)

        # todo: Revisit this; in the best place to do this?
        self.set_config_item('computed_thresh', self.computed_thresh)
示例#5
0
def generalize_country_to_region(
    workdata,
    column: str,
    countries=pd.read_csv(
        'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv',
        usecols=['name', 'alpha-2', 'alpha-3', 'region'])):
    """
    A DataFrame adott oszlopában lévő országneveket és kódokat cseréli le annak a régiónak a nevére,
    ahol az ország található.
    :param workdata: a WorkData példány, ami a DataFramet tartalmazza
    :param column: az oszlop neve
    :param countries: az országokat, kódokat, és régiókat tartalmazó file
    :return:
    """
    reshaped = pd.lreshape(countries, {
        'country': ['name', 'alpha-2', 'alpha-3'],
        'region': ['region', 'region', 'region']
    },
                           dropna=False)
    dictionary = dict(zip(reshaped['country'], reshaped['region']))
    workdata.df[column] = workdata.df[column].map(dictionary)
示例#6
0
def count_unique_heros():
    # загружаем датасет
    features = pd.read_csv('features.csv', index_col='match_id')

    # получаем кол-во записей в датасете
    rows_num = features.shape[0]

    # заполняем пустоты в признаках
    for f, n in features.count().items():
        if n != rows_num:
            features[f].fillna(features[f].mean(), inplace=True)

    values = []
    #заполняем массив наименований признаков
    for f in list(features.columns):
        if 'hero' in f:
            values.append(f)

    #решейпим матрицу в вектор
    df1 = pd.lreshape(features, {'hero': values})

    #выводим кол-во уникальных значений (героев)
    print(df1['hero'].value_counts().shape[0])
示例#7
0
    def test_pairs(self):
        data = {
            'birthdt':
            ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009'],
            'birthwt': [1766, 3301, 1454, 3139, 4133],
            'id': [101, 102, 103, 104, 105],
            'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
            'visitdt1':
            ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009'],
            'visitdt2':
            ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
            'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
            'wt1': [1823, 3338, 1549, 3298, 4306],
            'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
            'wt3': [2293.0, nan, nan, 3377.0, 4805.0]
        }

        df = DataFrame(data)

        spec = {
            'visitdt': ['visitdt%d' % i for i in range(1, 4)],
            'wt': ['wt%d' % i for i in range(1, 4)]
        }
        result = lreshape(df, spec)

        exp_data = {
            'birthdt': [
                '08jan2009', '20dec2008', '30dec2008', '21dec2008',
                '11jan2009', '08jan2009', '30dec2008', '21dec2008',
                '11jan2009', '08jan2009', '21dec2008', '11jan2009'
            ],
            'birthwt': [
                1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, 4133, 1766,
                3139, 4133
            ],
            'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
            'sex': [
                'Male', 'Female', 'Female', 'Female', 'Female', 'Male',
                'Female', 'Female', 'Female', 'Male', 'Female', 'Female'
            ],
            'visitdt': [
                '11jan2009', '22dec2008', '04jan2009', '29dec2008',
                '20jan2009', '21jan2009', '22jan2009', '31dec2008',
                '03feb2009', '05feb2009', '02jan2009', '15feb2009'
            ],
            'wt': [
                1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, 1892.0, 3338.0,
                4575.0, 2293.0, 3377.0, 4805.0
            ]
        }
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        result = lreshape(df, spec, dropna=False)
        exp_data = {
            'birthdt': [
                '08jan2009', '20dec2008', '30dec2008', '21dec2008',
                '11jan2009', '08jan2009', '20dec2008', '30dec2008',
                '21dec2008', '11jan2009', '08jan2009', '20dec2008',
                '30dec2008', '21dec2008', '11jan2009'
            ],
            'birthwt': [
                1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133,
                1766, 3301, 1454, 3139, 4133
            ],
            'id': [
                101, 102, 103, 104, 105, 101, 102, 103, 104, 105, 101, 102,
                103, 104, 105
            ],
            'sex': [
                'Male', 'Female', 'Female', 'Female', 'Female', 'Male',
                'Female', 'Female', 'Female', 'Female', 'Male', 'Female',
                'Female', 'Female', 'Female'
            ],
            'visitdt': [
                '11jan2009', '22dec2008', '04jan2009', '29dec2008',
                '20jan2009', '21jan2009', nan, '22jan2009', '31dec2008',
                '03feb2009', '05feb2009', nan, nan, '02jan2009', '15feb2009'
            ],
            'wt': [
                1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, 1892.0,
                3338.0, 4575.0, 2293.0, nan, nan, 3377.0, 4805.0
            ]
        }
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        spec = {
            'visitdt': ['visitdt%d' % i for i in range(1, 3)],
            'wt': ['wt%d' % i for i in range(1, 4)]
        }
        pytest.raises(ValueError, lreshape, df, spec)
示例#8
0
    def test_pairs(self):
        data = {
            "birthdt": [
                "08jan2009",
                "20dec2008",
                "30dec2008",
                "21dec2008",
                "11jan2009",
            ],
            "birthwt": [1766, 3301, 1454, 3139, 4133],
            "id": [101, 102, 103, 104, 105],
            "sex": ["Male", "Female", "Female", "Female", "Female"],
            "visitdt1": [
                "11jan2009",
                "22dec2008",
                "04jan2009",
                "29dec2008",
                "20jan2009",
            ],
            "visitdt2":
            ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"],
            "visitdt3":
            ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"],
            "wt1": [1823, 3338, 1549, 3298, 4306],
            "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0],
            "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0],
        }

        df = DataFrame(data)

        spec = {
            "visitdt": [f"visitdt{i:d}" for i in range(1, 4)],
            "wt": [f"wt{i:d}" for i in range(1, 4)],
        }
        result = lreshape(df, spec)

        exp_data = {
            "birthdt": [
                "08jan2009",
                "20dec2008",
                "30dec2008",
                "21dec2008",
                "11jan2009",
                "08jan2009",
                "30dec2008",
                "21dec2008",
                "11jan2009",
                "08jan2009",
                "21dec2008",
                "11jan2009",
            ],
            "birthwt": [
                1766,
                3301,
                1454,
                3139,
                4133,
                1766,
                1454,
                3139,
                4133,
                1766,
                3139,
                4133,
            ],
            "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
            "sex": [
                "Male",
                "Female",
                "Female",
                "Female",
                "Female",
                "Male",
                "Female",
                "Female",
                "Female",
                "Male",
                "Female",
                "Female",
            ],
            "visitdt": [
                "11jan2009",
                "22dec2008",
                "04jan2009",
                "29dec2008",
                "20jan2009",
                "21jan2009",
                "22jan2009",
                "31dec2008",
                "03feb2009",
                "05feb2009",
                "02jan2009",
                "15feb2009",
            ],
            "wt": [
                1823.0,
                3338.0,
                1549.0,
                3298.0,
                4306.0,
                2011.0,
                1892.0,
                3338.0,
                4575.0,
                2293.0,
                3377.0,
                4805.0,
            ],
        }
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        result = lreshape(df, spec, dropna=False)
        exp_data = {
            "birthdt": [
                "08jan2009",
                "20dec2008",
                "30dec2008",
                "21dec2008",
                "11jan2009",
                "08jan2009",
                "20dec2008",
                "30dec2008",
                "21dec2008",
                "11jan2009",
                "08jan2009",
                "20dec2008",
                "30dec2008",
                "21dec2008",
                "11jan2009",
            ],
            "birthwt": [
                1766,
                3301,
                1454,
                3139,
                4133,
                1766,
                3301,
                1454,
                3139,
                4133,
                1766,
                3301,
                1454,
                3139,
                4133,
            ],
            "id": [
                101,
                102,
                103,
                104,
                105,
                101,
                102,
                103,
                104,
                105,
                101,
                102,
                103,
                104,
                105,
            ],
            "sex": [
                "Male",
                "Female",
                "Female",
                "Female",
                "Female",
                "Male",
                "Female",
                "Female",
                "Female",
                "Female",
                "Male",
                "Female",
                "Female",
                "Female",
                "Female",
            ],
            "visitdt": [
                "11jan2009",
                "22dec2008",
                "04jan2009",
                "29dec2008",
                "20jan2009",
                "21jan2009",
                np.nan,
                "22jan2009",
                "31dec2008",
                "03feb2009",
                "05feb2009",
                np.nan,
                np.nan,
                "02jan2009",
                "15feb2009",
            ],
            "wt": [
                1823.0,
                3338.0,
                1549.0,
                3298.0,
                4306.0,
                2011.0,
                np.nan,
                1892.0,
                3338.0,
                4575.0,
                2293.0,
                np.nan,
                np.nan,
                3377.0,
                4805.0,
            ],
        }
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        with tm.assert_produces_warning(FutureWarning):
            lreshape(df, spec, dropna=False, label="foo")

        spec = {
            "visitdt": [f"visitdt{i:d}" for i in range(1, 3)],
            "wt": [f"wt{i:d}" for i in range(1, 4)],
        }
        msg = "All column lists must be same length"
        with pytest.raises(ValueError, match=msg):
            lreshape(df, spec)
示例#9
0
    def test_pairs(self):
        data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
                            '11jan2009'],
                'birthwt': [1766, 3301, 1454, 3139, 4133],
                'id': [101, 102, 103, 104, 105],
                'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
                'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
                             '29dec2008', '20jan2009'],
                'visitdt2':
                ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
                'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
                'wt1': [1823, 3338, 1549, 3298, 4306],
                'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
                'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}

        df = DataFrame(data)

        spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
                'wt': ['wt%d' % i for i in range(1, 4)]}
        result = lreshape(df, spec)

        exp_data = {'birthdt':
                    ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
                     '11jan2009', '08jan2009', '30dec2008', '21dec2008',
                     '11jan2009', '08jan2009', '21dec2008', '11jan2009'],
                    'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
                                4133, 1766, 3139, 4133],
                    'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
                           104, 105],
                    'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
                            'Male', 'Female', 'Female', 'Female', 'Male',
                            'Female', 'Female'],
                    'visitdt': ['11jan2009', '22dec2008', '04jan2009',
                                '29dec2008', '20jan2009', '21jan2009',
                                '22jan2009', '31dec2008', '03feb2009',
                                '05feb2009', '02jan2009', '15feb2009'],
                    'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
                           1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        result = lreshape(df, spec, dropna=False)
        exp_data = {'birthdt':
                    ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
                     '11jan2009', '08jan2009', '20dec2008', '30dec2008',
                     '21dec2008', '11jan2009', '08jan2009', '20dec2008',
                     '30dec2008', '21dec2008', '11jan2009'],
                    'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
                                3139, 4133, 1766, 3301, 1454, 3139, 4133],
                    'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
                           101, 102, 103, 104, 105],
                    'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
                            'Male', 'Female', 'Female', 'Female', 'Female',
                            'Male', 'Female', 'Female', 'Female', 'Female'],
                    'visitdt': ['11jan2009', '22dec2008', '04jan2009',
                                '29dec2008', '20jan2009', '21jan2009', nan,
                                '22jan2009', '31dec2008', '03feb2009',
                                '05feb2009', nan, nan, '02jan2009',
                                '15feb2009'],
                    'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
                           1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
                           4805.0]}
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
                'wt': ['wt%d' % i for i in range(1, 4)]}
        pytest.raises(ValueError, lreshape, df, spec)
def main():
    evaluate_run = False

    results_folder = os.path.join(os.getcwd(),
                                  "results_" + walk + "/" + experiment)
    if not os.path.isdir(results_folder):
        print(colored("Error, " + results_folder + " does not exist", 'red'))
    else:
        print(colored("OK, " + results_folder + " exists", 'green'))

    rr = {}
    rb = {}
    br = {}
    bb = {}

    for timeout_folder in natsorted(os.listdir(os.path.join(results_folder))):
        print(colored("Timeout folder:", 'blue'), timeout_folder)

        if timeout_folder.endswith("pickle"):
            continue

        parameters = timeout_folder.split("_")
        for param in parameters:
            if param.startswith("timeout"):
                timeout = int(param.split("#")[-1]) * 10
                # print("\t timeoutR:",timeoutR)

        if timeout == -1:
            print(colored("\tWARNING: wrong timeout folder", 'red'))
            continue

        if os.path.isfile(
                os.path.join(
                    results_folder, pickle_file_root + "_timeout#" +
                    str(timeout) + "_.pickle")):
            run_memory_mean = pd.read_pickle(
                os.path.join(
                    results_folder, pickle_file_root + "_timeout#" +
                    str(timeout) + "_.pickle"))
            print(
                colored(
                    pickle_file_root + "_timeout#" + str(timeout) +
                    "_.pickle already exists for timeout:" + str(timeout),
                    'green'))
        else:
            # print(colored(
            #     os.path.join(results_folder, pickle_file_root"_timeout#"+str(timeout*10)+"_.pickle"),
            #     'red'))
            # sys.exit()
            for filename in natsorted(
                    os.listdir(os.path.join(results_folder, timeout_folder))):
                filename_seed = filename.split("_")[0]
                # print(filename)
                if filename.endswith("areaLOG_client.tsv"):
                    if not os.path.getsize(
                            os.path.join(results_folder, timeout_folder,
                                         filename)) > 0:
                        print(
                            colored("\tWARNING, empty file at:" + filename,
                                    'red'))
                        continue
                    # print('\tfilename: ', filename)
                    df_area_client = pd.read_csv(os.path.join(
                        results_folder, timeout_folder, filename),
                                                 sep="\t",
                                                 header=None)

                if filename.endswith("areaLOG_server.tsv"):
                    if not os.path.getsize(
                            os.path.join(results_folder, timeout_folder,
                                         filename)) > 0:
                        print(
                            colored("\tWARNING, empty file at:" + filename,
                                    'red'))
                        continue
                    # print('\tfilename: ', filename)
                    df_area_server = pd.read_csv(os.path.join(
                        results_folder, timeout_folder, filename),
                                                 sep="\t",
                                                 header=None)

                # if filename.endswith("taskLOG_client.tsv"):
                #     if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0:
                #         print(colored("\tWARNING, empty file at:" + filename, 'red'))
                #         continue
                #     # print('\tfilename: ', filename)
                #     df_task_client = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t",
                #                                  header=None)
                #
                # if filename.endswith("taskLOG_server.tsv"):
                #     if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0:
                #         print(colored("\tWARNING, empty file at:" + filename, 'red'))
                #         continue
                #     # print('\tfilename: ', filename)
                #     df_task_server = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t",
                #                                  header=None)

                if filename.endswith("kiloLOG_client.tsv"):
                    if not os.path.getsize(
                            os.path.join(results_folder, timeout_folder,
                                         filename)) > 0:
                        print(
                            colored("\tWARNING, empty file at:" + filename,
                                    'red'))
                        continue
                    # print('\tfilename: ', filename)
                    df_kilo_client = pd.read_csv(os.path.join(
                        results_folder, timeout_folder, filename),
                                                 sep="\t",
                                                 header=None)

                if filename.endswith("kiloLOG_server.tsv"):
                    if not os.path.getsize(
                            os.path.join(results_folder, timeout_folder,
                                         filename)) > 0:
                        print(
                            colored("\tWARNING, empty file at:" + filename,
                                    'red'))
                        continue
                    # print('\tfilename: ', filename, end='\n')
                    df_kilo_server = pd.read_csv(os.path.join(
                        results_folder, timeout_folder, filename),
                                                 sep="\t",
                                                 header=None)
                    evaluate_run = True

                if evaluate_run:
                    '''Kilo log part'''
                    if len(df_kilo_client.columns) > 145:
                        # print("Cutting null elements in client kilo df")
                        df_kilo_client.drop(
                            df_kilo_client.columns[len(df_kilo_client.columns)
                                                   - 1],
                            axis=1,
                            inplace=True)

                    if len(df_kilo_server.columns) > 145:
                        # print("Cutting null elements in server kilo df")
                        df_kilo_server.drop(
                            df_kilo_server.columns[len(df_kilo_server.columns)
                                                   - 1],
                            axis=1,
                            inplace=True)

                    col_kilo_labels = ['time']
                    for i in range(0, len(df_kilo_server.columns) - 1, 6):
                        #     print(i,end=", ")
                        col_kilo_labels += [
                            'id' + str(i // 6), 'state' + str(i // 6),
                            'posx' + str(i // 6), 'posy' + str(i // 6),
                            'ori' + str(i // 6), 'same_state' + str(i // 6)
                        ]

                    col_kilo_to_drop = []
                    for i in range((len(df_kilo_server.columns) - 1) // 6):
                        #     print(i,end=", ")
                        col_kilo_to_drop += ['same_state' + str(i)]

                    df_kilo_server.columns = col_kilo_labels
                    df_kilo_client.columns = col_kilo_labels
                    df_kilo_server = df_kilo_server.drop(col_kilo_to_drop,
                                                         axis=1)
                    df_kilo_client = df_kilo_client.drop(col_kilo_to_drop,
                                                         axis=1)
                    '''Completed task LOG part'''
                    # task_label = ['time', 'id', 'creationTime', 'completitionTime', 'color', 'contained']
                    # df_task_client.columns = task_label
                    '''Area LOG part'''
                    col_area_labels = ['time']
                    for i in range(0, len(df_area_server.columns) - 2, 6):
                        # print(i, end=", ")
                        col_area_labels += [
                            'id' + str(i // 6), 'posx' + str(i // 6),
                            'posy' + str(i // 6), 'color' + str(i // 6),
                            'completed' + str(i // 6),
                            'contained' + str(i // 6)
                        ]

                    # Remove last empty col and assign labels to df_area_server
                    if len(df_area_server.columns) > 49:
                        # print("Cutting null elements in area server df")
                        df_area_server.drop(
                            df_area_server.columns[len(df_area_server.columns)
                                                   - 1],
                            axis=1,
                            inplace=True)
                    df_area_server.columns = col_area_labels

                    # First df_area_client row contains garbage
                    # so is substituted with the second row except for the time,
                    # then remove Nan values in [:,49:]
                    if len(df_area_client.columns) > 49:
                        # print("Cutting null elements in area client df")
                        df_area_client.loc[0, 1:] = df_area_client.loc[1, 1:]
                        df_area_client = df_area_client.drop(np.arange(
                            49, len(df_area_client.columns)),
                                                             axis=1)
                    df_area_client.columns = col_area_labels

                    area_color_label = []
                    for i in range(num_areas):
                        area_color_label += ["color" + str(i)]
                    #     print("color"+str(i))
                    areas_client_color = df_area_client[area_color_label].iloc[
                        0, :].values
                    areas_server_color = df_area_server[area_color_label].iloc[
                        0, :].values
                    # print(areas_client_color)
                    # print(areas_server_color)

                    area_pos_label = []
                    for i in range(num_areas):
                        area_pos_label += ["posx" + str(i)]
                        area_pos_label += ["posy" + str(i)]
                    areas_pos = df_area_client[area_pos_label].iloc[
                        0, :].values
                    # print(areas_pos)
                    areas_pos = areas_pos.reshape(-1, 2)

                    color_list = ["color" + str(i) for i in range(num_areas)]
                    df_area3_s = df_area_server.iloc[:1, :][color_list]
                    df_area3_c = df_area_client.iloc[:1, :][color_list]
                    for i, idx in enumerate(
                            range(1,
                                  len(df_area3_c.columns) * 2, 2)):
                        #     print(i, ' ', idx)
                        df_area3_c.insert(loc=idx,
                                          column='other_col' + str(i),
                                          value=df_area3_s.iloc[0][i])
                    client = [
                        col for col in df_area3_c.columns if 'color' in col
                    ]
                    server = [
                        col for col in df_area3_c.columns if 'other_col' in col
                    ]
                    df_area_colors = pd.lreshape(df_area3_c, {
                        'color_client': client,
                        'color_server': server
                    })
                    area_type = []
                    for area in df_area_colors.values:
                        if area[0] == 0 and area[1] == 0:
                            area_type += ['BB']
                        if area[0] == 0 and area[1] == 1:
                            area_type += ['BR']
                        if area[0] == 1 and area[1] == 0:
                            area_type += ['RB']
                        if area[0] == 1 and area[1] == 1:
                            area_type += ['RR']
                    df_area_colors.insert(loc=2,
                                          column='area_type',
                                          value=area_type)
                    '''Post process server'''
                    for i, kilo_id in enumerate(
                            np.arange(1, len(df_kilo_server.columns), 5)):
                        # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue'))
                        #     print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n')
                        kilo_pos = df_kilo_server.iloc[:, kilo_id + i +
                                                       2:kilo_id + i +
                                                       4].values
                        #     print(kilo_pos)
                        in_area = np.empty(kilo_pos.shape[0], dtype=int)
                        in_area.fill(-1)
                        for area_idx, area_pos in enumerate(areas_pos):
                            # print(area_idx, ' ', area_pos)
                            dist = np.linalg.norm(kilo_pos - area_pos, axis=1)
                            #     print(dist, end='\n\n')
                            in_area = np.where(
                                dist < area_threshold,
                                df_area_colors.iloc[area_idx][-1][::-1],
                                in_area)
                        #     in_area = np.where(in_area == -1, np.NaN, in_area)
                        #     print(in_area)
                        df_kilo_server.insert(loc=int(kilo_id + i + 2),
                                              column='area_type' + str(i),
                                              value=in_area)
                    '''Post process client'''
                    for i, kilo_id in enumerate(
                            np.arange(1, len(df_kilo_client.columns), 5)):
                        # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue'))
                        #     print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n')
                        kilo_pos = df_kilo_client.iloc[:, kilo_id + i +
                                                       2:kilo_id + i +
                                                       4].values
                        #     print(kilo_pos)
                        in_area = np.empty(kilo_pos.shape[0], dtype=int)
                        in_area.fill(-1)
                        for area_idx, area_pos in enumerate(areas_pos):
                            #     print(area_idx,' ', area_pos)
                            dist = np.linalg.norm(kilo_pos - area_pos, axis=1)
                            #     print(dist, end='\n\n')
                            in_area = np.where(
                                dist < area_threshold,
                                df_area_colors.iloc[area_idx][-1], in_area)
                        #     in_area = np.where(in_area == -1, np.NaN, in_area)
                        #     print(in_area)
                        df_kilo_client.insert(loc=int(kilo_id + i + 2),
                                              column='area_type' + str(i),
                                              value=in_area)
                    '''Here finally evaluated in which area the timeout elapses'''
                    kilo_resume = [["state" + str(i), "area_type" + str(i)]
                                   for i in range(num_robots)]
                    kilo_resume = np.reshape(kilo_resume, (-1))
                    server_kilo_resume = df_kilo_server.iloc[:][kilo_resume]
                    client_kilo_resume = df_kilo_client.iloc[:][kilo_resume]
                    total_exp_df = client_kilo_resume.join(server_kilo_resume,
                                                           lsuffix='_c',
                                                           rsuffix='_s')

                    if value_studied == "mean_timeout":
                        timeout_count = pd.DataFrame(
                            columns=['RR', 'RB', 'BR', 'BB'])
                        for i in range(0, len(total_exp_df.columns), 2):
                            #     print(total_exp_df.iloc[:50,i:i+2])
                            kilo_state = total_exp_df.iloc[:, i:i + 2]
                            kilo_state = kilo_state.replace(2, 3)
                            mask = (kilo_state[
                                kilo_state.columns.values[0]].diff() == 2)
                            #     print(kilo_state[mask])
                            #     print(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), end='\n\n')
                            robot_timeout = kilo_state[mask][
                                kilo_state.columns.values[1]].value_counts(
                                ).to_frame().T
                            #     robot_timeout = pd.DataFrame(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), columns=['RR, RB,BR,BB'])
                            #     print(robot_timeout)
                            timeout_count = timeout_count.append(robot_timeout)
                            # print(robot_timeout, end='\n\n')
                        timeout_count = timeout_count.fillna(0)
                        single_run_mean = timeout_count.mean(axis=0)

                    else:
                        completed_area_count = pd.DataFrame(
                            columns=['RR', 'RB', 'BR', 'BB'])
                        for i in range(0, len(total_exp_df.columns), 2):
                            #     print(total_exp_df.iloc[:50,i:i+2])
                            kilo_state = total_exp_df.iloc[:, i:i + 2]
                            mask = (kilo_state[
                                kilo_state.columns.values[0]].diff() == -1)
                            # print(kilo_state[mask])
                            # print(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), end='\n\n')
                            robot_completed_area = kilo_state[mask][
                                kilo_state.columns.values[1]].value_counts(
                                ).to_frame().T
                            #     robot_completed_area = pd.DataFrame(kilo_state[mask][kilo_state.columns.values[1]].value_counts(), columns=['RR, RB,BR,BB'])
                            #     print(robot_completed_area)
                            completed_area_count = completed_area_count.append(
                                robot_completed_area)
                            # print(robot_completed_area, end='\n\n')

                        completed_area_count = completed_area_count.fillna(0)
                        single_run_mean = completed_area_count.mean(axis=0)

                    single_df = single_run_mean.to_frame().T
                    single_df.index = [filename_seed]

                    if os.path.isfile(
                            os.path.join(
                                results_folder, pickle_file_root +
                                "_timeout#" + str(timeout) + "_.pickle")):
                        run_memory_mean = pd.read_pickle(
                            os.path.join(
                                results_folder, pickle_file_root +
                                "_timeout#" + str(timeout) + "_.pickle"))
                        run_memory_mean = run_memory_mean.append(single_df)
                        run_memory_mean.to_pickle(
                            os.path.join(
                                results_folder, pickle_file_root +
                                "_timeout#" + str(timeout) + "_.pickle"))
                        print("Timeout:", timeout, end=", ")
                        print("Appending mean run, file size: ",
                              run_memory_mean.shape)
                    else:
                        print("Timeout:", timeout, end=", ")
                        print("Writing mean run")
                        single_df.to_pickle(
                            os.path.join(
                                results_folder, pickle_file_root +
                                "_timeout#" + str(timeout) + "_.pickle"))
                    evaluate_run = False

        rr[timeout] = run_memory_mean['RR'].values
        rb[timeout] = run_memory_mean['RB'].values
        br[timeout] = run_memory_mean['BR'].values
        bb[timeout] = run_memory_mean['BB'].values

    if value_studied == "mean_timeout":
        figureName = 'meanElapsedTimeout'
    else:
        figureName = 'meanCompletedAreas'

    figureName += '_groupsize' + groupSize + '_' + experiment + '_' + walk
    print("rr", rr)
    boxplots_utils.grouped_4_boxplot(rr, rb, br, bb, y_lim, figureName)
wc_full_data = wc_full_data.sort_values(by=['Type', 'SKU'])
is_french = wc_full_data['SKU'].astype(str).str.contains('_fr')
wc_data_french = wc_full_data[is_french]
wc_data = wc_full_data[is_french == False]
# Clean the invalid name
wc_data['Name'] = wc_data['Name'].astype(str)
wc_data = wc_data[wc_data['Name'].str.contains('#REF!') == False]

# wc_data contains English rows
# wc_data_french contains French Rows

attribute_keys = wc_data.columns[wc_data.columns.str.endswith(' name')]
attribute_values = wc_data.columns[wc_data.columns.str.endswith('value(s)')]

wc_data_attributes = pd.lreshape(wc_data, {
    'key': attribute_keys,
    'value': attribute_values
})
wc_data_attributes = wc_data_attributes.pivot(index='ID',
                                              columns='key',
                                              values='value')

wc_data = pd.merge(wc_data, wc_data_attributes, on='ID')

wc_data['slug'] = wc_data['Name'].apply(lambda x: slugify(x))

# %%
slug_mask = wc_data['slug'].duplicated(keep=False)
wc_data.loc[slug_mask, ['slug', 'Name']].sort_values(by=['Name'])
wc_data.loc[slug_mask,
            'slug'] += wc_data.groupby('slug').cumcount().add(1).astype(str)
wc_data['new_sku'] = wc_data['SKU'].fillna('').apply(lambda x: x.split('-')[0])
def make_link_and_node_df(sankey_df, num_steps: int, dropna=False):
    """Takes a df in the following format (output of make_sankey_df):
        |   0   |   1   | ... | num | step_0 | step_1 | ...
    ----+-------+-------+-----+-----+--------+--------+-----
     1  | cat_1 | cat_2 | ... |  2  |   0    |   8    | ...
     2  | cat_2 | None  | ... | 10  |   1    |   9    | ...
    ...
    
    Returns link_df:
       | source | target | num
    ---+--------+--------+-----
     0 |   0    |   8    | 114
     1 |   1    |   9    |  57
    ...
    
    Returns node_df:
       | source | label | step
    ---+--------+-------+--------
     0 |   0    | cat_1 | step_0
     1 |   1    | cat_2 | step_0
    ...
    """
    # reshape into source-target
    steps = range(num_steps)
    link_df = pd.lreshape(sankey_df,
                          groups={
                              'source':
                              [f'step_{step}' for step in steps[:-1]],
                              'target': [f'step_{step}' for step in steps[1:]]
                          })[['source', 'target', 'num']]
    link_df = link_df.groupby(['source', 'target']).sum().reset_index()

    # get index labels
    node_df = pd.lreshape(
        sankey_df,
        groups={
            'source': [f'step_{step}' for step in steps],
            'label': steps
        })[['source', 'label'
            ]].drop_duplicates().sort_values('source').reset_index(drop=True)

    # link source indices to step
    step_source = sankey_df[[f'step_{step}'
                             for step in steps]].to_dict(orient='list')
    step_source = {k: list(set(v)) for k, v in step_source.items()}
    source_step_dict = {}
    for k, v in step_source.items():
        for source in v:
            source_step_dict[source] = k
    node_df['step'] = node_df['source'].apply(lambda x: source_step_dict[x])

    if dropna is True:
        # generate new indices for link_df
        step_stack_df = pd.lreshape(
            link_df, {'step_stack': ['source', 'target']})[['step_stack']]
        step_stack_df['new_idx'] = step_stack_df['step_stack'].astype(
            'category').cat.codes
        step_stack_df = step_stack_df.drop_duplicates()
        replace_dict = dict(
            zip(step_stack_df['step_stack'], step_stack_df['new_idx']))
        link_df.loc[:, ['source', 'target'
                        ]] = link_df.loc[:, ['source', 'target']].replace(
                            replace_dict)  # reassign missing keys

        # filter out missing keys from node_df
        node_df = node_df[(node_df['source'].isin(replace_dict.keys()))]
        node_df.loc[:, 'source'] = node_df.loc[:, 'source'].replace(
            replace_dict)  # reassign missing keys

    return link_df, node_df
示例#13
0
def process():
    input_json = json.load(open('input.json', 'r'))
    m_i_df = pd.read_pickle(input_json['data_pickle'])

    #input_df

    #result = current_app.

    print('proc till tolist')
    m_i_df["away_players"] = m_i_df[[
        'away1_id', 'away2_id', 'away3_id', 'away4_id', "away5_id", "away6_id",
        "away7_id", 'away8_id', 'away9_id', 'away10_id', 'away11_id'
    ]].values.tolist()
    m_i_df["home_players"] = m_i_df.loc[:, [
        'home1_id', 'home2_id', 'home3_id', 'home4_id', 'home5_id', 'home6_id',
        'home7_id', 'home8_id', 'home9_id', 'home10_id', 'home11_id'
    ]].values.tolist()
    m_i_df["score_home"], m_i_df["score_away"] = m_i_df["score"].str.split(
        '-', 1).str
    m_i_df["score_home"] = m_i_df["score_home"].apply(int)
    m_i_df["score_away"] = m_i_df["score_away"].apply(int)
    m_i_df["away_players"] = m_i_df[[
        'away1_id', 'away2_id', 'away3_id', 'away4_id', "away5_id", "away6_id",
        "away7_id", 'away8_id', 'away9_id', 'away10_id', 'away11_id'
    ]].values.tolist()
    m_i_df["home_players"] = m_i_df.loc[:, [
        'home1_id', 'home2_id', 'home3_id', 'home4_id', 'home5_id', 'home6_id',
        'home7_id', 'home8_id', 'home9_id', 'home10_id', 'home11_id'
    ]].values.tolist()
    m_i_df["away_players_bd"] = m_i_df["away_players"].apply(eletkor_to_list)
    m_i_df["home_players_bd"] = m_i_df["home_players"].apply(eletkor_to_list)
    m_i_df["away_players_bd"] = m_i_df[[
        'date',
        'away_players_bd',
    ]].values.tolist()
    m_i_df["home_players_bd"] = m_i_df[[
        'date',
        'home_players_bd',
    ]].values.tolist()
    m_i_df["avg_home_age"] = m_i_df["home_players_bd"].apply(calculate_age)
    m_i_df["avg_away_age"] = m_i_df["away_players_bd"].apply(calculate_age)
    m_i_df["max_age"] = m_i_df["home_players_bd"].apply(calculate_max_age)
    m_i_df["max_age_place"] = m_i_df["home_players_bd"].apply(
        calculate_max_age_place)
    m_i_df["AVG_HEIGHT_H"] = m_i_df["home_players"].apply(avg_height)
    m_i_df["AVG_HEIGHT_A"] = m_i_df["away_players"].apply(avg_height)
    m_i_df["stad_cap_a"] = m_i_df.away_team.apply(stad_cap)
    m_i_df["stad_cap_h"] = m_i_df.home_team.apply(stad_cap)
    m_i_df["all_goals"] = m_i_df["score_away"] + m_i_df["score_home"]
    m_i_df["goal_difference"] = m_i_df["score_away"] - m_i_df["score_home"]
    m_i_df_proba = pd.concat([
        m_i_df[:], m_i_df["away_players"].apply(height_to_list).apply(
            pd.Series).rename(columns=lambda x: 'heights_a_' + str(x + 1))[:]
    ],
                             axis=1)
    m_i_df_proba = pd.concat([
        m_i_df_proba[:], m_i_df["home_players"].apply(height_to_list).apply(
            pd.Series).rename(columns=lambda x: 'heights_h_' + str(x + 1))[:]
    ],
                             axis=1)
    pos_height_df = pd.lreshape(
        m_i_df_proba, {
            'position': [
                'away1_pos', 'away2_pos', 'away3_pos', 'away4_pos',
                "away5_pos", "away6_pos", "away7_pos", 'away8_pos',
                'away9_pos', 'away10_pos', 'away11_pos', 'home1_pos',
                'home2_pos', 'home3_pos', 'home4_pos', 'home5_pos',
                'home6_pos', 'home7_pos', 'home8_pos', 'home9_pos',
                'home10_pos', 'home11_pos'
            ],
            'heights': [
                'heights_a_1', 'heights_a_2', 'heights_a_3', 'heights_a_4',
                'heights_a_5', 'heights_a_6', "heights_a_7", "heights_a_8",
                "heights_a_9", 'heights_a_10', 'heights_a_11', 'heights_h_1',
                'heights_h_2', 'heights_h_3', 'heights_h_4', 'heights_h_5',
                'heights_h_6', "heights_h_7", "heights_h_8", "heights_h_9",
                'heights_h_10', 'heights_h_11'
            ]
        }).pipe(lambda x: x[["position", "heights"]])
    pos_height_df["heights"] = pos_height_df["heights"].apply(try_height_2)
    m_i_df = pd.concat([
        m_i_df[:],
        m_i_df["away_players_bd"].apply(calculate_valamelyik_age).apply(
            pd.Series).rename(columns=lambda x: 'ages_a_' + str(x + 1))[:]
    ],
                       axis=1)
    m_i_df = pd.concat([
        m_i_df[:],
        m_i_df["home_players_bd"].apply(calculate_valamelyik_age).apply(
            pd.Series).rename(columns=lambda x: 'ages_h_' + str(x + 1))[:]
    ],
                       axis=1)
    pos_age_df = pd.lreshape(
        m_i_df, {
            'position': [
                'away1_pos', 'away2_pos', 'away3_pos', 'away4_pos',
                "away5_pos", "away6_pos", "away7_pos", 'away8_pos',
                'away9_pos', 'away10_pos', 'away11_pos', 'home1_pos',
                'home2_pos', 'home3_pos', 'home4_pos', 'home5_pos',
                'home6_pos', 'home7_pos', 'home8_pos', 'home9_pos',
                'home10_pos', 'home11_pos'
            ],
            'ages': [
                'ages_a_1', 'ages_a_2', 'ages_a_3', 'ages_a_4', 'ages_a_5',
                'ages_a_6', "ages_a_7", "ages_a_8", "ages_a_9", 'ages_a_10',
                'ages_a_11', 'ages_h_1', 'ages_h_2', 'ages_h_3', 'ages_h_4',
                'ages_h_5', 'ages_h_6', "ages_h_7", "ages_h_8", "ages_h_9",
                'ages_h_10', 'ages_h_11'
            ]
        }).pipe(lambda x: x)[["position", "ages"]]

    print("EDDIG ELJUT - PROCESS")


    new_df = pd.merge(m_i_df[["away_team","home_team","date","avg_home_age","avg_away_age","result","score"]]\
    ,current_app.m_o_df[["odds","odds1" ,"odds2" ,"oddsx","team_id_at","team_id_ht","date","score"]]\
    ,how='left', left_on=["away_team","home_team","date","score"], right_on = ["team_id_at","team_id_ht","date","score"])

    new_df[["odds", "odds1", "odds2",
            "oddsx"]] = new_df[["odds", "odds1", "odds2",
                                "oddsx"]].applymap(oddsx_to_float)
    new_df = new_df.dropna(subset=['odds'])

    new_df["result"] = new_df["result"].apply(lambda x: float(x))

    new_df["win_as_udog_h"] = new_df.loc[:,
                                         ["odds1", "odds2", "result"]].apply(
                                             win_udog_h, axis=1)
    new_df["win_as_udog_a"] = new_df.loc[:,
                                         ["odds1", "odds2", "result"]].apply(
                                             win_udog_a, axis=1)

    new_df["lose_fav_h"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply(
        lose_fav_h, axis=1)
    new_df["lose_fav_a"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply(
        lose_fav_a, axis=1)

    new_df[
        "looser"] = new_df.loc[:,
                               ["team_id_ht", "team_id_at", "result"]].apply(
                                   looser, axis=1)
    new_df["looser_odds"] = new_df.loc[:, ["odds1", "odds2", "result"]].apply(
        looser_odds, axis=1)

    m_i_df["away_values"] = m_i_df.loc[:, ["date", "away_players"]].apply(
        player_value, axis=1)

    m_i_df["home_values"] = m_i_df.loc[:, ["date", "home_players"]].apply(
        player_value, axis=1)

    new_df = pd.merge(new_df,
                      m_i_df[[
                          "away_values", "home_values", "away_team",
                          "home_team", "date", "score"
                      ]],
                      how='left',
                      left_on=["away_team", "home_team", "date", "score"],
                      right_on=["away_team", "home_team", "date", "score"])

    value_a = m_i_df['away_values'].apply(pd.Series)
    value_h = m_i_df['home_values'].apply(pd.Series)

    value_a = value_a.rename(columns=lambda x: 'value_a_' + str(x + 1))
    value_h = value_h.rename(columns=lambda x: 'value_h_' + str(x + 1))






















    out = [
         {'most-used-formation':m_i_df[['home_formation','away_formation']].unstack().value_counts().index[0]},
          {'number-of-players-with-no-games':str(len(current_app.p_i_df)-len(current_app.p_i_df.loc[current_app.p_i_df.playerid.isin(pd.DataFrame(m_i_df[['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id']].unstack().unique())[0])]))},

          {'player-with-highest-number-of-games':str(m_i_df[['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id']].unstack().value_counts().index[0])},
          {'player-with-highest-number-of-games-where-his-team-didnt-concede': int(pd.concat([pd.DataFrame(m_i_df.loc[(m_i_df['score_home'] == 0)][['home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id']]),pd.DataFrame(m_i_df.loc[(m_i_df['score_away'] == 0)][['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id", "away6_id","away7_id",'away8_id','away9_id']])]).unstack().value_counts().index[0])},
          {'most-games-played-in-same-position-by-player':str(
            pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id'],
                     'position':['away10_pos','away11_pos','away1_pos','away2_pos','away3_pos','away4_pos',"away5_pos","away6_pos","away7_pos",'away8_pos','away9_pos','home1_pos','home2_pos','home3_pos','home4_pos','home5_pos','home6_pos','home7_pos','home8_pos','home9_pos','home10_pos','home11_pos']})\
.pipe(lambda x:x)[["player_id","position"]]\
.groupby(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id", "away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id'],
                              'position':['away10_pos','away11_pos','away1_pos','away2_pos','away3_pos','away4_pos',"away5_pos","away6_pos","away7_pos",'away8_pos','away9_pos','home1_pos','home2_pos','home3_pos','home4_pos','home5_pos','home6_pos','home7_pos','home8_pos','home9_pos','home10_pos','home11_pos']})\
.pipe(lambda x:x)[["player_id","position"]].columns.tolist(),as_index=False).size().max())},

          {'most-different-positions-by-player':str(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id",
       "away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id',
        'home8_id','home9_id','home10_id','home11_id'],
                      'position':['away10_pos','away11_pos','away1_pos','away2_pos','away3_pos','away4_pos',"away5_pos",
       "away6_pos","away7_pos",'away8_pos','away9_pos',
       'home1_pos','home2_pos','home3_pos','home4_pos','home5_pos','home6_pos','home7_pos',
        'home8_pos','home9_pos','home10_pos','home11_pos']}).pipe(lambda x:x)[["player_id","position"]][["player_id","position"]].groupby('player_id')["position"].nunique().max())},
          {'most-different-formations-by-player':str(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id","away6_id","away7_id",'away8_id','away9_id','home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id','home8_id','home9_id','home10_id','home11_id'],'formation':['away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation','away_formation',"home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation","home_formation"]}).pipe(lambda x: x)[["player_id","formation"]].groupby('player_id')["formation"].nunique().max())},

          {'largest-odds-overcome-in-game':new_df[new_df["result"]!=0.0]["odds"].max()},
          {'largest-height-difference-overcome-in-game':m_i_df.loc[:,["result","AVG_HEIGHT_H","AVG_HEIGHT_A"]].apply(height_diff_OC,axis=1).max()},

          {'longest-time-in-days-between-two-games-for-player':None},

          {'biggest-value-difference':str(m_i_df.loc[:,["away_values" ,"home_values"]].apply(values_diff,axis=1).max())},

          {'biggest-value-difference-upset':int(abs(m_i_df.loc[:,["away_values" ,"home_values","result"]].apply(values_diff_ups,axis=1).min()))}, # an upset means the unexpected team won

          {'biggest-value-difference-with-higher-odds':int(new_df.loc[:,["away_values" ,"home_values","odds1","odds2"]].apply(values_diff_ups_odds,axis=1).max())},

          {'biggest-stadium-capacity-difference-upset':None},

          {'capacity-of-stadium-of-team-with-most-games':pd.DataFrame(pd.lreshape(m_i_df, {'team_id':['away_team','home_team'],'seats':["stad_cap_a",'stad_cap_h']}).groupby(pd.lreshape(m_i_df, {'team_id':['away_team','home_team'],'seats':["stad_cap_a",'stad_cap_h']})[["team_id","seats"]].columns.tolist(),as_index=False).size()).idxmax()[0][1]},

          {'id-of-oldest-team-to-win-a-game':id_of_oldest_team_to_win_a_game(m_i_df)},

          {'biggest-age-difference-between-teams-match-id':int(m_i_df.iloc[abs(m_i_df["avg_away_age"]-m_i_df["avg_home_age"]).idxmax(),:]["mkey"])},

          {'median-of-winning-team-average-age':(m_i_df.loc[:,["result","avg_away_age","avg_home_age"]].apply(gyoztes_kor,axis=1)).median()},

          {'median-of-favorite-team-average-age':int(new_df.loc[:,["odds1" ,"odds2","avg_home_age","avg_away_age"]].apply(fav_age,axis=1).median())}, # favorite means has lower odds of winning

          {'median-of-underdog-team-average-age':int(new_df.loc[:,["odds1" ,"odds2","avg_home_age","avg_away_age"]].apply(udog_age,axis=1).median())}, # underdog means has higher odds of winning

          {'team-with-most-wins-as-underdog':pd.lreshape(new_df, {'team_id':['team_id_at','team_id_ht'],'wins_as_udog':["win_as_udog_a",'win_as_udog_h']}).pipe(lambda x: x[["team_id","wins_as_udog"]])\
           .groupby("team_id").agg({"wins_as_udog":"sum"})["wins_as_udog"].idxmax()},

          {'team-with-most-losses-as-favorite':pd.lreshape(new_df, {'team_id':['team_id_at','team_id_ht'],'lose_as_fav':["lose_fav_a",'lose_fav_h']}).pipe(lambda x: x[["team_id","lose_as_fav"]])\
           .groupby("team_id").agg({"lose_as_fav":"sum"})["lose_as_fav"].idxmax()},

          {'team-with-lowest-average-odds-of-draw':pd.lreshape(new_df, {'team_id':['team_id_at','team_id_ht'],'oddsx':["oddsx",'oddsx']}).pipe(lambda x: x[["team_id","oddsx"]])\
           .groupby("team_id").agg({"oddsx":"mean"})["oddsx"].idxmin()},

          {'position-with-highest-average-value':None},

          {'position-with-largest-average-height':pos_height_df.groupby("position").agg({"heights":"mean"})["heights"].idxmax()},

          {'position-with-youngest-average-age':pos_age_df.groupby("position").agg({"ages":"mean"})["ages"].idxmin()},

          {'goalkeeper-with-most-clean-sheets':None},#az átlagosan legtöbb gólt kapó kapus születési dátuma

          {'stadium-capactiy-of-team-with-most-avg-goals-in-a-game':None},#átlagosan leggólgazdagabb meccseket játszó csapat stadionjának befogadóképessége

          {'team-with-highest-profit-for-losing':int(new_df[["looser_odds","looser"]].groupby("looser")["looser_odds"].sum().idxmax())},#a csapat, akinek, ha minden meccsén ellenük fogadsz, összesítve a legnagyobb profitot termeli (mindig ugyanakkora összeggel fogadsz rá)

          {'largest-std-in-goal-difference-team':int(pd.lreshape(m_i_df, {'team_id':['away_team','home_team'],
                                'goal_difference':["goal_difference",'goal_difference']}).pipe(lambda x: x[["team_id","goal_difference"]])\
          .groupby("team_id").agg({"goal_difference":"std"})["goal_difference"].idxmax())},#a legnagyobb gólkülönbség szórással rendelkező csapat

          {'player-with-most-different-teams': int(pd.lreshape(m_i_df, {'player_id':['away10_id','away11_id','away1_id','away2_id','away3_id','away4_id',"away5_id",
                 "away6_id","away7_id",'away8_id','away9_id',
                 'home1_id','home2_id','home3_id','home4_id','home5_id','home6_id','home7_id',
                  'home8_id','home9_id','home10_id','home11_id'],
                                'team_id':['away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team','away_team',
                                           'home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team','home_team']})\
              .groupby("player_id")["team_id"].nunique().idxmax())},#a legtöbb csapatban pályára lépő játékos

          {'longest-losing-streak-team':None},#a leghosszabb vesztes széria by team

          {'longest-home-winning-streak-stadium-capacity':None },#leghosszabb hazai pályás győzelmi széria helyszínének befogadóképessége

          {'win-ratio-of-actual-highest-rated-player':None},#az adott időpillanatban legértékesebb játékos átlagos win ratioja

          {'oldest-player-to-win-a-home-game':m_i_df.iloc[m_i_df[m_i_df["result"]==1]["max_age"].idxmax(),:]["home_players"][m_i_df.iloc[m_i_df[m_i_df["result"]==1]["max_age"].idxmax(),:]["max_age_place"][0]]}#a legidősebb hazai pályán győztes meccset játszó játékos
          ]

    json.dump(out, open('output.json', 'w'))
    return 'FING'
示例#14
0
def compute(hi,d_hrs,d_divo,period=3,transform=1):   
    #compute moments, period   
    #says how many years correspond to one   
    #period   
   
    #Get Date at Interview   
    hi.insert(0, 'IDN', range(0,  len(hi)))   
    hi['res']=hi['NUMUNION']+hi['NUMCOHMR']   
       
    #Get Duration bins   
    bins_d=np.linspace(0,1200,int((100/period)+1))   
    bins_d_label=np.linspace(1,len(bins_d)-1,len(bins_d)-1)   
       
    ##########################   
    #Gen cohabitation Dataset   
    #########################   
       
    #Get date at interview   
    hi['int']=hi['IDATMM']+(hi['IDATYY']-1900)*12   
      
    #Gen age at interview  
    hi['ageint']=round((((hi['IDATYY']-1900)*12+hi['IDATMM'])-hi['birth_month'])/12,0)  
       
    #Take only if cohabitations   
    coh=hi[(hi['NUMUNION']-hi['NUMMAR']>0) |  (hi['NUMCOHMR']>0)].copy()   
       
       
    #Create number of cohabitations   
    coh['num']=0.0  
    for i in range(9):   
        if(np.any(coh['HOWBEG0'+str(i+1)])=='coh'):  
            coh.loc[coh['HOWBEG0'+str(i+1)]=='coh','num']=coh.loc[coh['HOWBEG0'+str(i+1)]=='coh','num']+1.0   
               
    #Expand the data       
    cohe=coh.loc[coh.index.repeat(np.array(coh.num,dtype=np.int32))]   
       
       
    #Link each cohabitation to relationship number   
    cohe['rell'] = cohe.groupby(['IDN']).cumcount()+1   
    cohe['cou']=1   
    cohe['rel']=None   
    for i in range(9):   
        if(np.any(coh['HOWBEG0'+str(i+1)])=='coh'):  
            cohe.loc[(cohe['HOWBEG0'+str(i+1)]=='coh') & (cohe['rell']==cohe['cou']),'rel']=i+1   
            cohe.loc[cohe['HOWBEG0'+str(i+1)]=='coh','cou']= cohe.loc[cohe['HOWBEG0'+str(i+1)]=='coh','cou']+1   
           
    #Get beginning and end of relationhip   
    cohe['beg']=-1   
    cohe['endd']=-1   
    cohe['how']=-1   
    cohe['mar']=-1       
    for i in range(9):   
        cohe.loc[(i+1==cohe['rel']),'beg']=cohe.loc[(i+1==cohe['rel']),'BEGDAT0'+str(i+1)]   
        cohe.loc[(i+1==cohe['rel']),'endd']=cohe.loc[(i+1==cohe['rel']),'ENDDAT0'+str(i+1)]   
        cohe.loc[(i+1==cohe['rel']),'how']=cohe.loc[(i+1==cohe['rel']),'HOWEND0'+str(i+1)]   
        cohe.loc[(i+1==cohe['rel']),'mar']=cohe.loc[(i+1==cohe['rel']),'MARDAT0'+str(i+1)]   
        #add here an indicator of whether it should be unilateral duvorce scenario   
           
    #Get how relationship end   
    cohe['fine']='censored'   
    cohe.loc[cohe['how']=='sep','fine']='sep'   
    cohe.loc[cohe['how']=='div','fine']='mar'   
    cohe.loc[(cohe['how']=='intact') & (cohe['mar']>1),'fine']='mar'   
       
    #Replace censored date if still together   
    cohe['end']=-1   
    cohe.loc[cohe['fine']=='sep','end']=cohe.loc[cohe['fine']=='sep','endd']   
    cohe.loc[cohe['fine']=='mar','end']=cohe.loc[cohe['fine']=='mar','mar']   
    cohe.loc[cohe['fine']=='censored','end']=cohe.loc[cohe['fine']=='censored','int']   
       
    #Duration   
    cohe['dur']=cohe['end']-cohe['beg']   
       
    #Keep if no error for duration   
    cohe=cohe[(cohe['dur']>0) & (cohe['dur']<2000)]   
       
    #Transform Duration in Years   
    cohe['dury'] = pd.cut(x=cohe['dur'], bins=bins_d,labels=bins_d_label)    
       
    cohe['dury']=cohe['dury'].astype(float)     
       
    #Eliminate non useful things   
    del coh   
       
    ##########################   
    #Gen marriage Dataset   
    #########################   
       
    #Take only if marriages   
    mar=hi[hi['NUMMAR']>0].copy()   
       
    #Create number of cohabitations   
    mar['num']=0   
    for i in range(9):   
        mar.loc[mar['MARDAT0'+str(i+1)]>0,'num']=mar.loc[mar['MARDAT0'+str(i+1)]>0,'num']+1   
               
    #Expand the data       
    mare=mar.loc[mar.index.repeat(mar.num)]   
       
       
    #Link each marriage to relationship number   
    mare['rell'] = mare.groupby(['IDN']).cumcount()+1   
    mare['cou']=1   
    mare['rel']=None   
    for i in range(9):   
        mare.loc[(mare['MARDAT0'+str(i+1)]>0) & (mare['rell']==mare['cou']),'rel']=i+1   
        mare.loc[mare['MARDAT0'+str(i+1)]>0,'cou']= mare.loc[mare['MARDAT0'+str(i+1)]>0,'cou']+1   
           
    #Get beginning and end of relationhip   
    mare['beg']=-1   
    mare['endd']=-1   
    mare['how']=-1   
    mare['mar']=-1       
    for i in range(9):   
        mare.loc[(i+1==mare['rel']),'beg']=mare.loc[(i+1==mare['rel']),'MARDAT0'+str(i+1)]   
        mare.loc[(i+1==mare['rel']),'endd']=mare.loc[(i+1==mare['rel']),'ENDDAT0'+str(i+1)]   
        mare.loc[(i+1==mare['rel']),'how']=mare.loc[(i+1==mare['rel']),'HOWEND0'+str(i+1)]   
       
           
    #Get how relationship end   
    mare['fine']='censored'   
    mare.loc[mare['how']=='div','fine']='div'   
       
       
    #Replace censored date if still together   
    mare['end']=-1   
    mare.loc[mare['fine']=='div','end']=mare.loc[mare['fine']=='div','endd']   
    mare.loc[mare['fine']=='censored','end']=mare.loc[mare['fine']=='censored','int']   
       
    #Duration   
    mare['dur']=mare['end']-mare['beg']   
       
    #Keep if no error for duration   
    mare=mare[(mare['dur']>0) & (mare['dur']<2000)]   
       
    #Transform Duration in Years   
    mare['dury'] = pd.cut(x=mare['dur'], bins=bins_d,labels=bins_d_label)    
       
    mare['dury']=mare['dury'].astype(float)    
       
    del mar   
       
    #############################   
    #Build relationship by month   
    ##############################   
       
    #Eliminate observation if info on beg-end not complete   
    #for i in range(9):   
     #   hi=hi[(np.isfinite(hi['BEGDAT0'+str(i+1)])) & (hi['BEGDAT0'+str(i+1)]<3999)]   
           
    #Get date in time at which the guy is 20,25...,50 (9)   
    for j in range(7):   
        hi['time_'+str(20+(j)*5)]=hi['DOBY']*12+hi['DOBM']+(20+(j)*5)*12   
           
    #Get the status   
    for j in range(7):   
           
        #Create the variable of Status   
        hi['status_'+str(20+(j)*5)]='single'   
           
   
           
        for i in range(9):   
            if(np.any(hi['HOWBEG0'+str(i+1)])!=None):  
               
                #Get if in couple   
                hi.loc[(hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)]) & (hi['BEGDAT0'+str(i+1)]<3999) &   
                       (((hi['time_'+str(20+(j)*5)]<=hi['ENDDAT0'+str(i+1)]) & (hi['ENDDAT0'+str(i+1)]>0))  |   
                        (hi['ENDDAT0'+str(i+1)]==0) | (hi['WIDDAT0'+str(i+1)]>0) )  
                       ,'status_'+str(20+(j)*5)]='mar'   
                          
            if(np.any(hi['HOWBEG0'+str(i+1)])=='coh'):             
                #Substitute if actually cohabitation    
                hi.loc[(hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)]) & (hi['BEGDAT0'+str(i+1)]<3999) &   
                       (((hi['time_'+str(20+(j)*5)]<=hi['ENDDAT0'+str(i+1)]) & (hi['ENDDAT0'+str(i+1)]>0))  |    
                        (hi['ENDDAT0'+str(i+1)]==0) | (hi['WIDDAT0'+str(i+1)]>0) ) &    
                        (hi['status_'+str(20+(j)*5)]=='mar') &    
                       (hi['HOWBEG0'+str(i+1)]=='coh')    &    
                       ((hi['MARDAT0'+str(i+1)]==0) | (hi['MARDAT0'+str(i+1)]>hi['time_'+str(20+(j)*5)]))        
                       ,'status_'+str(20+(j)*5)]='coh'    
                      
    #Create the variables ever cohabited and ever married   
    for j in range(7):   
           
        #Create the variable of ever married or cohabit   
        hi['everm_'+str(20+(j)*5)]=0.0   
        hi['everc_'+str(20+(j)*5)]=0.0   
           
        for i in range(9):   
              
            #if(np.any(hi['HOWBEG0'+str(i+1)])=='coh'):  
                #Get if ever cohabited    
                #hi.loc[((hi['everc_'+str(20+(max(j-1,0))*5)]>=0.1) | ((hi['HOWBEG0'+str(i+1)]=='coh') & (hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)]))),'everc_'+str(20+(j)*5)]=1.0   
            hi.loc[(hi['everc_'+str(20+(max(j-1,0))*5)]>=0.1),'everc_'+str(20+(j)*5)]=1.0  
            try: 
                hi.loc[((hi['HOWBEG0'+str(i+1)]=='coh') & (hi['time_'+str(20+(j)*5)]>=hi['BEGDAT0'+str(i+1)])),'everc_'+str(20+(j)*5)]=1.0   
            except: 
                pass 
                  
                #Get if ever married   
            hi.loc[((hi['everm_'+str(20+(max(j-1,0))*5)]>=0.1) |  (hi['time_'+str(20+(j)*5)]>=hi['MARDAT0'+str(i+1)])),'everm_'+str(20+(j)*5)]=1.0   
                   
    ######################################   
    #Build employment by status in 1986   
    ######################################   
    empl=hi[(hi['M2DP01']=='FEMALE') & (hi['weeks']<99)].copy()   
    empl['stat']='single'   
    empl['dist']=99999   
    for j in range(7):   
        empl.loc[np.abs(empl['time_'+str(20+(j)*5)]-86*12)<empl['dist'],'stat']=hi['status_'+str(20+(j)*5)]   
               
    ##########################   
    #BUILD HAZARD RATES   
    #########################    
       
    #Hazard of Separation   
    hazs=list()   
    hazs=hazards(cohe,'sep','dury','fine',hazs,int(6/period),'SAMWT')   
       
    #Hazard of Marriage   
    hazm=list()   
    hazm=hazards(cohe,'mar','dury','fine',hazm,int(6/period),'SAMWT')   
       
    #Hazard of Divorce   
    hazd=list()   
    hazd=hazards(mare,'div','dury','fine',hazd,int(12/period),'SAMWT')   
    
    #Eventually transform Hazards pooling more years together
    if transform>1:
        
        #Divorce
        hazdp=list()
        pop=1
        for i in range(int(12/(period*transform))):
            haz1=hazd[transform*i]*pop
            haz2=hazd[transform*i+1]*(pop-haz1)
            hazdp=[(haz1+haz2)/pop]+hazdp 
            pop=pop-(haz1+haz2)
        hazdp.reverse()   
        hazdp=np.array(hazdp).T 
        hazd=hazdp
            
        #Separation and Marriage
        hazsp=list()
        hazmp=list()
        pop=1
        for i in range(int(6/(period*transform))):
            hazs1=hazs[transform*i]*pop
            hazm1=hazm[transform*i]*pop
            
            hazs2=hazs[transform*i+1]*(pop-hazs1-hazm1)
            hazm2=hazm[transform*i+1]*(pop-hazs1-hazm1)
            hazsp=[(hazs1+hazs2)/pop]+hazsp
            hazmp=[(hazm1+hazm2)/pop]+hazmp
            pop=pop-(hazs1+hazs2+hazm1+hazm2)
            
        hazsp.reverse()   
        hazsp=np.array(hazsp).T 
        hazs=hazsp
        
        hazmp.reverse()   
        hazmp=np.array(hazmp).T 
        hazm=hazmp
       
    ########################################   
    #Construct share of each relationship   
    #######################################   
    mar=np.zeros(6)   
    coh=np.zeros(6)   
    emar=np.zeros(6)   
    ecoh=np.zeros(6)   
       
    for j in range(6):   
        mar[j]=np.average(hi['status_'+str(20+(j)*5)]=='mar', weights=np.array(hi['SAMWT']))   
        coh[j]=np.average(hi['status_'+str(20+(j)*5)]=='coh', weights=np.array(hi['SAMWT']))   
        emar[j]=np.average(hi['everm_'+str(20+(j)*5)], weights=np.array(hi['SAMWT']))   
        ecoh[j]=np.average(hi['everc_'+str(20+(j)*5)], weights=np.array(hi['SAMWT']))   
           
           

    #########################################   
    #Create the age at unilateral divorce+   
    #regression on the effect of unilateral divorce   
    ###########################################   
       
    #Number of relationships for the person   
    hi['numerl']=0.0   
       
    #List of variables to keep   
    keep_var=list()   
    keep_var=keep_var+['numerl']+['state']+['SAMWT']   
       
    for i in range(9):   
           
        #Make sure that some relationship of order i exist   
        if (np.any(hi['BEGDAT0'+str(i+1)])):   
               
            #Add relationship order   
            hi['order'+str(i+1)]=np.nan   
            hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'order'+str(i+1)]=i+1   
               
            #Add number of relationships   
            hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'numerl']+=1.0   
               
            #Get whether the relationship started in marriage or cohabitation   
            hi['imar'+str(i+1)]=np.nan   
            hi.loc[hi['HOWBEG0'+str(i+1)]=='coh','imar'+str(i+1)]=0.0   
            hi.loc[hi['HOWBEG0'+str(i+1)]=='mar','imar'+str(i+1)]=1.0   
               
            #Get age at relationship   
            hi['iage'+str(i+1)]=np.nan   
            hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'iage'+str(i+1)]=round((hi['BEGDAT0'+str(i+1)]-hi['birth_month'])/12)   
               
            #Get if unilateral divorce when relationship started   
            hi['unid'+str(i+1)]=np.nan   
            hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'unid'+str(i+1)]=0.0   
            hi.loc[(round(hi['BEGDAT0'+str(i+1)]/12+1900)>=hi['unil']) & (hi['unil']>0.1),'unid'+str(i+1)]=1.0   
               
            #Year Realationship Started   
            hi['year'+str(i+1)]=np.nan   
            hi.loc[np.isnan(hi['BEGDAT0'+str(i+1)])==False,'year'+str(i+1)]=round(hi['BEGDAT0'+str(i+1)]/12+1900)   
                          
            #Keep variables   
            keep_var=keep_var+['year'+str(i+1)]+['unid'+str(i+1)]+['iage'+str(i+1)]+['imar'+str(i+1)]+['order'+str(i+1)]   
       
           
           
    #New Dataset to reshape   
    hi2=hi[keep_var]   
       
    #Reahspe Dataset   
    years = ([col for col in hi2.columns if col.startswith('year')])   
    unids = ([col for col in hi2.columns if col.startswith('unid')])   
    iages = ([col for col in hi2.columns if col.startswith('iage')])   
    imars = ([col for col in hi2.columns if col.startswith('imar')])   
    order = ([col for col in hi2.columns if col.startswith('order')])   
       
    hi3 = pd.lreshape(hi2, {'year' : years,'unid' : unids,'iage' : iages,'imar' : imars,'order' : order})    
       
    #Eliminate if missing   
    hi3.replace([np.inf, -np.inf], np.nan)   
    hi3.dropna(subset=['imar','unid'])   
       
    #Regression   
    FE_ols = smf.wls(formula='imar ~ unid+C(iage)+C(state)+C(year)',weights=hi3['SAMWT'], data = hi3.dropna()).fit()   
    #FE_ols = smf.ols(formula='imar ~ unid+C(iage)+C(state)+C(year)', data = hi3.dropna()).fit()   
    beta_unid=FE_ols.params['unid']   
       
    #Get age at which unilateral divorced was introduced   
    hi['age_unid']=0.0   
    hi.loc[hi['unil']==0,'age_unid']=1000.0   
    hi.loc[hi['unil']!=0,'age_unid']=hi['unil']-hi['birth']     
         
       
    #Get age in the second survey   
    date_age=pd.read_csv('age_drop.csv')   
       
    #From hi make '-1' if law changed before the guy starts   
    hi.loc[hi['age_unid']<0,'age_unid']=-1   
       
       

    ############################## 
    #Compute hours using the psid 
    ################################ 
     
    #Account for the survey to be retrospective 
    d_hrs['age']=d_hrs['age']-1.0 
     
    #Trim if hrs>2000 
    d_hrs.loc[d_hrs['wls']>=2000,'wls']=2000 
     
    #First keep the right birth cohorts 
    d_hrs['birth']=d_hrs['year']-d_hrs['age'] 
    d_hrs=d_hrs[(d_hrs['birth']>=1940) & (d_hrs['birth']<1955)] 
  
    #Generate variables of interest 
    d_hrs['mar']=-1.0 
    d_hrs.loc[(d_hrs['mls']==1),'mar']=1.0 
    d_hrs.loc[(d_hrs['mls']>1) & (d_hrs['mls']<100),'mar']=0.0 
     
    #Get mean labor supply 
    mean_fls=np.average(d_hrs.loc[(d_hrs['age']>=20) & (d_hrs['age']<=60),'wls'])/2000 
       
    #New dataset  
    d_hrs2=d_hrs[(d_hrs['mar']>=0) & (d_hrs['year']>=1977)] 
    
    #Get Ratio of Female to Male FLP #23-38-53
    fls_ratio=np.zeros((2))   
    fls_ratio[0]=np.average(d_hrs2.loc[(d_hrs2['mar']==1.0) & (d_hrs['age']>=23) &
                                    (d_hrs['age']<=38),'wls'])/np.average(d_hrs2.loc[(d_hrs2['mar']==0.0) &
                                    (d_hrs['age']>=23) & (d_hrs['age']<=38),'wls'])   
                
    fls_ratio[1]=np.average(d_hrs2.loc[(d_hrs2['mar']==1.0) & (d_hrs['age']>=38) &
                                    (d_hrs['age']<=53),'wls'])/np.average(d_hrs2.loc[(d_hrs2['mar']==0.0) &
                                    (d_hrs['age']>=38) & (d_hrs['age']<=53),'wls'])   
                    
                    
    #Get difference in male wages in marriage and cohabitation
    weightm=d_hrs2.loc[(d_hrs2['mar']==1.0) & (np.isnan(d_hrs2['ln_ly'])==False),'wls']
    weightc=d_hrs2.loc[(d_hrs2['mar']==0.0) & (np.isnan(d_hrs2['ln_ly'])==False),'wls']
    wage_ratio=np.average(d_hrs2.loc[(d_hrs2['mar']==1.0) & (np.isnan(d_hrs2['ln_ly'])==False),'ln_ly'],weights=weightm)-np.average(d_hrs2.loc[(d_hrs2['mar']==0.0) & (np.isnan(d_hrs2['ln_ly'])==False),'ln_ly'],weights=weightc)
               
    #######################################
    #Get divorce by income using PSID
    ########################################
    divR=np.average(d_divo.loc[(d_divo['ln_ly']>d_divo['wtmedian']),'div'])
    divP=np.average(d_divo.loc[(d_divo['ln_ly']<d_divo['wtmedian']),'div'])
    marR=np.average(d_divo.loc[(d_divo['ln_ly']>d_divo['wtmedian']),'mar'])
    marP=np.average(d_divo.loc[(d_divo['ln_ly']<d_divo['wtmedian']),'mar'])
    div_ratio=(divR/marR)/(divP/marP)
     
    
    ########################################
    #FREQENCIES
    #######################################
        
    def CountFrequency(my_list):    
     
        # Creating an empty dictionary     
        freq = {}    
        for item in my_list:    
            if (item in freq):    
                freq[item] += 1   
            else:    
                freq[item] = 1   
         
        #for key, value in freq.items():    
         #   print ("% d : % d"%(key, value))    
           
        return freq   
           
    
    #Modify age unid
    freq_pc=dict()
    freq_pc['male'] = CountFrequency(hi.loc[hi['M2DP01']=='MALE','age_unid'].tolist()) 
    freq_pc['female'] = CountFrequency(hi.loc[hi['M2DP01']=='FEMALE','age_unid'].tolist()) 
    freq_pc['share_female']=np.mean(hi['M2DP01']=='FEMALE')
       
    #Frequencies for age in the second wave   
    freq_i= CountFrequency(date_age['age'].tolist())   
      
    #Frequencies for age at intervire  
    freq_ai=CountFrequency(hi['ageint'].tolist())   
    
    #Frequencies of agents by age at unid and gender
    freq_nsfh = hi[['M2DP01','age_unid','SAMWT']]#hi.groupby(['M2DP01','age_unid'])['SAMWT'].count()
       
    #Get distribution of types using the psid
    freq_psid_tot=d_hrs[['age','unid']]
    freq_psid_par=d_hrs2[['age','unid','mar']]
    freq_psid_div=d_divo[['age','unid']]
        
       
    #Create a dictionary for saving simulated moments   
    listofTuples = [("hazs" , hazs), ("hazm" , hazm),("hazd" , hazd),("emar" , emar),  
                    ("ecoh" , ecoh), ("fls_ratio" , fls_ratio),("wage_ratio" , wage_ratio),("div_ratio" , div_ratio),
                    ("mean_fls" , mean_fls),("mar" , mar),("coh" , coh),  
                    ("freq_pc" , freq_pc), ("freq_i" , freq_i),("beta_unid" , beta_unid),("freq_ai" , freq_ai),
                    ("freq_nsfh" , freq_nsfh),("freq_psid_tot" , freq_psid_tot),("freq_psid_par" , freq_psid_par),("freq_psid_div" , freq_psid_div)]   
    dic_mom=dict(listofTuples)   
       
    del hi,hi2,hi3   
    return dic_mom   
示例#15
0
def main():
    evaluate_run = False

    results_folder = os.path.join(os.getcwd(), "results_"+walk+"/" + experiment)
    if not os.path.isdir(results_folder):
        print(colored("Error, " + results_folder + " does not exist", 'red'))
    else:
        print(colored("OK, " + results_folder + " exists", 'green'))


    for timeout_folder in natsorted(os.listdir(os.path.join(results_folder))):
        if timeout_folder.endswith("pickle"):
            continue

        print(colored("Timeout folder:", 'blue'), timeout_folder)
        df_kilo_timeout = pd.DataFrame()

        timeout = -1
        parameters = timeout_folder.split("_")
        for param in parameters:
            if param.startswith("timeout"):
                timeout = int(param.split("#")[-1]) * 10
                # print("\t timeoutR:",timeoutR)

        if timeout == -1:
            print(colored("\tWARNING: wrong timeout folder", 'red'))
            continue

        if os.path.isfile(os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#" + str(timeout) + "_.pickle")):
            print("Already exists ",
                  os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#" + str(timeout) + "_.pickle"))

        else:
            # print(colored(
            #     os.path.join(results_folder, pickle_file_root"_timeout#"+str(timeout*10)+"_.pickle"),
            #     'red'))
            # sys.exit()
            for filename in natsorted(os.listdir(os.path.join(results_folder, timeout_folder))):
                filename_seed = filename.split("_")[0].split("#")[-1]
                # print(filename)
                if filename.endswith("areaLOG_client.tsv"):
                    if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0:
                        print(colored("\tWARNING, empty file at:" + filename, 'red'))
                        continue
                    # print('\tfilename: ', filename)
                    df_area_client = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t",
                                                 header=None)

                if filename.endswith("areaLOG_server.tsv"):
                    if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0:
                        print(colored("\tWARNING, empty file at:" + filename, 'red'))
                        continue
                    # print('\tfilename: ', filename)
                    df_area_server = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t",
                                                 header=None)

                if filename.endswith("kiloLOG_client.tsv"):
                    if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0:
                        print(colored("\tWARNING, empty file at:" + filename, 'red'))
                        continue
                    # print('\tfilename: ', filename)
                    df_kilo_client = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t",
                                                 header=None)

                if filename.endswith("kiloLOG_server.tsv"):
                    if not os.path.getsize(os.path.join(results_folder, timeout_folder, filename)) > 0:
                        print(colored("\tWARNING, empty file at:" + filename, 'red'))
                        continue
                    # print('\tfilename: ', filename, end='\n')
                    df_kilo_server = pd.read_csv(os.path.join(results_folder, timeout_folder, filename), sep="\t",
                                                 header=None)
                    evaluate_run = True

                if evaluate_run:
                    print(colored("\tEvaluating run:" + filename_seed, 'blue'))


                    '''Kilo log part'''
                    if len(df_kilo_client.columns) > 145:
                        # print("Cutting null elements in client kilo df")
                        df_kilo_client.drop(df_kilo_client.columns[len(df_kilo_client.columns) - 1], axis=1, inplace=True)

                    if len(df_kilo_server.columns) > 145:
                        # print("Cutting null elements in server kilo df")
                        df_kilo_server.drop(df_kilo_server.columns[len(df_kilo_server.columns) - 1], axis=1, inplace=True)

                    col_kilo_labels = ['time']
                    for i in range(0, len(df_kilo_server.columns) - 1, 6):
                        #     print(i,end=", ")
                        col_kilo_labels += ['id' + str(i // 6), 'state' + str(i // 6), 'posx' + str(i // 6),
                                            'posy' + str(i // 6),
                                            'ori' + str(i // 6), 'same_state' + str(i // 6)]

                    col_kilo_to_drop = []
                    for i in range((len(df_kilo_server.columns) - 1) // 6):
                        #     print(i,end=", ")
                        col_kilo_to_drop += ['same_state' + str(i)]

                    df_kilo_server.columns = col_kilo_labels
                    df_kilo_client.columns = col_kilo_labels
                    df_kilo_server = df_kilo_server.drop(col_kilo_to_drop, axis=1)
                    df_kilo_client = df_kilo_client.drop(col_kilo_to_drop, axis=1)


                    '''Area LOG part'''
                    col_area_labels = ['time']
                    for i in range(0, len(df_area_server.columns) - 2, 6):
                        # print(i, end=", ")
                        col_area_labels += ['id' + str(i // 6), 'posx' + str(i // 6), 'posy' + str(i // 6),
                                            'color' + str(i // 6),
                                            'completed' + str(i // 6), 'contained' + str(i // 6)]

                    # Remove last empty col and assign labels to df_area_server
                    if len(df_area_server.columns) > 49:
                        # print("Cutting null elements in area server df")
                        df_area_server.drop(df_area_server.columns[len(df_area_server.columns) - 1], axis=1, inplace=True)
                    df_area_server.columns = col_area_labels

                    # First df_area_client row contains garbage
                    # so is substituted with the second row except for the time,
                    # then remove Nan values in [:,49:]
                    if len(df_area_client.columns) > 49:
                        # print("Cutting null elements in area client df")
                        df_area_client.loc[0, 1:] = df_area_client.loc[1, 1:]
                        df_area_client = df_area_client.drop(np.arange(49, len(df_area_client.columns)), axis=1)
                    df_area_client.columns = col_area_labels

                    area_pos_label = []
                    for i in range(num_areas):
                        area_pos_label += ["posx" + str(i)]
                        area_pos_label += ["posy" + str(i)]
                    areas_pos = df_area_client[area_pos_label].iloc[0, :].values
                    # print(areas_pos)
                    areas_pos = areas_pos.reshape(-1, 2)


                    color_list = ["color" + str(i) for i in range(num_areas)]
                    df_area3_s = df_area_server.iloc[:1, :][color_list]
                    df_area3_c = df_area_client.iloc[:1, :][color_list]
                    for i, idx in enumerate(range(1, len(df_area3_c.columns) * 2, 2)):
                        #     print(i, ' ', idx)
                        df_area3_c.insert(loc=idx, column='other_col' + str(i), value=df_area3_s.iloc[0][i])
                    client = [col for col in df_area3_c.columns if 'color' in col]
                    server = [col for col in df_area3_c.columns if 'other_col' in col]
                    df_area_colors = pd.lreshape(df_area3_c, {'color_client': client, 'color_server': server})
                    area_type = []
                    for area in df_area_colors.values:
                        if area[0] == 0 and area[1] == 0:
                            area_type += ['BB']
                        if area[0] == 0 and area[1] == 1:
                            area_type += ['BR']
                        if area[0] == 1 and area[1] == 0:
                            area_type += ['RB']
                        if area[0] == 1 and area[1] == 1:
                            area_type += ['RR']
                    df_area_colors.insert(loc=2, column='area_type', value=area_type)

                    '''Post process server'''
                    for i_c, kilo_id in enumerate(np.arange(1, len(df_kilo_server.columns), 5)):
                        # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue'))
                        #     print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n')
                        kilo_pos = df_kilo_server.iloc[:, kilo_id + i_c + 2:kilo_id + i_c + 4].values
                        #     print(kilo_pos)
                        in_area = np.empty(kilo_pos.shape[0], dtype=int)
                        in_area.fill(-1)
                        for area_idx, area_pos in enumerate(areas_pos):
                            # print(area_idx, ' ', area_pos)
                            dist = np.linalg.norm(kilo_pos - area_pos, axis=1)
                            #     print(dist, end='\n\n')
                            in_area = np.where(dist < area_threshold, df_area_colors.iloc[area_idx][-1][::-1], in_area)
                        #     in_area = np.where(in_area == -1, np.NaN, in_area)
                        #     print(in_area)
                        df_kilo_server.insert(loc=int(kilo_id + i_c + 2), column='area_type' + str(i_c), value=in_area)

                    '''Post process client'''
                    for i_s, kilo_id in enumerate(np.arange(1, len(df_kilo_client.columns), 5)):
                        # print(colored("kilo_id:" + str((kilo_id - 1) // 5), 'blue'))
                        #     print(df_kilo_client.iloc[:20, kilo_id+2:kilo_id+4].values, end='\n\n')
                        kilo_pos = df_kilo_client.iloc[:, kilo_id + i_s + 2:kilo_id + i_s + 4].values
                        #     print(kilo_pos)
                        in_area = np.empty(kilo_pos.shape[0], dtype=int)
                        in_area.fill(-1)
                        for area_idx, area_pos in enumerate(areas_pos):
                            #     print(area_idx,' ', area_pos)
                            dist = np.linalg.norm(kilo_pos - area_pos, axis=1)
                            #     print(dist, end='\n\n')
                            in_area = np.where(dist < area_threshold, df_area_colors.iloc[area_idx][-1], in_area)
                        #     in_area = np.where(in_area == -1, np.NaN, in_area)
                        #     print(in_area)
                        df_kilo_client.insert(loc=int(kilo_id + i_s + 2), column='area_type' + str(i_s), value=in_area)

                    df_kilo_single_run = df_kilo_client.join(df_kilo_server, lsuffix='_c', rsuffix='_s')
                    df_kilo_single_run = df_kilo_single_run.set_index(df_kilo_single_run.index.astype(str) + '_' + filename_seed)

                    df_kilo_timeout = df_kilo_timeout.append(df_kilo_single_run)


                    evaluate_run = False

            '''Save pickle file'''
            df_kilo_timeout.to_pickle(os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#"+str(timeout)+"_.pickle"))
            print("Saving at: ", os.path.join(results_folder, timeout_folder, "kiloLOG_timeout#"+str(timeout)+"_.pickle"))
            print("Changing dir")