Exemplos de MyTool.getDFBetween em Python, exemplos de MyTool.getDFBetween, RoboFontExamples em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def getUserReport_hourly(cluster, start='', stop='', top=5, account=None):
        # get top 5 user for each resource
        fname = "{}/{}_{}".format(CSV_DIR, cluster,
                                  "assoc_usage_day_table.csv")
        df = pandas.read_csv(
            fname,
            usecols=['id', 'id_tres', 'alloc_secs', 'time_start'],
            dtype={'time_start': int})
        st, stp, df = MyTool.getDFBetween(df, 'time_start', start,
                                          stop)  #constrain by time
        sumDf = df.groupby(['id_tres', 'id']).sum()  #sum over user
        fname1 = "{}/{}_{}".format(CSV_DIR, cluster, "assoc_table.csv")
        userDf = pandas.read_csv(fname1,
                                 usecols=['id_assoc', 'user', 'acct'],
                                 index_col=0)
        sumDf = sumDf.join(userDf, on='id')
        if account:
            sumDf = sumDf[sumDf['acct'] == account]
        cpuIdx = sumDf.loc[(1, )].nlargest(top, 'alloc_secs').index
        memIdx = sumDf.loc[(2, )].nlargest(top, 'alloc_secs').index
        nodeIdx = sumDf.loc[(4, )].nlargest(top, 'alloc_secs').index
        gpuIdx = sumDf.loc[(1001, )].nlargest(top, 'alloc_secs').index

        #refine top users' data using hour_table
        fname2 = "{}/{}_{}".format(CSV_DIR, cluster,
                                   "assoc_usage_hour_table.csv")
        df = pandas.read_csv(
            fname2, usecols=['id', 'id_tres', 'time_start', 'alloc_secs'])
        st, stp, df = MyTool.getDFBetween(df, 'time_start', start, stop)
        # get top users data only
        dfg = df.groupby(['id_tres', 'id'])
        tresSer = {
            1: [],
            2: [],
            4: [],
            1001: []
        }  # {1: [{'data': [[ms,value],...], 'name': uid},...], 2:...}
        idxSer = {1: cpuIdx, 2: memIdx, 4: nodeIdx, 1001: gpuIdx}
        for tres in [1, 2, 4, 1001]:
            for uid in idxSer[tres]:
                topDf = dfg.get_group((tres, uid))
                topDf['ts_ms'] = topDf['time_start'] * 1000
                topDf['alloc_ratio'] = topDf['alloc_secs'] / 3600
                topLst = topDf[['ts_ms', 'alloc_ratio']].values.tolist()
                tresSer[tres].append({
                    'data':
                    topLst,
                    'name':
                    userDf.loc[uid, 'user'] + "(" + userDf.loc[uid, 'acct'] +
                    ")"
                })

        return st, stp, tresSer

Exemplo n.º 2

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def sum_assoc_usage_day(cluster):
        # read in one year's usage table
        fname = "{}/{}_{}".format(CSV_DIR, cluster,
                                  "assoc_usage_day_table.csv")
        df = pandas.read_csv(fname, dtype={'time_start': int})
        start = int(time.time()) - 365 * 24 * 3600  # 1 years' history
        start, stop, df = MyTool.getDFBetween(df, 'time_start', start, None)

        # join with user
        fname1 = "{}/{}_{}".format(CSV_DIR, cluster, "assoc_table.csv")
        userDf = pandas.read_csv(fname1,
                                 usecols=['id_assoc', 'user', 'acct'],
                                 index_col=0)
        rlt = df.join(userDf, on='id')
        rlt.to_csv("{}/{}_{}".format(
            CSV_DIR, cluster, "assoc_usage_day_1year_combine_table.csv"),
                   index=False)

        # get summary data
        rlt = rlt[['id_tres', 'user', 'alloc_secs']]
        dfg = rlt.groupby(['id_tres', 'user'])
        sum_df = dfg.sum()
        df_lst = []
        for idx in [1, 2, 4, 1001]:  #cpu, mem, node, gpu
            tres_df = sum_df.loc[idx, ]
            tres_df = tres_df.sort_values('alloc_secs', ascending=False)
            tres_df = tres_df.reset_index('user')
            tres_df['id_tres'] = idx
            tres_df['rank'] = tres_df.index + 1
            df_lst.append(tres_df)
        sum_df = pandas.concat(df_lst, ignore_index=True)
        sum_df.to_csv("{}/{}_{}".format(CSV_DIR, cluster,
                                        "assoc_usage_day_1year_sum_table.csv"),
                      index=False)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def sum_job_step(cluster, days=30):
        start = int(time.time()) - days * ONE_DAY_SECS
        step_df = SlurmDBQuery.readClusterTable(cluster, 'step_table', [
            'job_db_inx', 'id_step', 'user_sec', 'user_usec', 'sys_sec',
            'sys_usec', 'time_start'
        ])
        s1, s2, step_df = MyTool.getDFBetween(step_df, 'time_start', start)
        dfg = step_df.groupby('job_db_inx')
        sum_df = dfg.sum()
        sum_df.insert(
            0, 'total_cpu', sum_df.user_sec + sum_df.sys_sec +
            (sum_df.user_sec + sum_df.sys_sec) / 1000000)
        sum_df = sum_df[['total_cpu']]
        #sum_df  = sum_df.astype(int)     # will lost int with join because of missing data
        #print("sum_df={}".format(sum_df))

        job_df = SlurmDBQuery.readClusterTable(cluster, 'job_table')
        s1, s2, job_df = MyTool.getDFBetween(job_df, 'time_start', start)
        comb_df = job_df.join(sum_df, on='job_db_inx')
        comb_df.to_csv("{}/{}_{}".format(CSV_DIR, cluster,
                                         "job_step_sum_table.csv"),
                       index=False)  # job_df is more than sum_df

Exemplo n.º 4

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def readJobTable(cluster,
                     start=None,
                     stop=None,
                     fld_lst=None,
                     index_col=None,
                     time_col='time_submit'):
        f_name = "{}/{}_{}".format(CSV_DIR, cluster, "job_table.csv")
        df = pandas.read_csv(f_name, usecols=fld_lst, index_col=index_col)
        if time_col and (start or stop):
            logger.debug("start={},stop={}".format(start, stop))
            start, stop, df = MyTool.getDFBetween(df, time_col, start, stop)

        return start, stop, df

Exemplo n.º 5

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def getUserDoneJobReport(
        uid,
        cluster='slurm',
        days=3,
        output='JobID,JobIDRaw,JobName,AllocCPUS,AllocTRES,State,ExitCode,User,NodeList,Start,End'
    ):
        if days > 30:  # only 30 days of history is saved
            return None
        job_df = SlurmDBQuery.readClusterTable(cluster, 'job_step_sum_table')
        start = int(time.time()) - days * ONE_DAY_SECS
        s1, s2, job_df = MyTool.getDFBetween(job_df, 'time_start', start)

        user_job_df = job_df[job_df['id_user'] == uid]
        return user_job_df

Exemplo n.º 6

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

 def readClusterTableBetween(cluster,
                             part_table_name,
                             fld_lst,
                             start=None,
                             stop=None,
                             index_col=None,
                             ts_col=None):
     df = SlurmDBQuery.readClusterTable(cluster, part_table_name, fld_lst,
                                        index_col)
     if ts_col:
         start, stop, df = MyTool.getDFBetween(df, ts_col, start, stop)
         return start, stop, df
     else:
         return 0, 0, df

Exemplo n.º 7

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def getNodeRunJobs(self, node, start, stop):
        df = pandas.read_csv(CSV_DIR + "slurm_cluster_job_table.csv",
                             usecols=[
                                 'id_job', 'id_user', 'nodelist',
                                 'nodes_alloc', 'state', 'time_start',
                                 'time_end', 'time_suspended'
                             ])
        start, stop, df = MyTool.getDFBetween(df, 'time_start', start, stop)
        df = df[df['nodes_alloc'] > 0]

        #jobs running on node
        if node:
            criterion = df['nodelist'].map(lambda x: node in MyTool.nl2flat(x))
            df = df[criterion]
            df['user'] = df['id_user'].map(lambda x: MyTool.getUser(x))

        return df[[
            'id_job', 'user', 'time_start', 'time_end', 'time_suspended'
        ]]

Exemplo n.º 8

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def getAccountUsage_hourly(cluster, start='', stop=''):
        #cluster usage
        fname = "{}/{}_{}".format(CSV_DIR, cluster,
                                  "assoc_usage_hour_table.csv")
        df = pandas.read_csv(
            fname, usecols=['id', 'id_tres', 'time_start', 'alloc_secs'])
        st, stp, df = MyTool.getDFBetween(df, 'time_start', start, stop)

        # get account's data, id_assoc (user) - account
        fname1 = "{}/{}_{}".format(CSV_DIR, cluster, "assoc_table.csv")
        userDf = pandas.read_csv(fname1,
                                 usecols=['id_assoc', 'acct'],
                                 index_col=0)
        # add acct to df
        df['acct'] = df['id'].map(userDf['acct'])
        df.drop('id', axis=1, inplace=True)

        # sum over the same id_tres, acct, time_start
        sumDf = df.groupby(['id_tres', 'acct', 'time_start']).sum()
        sumDf['ts_ms'] = sumDf.index.get_level_values('time_start') * 1000
        sumDf['alloc_ratio'] = sumDf[
            'alloc_secs'] / 3600  #1 sec on node1 and 1 sec on node2 =? 2/3600 node

        return st, stp, sumDf

Exemplo n.º 9

0

Exibir arquivo

Arquivo: querySlurmDB.py Projeto: flatironinstitute/SlurmUtil

    def getClusterUsage_hourly(cluster, start, stop):
        #read from csv, TODO: deleted=0 for all data now
        fname = "{}/{}_{}".format(CSV_DIR, cluster, "usage_hour_table.csv")
        df = pandas.read_csv(fname,
                             usecols=[
                                 'id_tres', 'time_start', 'count',
                                 'alloc_secs', 'down_secs', 'pdown_secs',
                                 'idle_secs', 'resv_secs', 'over_secs'
                             ])
        start, stop, df = MyTool.getDFBetween(df, 'time_start', start, stop)
        df['total_secs'] = df['alloc_secs'] + df['down_secs'] + df[
            'pdown_secs'] + df['idle_secs'] + df['resv_secs']
        df['tdown_secs'] = df['down_secs'] + df['pdown_secs']
        df = df[df['count'] *
                3600 == df['total_secs']]  # count =? count of cores
        df['ts_ms'] = df['time_start'] * 1000
        dfg = df.groupby('id_tres')

        cpuDf = dfg.get_group(1)
        memDf = dfg.get_group(2)
        #eneDf            = dfg.get_group(3)
        #nodeDf     = dfg.get_group(4)  not available

        return start, stop, cpuDf, memDf