if start == True: data[-1] = row.CompleteTimestamp else: data = list() data += [ temp_caseid, row.Activity, row.Resource, '', row.Amount, row.CompleteTimestamp ] start = False table.append(data) headers = [ 'CASE_ID', 'Activity', 'Resource', 'StartTimestamp', 'Amount', 'CompleteTimestamp' ] df = pd.DataFrame(table, columns=headers) eventlog = Eventlog(df) eventlog = eventlog.assign_timestamp(name='StartTimestamp', new_name='StartTimestamp', _format='%Y.%m.%d %H:%M:%S', errors='raise') eventlog = eventlog.assign_timestamp(name='CompleteTimestamp', new_name='CompleteTimestamp', _format='%Y/%m/%d %H:%M:%S', errors='raise') eventlog['Duration'] = (eventlog['CompleteTimestamp'] - eventlog['StartTimestamp']).apply(to_minute) eventlog.dropna(subset=['Resource', 'StartTimestamp', 'CompleteTimestamp'], inplace=True) eventlog.to_csv('../sample_data/BPIC2012.csv')
eventlog = eventlog.loc[eventlog['Resource'].isin(valid_resource_list)] #20회 초과 이벤트 제외 event_count = eventlog.groupby('CASE_ID').Activity.count() invalid_case = event_count[event_count > 20].astype(int) invalid_case_list = list(invalid_case.index) eventlog = eventlog.loc[~eventlog['CASE_ID'].isin(invalid_case_list)] #특정 날짜(scheduling 대상)에 event가 있는 모든 instance 추출 eventlog['StartDate'] = eventlog['StartTimestamp'].dt.date eventlog['CompleteDate'] = eventlog['CompleteTimestamp'].dt.date d='2012-03-10' target_date=pd.to_datetime(d).date() valid_case = eventlog.loc[eventlog['StartDate']==target_date,'CASE_ID'].unique() eventlog = eventlog.loc[eventlog['CASE_ID'].isin(valid_case)] #target date 이후 이벤트 삭제 (하루치만) eventlog = eventlog.loc[eventlog['StartDate']<=target_date] #weight 배정 max_amount = eventlog['Amount'].max() print(max_amount) #custom_bucket_array = np.linspace(0, max_amount, 10) labels = [x+1 for x in range(5)] eventlog['weight'] = pd.cut(df['Amount'], 5, labels=labels) eventlog.to_csv('../result/modi_BPI_2012_0301.csv') print(eventlog) print(len(eventlog['CASE_ID'].unique()))