def validation(self, dataloader, use_cuda, verbose=True, prob=1.0, accuracy=False, parameters=None): """ Compute the avaraged loss per event of a generalized Hawkes process given observed sequences and current model :param dataloader: a pytorch batch-based data loader :param use_cuda: use cuda (true) or not (false) """ device = torch.device('cuda:0' if use_cuda else 'cpu') self.lambda_model_validation.to(device) self.lambda_model_validation.eval() Cs = torch.LongTensor( list(range(len(dataloader.dataset.database['type2idx'])))) Cs = Cs.view(-1, 1) Cs = Cs.to(device) if dataloader.dataset.database['event_features'] is not None: all_event_feature = torch.from_numpy( dataloader.dataset.database['event_features']) FCs = all_event_feature.type(torch.FloatTensor) FCs = torch.t(FCs) # (num_type, dim_features) FCs = FCs.to(device) else: FCs = None if not accuracy: start = time.time() loss = 0 prob = np.array([prob]) prob_tensor = torch.from_numpy(prob).type(torch.FloatTensor) with torch.no_grad(): for batch_idx, samples in enumerate(dataloader): ci, batch_dict = samples2dict(samples, device, Cs, FCs) lambda_t, Lambda_t = self.lambda_model_validation( batch_dict) lambda_t /= prob_tensor Lambda_t /= prob_tensor loss += self.loss_function(lambda_t, Lambda_t, ci) # display training processes if verbose: if batch_idx % 100 == 0: logger.info( 'Validation [{}/{} ({:.0f}%)]\t Time={:.2f}sec.' .format(batch_idx * ci.size(0), len(dataloader.dataset), 100. * batch_idx / len(dataloader), time.time() - start)) return loss / len(dataloader.dataset) else: with torch.no_grad(): loss = np.linalg.norm( list(self.lambda_model_validation.parameters()) [1].data.numpy() - parameters) / self.num_type**2 return loss
def print_info(self): """ Print basic information of the model. """ logger.info(self.model_name) self.lambda_model.print_info() logger.info("The loss function is {}.".format(self.loss_function))
def print_info(self): """ Print basic information of the kernel model. """ logger.info('The type of decay kernel: {}.'.format(self.kernel_type)) logger.info('The number of basis = {}.'.format( self.parameters.size(1)))
def print_info(self): """ Print basic information of the exogenous intensity function. """ logger.info('Exogenous intensity function: mu(t) = {}.'.format( self.exogenous_intensity_type)) logger.info('The number of event types = {}.'.format(self.num_type))
def print_info(self): """ Print basic information of the exogenous intensity function. """ logger.info("Endogenous impact function: phi_(kk')(t) = {}.".format( self.endogenous_impact_type)) logger.info('The number of event types = {}.'.format(self.num_type)) self.decay_kernel.print_info()
def aggregating(database, dt): """ Count the number of events in predefined time bins, and convert event sequences to aggregate time series :param database: the observed event sequences :param dt: a float number indicating the length of time bin. :return: the output's format is shown as follows: output = {'event_features': None or (De, C) float array of event's static features, C is the number of event types. 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = {seq_1, seq_2, ..., seq_N}. } For the i-th sequence: seq_i = {'times': (N,) float array of discrete timestamps, N = [(t_stop - t_start)/dt] is the number of bins. 'events': (N, C) int array of event types, events[n, c] counts the number of type-c events in the n-th bin 'seq_feature': None or (Ds,) float array of sequence's static feature. 't_start': a float number indicating the start timestamp of the sequence. 't_stop': a float number indicating the stop timestamp of the sequence. 'label': None or int/float number indicating the labels of the sequence} """ start = time.time() output = copy.deepcopy(database) num_types = len(database['type2idx']) logger.info('aggregation of event sequences is applied...') for i in range(len(database['sequences'])): seq_i = database['sequences'][i] num_bins = round((seq_i['t_stop'] - seq_i['t_start']) / dt) + 1 times = np.zeros((num_bins, )) events = np.zeros((num_bins, num_types)) for n in range(num_bins): times[n] = seq_i['t_start'] + (n + 1) * dt for k in range(seq_i['times'].shape[0]): n = int(round((seq_i['times'][k] - seq_i['t_start']) / dt)) c = seq_i['events'][k] events[n, c] += 1 output['sequences'][i]['times'] = times output['sequences'][i]['events'] = events if i % 1000 == 0: logger.info( '{} sequences have been aggregated... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) return output
def __init__(self, database, memorysize): """ :param database: the observed event sequences database = {'event_features': None or (C, De) float array of event's static features, C is the number of event types. 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = {seq_1, seq_2, ..., seq_N}. } For the i-th sequence: seq_i = {'times': (N,) float array of timestamps, N is the number of events. 'events': (N,) int array of event types. 'seq_feature': None or (Ds,) float array of sequence's static feature. 't_start': a float number indicating the start timestamp of the sequence. 't_stop': a float number indicating the stop timestamp of the sequence. 'label': None or int/float number indicating the labels of the sequence} :param memorysize: how many historical events remembered by each event """ self.event_cell = [] self.time_cell = [] self.database = database self.memory_size = memorysize for i in range(len(database['sequences'])): seq_i = database['sequences'][i] times = seq_i['times'] events = seq_i['events'] t_start = seq_i['t_start'] print(events.shape) for j in range(len(events)): target = events[j] # former = np.zeros((memorysize,), dtype=np.int) # former = np.random.permutation(len(self.database['type2idx'])) # former = former[:memorysize] former = np.random.choice(len(self.database['type2idx']), memorysize) target_t = times[j] former_t = t_start * np.ones((memorysize, )) if 0 < j < memorysize: former[-j:] = events[:j] former_t[-j:] = times[:j] elif j >= memorysize: former = events[j - memorysize:j] former_t = times[j - memorysize:j] self.event_cell.append((target, former, i)) self.time_cell.append((target_t, former_t)) logger.info('In this dataset, the number of events = {}.'.format( len(self.event_cell))) logger.info( 'Each event is influenced by its last {} historical events.'. format(self.memory_size))
def plot_and_save(self, infect: torch.Tensor, output_name: str = None): """ Plot endogenous impact function for all event types Args: :param infect: a (num_type, num_type+1, M) FloatTensor containing all endogenous impact :param output_name: the name of the output png file """ impact = infect.sum(2).data.cpu().numpy() plt.figure(figsize=(5, 5)) plt.imshow(impact) plt.colorbar() if output_name is None: plt.savefig('endogenous_impact.png') else: plt.savefig(output_name) plt.close("all") logger.info("Done!")
def validation(self, dataloader, use_cuda): """ Compute the avaraged loss per event of a generalized Hawkes process given observed sequences and current model :param dataloader: a pytorch batch-based data loader :param use_cuda: use cuda (true) or not (false) """ device = torch.device('cuda:0' if use_cuda else 'cpu') self.lambda_model.to(device) self.lambda_model.eval() Cs = torch.LongTensor( list(range(len(dataloader.dataset.database['type2idx'])))) Cs = Cs.view(-1, 1) Cs = Cs.to(device) if dataloader.dataset.database['event_features'] is not None: all_event_feature = torch.from_numpy( dataloader.dataset.database['event_features']) FCs = all_event_feature.type(torch.FloatTensor) FCs = torch.t(FCs) # (num_type, dim_features) FCs = FCs.to(device) else: FCs = None start = time.time() loss = 0 for batch_idx, samples in enumerate(dataloader): ci, batch_dict = samples2dict(samples, device, Cs, FCs) loss = 0 for m in range(self.num_cluster): weight = self.responsibility[batch_dict['sn'][:, 0], m] # (batch_size, ) lambda_t, Lambda_t = self.lambda_model[m](batch_dict) loss_m = self.loss_function(lambda_t, Lambda_t, ci) # (batch_size, ) loss += (weight * loss_m).sum() / loss_m.size(0) # display training processes if batch_idx % 100 == 0: logger.info( 'Validation [{}/{} ({:.0f}%)]\t Time={:.2f}sec.'.format( batch_idx * ci.size(0), len(dataloader.dataset), 100. * batch_idx / len(dataloader), time.time() - start)) return loss / len(dataloader.dataset)
def plot_and_save(self, mu_all: torch.Tensor, output_name: str = None): """ Plot the stem plot of exogenous intensity functions for all event types Args: :param mu_all: a (num_type, 1) FloatTensor containing all exogenous intensity functions :param output_name: the name of the output png file """ mu_all = mu_all.squeeze(1) # (C,) mu_all = mu_all.data.cpu().numpy() plt.figure(figsize=(5, 5)) plt.stem(range(mu_all.shape[0]), mu_all, '-') plt.ylabel('Exogenous intensity') plt.xlabel('Index of event type') if output_name is None: plt.savefig('exogenous_intensity.png') else: plt.savefig(output_name) plt.close("all") logger.info("Done!")
def plot_and_save(self, t_stop: float = 5.0, output_name: str = None): """ Plot decay function and its integration and save the figure as a png file Args: t_stop (float): the end of timestamp output_name (str): the name of the output png file """ dt = np.arange(0.0, t_stop, 0.01) dt = np.tile(dt, (1, 1)) dt = torch.from_numpy(dt) dt = dt.type(torch.FloatTensor) gt = self.values(dt) # t_start = torch.zeros(dt.size()) igt = self.integrations(dt) # print(gt.shape) plt.figure(figsize=(5, 5)) for k in range(gt.shape[2]): plt.plot(dt[0, :].cpu().numpy(), gt[0, :, k].cpu().numpy(), label='g_{}(t)'.format(k), c='r') plt.plot(dt[0, :].cpu().numpy(), igt[0, :, k].cpu().numpy(), label='G_{}(t)'.format(k), c='b') leg = plt.legend(loc='upper left', ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.title('{} decay kernel and its integration'.format( self.kernel_type)) if output_name is None: plt.savefig('{}_decay_kernel.png'.format(self.kernel_type)) else: plt.savefig(output_name) plt.close("all") logger.info("Done!")
def print_info(self): """ Print basic information of the model. """ logger.info(self.model_name) for m in range(self.num_cluster): logger.info('Component {}, probability = {:.6f}'.format( m, self.prob_cluster[m])) self.lambda_model[m].print_info() logger.info("The loss function is {}.".format(self.loss_function))
def save_model(self, full_path, mode: str='entire'): """ Save trained model :param full_path: the path of directory :param mode: 'parameter' for saving only parameters of the model, 'entire' for saving entire model """ if mode == 'entire': torch.save(self.lambda_model, full_path) logger.info('The entire model is saved in {}.'.format(full_path)) elif mode == 'parameter': torch.save(self.lambda_model.state_dict(), full_path) logger.info('The parameters of the model is saved in {}.'.format(full_path)) else: logger.warning("'{}' is a undefined mode, we use 'entire' mode instead.".format(mode)) torch.save(self.lambda_model, full_path) logger.info('The entire model is saved in {}.'.format(full_path))
def fit(self, dataloader, optimizer, epochs: int, scheduler=None, sparsity: float = None, nonnegative=None, use_cuda: bool = False, validation_set=None, track_diagnostics=False): """ Learn parameters of a generalized Hawkes process given observed sequences :param dataloader: a pytorch batch-based data loader :param optimizer: the sgd optimization method defined by PyTorch :param epochs: the number of training epochs :param scheduler: the method adjusting the learning rate of SGD defined by PyTorch :param sparsity: None or a float weight of L1 regularizer :param nonnegative: None or a float lower bound, typically the lower bound = 0 :param use_cuda: use cuda (true) or not (false) :param validation_set: None or a validation dataloader :param track_diagnostics: Set to True to return historical loss values and weights. """ device = torch.device('cuda:0' if use_cuda else 'cpu') self.lambda_model.to(device) best_model = None self.lambda_model.train() if nonnegative is not None: clipper = LowerBoundClipper(nonnegative) Cs = torch.LongTensor( list(range(len(dataloader.dataset.database['type2idx'])))) Cs = Cs.view(-1, 1) Cs = Cs.to(device) if dataloader.dataset.database['event_features'] is not None: all_event_feature = torch.from_numpy( dataloader.dataset.database['event_features']) FCs = all_event_feature.type(torch.FloatTensor) FCs = torch.t(FCs) # (num_type, dim_features) FCs = FCs.to(device) else: FCs = None if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda) logger.info( 'In the beginning, validation loss per event: {:.6f}.\n'. format(validation_loss)) best_loss = validation_loss else: best_loss = np.inf if track_diagnostics: self.diagnostics = Diagnostics() for epoch in range(epochs): if scheduler is not None: scheduler.step() start = time.time() for batch_idx, samples in enumerate(dataloader): ci, batch_dict = samples2dict(samples, device, Cs, FCs) optimizer.zero_grad() lambda_t, Lambda_t = self.lambda_model(batch_dict) loss = self.loss_function(lambda_t, Lambda_t, ci) / lambda_t.size(0) reg = 0 if sparsity is not None: for parameter in self.lambda_model.parameters(): reg += sparsity * torch.sum(torch.abs(parameter)) loss_total = loss + reg loss_total.backward() optimizer.step() if nonnegative is not None: self.lambda_model.apply(clipper) if track_diagnostics: self.diagnostics.loss.append(loss.data.item()) self.diagnostics.mu.append( self.lambda_model.exogenous_intensity.emb.weight. squeeze().tolist()) self.diagnostics.alpha.append( self.lambda_model.endogenous_intensity.basis[0].weight. squeeze().tolist()) # display training processes if batch_idx % 100 == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format( epoch, batch_idx * ci.size(0), len(dataloader.dataset), 100. * batch_idx / len(dataloader))) if sparsity is not None: logger.info( 'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec' .format(loss.data, reg.data, time.time() - start)) else: logger.info( 'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec' .format(loss.data, 0, time.time() - start)) if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda) logger.info( 'After Epoch: {}, validation loss per event: {:.6f}.\n'. format(epoch, validation_loss)) if validation_loss < best_loss: best_model = copy.deepcopy(self.lambda_model) best_loss = validation_loss if best_model is not None: self.lambda_model = copy.deepcopy(best_model)
def fit(self, dataloader, optimizer, epochs: int, scheduler=None, sparsity: float = None, nonnegative=None, use_cuda: bool = False, validation_set=None, verbose=True, prob: float = 1.0, accuracy=False, parameters=None): """ Learn parameters of a generalized Hawkes process given observed sequences :param dataloader: a pytorch batch-based data loader :param optimizer: the sgd optimization method defined by PyTorch :param epochs: the number of training epochs :param scheduler: the method adjusting the learning rate of SGD defined by PyTorch :param sparsity: None or a float weight of L1 regularizer :param nonnegative: None or a float lower bound, typically the lower bound = 0 :param use_cuda: use cuda (true) or not (false) :param validation_set: None or a validation dataloader """ device = torch.device('cuda:0' if use_cuda else 'cpu') self.lambda_model.to(device) best_model = None self.lambda_model.train() self.mu_path.append( copy.deepcopy(list(self.lambda_model.parameters())[0].data)) self.alpha_path.append( copy.deepcopy(list(self.lambda_model.parameters())[1].data)) if nonnegative is not None: clipper = LowerBoundClipper(nonnegative) Cs = torch.LongTensor( list(range(len(dataloader.dataset.database['type2idx'])))) Cs = Cs.view(-1, 1) Cs = Cs.to(device) if dataloader.dataset.database['event_features'] is not None: all_event_feature = torch.from_numpy( dataloader.dataset.database['event_features']) FCs = all_event_feature.type(torch.FloatTensor) FCs = torch.t(FCs) # (num_type, dim_features) FCs = FCs.to(device) else: FCs = None if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda, verbose, prob, accuracy, parameters) logger.info( 'In the beginning, validation loss per event: {:.6f}.\n'. format(validation_loss)) best_loss = validation_loss self.learning_path.append(validation_loss) else: best_loss = np.inf start0 = time.time() self.training_time.append(time.time() - start0) for epoch in range(epochs): if scheduler is not None: scheduler.step() start = time.time() for batch_idx, samples in enumerate(dataloader): ci, batch_dict = samples2dict(samples, device, Cs, FCs) optimizer.zero_grad() lambda_t, Lambda_t = self.lambda_model(batch_dict) loss = self.loss_function(lambda_t, Lambda_t, ci) / lambda_t.size(0) reg = 0 if sparsity is not None: for parameter in self.lambda_model.parameters(): reg += sparsity * torch.sum(torch.abs(parameter)) loss_total = loss + reg loss_total.backward() optimizer.step() if nonnegative is not None: self.lambda_model.apply(clipper) if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda, verbose, prob, accuracy, parameters) if verbose: logger.info( 'After Epoch: {}, validation loss per event: {:.6f}.\n' .format(epoch, validation_loss)) if validation_loss < best_loss: best_model = copy.deepcopy(self.lambda_model) best_loss = validation_loss esti_loss = loss_total.data # display training processes if verbose: if batch_idx % 100 == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format( epoch, batch_idx * ci.size(0), len(dataloader.dataset), 100. * batch_idx / len(dataloader))) if sparsity is not None: logger.info( 'Loss per event: {:.3f}, Regularizer: {:.3f}, Validate Loss: {:.3f}, Time={:.2f}sec' .format(esti_loss.data, reg.data, validation_loss, time.time() - start)) else: logger.info( 'Loss per event: {:.3f}, Regularizer: {:.3f}, Loss: {:.6f}, Time={:.2f}sec' .format(esti_loss.data, 0, validation_loss, time.time() - start)) self.learning_path.append(loss_total) self.validation_path.append(validation_loss) self.training_time.append(time.time() - start0) self.mu_path.append( copy.deepcopy( list(self.lambda_model.parameters())[0].data)) self.alpha_path.append( copy.deepcopy( list(self.lambda_model.parameters())[1].data)) self.lambda_path.append(lambda_t) self.Lambda_path.append(Lambda_t) logger.info( 'Epoch : {}/{}, Used time: {: .2f} min, Estimated Time to finish: {: .2f} min, train loss: {: .3f}, validation loss: {: .3f}' .format((epoch + 1), epochs, self.training_time[-1] / 60, self.training_time[-1] / 60 / (epoch + 1) * (epochs - epoch - 1), loss_total, validation_loss)) if best_model is not None: self.lambda_model = copy.deepcopy(best_model)
def simulate(self, history, memory_size: int = 10, time_window: float = 1.0, interval: float = 1.0, max_number: int = 100, use_cuda: bool = False): """ Simulate one or more event sequences from given model. :param history: historical observations history = {'event_features': None or (C, De) float array of event's static features, C is the number of event types. 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = {seq_1, seq_2, ..., seq_N}. } For the i-th sequence: seq_i = {'times': (N,) float array of timestamps, N is the number of events. 'events': (N,) int array of event types. N can be "0" (i.e., no observations) 'seq_feature': None or (Ds,) float array of sequence's static feature. 't_start': a float number indicating the start timestamp of the sequence. 't_stop': a float number indicating the stop timestamp of the sequence. 'label': None or int/float number indicating the labels of the sequence} :param memory_size: the number of historical events used for simulation :param time_window: duration of simulation process. :param interval: the interval size calculating the supremum of intensity :param max_number: the maximum number of simulated events :param use_cuda: use cuda (true) or not (false) :return: new_data: having the same format as history counts: a list of (C,) ndarray, which counts the number of simulated events for each type """ device = torch.device('cuda:0' if use_cuda else 'cpu') self.lambda_model.to(device) self.lambda_model.eval() Cs = torch.LongTensor(list(range(len(history['type2idx'])))) Cs = Cs.view(-1, 1) Cs = Cs.to(device) if history['event_features'] is not None: all_event_feature = torch.from_numpy(history['event_features']) FCs = all_event_feature.type(torch.FloatTensor) FCs = torch.t(FCs) # (num_type, dim_features) FCs = FCs.to(device) else: FCs = None t_start = time.time() new_data = copy.deepcopy(history) # the number of new synthetic events for each type counts = np.zeros((self.num_type, len(new_data['sequences']))) for i in range(len(new_data['sequences'])): times_tmp = [] events_tmp = [] # initial point new_data['sequences'][i]['t_start'] = history['sequences'][i][ 't_stop'] new_data['sequences'][i][ 't_stop'] = history['sequences'][i]['t_stop'] + time_window t_now = new_data['sequences'][i]['t_start'] + 0.01 # initialize the input of intensity function ci = Cs # print(ci) ci = ci.to(device) ti = torch.FloatTensor([t_now]) ti = ti.to(device) ti = ti.view(1, 1) ti = ti.repeat(ci.size(0), 1) events = history['sequences'][i]['events'] times = history['sequences'][i]['times'] if times is None: tjs = torch.FloatTensor([new_data['sequences'][i]['t_start']]) tjs = tjs.to(device) cjs = torch.LongTensor( [np.random.permutation(self.num_type)[0]]) cjs = cjs.to(device) else: if memory_size > times.shape[0]: tjs = torch.from_numpy(times) tjs = tjs.type(torch.FloatTensor) tjs = tjs.to(device) cjs = torch.from_numpy(events) cjs = cjs.type(torch.LongTensor) cjs = cjs.to(device) else: tjs = torch.from_numpy(times[-memory_size:]) tjs = tjs.type(torch.FloatTensor) tjs = tjs.to(device) cjs = torch.from_numpy(events[-memory_size:]) cjs = cjs.type(torch.LongTensor) cjs = cjs.to(device) tjs = tjs.to(device) tjs = tjs.view(1, -1) tjs = tjs.repeat(ci.size(0), 1) cjs = cjs.to(device) cjs = cjs.view(1, -1) cjs = cjs.repeat(ci.size(0), 1) sn = torch.LongTensor([i]) sn = sn.to(device) sn = sn.view(1, 1) sn = sn.repeat(ci.size(0), 1) if history['sequences'][i]['seq_feature'] is not None: fsn = history['sequences'][i]['seq_feature'] fsn = torch.from_numpy(fsn) fsn = fsn.type(torch.FloatTensor) fsn = fsn.view(1, -1).repeat(ci.size(0), 1) fsn = fsn.to(device) else: fsn = None if FCs is None: fci = None fcjs = None else: fci = FCs[ci[:, 0], :] fcjs = FCs[cjs, :] fcjs = torch.transpose(fcjs, 1, 2) fcjs = fcjs.to(device) sample_dict = { 'ti': ti, 'tjs': tjs, 'ci': ci, 'cjs': cjs, 'sn': sn, 'fsn': fsn, 'fci': fci, 'fcjs': fcjs, 'Cs': Cs, 'FCs': FCs } while t_now < new_data['sequences'][i]['t_stop'] and len( times_tmp) < max_number: lambda_t = self.lambda_model.intensity(sample_dict) sample_dict['ti'] = sample_dict['ti'] + interval lambda_t2 = self.lambda_model.intensity(sample_dict) mt = max([float(lambda_t.sum()), float(lambda_t2.sum())]) s = np.random.exponential(1 / mt) if s < interval: sample_dict['ti'] = sample_dict['ti'] + s - interval ti = sample_dict['ti'].cpu().numpy() t_now = ti[0, 0] # float lambda_s = self.lambda_model.intensity(sample_dict) ms = float(lambda_s.sum()) u = np.random.rand() ratio = ms / mt if ratio > u: # generate a new event prob = lambda_s.data.cpu().numpy() / ms prob = prob[:, 0] # print(prob.shape) # print(self.num_type) ci = np.random.choice(self.num_type, p=prob) # int # add to new sequence times_tmp.append(t_now) events_tmp.append(ci) counts[ci, i] += 1 # update batch_dict ti = torch.FloatTensor([t_now]) ti = ti.to(device) ti = ti.view(1, 1).repeat(self.num_type, 1) ci = torch.LongTensor([ci]) ci = ci.to(device) ci = ci.view(1, 1).repeat(self.num_type, 1) if memory_size > sample_dict['cjs'].size(1): # print(sample_dict['cjs'].size()) # print(ci.size()) sample_dict['cjs'] = torch.cat( [sample_dict['cjs'], ci], dim=1) sample_dict['tjs'] = torch.cat( [sample_dict['tjs'], ti], dim=1) else: sample_dict['cjs'] = torch.cat( [sample_dict['cjs'][:, -memory_size + 1:], ci], dim=1) sample_dict['tjs'] = torch.cat( [sample_dict['tjs'][:, -memory_size + 1:], ti], dim=1) if FCs is not None: sample_dict['fcjs'] = FCs[sample_dict['cjs'], :] sample_dict['fcjs'] = torch.transpose( sample_dict['fcjs'], 1, 2) else: ti = sample_dict['ti'].cpu().numpy() t_now = ti[0, 0] # float if i % 500 == 0: logger.info( 'Sequence {}/{} has been generated... Time={:.2}sec.'. format(i, len(new_data['sequences']), time.time() - t_start)) times_tmp = np.asarray(times_tmp) events_tmp = np.asarray(events_tmp) new_data['sequences'][i]['times'] = times_tmp new_data['sequences'][i]['events'] = events_tmp return new_data, counts
def build_dict_mimic3(diagnose_dict_path: str, diagnose_adm_path: str, procedure_dict_path: str, procedure_adm_path: str, min_count: int): """ This function builds the icd code database Args: diagnose_dict_path: the path of diagnose icd code list (csv) diagnose_adm_path: diagnose_adm_path: the full path of admission diagnose csv file procedure_dict_path: procedure_dict_path: the path of procedure icd code list (csv) procedure_adm_path: procedure_adm_path: the full path of admission procedure csv file min_count: the minimum counts of ICD code Returns: database = {src_index: the dictionary mapping diagnose ICD code to index src_title: the dictionary mapping diagnose ICD code to its description tar_index: the dictionary mapping procedure ICD code to index tar_title: the dictionary mapping procedure ICD code to its description src_interactions: the diagnose pairs tar_interactions: the procedure pairs mutual_interactions: the list containing the admission with diseases and procedures } """ df_diagnose = pd.read_csv( diagnose_adm_path) # , encoding="ISO-8859-1")#"utf8") diag_counts = df_diagnose['ICD9_CODE'].value_counts() diag2idx = {} idx = 0 for icd in diag_counts.keys(): if diag_counts[icd] > min_count: diag2idx[str(icd)] = idx idx += 1 df_procedure = pd.read_csv( procedure_adm_path) # , encoding="ISO-8859-1")#"utf8") proc_counts = df_procedure['ICD9_CODE'].value_counts() proc2idx = {} idx = 0 for icd in proc_counts.keys(): if proc_counts[icd] > min_count: proc2idx[str(icd)] = idx idx += 1 diag2title = {} df_diagnose = pd.read_csv( diagnose_dict_path) # , encoding="ISO-8859-1")#"utf8") idx = 0 for i, row in df_diagnose.iterrows(): icd = str(row['ICD9_CODE']) des = str(row['LONG_TITLE']) if icd in diag2idx.keys(): diag2title[icd] = des idx += 1 logger.info('{} kinds of diagnoses are found.'.format(len(diag2idx))) proc2title = {} df_procedure = pd.read_csv( procedure_dict_path) # , encoding="ISO-8859-1")#"utf8") idx = 0 for i, row in df_procedure.iterrows(): icd = str(row['ICD9_CODE']) des = str(row['LONG_TITLE']) if icd in proc2idx.keys(): proc2title[icd] = des idx += 1 logger.info('{} kinds of procedures are found.'.format(len(proc2idx))) diag_adm = {} df_diagnose = pd.read_csv( diagnose_adm_path) # , encoding="ISO-8859-1")#"utf8") for i, row in df_diagnose.iterrows(): adm = str(row['HADM_ID']) icd = str(row['ICD9_CODE']) if icd in diag2idx.keys(): if adm not in diag_adm.keys(): diag_adm[adm] = [diag2idx[icd]] else: diag_adm[adm].append(diag2idx[icd]) if i % 10000 == 0: logger.info('{}/{} rows are processed.'.format( i, len(df_diagnose))) logger.info('{} diagnose admissions are found.'.format(len(diag_adm))) proc_adm = {} df_procedure = pd.read_csv( procedure_adm_path) # , encoding="ISO-8859-1")#"utf8") for i, row in df_procedure.iterrows(): adm = str(row['HADM_ID']) icd = str(row['ICD9_CODE']) if icd in proc2idx.keys(): if adm not in proc_adm.keys(): proc_adm[adm] = [proc2idx[icd]] else: proc_adm[adm].append(proc2idx[icd]) if i % 10000 == 0: logger.info('{}/{} rows are processed.'.format( i, len(df_procedure))) logger.info('{} procedure admissions are found.'.format(len(proc_adm))) diag_w_proc = [] for adm in diag_adm.keys(): if adm in proc_adm.keys(): diag_w_proc.append([diag_adm[adm], proc_adm[adm]]) database = { 'src_index': diag2idx, 'src_title': diag2title, 'tar_index': proc2idx, 'tar_title': proc2title, 'src_interactions': diag_adm, 'tar_interactions': proc_adm, 'mutual_interactions': diag_w_proc } return database
def load_seq_labels_csv(file_name: str, seq_domain: str, domain_dict: Dict, database: Dict): """ load sequences' features from a csv file :param file_name: the path and the name of the csv file :param seq_domain: the name of the key column corresponding to sequence index. :param domain_dict: a dictionary containing the name of the key column corresponding to the labels. The format should be domain_dict = {'domain_name': domain's feature type} The dictionary should only contain one key. If multiple keys are provided, only the first one is considered. Two types are considered: 1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers separated by spaces, and D should be the same for various elements. D-dimensional real-value labels will be generated for this domain. If each sequence has multiple rows, the average of the labels will be recorded. 2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords. N-dimensional categorical label will be generated for this domain. If each sequence has multiple rows, the aggregation of the categories will be recorded. :param database: a dictionary of data generated by the function "load_sequences_csv()" :return: a database having sequences' labels """ df = pd.read_csv(file_name) num_seq = len(database['seq2idx']) # initialize features keys = list(domain_dict.keys()) label_domain = keys[0] if len(keys) > 1: logger.warning( "{} label domains are found. Only the first domain '{}' is used to generate labels." .format(len(keys), label_domain)) features = {} counts = {} features[label_domain] = None logger.info('Start to generate sequence labels...') start = time.time() for i, row in df.iterrows(): seq_name = str(row[seq_domain]) if seq_name not in database['seq2idx'].keys(): logger.warning( "'{}' is a new sequence not appearing in current database.". format(seq_name)) logger.warning("It will be ignored in the process.") else: seq_idx = database['seq2idx'][seq_name] elements = str(row[label_domain]) if domain_dict[label_domain] == 'numerical': elements = np.asarray(list(map(float, elements.split()))) dim = elements.shape[0] if features[label_domain] is None: features[label_domain] = np.zeros((dim, num_seq)) features[label_domain][:, seq_idx] = elements counts[label_domain] = np.zeros((1, num_seq)) counts[label_domain][0, seq_idx] = 1 else: features[label_domain][:, seq_idx] += elements counts[label_domain][0, seq_idx] += 1 elif domain_dict[label_domain] == 'categorical': elements = elements.split() if features[label_domain] is None: features[label_domain] = {} features[label_domain][seq_idx] = elements counts[label_domain] = {} element_idx = 0 else: if seq_idx not in features[label_domain].keys(): features[label_domain][seq_idx] = elements else: features[label_domain][seq_idx].extend(elements) for element in elements: if element not in counts[label_domain].keys(): counts[label_domain][element] = element_idx element_idx += 1 else: logger.warning( 'Undefined feature type for the domain {}.'.format( label_domain)) logger.warning("It will be ignored in the process.") if i % 1000 == 0: logger.info('{} rows have been processed... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) # post-process of features start = time.time() if domain_dict[label_domain] == 'numerical': features_tmp = features[label_domain] features_tmp = features_tmp / np.tile( counts[label_domain], (features[label_domain].shape[0], 1)) for seq_idx in range(features_tmp.shape[1]): database['sequences'][seq_idx]['label'] = features_tmp[:, seq_idx] elif domain_dict[label_domain] == 'categorical': for seq_idx in features[label_domain].keys(): elements = list(set(features[label_domain][seq_idx])) feature_tmp = [] for element in elements: element_idx = counts[label_domain][element] feature_tmp.append(element_idx) feature_tmp = np.asarray(feature_tmp, dtype=np.int) database['sequences'][seq_idx]['label'] = feature_tmp else: logger.warning('Undefined label type for the domain {}.'.format( domain_dict[label_domain])) logger.warning("It will be ignored in the process.") logger.info("Labels of domain '{}' is generated... Time={}ms.".format( domain_dict[label_domain], round(1000 * (time.time() - start)))) return database
def fit_ot(self, dataloader, optimizer, epochs: int, trans: torch.Tensor, mu_t: torch.Tensor, A_t: torch.Tensor, p_s: torch.Tensor, p_t: torch.Tensor, sample_dict1, sample_dict2, gamma, alpha, scheduler=None, sparsity: float=None, nonnegative=None, use_cuda: bool=False, validation_set=None): """ Learn parameters of a generalized Hawkes process given observed sequences :param dataloader: a pytorch batch-based data loader :param optimizer: the sgd optimization method defined by PyTorch :param epochs: the number of training epochs :param trans: fixed optimal transport :param mu_t: base intensity of target Hawkes process :param A_t: infectivity of target Hawkes process :param p_s: the distribution of event types in source Hawkes process :param p_t: the distribution of event types in target Hawkes process :param scheduler: the method adjusting the learning rate of SGD defined by PyTorch :param sparsity: None or a float weight of L1 regularizer :param nonnegative: None or a float lower bound, typically the lower bound = 0 :param use_cuda: use cuda (true) or not (false) :param validation_set: None or a validation dataloader """ device = torch.device('cuda:0' if use_cuda else 'cpu') self.lambda_model.to(device) best_model = None self.lambda_model.train() if nonnegative is not None: clipper = LowerBoundClipper(nonnegative) Cs = torch.LongTensor(list(range(len(dataloader.dataset.database['type2idx'])))) Cs = Cs.view(-1, 1) Cs = Cs.to(device) if dataloader.dataset.database['event_features'] is not None: all_event_feature = torch.from_numpy(dataloader.dataset.database['event_features']) FCs = all_event_feature.type(torch.FloatTensor) FCs = torch.t(FCs) # (num_type, dim_features) FCs = FCs.to(device) else: FCs = None if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda) logger.info('In the beginning, validation loss per event: {:.6f}.\n'.format(validation_loss)) best_loss = validation_loss else: best_loss = np.inf for epoch in range(epochs): if scheduler is not None: scheduler.step() start = time.time() for batch_idx, samples in enumerate(dataloader): ci, batch_dict = samples2dict(samples, device, Cs, FCs) optimizer.zero_grad() lambda_t, Lambda_t = self.lambda_model(batch_dict) loss = self.loss_function(lambda_t, Lambda_t, ci) / lambda_t.size(0) reg = 0 if sparsity is not None: for parameter in self.lambda_model.parameters(): reg += sparsity * torch.sum(torch.abs(parameter)) base_intensity = self.lambda_model.exogenous_intensity.intensity(sample_dict1) infectivity = self.lambda_model.endogenous_intensity.granger_causality(sample_dict2).squeeze(2) d_gw = self.dgw(infectivity, A_t, trans, p_s, p_t) d_w = self.dw(base_intensity, mu_t, trans, p_s, p_t) loss_total = loss + reg + gamma * (alpha*d_w + (1-alpha)*d_gw) loss_total.backward() optimizer.step() if nonnegative is not None: self.lambda_model.apply(clipper) # display training processes if batch_idx % 100 == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format( epoch, batch_idx * ci.size(0), len(dataloader.dataset), 100. * batch_idx / len(dataloader))) if sparsity is not None: logger.info('Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'.format( loss.data, reg.data, time.time() - start)) else: logger.info('Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec'.format( loss.data, 0, time.time() - start)) if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda) logger.info('After Epoch: {}, validation loss per event: {:.6f}.\n'.format(epoch, validation_loss)) if validation_loss < best_loss: best_model = copy.deepcopy(self.lambda_model) best_loss = validation_loss if best_model is not None: self.lambda_model = copy.deepcopy(best_model)
def fit(self, dataloader, optimizer, epochs: int, scheduler=None, sparsity: float = None, nonnegative=None, use_cuda: bool = False, validation_set=None): """ Learn parameters of a generalized Hawkes process given observed sequences :param dataloader: a pytorch batch-based data loader :param optimizer: the sgd optimization method :param epochs: the number of training epochs :param scheduler: the method adjusting the learning rate of SGD :param sparsity: None or a float weight of L1 regularizer :param nonnegative: None or a float lower bound :param use_cuda: use cuda (true) or not (false) :param validation_set: None or a validation dataloader """ device = torch.device('cuda:0' if use_cuda else 'cpu') self.lambda_model.to(device) self.responsibility = self.responsibility.to(device) self.prob_cluster = self.prob_cluster.to(device) best_model = None self.lambda_model.train() if nonnegative is not None: clipper = LowerBoundClipper(nonnegative) Cs = torch.LongTensor( list(range(len(dataloader.dataset.database['type2idx'])))) Cs = Cs.view(-1, 1) Cs = Cs.to(device) if dataloader.dataset.database['event_features'] is not None: all_event_feature = torch.from_numpy( dataloader.dataset.database['event_features']) FCs = all_event_feature.type(torch.FloatTensor) FCs = torch.t(FCs) # (num_type, dim_features) FCs = FCs.to(device) else: FCs = None if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda) logger.info( 'In the beginning, validation loss per event: {:.6f}.\n'. format(validation_loss)) best_loss = validation_loss else: best_loss = np.inf # EM algorithm for epoch in range(epochs): if scheduler is not None: scheduler.step() start = time.time() log_weight = self.prob_cluster.log().view(1, self.num_cluster).repeat( self.num_sequence, 1) log_responsibility = 0 * self.responsibility num_responsibllity = 0 * self.responsibility log_responsibility = log_responsibility.to(device) num_responsibllity = num_responsibllity.to(device) for batch_idx, samples in enumerate(dataloader): ci, batch_dict = samples2dict(samples, device, Cs, FCs) optimizer.zero_grad() loss = 0 for m in range(self.num_cluster): weight = self.responsibility[batch_dict['sn'][:, 0], m] # (batch_size, ) lambda_t, Lambda_t = self.lambda_model[m](batch_dict) loss_m = self.loss_function(lambda_t, Lambda_t, ci) # (batch_size, ) loss += (weight * loss_m).sum() / loss_m.size(0) for i in range(loss_m.size(0)): sn = batch_dict['sn'][i, 0] log_responsibility[sn, m] += loss_m.data[i] num_responsibllity[sn, m] += 1 reg = 0 if sparsity is not None: for parameter in self.lambda_model.parameters(): reg += sparsity * torch.sum(torch.abs(parameter)) loss_total = loss + reg loss_total.backward() optimizer.step() if nonnegative is not None: self.lambda_model.apply(clipper) # display training processes if batch_idx % 100 == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]'.format( epoch, batch_idx * ci.size(0), len(dataloader.dataset), 100. * batch_idx / len(dataloader))) if sparsity is not None: logger.info( 'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec' .format(loss.data, reg.data, time.time() - start)) else: logger.info( 'Loss per event: {:.6f}, Regularizer: {:.6f} Time={:.2f}sec' .format(loss.data, 0, time.time() - start)) logger.info('Distribution of clusters') for m in range(self.num_cluster): logger.info('Cluster {}, prob={:.6f}'.format( m, self.prob_cluster[m])) # update responsibility log_responsibility /= (num_responsibllity + 1e-7) self.responsibility = F.softmax(log_responsibility + log_weight, dim=1) self.prob_cluster = self.responsibility.sum(0) self.prob_cluster = self.prob_cluster / self.prob_cluster.sum() if validation_set is not None: validation_loss = self.validation(validation_set, use_cuda) logger.info( 'After Epoch: {}, validation loss per event: {:.6f}.\n'. format(epoch, validation_loss)) if validation_loss < best_loss: best_model = copy.deepcopy(self.lambda_model) best_loss = validation_loss if best_model is not None: self.lambda_model = copy.deepcopy(best_model)
def data_info(database): """ Print basic information of proposed database :param database: the database with the format mentioned above """ logger.info('** Statistics of Target Database **') logger.info('- The number of event types = {}.'.format( len(database['type2idx']))) logger.info('- The number of sequences = {}.'.format( len(database['seq2idx']))) if database['event_features'] is not None: logger.info( '- Each event has a feature vector with dimension {}.'.format( database['event_features'].shape[1])) else: logger.info('- Event feature is None.') if database['sequences'][0]['seq_feature'] is not None: logger.info( '- Each sequence has a feature vector with dimension {}.'.format( database['sequences'][0]['seq_feature'].shape[0])) else: logger.info('- Sequence feature is None.') N_max = 0 N_min = np.inf N_mean = 0 for i in range(len(database['sequences'])): num_event = database['sequences'][i]['events'].shape[0] N_mean += num_event if num_event < N_min: N_min = num_event if num_event > N_max: N_max = num_event N_mean /= len(database['sequences']) logger.info('- The longest sequence is with {} events.'.format(N_max)) logger.info('- The shortest sequence is with {} events.'.format(N_min)) logger.info( '- The average number of events per sequence is {:.2f}.'.format( N_mean))
def __init__(self, database, memorysize: int = None): """ :param database: the observed event sequences database = {'event_features': None or (C, De) float array of event's static features, C is the number of event types. 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = {seq_1, seq_2, ..., seq_N}. } For the i-th sequence: seq_i = {'times': (N,) float array of timestamps, N is the number of events. 'events': (N,) int array of event types. 'seq_feature': None or (Ds,) float array of sequence's static feature. 't_start': a float number indicating the start timestamp of the sequence. 't_stop': a float number indicating the stop timestamp of the sequence. 'label': None or int number indicating the labels of the sequence} :param memorysize: how many historical events remembered by each event When memorysize = None All events in a sequence will be considered. In that case, each batch can only contain one sequence because different sequences may have different length. When memorysize = K We only memory the last K events of each sequence. For the sequence with <K events, we fill virtual event "0" to the beginning of the sequence. """ self.event_cell = [] self.time_cell = [] self.database = database self.memory_size = memorysize if self.memory_size is None: logger.warning( "Because memory size is not given, the sampler can only sample 1 sequence per batch." ) logger.warning("Please set batch size = 1 in your code.") for i in range(len(database['sequences'])): seq_i = database['sequences'][i] times = seq_i['times'] events = seq_i['events'] t_start = seq_i['t_start'] target = seq_i['label'] target_t = seq_i['t_stop'] if self.memory_size is None: former = events former_t = times else: # former = np.zeros((memorysize,), dtype=np.int) # former = np.random.permutation(len(self.database['type2idx'])) # former = former[:memorysize] former = np.random.choice(len(self.database['type2idx']), memorysize) former_t = t_start * np.ones((memorysize, )) if 0 < times.shape[0] < memorysize: former[-memorysize:] = events former_t[-memorysize:] = times else: former = events[-memorysize:] former_t = times[-memorysize:] self.event_cell.append((target, former, i)) self.time_cell.append((target_t, former_t)) logger.info('In this dataset, the number of sequences = {}.'.format( len(self.event_cell)))
def stitching(database1: Dict, database2: Dict, method: str = 'random') -> Dict: """ Stitch each sequence in database2 to the end of one sequence of database1 :param database1: the observed event sequences :param database2: another observed event sequences database = {'event_features': None or (De, C) float array of event's static features, C is the number of event types. 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = {seq_1, seq_2, ..., seq_N}. } For the i-th sequence: seq_i = {'times': (N,) float array of timestamps, N is the number of events. 'events': (N,) int array of event types. 'seq_feature': None or (Ds,) float array of sequence's static feature. 't_start': a float number indicating the start timestamp of the sequence. 't_stop': a float number indicating the stop timestamp of the sequence. 'label': None or int/float number indicating the labels of the sequence} :param method: a string indicates stitching method: "random": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~ {1,...,N}, i=1,...,N and time-shifting is applied to sequences2. This method is suitable for the sequences generated by a same stationary point process. "feature": stitch the seq_j in sequences2 to the seq_i in sequences1 for j ~{1,...,N}, i=1,...,N and j is sampled according to the similarity between two sequences. The similarity is calculated by the Gaussian kernel of seq_features, labels and times. When seq_features/labels are not available, only timestamp information are taken into account. :return: the output sequences are with the same format as database1. """ start = time.time() output = copy.deepcopy(database1) if database1['type2idx'] == database2['type2idx']: if method is None or method == 'random': logger.info('random stitching is applied...') index = np.random.permutation( len(database2['sequences'] )) # random permutation of the index of sequences for i in range(len(database1['sequences'])): seq_i = database1['sequences'][i] j = i % len(database2['sequences']) seq_j = database2['sequences'][index[j]] # concatenate two timestamp arrays with time shifting times1 = seq_i['times'] times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop'] output['sequences'][i]['times'] = np.concatenate( (times1, times2), axis=0) # concatenate two event arrays output['sequences'][i]['events'] = np.concatenate( (seq_i['events'], seq_j['events']), axis=0) # update stop timestamp output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[ 't_stop'] - seq_j['t_start'] # update features if seq_i['seq_feature'] is not None and seq_j[ 'seq_feature'] is not None: output['sequences'][i]['seq_feature'] = ( seq_i['seq_feature'] + seq_j['seq_feature']) / 2 if i % 1000 == 0: logger.info( '{} sequences have been stitched... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) elif method == 'feature': logger.info('feature-based stitching is applied...') for i in range(len(database1['sequences'])): prob = np.zeros((len(database2['sequences']), )) seq_i = database1['sequences'][i] for j in range(len(database2['sequences'])): seq_j = database2['sequences'][j] if seq_j['t_start'] > seq_i['t_stop']: # consider temporal order weight = np.exp(-(seq_j['t_start'] - seq_i['t_stop'])**2) # consider feature similarity if seq_i['seq_feature'] is not None and seq_j[ 'seq_feature'] is not None: weight *= np.exp( -np.linalg.norm(seq_i['seq_feature'] - seq_j['seq_feature'])**2) # consider label consistency if seq_i['label'] is not None and seq_j[ 'label'] is not None: if seq_i['label'] != seq_j['label']: weight = 0 else: weight = 0 prob[j] = weight # sampling a sequence from database2 if np.sum(prob) > 0: prob = prob / np.sum(prob) else: prob = np.ones((len(database2['sequences']), )) / len( database2['sequences']) j = np.random.choice(len(database2['sequences']), p=prob) seq_j = database2['sequences'][j] # concatenate two timestamp arrays with time shifting times1 = seq_i['times'] times2 = seq_j['times'] - seq_j['t_start'] + seq_i['t_stop'] output['sequences'][i]['times'] = np.concatenate( (times1, times2), axis=0) # concatenate two event arrays output['sequences'][i]['events'] = np.concatenate( (seq_i['events'], seq_j['events']), axis=0) # update stop timestamp output['sequences'][i]['t_stop'] = seq_i['t_stop'] + seq_j[ 't_stop'] - seq_j['t_start'] # update features if seq_i['seq_feature'] is not None and seq_j[ 'seq_feature'] is not None: output['sequences'][i]['seq_feature'] = ( seq_i['seq_feature'] + seq_j['seq_feature']) / 2 if i % 1000 == 0: logger.info( '{} sequences have been stitched... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) else: logger.warning('You need to define your own stitching method... ' 'The function returns the first database.') else: logger.warning('The two databases do not have the same event types... ' 'The function returns the first database.') return output
def load_sequences_csv(file_name: str, domain_names: Dict, upperlimit=None): """ Load event sequences from a csv file :param file_name: the path and name of the target csv file :param domain_names: a dictionary contains the names of the key columns corresponding to {'seq_id', 'time', 'event'} The format should be domain_names = {'seq_id': the column name of sequence name, 'time': the column name of timestamps, 'event': the column name of events} :return: database: a dictionary containing observed event sequences database = {'event_features': None, 'type2idx': a Dict = {'event_name': event_index} 'idx2type': a Dict = {event_index: 'event_name'} 'seq2idx': a Dict = {'seq_name': seq_index} 'idx2seq': a Dict = {seq_index: 'seq_name'} 'sequences': a List = [seq_1, seq_2, ..., seq_N]. } For the i-th sequence: seq_i = {'times': (N,) float array of timestamps, N is the number of events. 'events': (N,) int array of event types. 'seq_feature': None. 't_start': a float number, the start timestamp of the sequence. 't_stop': a float number, the stop timestamp of the sequence. 'label': None } """ database = { 'event_features': None, 'type2idx': None, 'idx2type': None, 'seq2idx': None, 'idx2seq': None, 'sequences': [] } if upperlimit is not None: df = pd.read_csv(file_name).iloc[:upperlimit] else: df = pd.read_csv(file_name) type2idx = {} idx2type = {} seq2idx = {} idx2seq = {} logger.info('Count the number of sequences...') start = time.time() seq_idx = 0 type_idx = 0 for i, row in df.iterrows(): seq_name = str(row[domain_names['seq_id']]) event_type = str(row[domain_names['event']]) if seq_name not in seq2idx.keys(): seq2idx[seq_name] = seq_idx seq = { 'times': [], 'events': [], 'seq_feature': None, 't_start': 0.0, 't_stop': 0.0, 'label': None } database['sequences'].append(seq) seq_idx += 1 if event_type not in type2idx.keys(): type2idx[event_type] = type_idx type_idx += 1 if i % 10000 == 0: logger.info('{} events have been processed... Time={}ms.'.format( i, round(1000 * (time.time() - start)))) logger.info( 'Done! {} sequences with {} event types are found in {}ms'.format( seq_idx + 1, type_idx + 1, round(1000 * (time.time() - start)))) logger.info('Build proposed database for the sequences...') start2 = time.time() for seq_name in seq2idx.keys(): seq_idx = seq2idx[seq_name] idx2seq[seq_idx] = seq_name for event_type in type2idx.keys(): type_idx = type2idx[event_type] idx2type[type_idx] = event_type database['type2idx'] = type2idx database['idx2type'] = idx2type database['seq2idx'] = seq2idx database['idx2seq'] = idx2seq for i, row in df.iterrows(): seq_name = str(row[domain_names['seq_id']]) timestamp = float(row[domain_names['time']]) event_type = str(row[domain_names['event']]) seq_idx = database['seq2idx'][seq_name] type_idx = database['type2idx'][event_type] database['sequences'][seq_idx]['times'].append(timestamp) database['sequences'][seq_idx]['events'].append(type_idx) if i % 10000 == 0: logger.info('{} events have been processed... Time={}ms.'.format( i, round(1000 * (time.time() - start2)))) logger.info('Done! {} sequences are built in {}ms'.format( len(database['seq2idx']), round(1000 * (time.time() - start2)))) logger.info('Format transformation...') for n in range(len(database['sequences'])): database['sequences'][n]['t_start'] = database['sequences'][n][ 'times'][0] database['sequences'][n][ 't_stop'] = database['sequences'][n]['times'][-1] + 1e-2 database['sequences'][n]['times'] = np.asarray( database['sequences'][n]['times']) database['sequences'][n]['events'] = np.asarray( database['sequences'][n]['events']) if n % 1000 == 0: logger.info( '{} sequences have been processed... Time={}ms.'.format( n, round(1000 * (time.time() - start)))) logger.info('Done! The database has been built in {}ms'.format( round(1000 * (time.time() - start)))) return database
def print_info(self): logger.info('A generalized Hawkes process intensity:') logger.info('Intensity function lambda(t) = {}'.format(self.intensity_type)) self.exogenous_intensity.print_info() self.endogenous_intensity.print_info()
def load_event_features_csv(file_name: str, event_domain: str, domain_dict: Dict, database: Dict, normalize: int = 0): """ load events' features from a csv file :param file_name: the path and the name of the csv file :param event_domain: the name of the key column corresponding to event index. :param domain_dict: a dictionary containing the names of the key columns corresponding to the features. The format should be domain_dict = {'domain_name': domain's feature type} Two types are considered: 1) 'numerical': each element (row) in the corresponding domain should be a string containing D numbers separated by spaces, and D should be the same for various elements. D-dimensional real-value features will be generated for this domain. If each event type has multiple rows, the average of the features will be recorded. 2) 'categorical': each element (row) in the corresponding domain should be a strong containing N keywords separated by spaces, but N can be different for various elements. D-dimensional binary features will be generated for this domain. Here D is the number of distinguished keywords (vocabulary size). If each event type has multiple rows, the aggregation of the binary features will be recorded. :param database: a dictionary of data generated by the function "load_sequences_csv()" :param normalize: 0 = no normalization, 1 = normalization across features, 2 = normalization across event types :return: a database having events' features """ df = pd.read_csv(file_name) num_event = len(database['type2idx']) # initialize features features = {} counts = {} for key in domain_dict.keys(): features[key] = None counts[key] = None logger.info('Start to generate sequence features...') start = time.time() for i, row in df.iterrows(): event_name = str(row[event_domain]) if event_name not in database['type2idx'].keys(): logger.warning( "'{}' is a new event type not appearing in current database.". format(event_name)) logger.warning("It will be ignored in the process.") else: event_idx = database['type2idx'][event_name] for key in domain_dict.keys(): elements = str(row[key]) if domain_dict[key] == 'numerical': elements = np.asarray(list(map(float, elements.split()))) dim = elements.shape[0] if features[key] is None: features[key] = np.zeros((dim, num_event)) features[key][:, event_idx] = elements counts[key] = np.zeros((1, num_event)) counts[key][0, event_idx] = 1 counts[key][0, 0] = 1 else: features[key][:, event_idx] += elements counts[key][0, event_idx] += 1 elif domain_dict[key] == 'categorical': elements = elements.split() if features[key] is None: features[key] = {} features[key][event_idx] = elements counts[key] = {} element_idx = 0 else: if event_idx not in features[key].keys(): features[key][event_idx] = elements else: features[key][event_idx].extend(elements) for element in elements: if element not in counts[key].keys(): counts[key][element] = element_idx element_idx += 1 else: logger.warning( 'Undefined feature type for the domain {}.'.format( key)) logger.warning("It will be ignored in the process.") if i % 1000 == 0: logger.info('{} rows have been processed... Time={}ms'.format( i, round(1000 * (time.time() - start)))) # post-process of features features_all = None start = time.time() for key in domain_dict.keys(): if domain_dict[key] == 'numerical': features_tmp = features[key] features_tmp = features_tmp / np.tile(counts[key], (features[key].shape[0], 1)) if features_all is None: features_all = features_tmp else: features_all = np.concatenate((features_all, features_tmp), axis=0) elif domain_dict[key] == 'categorical': features_tmp = np.zeros((len(counts[key]), num_event)) for event_idx in features[key].keys(): for element in features[key][event_idx]: element_idx = counts[key][element] features_tmp[element_idx, event_idx] += 1 if features_all is None: features_all = features_tmp else: features_all = np.concatenate((features_all, features_tmp), axis=0) else: logger.warning( 'Undefined feature type for the domain {}.'.format(key)) logger.warning("It will be ignored in the process.") logger.info( "features of domain '{}' is generated... Time={}ms.".format( key, round(1000 * (time.time() - start)))) if normalize == 1: features_all = features_all / \ np.tile(np.sum(features_all, axis=0)+1e-8, (features_all.shape[0], 1)) if normalize == 2: features_all = features_all / \ np.transpose(np.tile(np.sum(features_all, axis=1)+1e-8, (features_all.shape[1], 1))) database['event_features'] = features_all return database