def _test(): tp = TextProcessor(10, 2192, 100) corpus = [ '你好 中国', '打印 每类 文本 for', 'for', '遍历 所有 文本 第二个 for 便利 某一类 文本 下 的 词语 权重' ] print(tp.w2v_transform('\n'.join(corpus))['你好']) corpus = [ '你好 中国'.split(), '打印 每类 文本 for'.split(), 'for'.split(), '遍历 所有 文本 第二个 for 便利 某一类 文本 下 的 词语 权重'.split() ] ctf = tp.tf_idf_transform(corpus) clsi = tp.lda_transform(ctf) for i in clsi: print(i) aa = tp.ldaModel.print_topics(num_topics=500, num_words=50) for i in aa: print(i) path = conf.get_filename_via_tpl('model', model_type='lsi', n_users=conf.N_USERS, n_samples=conf.N_SAMPLES, n_dims=conf.N_DIMS) tp.load_model('lsi') tp.w2v_transform([['你好啊', 'hell0'], ['123', 'forfor']])
def tf_idf_transform(self, doc): """ Perform tf-idf transformation on doc. """ self.dictionary = corpora.Dictionary(doc) corpus = [self.dictionary.doc2bow(text) for text in doc] self.tfIdfModel = TfidfModel(corpus) conf.mk_dir(self.tfIdfPath) self.dictionary.save(self.dictPath) logger.info('Dictionary has been saved in %s.' % self.dictPath) self.tfIdfModel.save(self.tfIdfPath) logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath) tfidf_corpus = self.tfIdfModel[corpus] tfidf_corpus_path = conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples) corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus) logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' % (np.array(tfidf_corpus).shape, tfidf_corpus_path)) return tfidf_corpus
def lsi_transform(self, corpus_tf_idf): logger.info('Training lsi model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) self.lsiModel = LsiModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) # print self.lsiModel[corpus] conf.mk_dir(self.lsiPath) self.lsiModel.save(self.lsiPath) logger.info('Lsi model has been saved in %s.' % self.lsiPath) lsi_corpus = self.lsiModel[corpus_tf_idf] lsi_corpus_path = conf.get_filename_via_tpl('lsi', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lsi_corpus_path) corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus) logger.info('Lsi corpus with a shape of %s has been saved in %s.' % (np.array(lsi_corpus).shape, lsi_corpus_path)) return lsi_corpus
def save_sequences(self, new_mblog_info=None): mblog_info = new_mblog_info if new_mblog_info else self.mblogInfo uid_fname = conf.get_filename_via_tpl('uid', n_users=len(self.uidList), n_samples=mblog_info.seqLen) seq_fname = conf.get_filename_via_tpl('seq', n_users=len(self.uidList), n_samples=mblog_info.seqLen) # Save user ids with open(uid_fname, 'w') as fp: csv_writer = csv.writer(fp) csv_writer.writerow(self.uidList) logger.info('User id are saved in %s. ' % uid_fname) # Save sequences with open(seq_fname, 'w') as fp: csv_writer = csv.writer(fp) csv_writer.writerows(self.sequences) logger.info('Sequences data is saved in file %s. ' % seq_fname)
def calculate_te(data, vec_type, lag=1, normalised=True): """ Perform transfer entropy on each pair of samples to find out causal relationships. """ data = np.array(data) n_nodes, n_samples, n_dims = data.shape cn = np.zeros((n_nodes, n_nodes)) te_mat = np.zeros((n_nodes, n_nodes)) # if normalised: H_0 = np.zeros(n_nodes) for i in range(n_nodes): max_min = np.max(data[i], 0) - np.min(data[i], 0) H_0[i] = np.sum(np.log2(max_min)) # for i in range(n_nodes): # H_0[i] = entropy(data[i]) logger.info('Calculating te...') # Calculate te and fill with the causal network. for i in range(n_nodes): sample_i = data[i] for j in range(i, n_nodes): sample_j = data[j] # Construct variables XP, YP and X/YF for te estimator. sample_i_p = sample_i[lag:] sample_i_f = sample_i[:-lag] sample_j_p = sample_j[lag:] te_j_i = cmi(sample_i_f, sample_j_p, sample_i_p) if normalised: te_j_i = te_j_i / (H_0[i] - cond_entropy( sample_i_f, np.concatenate((sample_i_p, sample_j_p), 1))) # te_j_i = te_j_i / H_0[i] te_mat[j][i] = te_j_i if i != j: sample_j_f = sample_j[:-lag] te_i_j = cmi(sample_j_f, sample_i_p, sample_j_p) if normalised: te_i_j = te_i_j / (H_0[j] - cond_entropy( sample_j_f, np.concatenate( (sample_i_p, sample_j_p), 1))) # te_i_j = te_i_j / H_0[j] te_mat[i][j] = te_i_j te_path = conf.get_filename_via_tpl('te_' + vec_type, n_users=n_nodes, n_samples=n_samples, n_dims=n_dims, lag=lag) np.savetxt(te_path, te_mat, delimiter=',', fmt='%f') logger.info('Te result has been saved in %s. ' % te_path) return cn, te_mat
def lda_transform(self, corpus_tf_idf, train_separated=False, is_update=False): """ Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it. :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix. :param train_separated: The model is going to be train with all corpus one time or some of them separately one time. :param is_update: Whether the training to be perform is to construct a new model or update one existed. :return: lda corpus. """ logger.info('Training lda model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) if is_update: # A ldaModel had been trained before and now update the model with other corpus. if self.ldaModel is None: self.load_model('lda') self.ldaModel.update(corpus_tf_idf) logger.info('Lda model has been updated successfully.') return self.ldaModel[corpus_tf_idf] if train_separated: # corpus = [] # spacing = 10000 # for i in range(int(len(corpus_tf_idf)/spacing)): # corpus.append(corpus_tf_idf[i*spacing: i]) # self.ldaModel = LdaModel() pass self.ldaModel = LdaModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) conf.mk_dir(self.ldaPath) self.ldaModel.save(self.ldaPath) logger.info('lda model has been saved in %s' % self.ldaPath) lda_corpus = self.ldaModel[corpus_tf_idf] lda_corpus_path = conf.get_filename_via_tpl('lda', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lda_corpus_path) corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus) logger.info('Lda corpus with a shape of %s has been saved in %s.' % (np.array(lda_corpus).shape, lda_corpus_path)) return lda_corpus
def load_corpus(self, model_type, dense=False): corpus = None try: if model_type == 'tfidf': corpus = corpora.MmCorpus( conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples)) elif model_type in ['lsi', 'lda']: corpus = corpora.MmCorpus( conf.get_filename_via_tpl(model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm')) elif model_type == 'w2v': corpus = np.loadtxt(conf.get_filename_via_tpl( model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims), dtype=np.float, delimiter=',') logger.info('%s corpus with a shape of %s has been loaded. ' % (model_type, np.array(corpus).shape)) if dense and model_type in ['tfidf', 'lsi', 'lda']: corpus = matutils.corpus2dense(corpus, self.nDims, self.nSamples * self.nUsers, dtype=np.float).T else: corpus = np.array(corpus) except Exception as e: raise e return corpus
def recover_text_list(n_users, n_samples, debug=False): text_list = [] with open( conf.get_filename_via_tpl('uid', n_users=n_users, n_samples=n_samples)) as fp: uid_list = [int(i) for i in fp.readline().split(',')] debug_flag = 0 for uid in uid_list: if debug and debug_flag > 0: break csv.field_size_limit(sys.maxsize) with open(conf.get_filename_via_tpl('text', user_id=uid, n_samples=n_samples), encoding='utf-8') as fp: csv_reader = csv.reader(fp) for line in csv_reader: if not len(line) or line[0] == '': text = [] else: text = line[0].strip().split(' ') while '' in text: text.remove('') text_list.append(text) logger.info( 'Successfully recover user %d\'s data with %d samples. ' % (uid, n_samples)) debug_flag += 1 # for i in range(10): # print(text_list[i]) if debug: print(text_list[:100]) return text_list
def __init__(self, n_users, n_samples, n_dims): self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\ conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \ conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt')
def w2v_transform(self, sentences): """ Perform word2vec on texts and obtain a w2v model. :param sentences: Sentences that each one of it contains a list of words of a text. :return: W2v model. """ logger.info('Training w2v model with a dim of %d...' % self.nDims) # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path) # sentences = [] # for sen in file.readlines(): # sentences.append(sen.strip().split(' ')) # print(sentences) self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0) conf.mk_dir(self.w2vPath) self.w2vModel.save(self.w2vPath) self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False) # print(model['[']) # Construct w2v corpus w2v_corpus = [] for sen in sentences: vec = [0] * self.nDims if len(sen) > 0: for word in sen: vec = list( map(lambda m, n: m + n, vec, self.w2vModel[word])) # vec += self.w2vModel[word] w2v_corpus.append(vec) w2v_corpus_path = conf.get_filename_via_tpl('w2v', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims) conf.mk_dir(w2v_corpus_path) with open(w2v_corpus_path, 'w') as fp: csv_writer = csv.writer(fp) for line in w2v_corpus: csv_writer.writerow(line) logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path) return w2v_corpus
# coding: utf-8 import datetime import numpy as np import csv import utils.config_util as conf from utils import model_object as mo from utils.log import get_console_logger np.set_printoptions(threshold=np.inf) logger = get_console_logger(__name__) n_users, n_samples = 12, 2192 original_seq = np.loadtxt(conf.get_filename_via_tpl('seq', n_users=12, n_samples=2192), delimiter=',') print(original_seq.shape) def segment_ts_enum(): """ Segment time series with time steps constructed by inserting numbers one by one the the time steps list. """ times_steps = {original_seq.shape[1]} results = [['obj_func', 'h_seq', 'it', 'penalty']] while len(times_steps) < original_seq.shape[1]: max_obj = max_idx = -1 max_line = None for i in range(1, original_seq.shape[1]): if i not in times_steps: temp_steps = [i] + list(times_steps)
def segment_ts_bottom_up(pre_compute=True): """ Segment time series using a bottom-up method, and its candidate_obj are calculated as inner products of every two neighbor time points. """ start_time = datetime.datetime.now() # seq = original_seq.copy() seq = original_seq[:, :100].copy() # print(seq) results = [['time_steps', 'obj_func', 'h_vars', 'h_tps', 'it', 'regularization']] # lamb = -0.01 lamb = -100 / seq.shape[1] # Initial last_time_steps, last_idx = np.array(range(1, seq.shape[1] + 1)), -1 last_obj, last_seq, last_obj_details = mo.object_function(seq, last_time_steps, lamb, True) results.append([last_time_steps, last_obj] + last_obj_details) print('result: ', last_obj) print('details:', last_obj_details) # return if pre_compute: mo.pre_compute(seq, last_time_steps) # Maximize object function iteratively while True: max_idx, max_time_steps, max_candidate_obj, max_candidate_seq, max_candidate_obj_details = -1, None, -1, None, None next_length = last_time_steps.shape[0] - 1 # Find maximum obj of next level's for i in range(next_length): new_time_steps = np.delete(last_time_steps, i) temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details = \ mo.object_function(seq, new_time_steps, lamb, True, i, last_seq) # print('----') # print('time_steps:', new_time_steps) # print('temp result: ', temp_candidate_obj) # print('temp result terms:', temp_candidate_obj_details) if temp_candidate_obj > max_candidate_obj: max_idx, max_time_steps, max_candidate_obj, max_candidate_seq, max_candidate_obj_details = \ i, new_time_steps, temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details # Loop ending condition: None of next level's obj result greater than last level's if max_idx == -1 or max_candidate_obj < last_obj or last_seq.shape[1] < 4: break # Merge time points i and (i+1) to boost seq's obj result if pre_compute: mo.pre_compute(seq, max_time_steps, max_idx, last_seq) last_idx, last_time_steps, last_obj, last_seq, last_obj_details = \ max_idx, max_time_steps, max_candidate_obj, max_candidate_seq, max_candidate_obj_details results.append([last_time_steps, last_obj] + last_obj_details) # print('time steps: ', last_time_steps) print('----\nlevel %d' % (seq.shape[1] - last_seq.shape[1])) print('result: ', last_obj) print('details:', last_obj_details) # break # return finish_time = datetime.datetime.now() print('%s用时:%f\n--------' % ('precompute' if pre_compute else 'not-precompute', (finish_time - start_time).total_seconds())) # Save results filename = conf.get_filename_via_tpl( 'Obj', n_users=seq.shape[0], n_samples=seq.shape[1], date=datetime.datetime.now().strftime('%y%m%d%H%M%S')) with open(filename, 'w', newline='') as fp: csv_writer = csv.writer(fp) csv_writer.writerow(['lambda: %f' % lamb]) csv_writer.writerows(results) return results
def construct_time_series_data(self): mblog_info = self.mblogInfo def sequence_idx(time_str): if isinstance(mblog_info.timeStep, int): return int((mblog_info.endTime - datetime.strptime( time_str, mblog_info.timeFormat)).total_seconds() / mblog_info.timeStep) else: steps_count = len(mblog_info.timeStep) total_seconds = (mblog_info.endTime - datetime.strptime( time_str, mblog_info.timeFormat)).total_seconds() index = int(total_seconds / mblog_info.timeStep[-1]) rest = total_seconds - index * mblog_info.timeStep[-1] for i in range(steps_count): if rest < mblog_info.timeStep[i]: return index * steps_count + i return (index + 1) * steps_count with open(conf.get_absolute_path('data') + 'default_users.txt', encoding='utf-8') as fp: lines = fp.readlines() user_mblogs_dir = conf.get_absolute_path('data') + 'user_mblogs/' self.uidList = [] for line in lines: try: uid, count = int(line.split('|')[0]), int(line.split('|')[1]) except ValueError as e: logger.error('Invalid uid or count. %s' % e) continue filename = '{}-{}.csv'.format(uid, count) self.uidList.append(uid) absolute_mblogs_fname = user_mblogs_dir + filename sequence = [0] * mblog_info.seqLen text_list = [''] * mblog_info.seqLen with open(absolute_mblogs_fname) as fp: csv_reader = csv.reader(fp) next(csv_reader) try: while True: line = next(csv_reader) pub_time = line[2] text = content_filter(line[3]).strip() + ' ' idx = sequence_idx(pub_time) try: sequence[idx] = 1 text_list[idx] += text except IndexError: # logger.info('Index(%d) out of range. Cause: pub_time(%s) is out of range. ' % (idx, pub_time)) print('%d, %d' % (len(sequence), idx)) except StopIteration: self.sequences.append(sequence) text_list = tokenize(text_list) # self.textList.extend(text_list) logger.info('Successfully gen user %d\'s data. ' % uid) # Text of one user is saved to one file. with open( conf.get_filename_via_tpl('text', user_id=uid, n_samples=mblog_info.seqLen), 'w') as fp: csv_writer = csv.writer(fp) for row in text_list: csv_writer.writerow([row]) # print self.uidList self.save_sequences()
# except ValueError as msg: # logger.error('Not a valid file. Skip it. ' + str(msg)) # uid_pairs = [] # for uid in uid_list: # for uid_2 in uid_list: # if uid != uid_2: # uid_pairs.append((uid, uid_2)) # users_retweet = check_retweet(uid_pairs) # with open(conf.get_absolute_path('DATA_ROOT') + '/users_retweet.csv', 'w', encoding='utf-8') as fp: # csv_writer = csv.writer(fp) # for idx in range(len(uid_pairs)): # csv_writer.writerow([str(uid_pairs[idx][0]) + '-->' + str(uid_pairs[idx][1]), str(users_retweet[idx])]) # Make transfer uid_list = np.loadtxt(conf.get_filename_via_tpl('uid', n_users=conf.N_USERS, n_samples=conf.N_SAMPLES), delimiter=',', dtype=np.int) uid_dict = {} idx = 0 for uid in uid_list: uid_dict[uid] = idx idx += 1 print(uid_list) transfer = np.zeros((conf.N_USERS, conf.N_USERS)) with open(conf.get_absolute_path('data') + 'users_retweet.csv', encoding='utf-8') as fp: csv_reader = csv.reader(fp) for r in csv_reader:
import utils.config_util as conf from temp.te import cmidd os.system('cls') lagmax = 6 tmax = conf.N_SAMPLES nnode = conf.N_USERS resampleTime = 100 length = tmax - lagmax - 1 lag_max_pre = np.eye(nnode, dtype=int) lag_max_late = np.zeros((nnode, nnode), dtype=np.int) lag_te = np.zeros((nnode, nnode)) sample = np.loadtxt(conf.get_filename_via_tpl('seq', n_users=nnode, n_samples=conf.N_SAMPLES), delimiter=',') # sample = sample.T t = time.time() for n in range(nnode): #xrange(0): con = {} con[(n, 1)] = sample[n, lagmax:tmax - 1] x = sample[n, lagmax + 1:tmax] redundance = 0 nodeset = range(nnode) for j in range(lagmax): n_teList = [] for i in nodeset:
def evaluate(n_users, n_samples, n_dims): """ Evaluate result via precise, recall and f-value. :return: Accuracy, recall and f1. """ result = np.loadtxt(conf.get_filename_via_tpl('te_text', n_users=n_users, n_samples=n_samples, n_dims=n_dims), delimiter=',') # print(result) new_result = np.zeros(result.shape) new_result_state = np.zeros(result.shape).astype(int) for i in range(n_users): for j in range(n_users): if result[i][j] > 0.1: new_result[i][j] = result[i][j] new_result_state[i][j] = 1 # for i in range(n_users): # for j in range(i, n_users): # if abs(result[i][j]-result[j][i]) < 0.1: # new_result[i][j] = new_result[j][i] = 0 # print(new_result) with open( conf.get_filename_via_tpl('re', n_users=n_users, n_samples=n_samples, n_dims=n_dims), 'w') as fp: csv_writer = csv.writer(fp) csv_writer.writerows(new_result) comparison = np.loadtxt(conf.RESULT_DIR + '/transfer', delimiter=',', dtype=int) # print(comparison) print('----Evaluation of %d users, %d samples, %d dims. ----' % (n_users, n_samples, n_dims)) acc_rate = np.sum(new_result_state == comparison) * 1. / np.power( n_users, 2) predict_result = np.zeros((2, 2)).astype(int) for i in range(n_users): for j in range(n_users): predict_result[~comparison[i][j]][~new_result_state[i][j]] += 1 print(predict_result) p = 1. * predict_result[0][0] / (np.sum(predict_result[:, 0])) r = 1. * predict_result[0][0] / (np.sum(predict_result[0, :])) f1 = (2 * p * r) / (p + r) print('Accuracy: %.3f' % acc_rate) print('Precise: %.3f' % p) print('Recall: %.3f' % r) print('F1: %.3f' % f1) print('---ROC-AUC---') fpr, tpr, thresholds = roc_curve(comparison.reshape(n_users * n_users), new_result.reshape(n_users * n_users)) print('fpr: ') print(fpr) print('tpr: ') print(tpr) print('thresholds: ') print(thresholds) roc_auc = auc(fpr, tpr) print('roc-auc:') print(roc_auc) print('\n\n')
def segment_ts_bottom_up_test(pre_compute=True): start_time = datetime.datetime.now() # seq = original_seq.copy() seq = original_seq[:, :100].copy() results = [['time_steps', 'obj_func', 'h_vars', 'h_tps', 'it', 'regularization']] # lamb = -0.01 lamb = -100 / seq.shape[1] # Initial last_time_steps, last_idx = np.array(range(1, seq.shape[1] + 1)), -1 last_obj, last_seq, last_obj_details = mo.object_function(seq, last_time_steps, lamb, True) results.append([last_time_steps, last_obj] + last_obj_details) print('result: ', last_obj) print('details:', last_obj_details) # return if pre_compute: mo.pre_compute(seq, last_time_steps) levels = [] stop_level = 97 levels_idx = 0 max_rst = 0. while True: next_length = last_time_steps.shape[0] - 1 # Find maximum obj of next level's for i in range(last_idx + 1, next_length): new_time_steps = np.delete(last_time_steps, i) temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details = \ mo.object_function(seq, new_time_steps, lamb, True, i, last_seq) if temp_candidate_obj > max_rst: max_rst = temp_candidate_obj if next_length > stop_level: levels.append([i, new_time_steps, temp_candidate_obj, temp_candidate_seq, temp_candidate_obj_details, last_seq]) results.append([new_time_steps, temp_candidate_obj] + temp_candidate_obj_details) results.append(['']) if len(levels) <= levels_idx: break curr_level = levels[levels_idx] levels_idx += 1 last_idx, last_time_steps, last_obj, last_seq, last_obj_details, ll_seq = \ curr_level[0], curr_level[1], curr_level[2], curr_level[3], curr_level[4], curr_level[5] # Merge time points i and (i+1) to boost seq's obj result if pre_compute: mo.pre_compute(seq, last_time_steps) # break # return finish_time = datetime.datetime.now() print('%s用时:%f\n--------' % ('precompute' if pre_compute else 'not-precompute', (finish_time - start_time).total_seconds())) print('最大值: ', max_rst) # Save results filename = conf.get_filename_via_tpl( 'Obj', n_users=seq.shape[0], n_samples=seq.shape[1], date=datetime.datetime.now().strftime('%y%m%d%H%M%S')) with open(filename, 'w', newline='') as fp: csv_writer = csv.writer(fp) csv_writer.writerow(['lambda: %f' % lamb]) csv_writer.writerows(results) return results
# coding: utf-8 import numpy as np import utils.config_util as conf import utils.entropy_estimators as ee from gen_data import DataGenerator, MblogInfo # Read sequences of all users which have been processed with the smallest time step. data_info = {'n_users': 12, 'n_samples': 2192, 'n_dims': 100} seq_filename = conf.get_filename_via_tpl('seq', n_users=data_info['n_users'], n_samples=data_info['n_samples']) sequences = np.loadtxt(seq_filename, np.int, delimiter=',') # print(sequences) # print(sequences.shape) # Set an active rate and find the optimal time steps to make a maximum joint entropy. active_rate = .4 active_count = active_rate * data_info['n_users'] hist = sequences.sum(0) active_status = np.zeros(sequences.shape[1]) active_status[np.where(hist > active_count)] = 1 # print(active_status) # print(len(active_status)) # print(sum(active_status)) last_joint_entropy = None def merge_sub_sequence(s, e, seqs): global last_joint_entropy
def testTE(lagmax, tmax, resampleTime, nnode, sample, indegreeaverage): """ function of TE """ length = tmax - lagmax - 1 lag_max_pre = np.eye(nnode, dtype=int) lag_max_late = np.zeros((nnode, nnode), dtype=np.int) lag_te = np.zeros((nnode, nnode)) # sample = sample.T for n in range(nnode): con = {} con[(n, 1)] = sample[n, lagmax:tmax - 1] x = sample[n, lagmax + 1:tmax] nodeset = range(nnode) ####find lag&pc#### for i in nodeset: if n == i: temp_te = cmidd(x, sample[i, (lagmax - 1):(tmax - 2)], con) temp_te_2 = np.array([ cmidd( x, random.sample( sample[i, (lagmax - 1):(tmax - 2)].tolist(), length), con) for m in range(resampleTime) ]) if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01: lag_max_late[i, n] += 1 lag_max_pre[i, n] += 1 lag_te[n, i] = temp_te else: temp_te = cmidd(x, sample[i, (lagmax + 1):(tmax)], con) temp_te_2 = np.array([ cmidd( x, random.sample(sample[i, (lagmax + 1):(tmax)].tolist(), length), con) for m in range(resampleTime) ]) if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01: lag_max_late[i, n] += 1 lag_max_pre[i, n] += 1 lag_te[n, i] = temp_te for n in range(nnode): for i in range(nnode): if n != i: if (lag_te[n, i] >= lag_te[i, n]): lag_max_late[i, n] = 0 elif lag_te[i, n]: lag_max_late[n, i] = 0 else: break np.savetxt(conf.get_filename_via_tpl('te_lag_max_pre', n_users=nnode, n_samples=tmax), lag_max_pre, fmt='%d', delimiter=',') np.savetxt(conf.get_filename_via_tpl('te_lag_max_late', n_users=nnode, n_samples=tmax), lag_max_late, fmt='%d', delimiter=',') np.savetxt(conf.get_filename_via_tpl('te_lag_te', n_users=nnode, n_samples=tmax), lag_te, delimiter=',') logger.info('TE results have been saved in result folder.') return lag_max_pre, lag_max_late, lag_te
def testMNR(lagmax, tmax, resampleTime, nnode, sample, indegreeaverage): """ function of MNR """ length = tmax - lagmax - 1 lag_max_pre = np.eye(nnode, dtype=int) lag_max_late = np.zeros((nnode, nnode), dtype=np.int) lag_te = np.zeros((nnode, nnode)) # sample = sample.T for n in range(nnode): con = {} con[(n, 1)] = sample[n, lagmax:tmax - 1] x = sample[n, lagmax + 1:tmax] nodeset = range(nnode) ####find lag&pc#### for j in range(lagmax): n_teList = [] for i in nodeset: if n == i: temp_te = cmidd(x, sample[i, (lagmax - j - 1):(tmax - j - 2)], con) temp_te_2 = np.array([ cmidd( x, random.sample( sample[i, (lagmax - j - 1):(tmax - j - 2)].tolist(), length), con) for m in range(resampleTime) ]) if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01: n_teList.append(i) con[(i, j + 2)] = sample[i, (lagmax - j - 1):(tmax - j - 2)] else: temp_te = cmidd(x, sample[i, (lagmax - j + 1):(tmax - j)], con) temp_te_2 = np.array([ cmidd( x, random.sample( sample[i, (lagmax - j + 1):(tmax - j)].tolist(), length), con) for m in range(resampleTime) ]) if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01: n_teList.append(i) con[(i, j + 1)] = sample[i, (lagmax - j + 1):(tmax - j)] lag_max_pre[n, n_teList] += 1 if len(n_teList): nodeset = n_teList[:] else: break ####remove lag&pc#### nodeindex = lag_max_pre[n].nonzero()[0] for i in nodeindex: tem_con = deepcopy(con) j = lag_max_pre[n, i] while (j > 0) and bool(len(con)): tem_con = deepcopy(con) y_next = tem_con.pop((i, j)) temp_te = cmidd(x, y_next, tem_con) temp_te_2 = np.array([ cmidd(x, random.sample(list(y_next), length), tem_con) for m in range(resampleTime) ]) if len(temp_te_2[temp_te_2 > temp_te]) / 100.00 < 0.01: break else: con = tem_con j -= 1 if j and len(con): for l in range(1, j + 1): tem_con = deepcopy(con) lag_te[n, i] += cmidd(x, tem_con.pop((i, l)), tem_con) lag_max_late[n, i] = j for n in range(nnode): for i in range(nnode): if n != i: if (lag_te[n, i] >= lag_te[i, n]): lag_max_late[i, n] = 0 elif lag_te[i, n]: lag_max_late[n, i] = 0 else: break np.savetxt(conf.get_filename_via_tpl('mnr_lag_max_pre', n_users=nnode, n_samples=tmax), lag_max_pre, fmt='%d', delimiter=',') np.savetxt(conf.get_filename_via_tpl('mnr_lag_max_late', n_users=nnode, n_samples=tmax), lag_max_late, fmt='%d', delimiter=',') np.savetxt(conf.get_filename_via_tpl('mnr_lag_te', n_users=nnode, n_samples=tmax), lag_te, delimiter=',') logger.info('MNR results have been saved in result folder.') return lag_max_pre, lag_max_late, lag_te
def construct_with_diff_ts(self, new_mblog_info): n_users, n_samples = 12, 2192 uid_list = np.loadtxt(conf.get_filename_via_tpl('uid', n_users=n_users, n_samples=n_samples), delimiter=',', dtype=np.int) original_seq = np.loadtxt(conf.get_filename_via_tpl( 'seq', n_users=n_users, n_samples=n_samples), delimiter=',', dtype=np.int) original_text_list = [] for uid in uid_list: with open(conf.get_filename_via_tpl('text', user_id=uid, n_samples=n_samples), encoding='utf-8') as fp: csv_reader = csv.reader(fp) text = [ line[0].strip() if len(line) > 0 else '' for line in csv_reader ] original_text_list.append(text) assert len( text) == 2192, 'Texts of user %d are not enough. ' % uid time_steps = new_mblog_info.timeStep if n_samples == len(new_mblog_info.timeStep): return original_seq, original_text_list new_seq = np.zeros((original_seq.shape[0], len(time_steps)), np.int) new_text_list = [[''] * len(time_steps)] * original_seq.shape[0] nidx = oidx = 0 for time_point in time_steps: step = int(time_point / (24 * 3600)) - oidx # if step == 1: # new_seq[:, nidx] = original_seq[:, oidx] # new_text_list[:, nidx] = original_text_list[:, oidx] # else: for r in range(original_seq.shape[0]): new_seq[r, nidx] = sum(original_seq[r, oidx:oidx + step]) new_text_list[r][nidx] = original_text_list[r][oidx] for c in range(1, step): new_text_list[r][nidx] = new_text_list[r][ nidx] + ' ' + original_text_list[r][oidx + c] # print(original_text_list[r][oidx + c]) new_seq[new_seq[:, nidx] > 0, nidx] = 1 oidx += step nidx += 1 assert nidx == len(time_steps), 'nidx != len(time_steps)' assert oidx == original_seq.shape[ 1], 'Total amount of time steps is smaller than sample length.' self.uidList = uid_list self.sequences = new_seq self.save_sequences(new_mblog_info) for uid, texts in zip(uid_list, new_text_list): with open( conf.get_filename_via_tpl('text', user_id=uid, n_samples=new_mblog_info.seqLen), 'w') as fp: csv_writer = csv.writer(fp) for row in texts: csv_writer.writerow([row]) return new_seq, new_text_list