class SparseSigniContainer(): _THRESHOLD_FOR_CLEANING = eval(exp_config.get('detection', 'threshold_for_cleaning')) _CAPACITY_FOR_CLEANING = eval(exp_config.get('detection', 'capacity_for_cleaning')) def __init__(self): self.container = {} def _clean(self, _timestamp): to_be_cleaned_up = [] for key, value in self.container.iteritems(): value.observe(_timestamp, 0.) if value.ewma <= self._THRESHOLD_FOR_CLEANING: to_be_cleaned_up.append(key) print 'cleaning', len(to_be_cleaned_up), 'items...' for key in to_be_cleaned_up: self.container.pop(key) def get(self, _id, _timestamp): # check for cleaning if len(self.container) > self._CAPACITY_FOR_CLEANING: self._clean(_timestamp) # return if _id in self.container: return self.container[_id] else: sig_scorer = fast_signi.SignificanceScorer() self.container[_id] = sig_scorer return sig_scorer
def __init__(self, start, end): self.deq = deque([]) self.start = start self.end = end self.delta = td(minutes=5) self.lag = td(minutes=60 * 3) self.host = exp_config.get('database', 'host') self.user = exp_config.get('database', 'user') self.db = exp_config.get('database', 'db') self.charset = exp_config.get('database', 'charset') self.connection = MySQLdb.connect(host=self.host, user=self.user, db=self.db, charset=self.charset, passwd='123456') cursor = self.connection.cursor() cursor.execute("desc timelines2") id = 0 self.id_map = {} for column in cursor.fetchall(): self.id_map[column[0]] = id id += 1 cursor.close()
def __init__(self, _ptw_stream): self.ptw_stream = _ptw_stream _wz = eval(exp_config.get('significance', 'window_size')) _cycle = eval(exp_config.get('significance', 'cycle')) _average = eval(exp_config.get('significance', 'average')) print 'significance # set parameters:' + str( fast_signi.SignificanceScorer.set_window_size( _wz, _cycle, _average)) _uz = eval(exp_config.get('acceleration', 'unit_size')) print 'acceleration # set unit size ' + str( fast_smoother.set_unit_size(_uz)) _wz1 = eval(exp_config.get('acceleration', 'window_size1')) _wz2 = eval(exp_config.get('acceleration', 'window_size2')) print 'acceleration # set windows ' + str( fast_smoother.EWMASmoother.set_window_size(_wz1, _wz2)) self.processor = signi_processor.SigProcessor() self.threads = list() _start_t = exp_config.get('detection', 'start_t') _end_t = exp_config.get('detection', 'end_t') self._start_t = datetime.datetime.strptime(_start_t, '%Y-%m-%d %H:%M:%S') self._end_t = datetime.datetime.strptime(_end_t, '%Y-%m-%d %H:%M:%S')
def name2sim(input_names, prefix='name', return_matrix=False): print('prefix in name2sim', prefix) #debugging PATH = exp_config.get('data', 'path') method = exp_config.get('predicate_name', 'method') ip_ad = exp_config.get('dispy', 'ip') port = int(exp_config.get('dispy', 'port')) remote_path = exp_config.get('dispy', 'remote_path') print('In name2sim() method ', method) assert method in ['jaro_winkler', 'tfidf'] if os.path.isfile(PATH + prefix + '_list_' + method + '.txt'): names = list() fin = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'r', 'utf-8') for line in fin: names.append(line[:-1]) fin.close() else: names = input_names fout = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'w', 'utf-8') for name in names: fout.write(name) fout.write('\n') fout.close() sim = None if method == 'jaro_winkler': pass if method == 'tfidf': sim = dis_ngram_sim.distributed_name2sim_tfidf( names, '.', remote_path, 32, nodes=[(ip_ad, port)], return_sim=return_matrix) name2eid = dict(zip(names, range(len(names)))) name2eid[None] = len(names) return name2eid, sim
def process(self, sig_instance, sig_list=None): _t, _count, _ewma, _ewmvar, _sig, _keywords = sig_instance if _t < self._start_t or _t > self._end_t: return 0. if eval(exp_config.get('output', 'debug_info')): if sig_list: # for debugging print '-----------------------' for sig_ in sig_list: print '__sig__', sig_ print '-----------------------' create_new = True for thread in self.threads: if thread.add_to_thread(sig_instance): create_new = False break if create_new: thread = Slice() thread.new_thread(sig_instance) self.threads.append(thread) return _sig return 0.
def __init__(self): _start_y = int(exp_config.get('stream', 'start_y')) _start_m = int(exp_config.get('stream', 'start_m')) _start_d = int(exp_config.get('stream', 'start_d')) _end_y = int(exp_config.get('stream', 'end_y')) _end_m = int(exp_config.get('stream', 'end_m')) _end_d = int(exp_config.get('stream', 'end_d')) self.dy_start = date(_start_y, _start_m, _start_d) self.dy_end = date(_end_y, _end_m, _end_d) self.connection = MySQLdb.connect(host='?', user='******', db='?', charset='utf8') cursor = self.connection.cursor() cursor.execute("desc weibo_timelines") id = 0 self.id_map = {} for column in cursor.fetchall(): self.id_map[column[0]] = id id += 1 self.cursor = self.connection.cursor() _time0 = self.dy_start.strftime("%Y-%m-%d") _time1 = (self.dy_start + td(days=1)).strftime("%Y-%m-%d") sql_str = 'select * from ' + 'weibo_timelines' + ' where created_at >= "%s" and created_at < "%s" order by created_at' % ( _time0, _time1) print sql_str self.cursor.execute(sql_str)
def next(self): ptweet = self.ptw_stream.next() if ptweet is ts_stream.End_Of_Stream: return ts_stream.End_Of_Stream if ptweet is None: return None sig_instance, sig_list = self.processor.process(ptweet) if sig_instance is not None: output = self.process(sig_instance, sig_list) if eval(exp_config.get('output', 'debug_info')): print sig_instance return ptweet, output return ptweet, 0.0
def get_top_sig(): terms = [] _f = open(exp_config.get('process_detection_log', 'log_file'), 'r') #!!! for line in _f: if not line.startswith('201'): continue line = line.rstrip('\n') term = line.split('\t') terms.append(term) terms.sort(key=lambda x: terms_to_values(x)[4], reverse=True) for term in terms: _t, _count, _ewma, _ewmvar, _sig, _keywords = terms_to_values(term) print _t, _count, _ewma, _ewmvar, _sig, _keywords
def __init__(self, _ptw_stream): self.ptw_stream = _ptw_stream _wz = eval(exp_config.get('detection', 'window_size')) _cycle = eval(exp_config.get('detection', 'cycle')) _average = eval(exp_config.get('detection', 'average')) print 'set signi parameters:' + str( fast_signi.SignificanceScorer.set_window_size( _wz, _cycle, _average)) self.processor = signi_processor.SigProcessor() self.threads = list() _start_y = int(exp_config.get('detection', 'start_y')) _start_m = int(exp_config.get('detection', 'start_m')) _start_d = int(exp_config.get('detection', 'start_d')) self._start_t = datetime.datetime(_start_y, _start_m, _start_d) self.stop_tokens = { 'instagram', 'facebook', 'twitter', 'update', 'pic', 'picture', 'cr', 'damn', 'hashtag' }
__affiliation__ = 'Living Analytics Research Centre, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import threading from collections import deque import tensorflow as tf import numpy as np from array import array import time from functools import partial import exp_config DEBUG_FLAG = eval(exp_config.get('debug', 'flag')) BATCH_SIZE = eval(exp_config.get('cosine_embedding', 'batch_size')) LEARNING_RATE = eval(exp_config.get('cosine_embedding', 'learning_rate')) PARTITION_PATH = exp_config.get('cosine_embedding', 'partition_path') output_info = open('cosine_embedding_for_sparse_input_files_output.txt', 'wt') def _variable(name, shape, dtype, initializer): var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) return var def _step(i_partition,
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import datetime import exp_config from datetime import datetime as date _THREAD_GAP = eval(exp_config.get('process_detection_log', 'thread_gap')) def terms_to_values(terms): _t = datetime.datetime.strptime(terms[0], '%Y-%m-%d %H:%M:%S') _count = float(terms[1]) _ewma = float(terms[2]) _ewmvar = float(terms[3]) _sig = float(terms[4]) _keywords = terms[5] return _t, _count, _ewma, _ewmvar, _sig, _keywords class Slice: def __init__(self): self.start = 0.0 self.end = 0.0 self.keywords = None self.sig = 0.0 self.first_sig = 0.0 self.thread = [] self.first_keywords = None
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Living Analytics Research Centre, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import topicsketch.stemmer as stemmer import fast_signi import fast_smoother import exp_config _SIGNI_THRESHOLD = eval(exp_config.get('detection', 'detection_threshold')) _SIGNI_TYPE = exp_config.get('detection', 'detection_signi_type') class SparseSmootherContainer: _THRESHOLD_FOR_CLEANING = eval( exp_config.get('detection', 'threshold_for_cleaning')) _CAPACITY_FOR_CLEANING = eval( exp_config.get('detection', 'capacity_for_cleaning')) def __init__(self): self.container = {} def _clean(self, _timestamp): to_be_cleaned_up = [] max_v = 0. for key, value in self.container.iteritems(): value.observe(_timestamp, 0.) if _SIGNI_TYPE == 's':
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import redis import exp_config _KEY_MONITOR_CHANNEL = 'twitter:sg:event:python:monitor_ab_test:a' _HOST = exp_config.get('stream', 'host') print '_KEY_MONITOR_CHANNEL', _KEY_MONITOR_CHANNEL ### clear memory ######## db = redis.StrictRedis(_HOST, port=8181) db.delete(_KEY_MONITOR_CHANNEL) ######################### def report_rate(t, r): if not _KEY_MONITOR_CHANNEL: return db = redis.StrictRedis(_HOST, port=8181) db.lpush(_KEY_MONITOR_CHANNEL, (t, r))
import event_output import topic_sketch.preprocessor as preprocessor import search import imagecrawler import twokenize import tweet_stream import user_info _THREAD_GAP = eval(exp_config.get('detection', 'thread_gap')) ic = imagecrawler.ImageCrawler() class Keyword: def __init__(self, _word): self.word = _word self.records = list() def appear(self, _t, _sig): self.records.append((_t, _sig)) def span(self): return (self.records[-1][0] - self.records[0][0]).total_seconds() / ( 60 * 60) #hours
def name2sim(input_names, n_dim, prefix='name', with_embedding=True): PATH = exp_config.get('data', 'path') method = exp_config.get('predicate_name', 'method') embedding_iters = eval(exp_config.get('cosine_embedding', 'n_iter')) print 'In name2sim() method ', method, ' embedding_iters ', embedding_iters assert method in ['jaro_winkler', 'tfidf'] if os.path.isfile(PATH + prefix + '_list_' + method + '.txt'): names = list() fin = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'r', 'utf-8') for line in fin: names.append(line[:-1]) fin.close() else: names = set() for name in input_names: if name is not None: names.add(name) names = list(names) fout = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'w', 'utf-8') for name in names: fout.write(name) fout.write('\n') fout.close() sim = None if method == 'jaro_winkler': if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'): sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method + '.npz') else: sim = ngram_sim.name2sim_jw(names) scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz', sim) if method == 'tfidf': if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'): sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method + '.npz') else: sim = ngram_sim.name2sim_tfidf(names) scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz', sim) if with_embedding: if os.path.isfile(PATH + prefix + '_embeddings_' + method + '_' + str(n_dim) + '.npy'): embeddings = np.load(PATH + prefix + '_embeddings_' + method + '_' + str(n_dim) + '.npy') else: embeddings = cosine_embedding_for_sparse_input.embed( sim, n_dim, embedding_iters) np.save( PATH + prefix + '_embeddings_' + method + '_' + str(n_dim) + '.npy', embeddings) embeddings = np.append(embeddings, np.zeros((1, n_dim), dtype=np.float32), axis=0) else: embeddings = None name2eid = dict(zip(names, range(len(names)))) name2eid[None] = len(names) return name2eid, sim, embeddings
exp_config.set('triplet_embedding', 'nce_sampling', args.nce_sampling[0]) if args.triplet_embedding_batch_size is not None: exp_config.set('triplet_embedding', 'batch_size', args.triplet_embedding_batch_size[0]) if args.triplet_embedding_learning_rate_f is not None: exp_config.set('triplet_embedding', 'learning_rate_f', args.triplet_embedding_learning_rate_f[0]) if args.triplet_embedding_learning_rate_a is not None: exp_config.set('triplet_embedding', 'learning_rate_a', args.triplet_embedding_learning_rate_a[0]) if args.stratified_attribute is not None: exp_config.set('evaluation', 'stratified_attribute', args.stratified_attribute[0]) PATH = exp_config.get('data', 'path') SOURCE_PREFIX = exp_config.get('data', 'source_prefix') TARGET_PREFIX = exp_config.get('data', 'target_prefix') SOURCE_COL = int(exp_config.get('data', 'source_col')) TARGET_COL = int(exp_config.get('data', 'target_col')) USER_DIM = exp_config.get('triplet_embedding', 'user_dim') ITER_NUM = exp_config.get('triplet_embedding', 'n_iter') WARM_UP_NUM = exp_config.get('triplet_embedding', 'warm_up_iter') NCE_NUM = exp_config.get('triplet_embedding', 'nce_sampling') SUPERVISED_FLAG = eval(exp_config.get('triplet_embedding', 'supervised')) BIAS_FLAG = eval(exp_config.get('triplet_embedding', 'bias')) screen_name_exist = eval(exp_config.get('predicate_name', 'screen_name_exist'))
def image2sim(input_images, prefix='image'): PATH = exp_config.get('data', 'path') IDENTICAL_T = eval(exp_config.get('predicate_image', 'identical_threshold')) method = exp_config.get('predicate_image', 'method') embedding_iters = eval(exp_config.get('cosine_embedding', 'n_iter')) assert method in [ 'identical', 'vgg16', 'vgg19', 'xception', 'inception_resnet_v2', 'vggface' ] print('input_images', len(input_images)) if os.path.isfile(PATH + prefix + '_list' + '.txt'): images = list() fin = open(PATH + prefix + '_list' + '.txt', 'r') for line in fin: images.append(line[:-1]) fin.close() else: images = list() for image in input_images: if image is not None: images.append(image) fout = open(PATH + prefix + '_list' + '.txt', 'w') for image in images: fout.write(image) fout.write('\n') fout.close() if method == 'identical': if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'): sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method + '.npz') else: funs = [ imagehash.average_hash, imagehash.phash, imagehash.dhash, imagehash.whash ] im_objs = list() for image in images: im_objs.append(Image.open(PATH + image)) print('images', len(images), 'im_objs', len(im_objs)) vs = list() for i in xrange(len(im_objs)): obj_i = im_objs[i] v_i = np.array([fun(obj_i) for fun in funs]) vs.append(v_i) sim = dok_matrix((len(images), len(images)), dtype=np.float32) for i in xrange(len(images)): current_t = time.time() v_i = vs[i] for j in xrange(len(images)): v_j = vs[j] s = np.median(v_i - v_j) if s < IDENTICAL_T: sim[i, j] = (IDENTICAL_T - s) / IDENTICAL_T print('processing images ', i, 100 * i // len(images), time.time() - current_t, 's') sim = sim.asformat('csr') scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz', sim) if method in [ 'vgg16', 'vgg19', 'xception', 'inception_resnet_v2', 'vggface' ]: if method == 'vgg16': from keras.applications.vgg16 import VGG16 from keras.preprocessing import image as keras_image from keras.applications.vgg16 import preprocess_input model = VGG16(weights='imagenet', include_top=False) if method == 'vgg19': from keras.applications.vgg19 import VGG19 from keras.preprocessing import image as keras_image from keras.applications.vgg19 import preprocess_input model = VGG19(weights='imagenet', include_top=False) if method == 'xception': from keras.applications.xception import Xception from keras.preprocessing import image as keras_image from keras.applications.xception import preprocess_input model = Xception(weights='imagenet', include_top=False) if method == 'inception_resnet_v2': from keras.applications.inception_resnet_v2 import InceptionResNetV2 from keras.preprocessing import image as keras_image from keras.applications.inception_resnet_v2 import preprocess_input model = InceptionResNetV2(weights='imagenet', include_top=False) if method == 'vggface': print('vggface') from keras_vggface.vggface import VGGFace from keras.preprocessing import image as keras_image from keras_vggface.utils import preprocess_input model = VGGFace(include_top=False) def get_feature(img_path): img = keras_image.load_img(img_path, target_size=(224, 224)) x = keras_image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) feature = model.predict(x) return feature if os.path.isfile(PATH + prefix + '_embeddings_' + method + '.npy'): embeddings = np.load(PATH + prefix + '_embeddings_' + method + '.npy') else: print('get image features') #debug embeddings = list() for image in images: embeddings.append(get_feature(PATH + image).flatten()) print('process', image) embeddings = np.array(embeddings, dtype=np.float32) np.save(PATH + prefix + '_embeddings_' + method + '.npy', embeddings) if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'): sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method + '.npz') else: lsh_instance = lsh.LSH(8, 5) indices = lsh_instance.load(embeddings) sim = dok_matrix((len(images), len(images)), dtype=np.float32) for i in range(len(images)): v_i = embeddings[i] for j in lsh_instance.query(indices[i]): v_j = embeddings[j] sim[i, j] = similarity(v_i, v_j) sys.stdout.write("\r%d%%" % (100 * i // len(images))) sys.stdout.flush() sim = sim.asformat('csr') scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz', sim) image2eid = dict(zip(images, range(len(images)))) image2eid[None] = len(images) return image2eid, sim
def process(): ### _start_y = int(exp_config.get('process_detection_log', 'start_y')) _start_m = int(exp_config.get('process_detection_log', 'start_m')) _start_d = int(exp_config.get('process_detection_log', 'start_d')) _end_y = int(exp_config.get('process_detection_log', 'end_y')) _end_m = int(exp_config.get('process_detection_log', 'end_m')) _end_d = int(exp_config.get('process_detection_log', 'end_d')) _start_t = date(_start_y, _start_m, _start_d) _end_t = date(_end_y, _end_m, _end_d) _threshold = eval( exp_config.get('process_detection_log', 'signi_threshold')) ### threads = [] _f = open(exp_config.get('process_detection_log', 'log_file'), 'r') #!!! for line in _f: if not line.startswith('201'): continue line = line.rstrip('\n') terms = line.split('\t') _t, _count, _ewma, _ewmvar, _sig, _keywords = terms_to_values(terms) if _t < _start_t or _t > _end_t: continue if _sig < _threshold: continue create_new = True for thread in threads: if thread.add_to_thread(terms): create_new = False break if create_new: thread = Slice() thread.new_thread(terms) threads.append(thread) _f.close() #threads.sort(key = lambda x: x.sig, reverse = True) threads.sort(key=lambda x: x.start) _s = set() # for debugging count = 0 for thread in threads: print thread.start, thread.end, thread.sig, thread.keywords ''' print '\hline' topic = '' for word in thread.keywords: topic = topic + word + ', ' print thread.start.strftime('%Y-%m-%d') + '&' + topic + '&\\\\' count += 1 _s.add(thread.start.strftime('%Y-%m-%d')) #print count, len(_s) if count == 50: break ''' return threads
def embed(triplets, attribute_embeddings, testing_ids=None, bias=False): # parameters BATCH_SIZE = eval(exp_config.get('triplet_embedding', 'batch_size')) USER_DIM = eval(exp_config.get('triplet_embedding', 'user_dim')) NCE_SAM_NUM = eval(exp_config.get('triplet_embedding', 'nce_sampling')) SNAPSHOT_FLAG = eval(exp_config.get('triplet_embedding', 'snapshot')) SNAPSHOT_GAP = eval(exp_config.get('triplet_embedding', 'snapshot_gap')) LEARNING_RATE_FOLLOW = eval( exp_config.get('triplet_embedding', 'learning_rate_f')) LEARNING_RATE_ATTRIBUTE = eval( exp_config.get('triplet_embedding', 'learning_rate_a')) DEBUG_FLAG = eval(exp_config.get('debug', 'flag')) # process triplets net_degrees = dict() max_user_id = 0 triplets_attribute = list() triplets_follow = list() predicates = set() for trip in triplets: s_, p_, o_ = trip predicates.add(p_) max_user_id = max(max_user_id, s_) if p_ == 'a': triplets_attribute.append((s_, o_)) if p_ == 'f': triplets_follow.append((s_, o_)) max_user_id = max(max_user_id, o_) if s_ in net_degrees: net_degrees[s_] += 1 else: net_degrees[s_] = 1 num_users = max_user_id + 1 print('num_users', num_users) print('predicates', predicates) triplets_attribute = np.array(triplets_attribute, dtype=np.int32) triplets_follow = np.array(triplets_follow, dtype=np.int32) n_triplets_follow = len(triplets_follow) n_triplets_attribute = len(triplets_attribute) def follow_data_generator(): follow_batch_start_id = 0 follow_permutation = np.random.permutation(n_triplets_follow) while True: if follow_batch_start_id + BATCH_SIZE > n_triplets_follow: follow_batch_start_id = 0 follow_permutation = np.random.permutation(n_triplets_follow) choice = follow_permutation[ follow_batch_start_id:follow_batch_start_id + BATCH_SIZE] selected_triplets_follow = triplets_follow[choice, :] _source, _target = np.expand_dims( selected_triplets_follow[:, 0], axis=1), selected_triplets_follow[:, 1] follow_batch_start_id += BATCH_SIZE yield _source, _target def attribute_data_generator(): attribute_batch_start_id = 0 attribute_permutation = np.random.permutation(n_triplets_attribute) while True: if attribute_batch_start_id + BATCH_SIZE > n_triplets_attribute: attribute_batch_start_id = 0 attribute_permutation = np.random.permutation( n_triplets_attribute) choice = attribute_permutation[ attribute_batch_start_id:attribute_batch_start_id + BATCH_SIZE] selected_triplets_attribute = triplets_attribute[choice, :] attribute_s = selected_triplets_attribute[:, 0:1] attribute_s = np.concatenate( (attribute_s, np.arange(BATCH_SIZE).reshape(BATCH_SIZE, 1)), axis=1) attribute_o = selected_triplets_attribute[:, 1] attribute_batch_start_id += BATCH_SIZE yield attribute_s, attribute_o net_probs = map(lambda x: net_degrees[x]**0.75 if x in net_degrees else 0., range(num_users)) net_probs = list(net_probs) net_probs /= np.sum(net_probs) P_limit = 2. # build model graph = tf.Graph() with graph.as_default(): with tf.device('/gpu:0'): follow_ds = tf.data.Dataset.from_generator( follow_data_generator, (tf.int32, tf.int32), ([BATCH_SIZE, 1], [BATCH_SIZE])).prefetch(3).repeat() follow_itr = follow_ds.make_one_shot_iterator() trip_follow_source, trip_follow_target = follow_itr.get_next() attribute_ds = tf.data.Dataset.from_generator( attribute_data_generator, (tf.int32, tf.int32), ([BATCH_SIZE, 2], [BATCH_SIZE])).prefetch(3).repeat() attribute_itr = attribute_ds.make_one_shot_iterator() trip_attribute_subject, trip_attribute_object = attribute_itr.get_next( ) user_embeddings = tf.Variable( tf.truncated_normal([num_users, USER_DIM], stddev=1e-2)) if testing_ids is not None: normalized_user_embeddings = tf.nn.l2_normalize( user_embeddings, dim=1) testing_ids_ph = tf.placeholder(tf.int32, shape=[len(testing_ids)]) dist = tf.tensordot(tf.nn.embedding_lookup( normalized_user_embeddings, testing_ids_ph), normalized_user_embeddings, axes=[[1], [1]]) P_follow = tf.Variable( tf.truncated_normal([USER_DIM, USER_DIM], stddev=1.0 / USER_DIM)) P_follow_norm = tf.norm(P_follow) update_P = tf.assign( P_follow, P_limit * tf.nn.l2_normalize(P_follow, [0, 1])).op bias_var = tf.Variable(tf.zeros(USER_DIM)) bias_norm = tf.norm(bias_var) f_loss = 0 a_loss = 0 for p_ in predicates: if p_ == 'f': target = tf.tensordot( tf.nn.embedding_lookup(user_embeddings, trip_follow_target), P_follow, [[1], [1]]) if bias: target += bias_var f_loss += tf.reduce_mean( tf.nn.nce_loss( user_embeddings, tf.zeros(num_users), trip_follow_source, target, NCE_SAM_NUM, num_users, num_true=1, sampled_values=(np.random.choice( num_users, NCE_SAM_NUM, False, net_probs), tf.ones(BATCH_SIZE, dtype=tf.float32), tf.ones(NCE_SAM_NUM, dtype=tf.float32)))) if p_ == 'a': attr_embeddings = tf.nn.embedding_lookup( attribute_embeddings, trip_attribute_object) dot = tf.tensordot(user_embeddings, attr_embeddings, [[1], [1]]) softm = tf.nn.softmax(dot, dim=0) softm = tf.gather_nd(softm, trip_attribute_subject) a_loss -= tf.reduce_mean(tf.log(softm)) f_global_step = tf.Variable(0, trainable=False) a_global_step = tf.Variable(0, trainable=False) f_learning_rate = tf.train.exponential_decay(LEARNING_RATE_FOLLOW, f_global_step, 10000, 0.96, staircase=True) a_learning_rate = tf.train.exponential_decay( LEARNING_RATE_ATTRIBUTE, a_global_step, 10000, 0.96, staircase=True) if f_loss != 0: f_optimizer = tf.train.GradientDescentOptimizer( f_learning_rate).minimize(f_loss, var_list=[user_embeddings]) f_optimizer_P = tf.train.GradientDescentOptimizer( 1e-4).minimize(f_loss, var_list=[P_follow, bias_var]) if a_loss != 0: a_optimizer = tf.train.GradientDescentOptimizer( a_learning_rate).minimize(a_loss) with tf.Session( graph=graph, config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as session: f_losses = list() a_losses = list() P_norms = list() bias_norms = list() tf.global_variables_initializer().run() warm_up_iter = eval(exp_config.get( 'triplet_embedding', 'warm_up_iter')) * len(triplets) // BATCH_SIZE n_iter = eval(exp_config.get('triplet_embedding', 'n_iter')) * len(triplets) // BATCH_SIZE ue_s = list() dist_s = list() for i in range(n_iter): if SNAPSHOT_FLAG and i % (SNAPSHOT_GAP * 1000) == 0: ue = user_embeddings.eval(session) dist_values = None if testing_ids is None else session.run( dist, feed_dict={testing_ids_ph: testing_ids}) ue_s.append(ue) dist_s.append(dist_values) if a_loss != 0: start_time = time.time() _, a_loss_val = session.run([a_optimizer, a_loss]) a_losses.append(a_loss_val) if DEBUG_FLAG and np.random.rand() < 0.001: print(i, 'a_loss_val', a_loss_val, time.time() - start_time) if i >= warm_up_iter and f_loss != 0: # follow start_time = time.time() if i % 5 == 0: _, f_loss_val, P_norm_, bias_norm_ = session.run( [f_optimizer_P, f_loss, P_follow_norm, bias_norm]) P_norms.append(P_norm_) bias_norms.append(bias_norm_) if P_norm_ > P_limit: session.run(update_P) else: _, f_loss_val = session.run([f_optimizer, f_loss]) f_losses.append(f_loss_val) if DEBUG_FLAG and np.random.rand() < 0.001: print(i, 'f_loss_val', f_loss_val, time.time() - start_time) ue = user_embeddings.eval(session) dist_values = None if testing_ids is None else session.run( dist, feed_dict={testing_ids_ph: testing_ids}) ue_s.append(ue) dist_s.append(dist_values) return ue_s, dist_s, (f_losses, a_losses, (P_norms, bias_norms))
def __init__(self): _start_y = int(exp_config.get('stream', 'start_y')) _start_m = int(exp_config.get('stream', 'start_m')) _start_d = int(exp_config.get('stream', 'start_d')) _end_y = int(exp_config.get('stream', 'end_y')) _end_m = int(exp_config.get('stream', 'end_m')) _end_d = int(exp_config.get('stream', 'end_d')) self.table = exp_config.get('stream', 'table') self.dy_start = date(_start_y, _start_m, _start_d) self.dy_end = date(_end_y, _end_m, _end_d) self.host = exp_config.get('stream', 'host') self.user = exp_config.get('stream', 'user') self.passwd = exp_config.get('stream', 'passwd') self.db = exp_config.get('stream', 'db') self.connection = MySQLdb.connect(host=self.host, user=self.user, passwd=self.passwd, db=self.db, charset='utf8') self.cursor = self.connection.cursor() _time0 = self.dy_start.strftime("%Y-%m-%d") _time1 = (self.dy_start + td(days=1)).strftime("%Y-%m-%d") sql_str = 'select * from ' + self.table + ' where t >= "%s" and t < "%s" order by t' % ( _time0, _time1) print sql_str self.cursor.execute(sql_str)
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import numpy as np import MySQLdb import twokenize from timeout import timeout import exp_config import copy import datetime import user_info ### configuration ######## _host = exp_config.get('database', 'host') _user = exp_config.get('database', 'user') _db = exp_config.get('database', 'db') _charset = exp_config.get('database', 'charset') ### for id mapping ######### _connection = MySQLdb.connect(host=_host, user=_user, passwd='123456', db=_db, charset=_charset) _cursor = _connection.cursor() _cursor.execute("desc timelines2") _id = 0 _id_map = {}
def embed2(X, n_dim, n_iter): ''' X is a sparse matrix, (CSR), ignore the diagonal elements. ''' DEBUG_FLAG = eval(exp_config.get('debug', 'flag')) BATCH_SIZE = eval(exp_config.get('cosine_embedding', 'batch_size')) LEARNING_RATE = eval(exp_config.get('cosine_embedding', 'learning_rate')) # checking if DEBUG_FLAG: print('X', X.shape) print('X[0,0]', X[0, 0]) print('X[1,1]', X[1, 1]) print('min X', np.min(X)) print('max X', np.max(X)) # focused on neighbour pairs start_time = time.time() pairs_i, pairs_j = X.nonzero() x_list = X.data assert len(x_list) == len(pairs_i) elements = list() for i, j, x in zip(pairs_i, pairs_j, x_list): if i != j: elements.append((i, j, x)) pairs_i, pairs_j, x_list = zip(*elements) pairs_i = np.array(pairs_i, dtype=np.int32) pairs_j = np.array(pairs_j, dtype=np.int32) x_list = np.array(x_list, dtype=np.float32) print('take ', time.time() - start_time, ' seconds') print('pairs_i', len(pairs_i), 'pairs_j', len(pairs_j), 'x_list', len(x_list)) n_W = len(x_list) if DEBUG_FLAG: print('n_pairs', n_W, (n_W + 0.) / (X.shape[0]**2)) batch_start_id = 0 permutation = np.random.permutation(n_W) # building model graph = tf.Graph() with graph.as_default(): global_step = tf.Variable(0, trainable=False) input_pairs_i = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) input_pairs_j = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) input_xs = tf.placeholder(tf.float32, shape=[BATCH_SIZE]) #attribute_embeddings = tf.Variable(tf.random_uniform([X.shape[0], n_dim], -2.0/np.sqrt(n_dim), 2.0/np.sqrt(n_dim)), # name='attribute_embeddings') attribute_embeddings = tf.Variable( tf.truncated_normal([X.shape[0], n_dim], stddev=1e-5)) mt = tf.nn.embedding_lookup(attribute_embeddings, input_pairs_i) * tf.nn.embedding_lookup( attribute_embeddings, input_pairs_j) mt = tf.reduce_sum(mt, axis=1) loss = tf.nn.l2_loss(mt - input_xs, name='loss') * 2 / BATCH_SIZE learning_rate = tf.train.exponential_decay(10., global_step, 25 * n_W / BATCH_SIZE, 0.96, staircase=True) #!!! optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss) with tf.Session( graph=graph, config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as session: tf.global_variables_initializer().run() real_iters = max(n_iter * n_W / BATCH_SIZE, 10000) print('real iters', real_iters) for i in range(real_iters): if batch_start_id + BATCH_SIZE > n_W: batch_start_id = 0 permutation = np.random.permutation(n_W) choice = permutation[batch_start_id:batch_start_id + BATCH_SIZE] _input_pairs_i, _input_pairs_j, _input_xs = pairs_i[ choice], pairs_j[choice], x_list[choice] batch_start_id += BATCH_SIZE start_time = time.time() _, loss_val = session.run( [optimizer, loss], feed_dict={ input_pairs_i: _input_pairs_i, input_pairs_j: _input_pairs_j, input_xs: _input_xs }) if DEBUG_FLAG and np.random.rand() < 0.001 or i == real_iters - 1: print(i, i / (n_W / BATCH_SIZE), 'f_loss_val', loss_val, time.time() - start_time, 's') attribute_embeddings = attribute_embeddings.eval() return attribute_embeddings
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import topic_sketch.stemmer as stemmer import fast_signi import exp_config _SIGNI_THRESHOLD = eval(exp_config.get('detection', 'signi_threshold')) class SparseSigniContainer(): _THRESHOLD_FOR_CLEANING = eval(exp_config.get('detection', 'threshold_for_cleaning')) _CAPACITY_FOR_CLEANING = eval(exp_config.get('detection', 'capacity_for_cleaning')) def __init__(self): self.container = {} def _clean(self, _timestamp): to_be_cleaned_up = [] for key, value in self.container.iteritems(): value.observe(_timestamp, 0.) if value.ewma <= self._THRESHOLD_FOR_CLEANING: to_be_cleaned_up.append(key)