class SparseSigniContainer():
    _THRESHOLD_FOR_CLEANING = eval(exp_config.get('detection', 'threshold_for_cleaning'))
    _CAPACITY_FOR_CLEANING = eval(exp_config.get('detection', 'capacity_for_cleaning'))


    def __init__(self):
        self.container = {}

    def _clean(self, _timestamp):
        to_be_cleaned_up = []
        for key, value in self.container.iteritems():
            value.observe(_timestamp, 0.)
            if value.ewma <= self._THRESHOLD_FOR_CLEANING:
                to_be_cleaned_up.append(key)

        print 'cleaning', len(to_be_cleaned_up), 'items...'
        for key in to_be_cleaned_up:
            self.container.pop(key)

    def get(self, _id, _timestamp):
        # check for cleaning
        if len(self.container) > self._CAPACITY_FOR_CLEANING:
            self._clean(_timestamp)

        # return
        if _id in self.container:
            return self.container[_id]
        else:
            sig_scorer = fast_signi.SignificanceScorer()

            self.container[_id] = sig_scorer
            return sig_scorer
Exemplo n.º 2
0
    def __init__(self, start, end):
        self.deq = deque([])

        self.start = start

        self.end = end

        self.delta = td(minutes=5)

        self.lag = td(minutes=60 * 3)

        self.host = exp_config.get('database', 'host')
        self.user = exp_config.get('database', 'user')
        self.db = exp_config.get('database', 'db')
        self.charset = exp_config.get('database', 'charset')

        self.connection = MySQLdb.connect(host=self.host,
                                          user=self.user,
                                          db=self.db,
                                          charset=self.charset,
                                          passwd='123456')
        cursor = self.connection.cursor()
        cursor.execute("desc timelines2")
        id = 0
        self.id_map = {}
        for column in cursor.fetchall():
            self.id_map[column[0]] = id
            id += 1
        cursor.close()
Exemplo n.º 3
0
    def __init__(self, _ptw_stream):
        self.ptw_stream = _ptw_stream

        _wz = eval(exp_config.get('significance', 'window_size'))
        _cycle = eval(exp_config.get('significance', 'cycle'))
        _average = eval(exp_config.get('significance', 'average'))
        print 'significance # set parameters:' + str(
            fast_signi.SignificanceScorer.set_window_size(
                _wz, _cycle, _average))

        _uz = eval(exp_config.get('acceleration', 'unit_size'))
        print 'acceleration # set unit size ' + str(
            fast_smoother.set_unit_size(_uz))

        _wz1 = eval(exp_config.get('acceleration', 'window_size1'))
        _wz2 = eval(exp_config.get('acceleration', 'window_size2'))
        print 'acceleration # set windows ' + str(
            fast_smoother.EWMASmoother.set_window_size(_wz1, _wz2))

        self.processor = signi_processor.SigProcessor()

        self.threads = list()

        _start_t = exp_config.get('detection', 'start_t')
        _end_t = exp_config.get('detection', 'end_t')

        self._start_t = datetime.datetime.strptime(_start_t,
                                                   '%Y-%m-%d %H:%M:%S')
        self._end_t = datetime.datetime.strptime(_end_t, '%Y-%m-%d %H:%M:%S')
Exemplo n.º 4
0
def name2sim(input_names, prefix='name', return_matrix=False):
    print('prefix in name2sim', prefix)  #debugging
    PATH = exp_config.get('data', 'path')
    method = exp_config.get('predicate_name', 'method')

    ip_ad = exp_config.get('dispy', 'ip')
    port = int(exp_config.get('dispy', 'port'))
    remote_path = exp_config.get('dispy', 'remote_path')

    print('In name2sim() method ', method)

    assert method in ['jaro_winkler', 'tfidf']

    if os.path.isfile(PATH + prefix + '_list_' + method + '.txt'):
        names = list()
        fin = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'r',
                          'utf-8')
        for line in fin:
            names.append(line[:-1])
        fin.close()
    else:
        names = input_names
        fout = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'w',
                           'utf-8')
        for name in names:
            fout.write(name)
            fout.write('\n')
        fout.close()

    sim = None
    if method == 'jaro_winkler':
        pass

    if method == 'tfidf':

        sim = dis_ngram_sim.distributed_name2sim_tfidf(
            names,
            '.',
            remote_path,
            32,
            nodes=[(ip_ad, port)],
            return_sim=return_matrix)

    name2eid = dict(zip(names, range(len(names))))

    name2eid[None] = len(names)

    return name2eid, sim
Exemplo n.º 5
0
    def process(self, sig_instance, sig_list=None):

        _t, _count, _ewma, _ewmvar, _sig, _keywords = sig_instance

        if _t < self._start_t or _t > self._end_t:
            return 0.

        if eval(exp_config.get('output', 'debug_info')):
            if sig_list:  # for debugging
                print '-----------------------'
                for sig_ in sig_list:
                    print '__sig__', sig_
                print '-----------------------'

        create_new = True

        for thread in self.threads:
            if thread.add_to_thread(sig_instance):
                create_new = False
                break

        if create_new:
            thread = Slice()
            thread.new_thread(sig_instance)

            self.threads.append(thread)

            return _sig

        return 0.
    def __init__(self):
        _start_y = int(exp_config.get('stream', 'start_y'))
        _start_m = int(exp_config.get('stream', 'start_m'))
        _start_d = int(exp_config.get('stream', 'start_d'))

        _end_y = int(exp_config.get('stream', 'end_y'))
        _end_m = int(exp_config.get('stream', 'end_m'))
        _end_d = int(exp_config.get('stream', 'end_d'))

        self.dy_start = date(_start_y, _start_m, _start_d)
        self.dy_end = date(_end_y, _end_m, _end_d)

        self.connection = MySQLdb.connect(host='?',
                                          user='******',
                                          db='?',
                                          charset='utf8')

        cursor = self.connection.cursor()
        cursor.execute("desc weibo_timelines")
        id = 0
        self.id_map = {}
        for column in cursor.fetchall():
            self.id_map[column[0]] = id
            id += 1

        self.cursor = self.connection.cursor()

        _time0 = self.dy_start.strftime("%Y-%m-%d")
        _time1 = (self.dy_start + td(days=1)).strftime("%Y-%m-%d")

        sql_str = 'select * from ' + 'weibo_timelines' + ' where created_at >= "%s" and created_at < "%s" order by created_at' % (
            _time0, _time1)
        print sql_str

        self.cursor.execute(sql_str)
Exemplo n.º 7
0
    def next(self):
        ptweet = self.ptw_stream.next()

        if ptweet is ts_stream.End_Of_Stream:
            return ts_stream.End_Of_Stream

        if ptweet is None:
            return None

        sig_instance, sig_list = self.processor.process(ptweet)

        if sig_instance is not None:
            output = self.process(sig_instance, sig_list)

            if eval(exp_config.get('output', 'debug_info')):
                print sig_instance
            return ptweet, output

        return ptweet, 0.0
def get_top_sig():
    terms = []

    _f = open(exp_config.get('process_detection_log', 'log_file'), 'r')  #!!!

    for line in _f:
        if not line.startswith('201'):
            continue

        line = line.rstrip('\n')

        term = line.split('\t')

        terms.append(term)

    terms.sort(key=lambda x: terms_to_values(x)[4], reverse=True)

    for term in terms:
        _t, _count, _ewma, _ewmvar, _sig, _keywords = terms_to_values(term)
        print _t, _count, _ewma, _ewmvar, _sig, _keywords
Exemplo n.º 9
0
    def __init__(self, _ptw_stream):
        self.ptw_stream = _ptw_stream

        _wz = eval(exp_config.get('detection', 'window_size'))
        _cycle = eval(exp_config.get('detection', 'cycle'))
        _average = eval(exp_config.get('detection', 'average'))
        print 'set signi parameters:' + str(
            fast_signi.SignificanceScorer.set_window_size(
                _wz, _cycle, _average))
        self.processor = signi_processor.SigProcessor()

        self.threads = list()

        _start_y = int(exp_config.get('detection', 'start_y'))
        _start_m = int(exp_config.get('detection', 'start_m'))
        _start_d = int(exp_config.get('detection', 'start_d'))

        self._start_t = datetime.datetime(_start_y, _start_m, _start_d)

        self.stop_tokens = {
            'instagram', 'facebook', 'twitter', 'update', 'pic', 'picture',
            'cr', 'damn', 'hashtag'
        }
__affiliation__ = 'Living Analytics Research Centre, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import threading
from collections import deque

import tensorflow as tf
import numpy as np
from array import array
import time

from functools import partial

import exp_config

DEBUG_FLAG = eval(exp_config.get('debug', 'flag'))
BATCH_SIZE = eval(exp_config.get('cosine_embedding', 'batch_size'))
LEARNING_RATE = eval(exp_config.get('cosine_embedding', 'learning_rate'))
PARTITION_PATH = exp_config.get('cosine_embedding', 'partition_path')

output_info = open('cosine_embedding_for_sparse_input_files_output.txt', 'wt')


def _variable(name, shape, dtype, initializer):

    var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)

    return var


def _step(i_partition,
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import datetime
import exp_config
from datetime import datetime as date

_THREAD_GAP = eval(exp_config.get('process_detection_log', 'thread_gap'))


def terms_to_values(terms):
    _t = datetime.datetime.strptime(terms[0], '%Y-%m-%d %H:%M:%S')
    _count = float(terms[1])
    _ewma = float(terms[2])
    _ewmvar = float(terms[3])
    _sig = float(terms[4])
    _keywords = terms[5]
    return _t, _count, _ewma, _ewmvar, _sig, _keywords


class Slice:
    def __init__(self):
        self.start = 0.0
        self.end = 0.0
        self.keywords = None
        self.sig = 0.0
        self.first_sig = 0.0
        self.thread = []
        self.first_keywords = None
Exemplo n.º 12
0
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Living Analytics Research Centre, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import topicsketch.stemmer as stemmer
import fast_signi
import fast_smoother

import exp_config

_SIGNI_THRESHOLD = eval(exp_config.get('detection', 'detection_threshold'))
_SIGNI_TYPE = exp_config.get('detection', 'detection_signi_type')


class SparseSmootherContainer:
    _THRESHOLD_FOR_CLEANING = eval(
        exp_config.get('detection', 'threshold_for_cleaning'))
    _CAPACITY_FOR_CLEANING = eval(
        exp_config.get('detection', 'capacity_for_cleaning'))

    def __init__(self):
        self.container = {}

    def _clean(self, _timestamp):
        to_be_cleaned_up = []
        max_v = 0.
        for key, value in self.container.iteritems():
            value.observe(_timestamp, 0.)

            if _SIGNI_TYPE == 's':
Exemplo n.º 13
0
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import redis
import exp_config

_KEY_MONITOR_CHANNEL = 'twitter:sg:event:python:monitor_ab_test:a'
_HOST = exp_config.get('stream', 'host')

print '_KEY_MONITOR_CHANNEL', _KEY_MONITOR_CHANNEL

### clear memory ########
db = redis.StrictRedis(_HOST, port=8181)
db.delete(_KEY_MONITOR_CHANNEL)
#########################


def report_rate(t, r):
    if not _KEY_MONITOR_CHANNEL:
        return
    db = redis.StrictRedis(_HOST, port=8181)
    db.lpush(_KEY_MONITOR_CHANNEL, (t, r))
Exemplo n.º 14
0
import event_output

import topic_sketch.preprocessor as preprocessor

import search

import imagecrawler

import twokenize

import tweet_stream

import user_info

_THREAD_GAP = eval(exp_config.get('detection', 'thread_gap'))

ic = imagecrawler.ImageCrawler()


class Keyword:
    def __init__(self, _word):
        self.word = _word
        self.records = list()

    def appear(self, _t, _sig):
        self.records.append((_t, _sig))

    def span(self):
        return (self.records[-1][0] - self.records[0][0]).total_seconds() / (
            60 * 60)  #hours
Exemplo n.º 15
0
def name2sim(input_names, n_dim, prefix='name', with_embedding=True):
    PATH = exp_config.get('data', 'path')
    method = exp_config.get('predicate_name', 'method')
    embedding_iters = eval(exp_config.get('cosine_embedding', 'n_iter'))

    print 'In name2sim() method ', method, ' embedding_iters ', embedding_iters

    assert method in ['jaro_winkler', 'tfidf']

    if os.path.isfile(PATH + prefix + '_list_' + method + '.txt'):
        names = list()
        fin = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'r',
                          'utf-8')
        for line in fin:
            names.append(line[:-1])
        fin.close()
    else:
        names = set()
        for name in input_names:
            if name is not None:
                names.add(name)
        names = list(names)
        fout = codecs.open(PATH + prefix + '_list_' + method + '.txt', 'w',
                           'utf-8')
        for name in names:
            fout.write(name)
            fout.write('\n')
        fout.close()

    sim = None
    if method == 'jaro_winkler':
        if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'):
            sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method +
                                        '.npz')
        else:
            sim = ngram_sim.name2sim_jw(names)
            scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz',
                                  sim)

    if method == 'tfidf':
        if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'):
            sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method +
                                        '.npz')
        else:
            sim = ngram_sim.name2sim_tfidf(names)
            scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz',
                                  sim)

    if with_embedding:
        if os.path.isfile(PATH + prefix + '_embeddings_' + method + '_' +
                          str(n_dim) + '.npy'):
            embeddings = np.load(PATH + prefix + '_embeddings_' + method +
                                 '_' + str(n_dim) + '.npy')
        else:
            embeddings = cosine_embedding_for_sparse_input.embed(
                sim, n_dim, embedding_iters)
            np.save(
                PATH + prefix + '_embeddings_' + method + '_' + str(n_dim) +
                '.npy', embeddings)

        embeddings = np.append(embeddings,
                               np.zeros((1, n_dim), dtype=np.float32),
                               axis=0)
    else:
        embeddings = None

    name2eid = dict(zip(names, range(len(names))))

    name2eid[None] = len(names)

    return name2eid, sim, embeddings
Exemplo n.º 16
0
    exp_config.set('triplet_embedding', 'nce_sampling', args.nce_sampling[0])
if args.triplet_embedding_batch_size is not None:
    exp_config.set('triplet_embedding', 'batch_size',
                   args.triplet_embedding_batch_size[0])
if args.triplet_embedding_learning_rate_f is not None:
    exp_config.set('triplet_embedding', 'learning_rate_f',
                   args.triplet_embedding_learning_rate_f[0])
if args.triplet_embedding_learning_rate_a is not None:
    exp_config.set('triplet_embedding', 'learning_rate_a',
                   args.triplet_embedding_learning_rate_a[0])

if args.stratified_attribute is not None:
    exp_config.set('evaluation', 'stratified_attribute',
                   args.stratified_attribute[0])

PATH = exp_config.get('data', 'path')

SOURCE_PREFIX = exp_config.get('data', 'source_prefix')
TARGET_PREFIX = exp_config.get('data', 'target_prefix')
SOURCE_COL = int(exp_config.get('data', 'source_col'))
TARGET_COL = int(exp_config.get('data', 'target_col'))

USER_DIM = exp_config.get('triplet_embedding', 'user_dim')
ITER_NUM = exp_config.get('triplet_embedding', 'n_iter')
WARM_UP_NUM = exp_config.get('triplet_embedding', 'warm_up_iter')
NCE_NUM = exp_config.get('triplet_embedding', 'nce_sampling')

SUPERVISED_FLAG = eval(exp_config.get('triplet_embedding', 'supervised'))
BIAS_FLAG = eval(exp_config.get('triplet_embedding', 'bias'))

screen_name_exist = eval(exp_config.get('predicate_name', 'screen_name_exist'))
Exemplo n.º 17
0
def image2sim(input_images, prefix='image'):

    PATH = exp_config.get('data', 'path')
    IDENTICAL_T = eval(exp_config.get('predicate_image',
                                      'identical_threshold'))
    method = exp_config.get('predicate_image', 'method')
    embedding_iters = eval(exp_config.get('cosine_embedding', 'n_iter'))

    assert method in [
        'identical', 'vgg16', 'vgg19', 'xception', 'inception_resnet_v2',
        'vggface'
    ]
    print('input_images', len(input_images))

    if os.path.isfile(PATH + prefix + '_list' + '.txt'):
        images = list()
        fin = open(PATH + prefix + '_list' + '.txt', 'r')
        for line in fin:
            images.append(line[:-1])
        fin.close()
    else:
        images = list()
        for image in input_images:
            if image is not None:
                images.append(image)
        fout = open(PATH + prefix + '_list' + '.txt', 'w')
        for image in images:
            fout.write(image)
            fout.write('\n')
        fout.close()

    if method == 'identical':
        if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'):
            sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method +
                                        '.npz')
        else:
            funs = [
                imagehash.average_hash, imagehash.phash, imagehash.dhash,
                imagehash.whash
            ]

            im_objs = list()
            for image in images:
                im_objs.append(Image.open(PATH + image))

            print('images', len(images), 'im_objs', len(im_objs))

            vs = list()
            for i in xrange(len(im_objs)):
                obj_i = im_objs[i]
                v_i = np.array([fun(obj_i) for fun in funs])
                vs.append(v_i)

            sim = dok_matrix((len(images), len(images)), dtype=np.float32)
            for i in xrange(len(images)):

                current_t = time.time()

                v_i = vs[i]
                for j in xrange(len(images)):
                    v_j = vs[j]
                    s = np.median(v_i - v_j)
                    if s < IDENTICAL_T:
                        sim[i, j] = (IDENTICAL_T - s) / IDENTICAL_T

                print('processing images ', i, 100 * i // len(images),
                      time.time() - current_t, 's')

            sim = sim.asformat('csr')
            scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz',
                                  sim)

    if method in [
            'vgg16', 'vgg19', 'xception', 'inception_resnet_v2', 'vggface'
    ]:
        if method == 'vgg16':
            from keras.applications.vgg16 import VGG16
            from keras.preprocessing import image as keras_image
            from keras.applications.vgg16 import preprocess_input
            model = VGG16(weights='imagenet', include_top=False)
        if method == 'vgg19':
            from keras.applications.vgg19 import VGG19
            from keras.preprocessing import image as keras_image
            from keras.applications.vgg19 import preprocess_input
            model = VGG19(weights='imagenet', include_top=False)
        if method == 'xception':
            from keras.applications.xception import Xception
            from keras.preprocessing import image as keras_image
            from keras.applications.xception import preprocess_input
            model = Xception(weights='imagenet', include_top=False)
        if method == 'inception_resnet_v2':
            from keras.applications.inception_resnet_v2 import InceptionResNetV2
            from keras.preprocessing import image as keras_image
            from keras.applications.inception_resnet_v2 import preprocess_input
            model = InceptionResNetV2(weights='imagenet', include_top=False)
        if method == 'vggface':
            print('vggface')
            from keras_vggface.vggface import VGGFace
            from keras.preprocessing import image as keras_image
            from keras_vggface.utils import preprocess_input
            model = VGGFace(include_top=False)

        def get_feature(img_path):
            img = keras_image.load_img(img_path, target_size=(224, 224))
            x = keras_image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)
            feature = model.predict(x)
            return feature

        if os.path.isfile(PATH + prefix + '_embeddings_' + method + '.npy'):
            embeddings = np.load(PATH + prefix + '_embeddings_' + method +
                                 '.npy')
        else:
            print('get image features')  #debug
            embeddings = list()
            for image in images:
                embeddings.append(get_feature(PATH + image).flatten())
                print('process', image)
            embeddings = np.array(embeddings, dtype=np.float32)

            np.save(PATH + prefix + '_embeddings_' + method + '.npy',
                    embeddings)

        if os.path.isfile(PATH + prefix + '_sim_' + method + '.npz'):
            sim = scipy.sparse.load_npz(PATH + prefix + '_sim_' + method +
                                        '.npz')
        else:
            lsh_instance = lsh.LSH(8, 5)
            indices = lsh_instance.load(embeddings)
            sim = dok_matrix((len(images), len(images)), dtype=np.float32)
            for i in range(len(images)):

                v_i = embeddings[i]
                for j in lsh_instance.query(indices[i]):
                    v_j = embeddings[j]
                    sim[i, j] = similarity(v_i, v_j)

                sys.stdout.write("\r%d%%" % (100 * i // len(images)))
                sys.stdout.flush()

            sim = sim.asformat('csr')
            scipy.sparse.save_npz(PATH + prefix + '_sim_' + method + '.npz',
                                  sim)

    image2eid = dict(zip(images, range(len(images))))
    image2eid[None] = len(images)

    return image2eid, sim
def process():
    ###
    _start_y = int(exp_config.get('process_detection_log', 'start_y'))
    _start_m = int(exp_config.get('process_detection_log', 'start_m'))
    _start_d = int(exp_config.get('process_detection_log', 'start_d'))

    _end_y = int(exp_config.get('process_detection_log', 'end_y'))
    _end_m = int(exp_config.get('process_detection_log', 'end_m'))
    _end_d = int(exp_config.get('process_detection_log', 'end_d'))

    _start_t = date(_start_y, _start_m, _start_d)
    _end_t = date(_end_y, _end_m, _end_d)

    _threshold = eval(
        exp_config.get('process_detection_log', 'signi_threshold'))

    ###

    threads = []

    _f = open(exp_config.get('process_detection_log', 'log_file'), 'r')  #!!!

    for line in _f:
        if not line.startswith('201'):
            continue

        line = line.rstrip('\n')

        terms = line.split('\t')
        _t, _count, _ewma, _ewmvar, _sig, _keywords = terms_to_values(terms)

        if _t < _start_t or _t > _end_t:
            continue

        if _sig < _threshold:
            continue

        create_new = True

        for thread in threads:
            if thread.add_to_thread(terms):
                create_new = False
                break

        if create_new:
            thread = Slice()
            thread.new_thread(terms)

            threads.append(thread)

    _f.close()

    #threads.sort(key = lambda x: x.sig, reverse = True)
    threads.sort(key=lambda x: x.start)

    _s = set()  # for debugging

    count = 0
    for thread in threads:
        print thread.start, thread.end, thread.sig, thread.keywords
        '''
        print '\hline'
        topic = ''
        for word in thread.keywords:
            topic = topic + word + ', '
        print thread.start.strftime('%Y-%m-%d') + '&' + topic + '&\\\\'

        count += 1

        _s.add(thread.start.strftime('%Y-%m-%d'))

        #print count, len(_s)

        if count == 50:
            break
        '''
    return threads
Exemplo n.º 19
0
def embed(triplets, attribute_embeddings, testing_ids=None, bias=False):

    # parameters
    BATCH_SIZE = eval(exp_config.get('triplet_embedding', 'batch_size'))
    USER_DIM = eval(exp_config.get('triplet_embedding', 'user_dim'))
    NCE_SAM_NUM = eval(exp_config.get('triplet_embedding', 'nce_sampling'))
    SNAPSHOT_FLAG = eval(exp_config.get('triplet_embedding', 'snapshot'))
    SNAPSHOT_GAP = eval(exp_config.get('triplet_embedding', 'snapshot_gap'))
    LEARNING_RATE_FOLLOW = eval(
        exp_config.get('triplet_embedding', 'learning_rate_f'))
    LEARNING_RATE_ATTRIBUTE = eval(
        exp_config.get('triplet_embedding', 'learning_rate_a'))
    DEBUG_FLAG = eval(exp_config.get('debug', 'flag'))

    # process triplets

    net_degrees = dict()
    max_user_id = 0
    triplets_attribute = list()
    triplets_follow = list()
    predicates = set()
    for trip in triplets:
        s_, p_, o_ = trip
        predicates.add(p_)
        max_user_id = max(max_user_id, s_)

        if p_ == 'a':
            triplets_attribute.append((s_, o_))
        if p_ == 'f':
            triplets_follow.append((s_, o_))
            max_user_id = max(max_user_id, o_)
            if s_ in net_degrees:
                net_degrees[s_] += 1
            else:
                net_degrees[s_] = 1

    num_users = max_user_id + 1
    print('num_users', num_users)
    print('predicates', predicates)

    triplets_attribute = np.array(triplets_attribute, dtype=np.int32)
    triplets_follow = np.array(triplets_follow, dtype=np.int32)

    n_triplets_follow = len(triplets_follow)
    n_triplets_attribute = len(triplets_attribute)

    def follow_data_generator():

        follow_batch_start_id = 0
        follow_permutation = np.random.permutation(n_triplets_follow)

        while True:
            if follow_batch_start_id + BATCH_SIZE > n_triplets_follow:
                follow_batch_start_id = 0
                follow_permutation = np.random.permutation(n_triplets_follow)
            choice = follow_permutation[
                follow_batch_start_id:follow_batch_start_id + BATCH_SIZE]
            selected_triplets_follow = triplets_follow[choice, :]
            _source, _target = np.expand_dims(
                selected_triplets_follow[:, 0],
                axis=1), selected_triplets_follow[:, 1]
            follow_batch_start_id += BATCH_SIZE

            yield _source, _target

    def attribute_data_generator():

        attribute_batch_start_id = 0
        attribute_permutation = np.random.permutation(n_triplets_attribute)

        while True:
            if attribute_batch_start_id + BATCH_SIZE > n_triplets_attribute:
                attribute_batch_start_id = 0
                attribute_permutation = np.random.permutation(
                    n_triplets_attribute)
            choice = attribute_permutation[
                attribute_batch_start_id:attribute_batch_start_id + BATCH_SIZE]
            selected_triplets_attribute = triplets_attribute[choice, :]

            attribute_s = selected_triplets_attribute[:, 0:1]
            attribute_s = np.concatenate(
                (attribute_s, np.arange(BATCH_SIZE).reshape(BATCH_SIZE, 1)),
                axis=1)
            attribute_o = selected_triplets_attribute[:, 1]

            attribute_batch_start_id += BATCH_SIZE

            yield attribute_s, attribute_o

    net_probs = map(lambda x: net_degrees[x]**0.75 if x in net_degrees else 0.,
                    range(num_users))
    net_probs = list(net_probs)
    net_probs /= np.sum(net_probs)

    P_limit = 2.

    # build model

    graph = tf.Graph()

    with graph.as_default():

        with tf.device('/gpu:0'):
            follow_ds = tf.data.Dataset.from_generator(
                follow_data_generator, (tf.int32, tf.int32),
                ([BATCH_SIZE, 1], [BATCH_SIZE])).prefetch(3).repeat()

            follow_itr = follow_ds.make_one_shot_iterator()
            trip_follow_source, trip_follow_target = follow_itr.get_next()

            attribute_ds = tf.data.Dataset.from_generator(
                attribute_data_generator, (tf.int32, tf.int32),
                ([BATCH_SIZE, 2], [BATCH_SIZE])).prefetch(3).repeat()

            attribute_itr = attribute_ds.make_one_shot_iterator()
            trip_attribute_subject, trip_attribute_object = attribute_itr.get_next(
            )

            user_embeddings = tf.Variable(
                tf.truncated_normal([num_users, USER_DIM], stddev=1e-2))

            if testing_ids is not None:
                normalized_user_embeddings = tf.nn.l2_normalize(
                    user_embeddings, dim=1)
                testing_ids_ph = tf.placeholder(tf.int32,
                                                shape=[len(testing_ids)])
                dist = tf.tensordot(tf.nn.embedding_lookup(
                    normalized_user_embeddings, testing_ids_ph),
                                    normalized_user_embeddings,
                                    axes=[[1], [1]])

            P_follow = tf.Variable(
                tf.truncated_normal([USER_DIM, USER_DIM],
                                    stddev=1.0 / USER_DIM))
            P_follow_norm = tf.norm(P_follow)

            update_P = tf.assign(
                P_follow, P_limit * tf.nn.l2_normalize(P_follow, [0, 1])).op

            bias_var = tf.Variable(tf.zeros(USER_DIM))
            bias_norm = tf.norm(bias_var)

            f_loss = 0
            a_loss = 0

            for p_ in predicates:
                if p_ == 'f':
                    target = tf.tensordot(
                        tf.nn.embedding_lookup(user_embeddings,
                                               trip_follow_target), P_follow,
                        [[1], [1]])
                    if bias:
                        target += bias_var

                    f_loss += tf.reduce_mean(
                        tf.nn.nce_loss(
                            user_embeddings,
                            tf.zeros(num_users),
                            trip_follow_source,
                            target,
                            NCE_SAM_NUM,
                            num_users,
                            num_true=1,
                            sampled_values=(np.random.choice(
                                num_users, NCE_SAM_NUM, False, net_probs),
                                            tf.ones(BATCH_SIZE,
                                                    dtype=tf.float32),
                                            tf.ones(NCE_SAM_NUM,
                                                    dtype=tf.float32))))

                if p_ == 'a':
                    attr_embeddings = tf.nn.embedding_lookup(
                        attribute_embeddings, trip_attribute_object)

                    dot = tf.tensordot(user_embeddings, attr_embeddings,
                                       [[1], [1]])
                    softm = tf.nn.softmax(dot, dim=0)
                    softm = tf.gather_nd(softm, trip_attribute_subject)
                    a_loss -= tf.reduce_mean(tf.log(softm))

            f_global_step = tf.Variable(0, trainable=False)
            a_global_step = tf.Variable(0, trainable=False)
            f_learning_rate = tf.train.exponential_decay(LEARNING_RATE_FOLLOW,
                                                         f_global_step,
                                                         10000,
                                                         0.96,
                                                         staircase=True)
            a_learning_rate = tf.train.exponential_decay(
                LEARNING_RATE_ATTRIBUTE,
                a_global_step,
                10000,
                0.96,
                staircase=True)
            if f_loss != 0:
                f_optimizer = tf.train.GradientDescentOptimizer(
                    f_learning_rate).minimize(f_loss,
                                              var_list=[user_embeddings])
                f_optimizer_P = tf.train.GradientDescentOptimizer(
                    1e-4).minimize(f_loss, var_list=[P_follow, bias_var])
            if a_loss != 0:
                a_optimizer = tf.train.GradientDescentOptimizer(
                    a_learning_rate).minimize(a_loss)

    with tf.Session(
            graph=graph,
            config=tf.ConfigProto(log_device_placement=False,
                                  allow_soft_placement=True)) as session:

        f_losses = list()
        a_losses = list()
        P_norms = list()
        bias_norms = list()

        tf.global_variables_initializer().run()

        warm_up_iter = eval(exp_config.get(
            'triplet_embedding', 'warm_up_iter')) * len(triplets) // BATCH_SIZE
        n_iter = eval(exp_config.get('triplet_embedding',
                                     'n_iter')) * len(triplets) // BATCH_SIZE

        ue_s = list()
        dist_s = list()

        for i in range(n_iter):
            if SNAPSHOT_FLAG and i % (SNAPSHOT_GAP * 1000) == 0:
                ue = user_embeddings.eval(session)
                dist_values = None if testing_ids is None else session.run(
                    dist, feed_dict={testing_ids_ph: testing_ids})
                ue_s.append(ue)
                dist_s.append(dist_values)

            if a_loss != 0:
                start_time = time.time()
                _, a_loss_val = session.run([a_optimizer, a_loss])
                a_losses.append(a_loss_val)
                if DEBUG_FLAG and np.random.rand() < 0.001:
                    print(i, 'a_loss_val', a_loss_val,
                          time.time() - start_time)

            if i >= warm_up_iter and f_loss != 0:
                # follow
                start_time = time.time()
                if i % 5 == 0:
                    _, f_loss_val, P_norm_, bias_norm_ = session.run(
                        [f_optimizer_P, f_loss, P_follow_norm, bias_norm])
                    P_norms.append(P_norm_)
                    bias_norms.append(bias_norm_)
                    if P_norm_ > P_limit:
                        session.run(update_P)
                else:
                    _, f_loss_val = session.run([f_optimizer, f_loss])

                f_losses.append(f_loss_val)
                if DEBUG_FLAG and np.random.rand() < 0.001:
                    print(i, 'f_loss_val', f_loss_val,
                          time.time() - start_time)

        ue = user_embeddings.eval(session)
        dist_values = None if testing_ids is None else session.run(
            dist, feed_dict={testing_ids_ph: testing_ids})
        ue_s.append(ue)
        dist_s.append(dist_values)

    return ue_s, dist_s, (f_losses, a_losses, (P_norms, bias_norms))
Exemplo n.º 20
0
    def __init__(self):
        _start_y = int(exp_config.get('stream', 'start_y'))
        _start_m = int(exp_config.get('stream', 'start_m'))
        _start_d = int(exp_config.get('stream', 'start_d'))

        _end_y = int(exp_config.get('stream', 'end_y'))
        _end_m = int(exp_config.get('stream', 'end_m'))
        _end_d = int(exp_config.get('stream', 'end_d'))

        self.table = exp_config.get('stream', 'table')

        self.dy_start = date(_start_y, _start_m, _start_d)
        self.dy_end = date(_end_y, _end_m, _end_d)

        self.host = exp_config.get('stream', 'host')
        self.user = exp_config.get('stream', 'user')
        self.passwd = exp_config.get('stream', 'passwd')
        self.db = exp_config.get('stream', 'db')

        self.connection = MySQLdb.connect(host=self.host,
                                          user=self.user,
                                          passwd=self.passwd,
                                          db=self.db,
                                          charset='utf8')

        self.cursor = self.connection.cursor()

        _time0 = self.dy_start.strftime("%Y-%m-%d")
        _time1 = (self.dy_start + td(days=1)).strftime("%Y-%m-%d")

        sql_str = 'select * from ' + self.table + ' where  t >= "%s" and t < "%s" order by t' % (
            _time0, _time1)
        print sql_str

        self.cursor.execute(sql_str)
Exemplo n.º 21
0
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import numpy as np
import MySQLdb
import twokenize
from timeout import timeout
import exp_config
import copy
import datetime
import user_info

### configuration ########
_host = exp_config.get('database', 'host')
_user = exp_config.get('database', 'user')
_db = exp_config.get('database', 'db')
_charset = exp_config.get('database', 'charset')

### for id mapping #########
_connection = MySQLdb.connect(host=_host,
                              user=_user,
                              passwd='123456',
                              db=_db,
                              charset=_charset)

_cursor = _connection.cursor()
_cursor.execute("desc timelines2")
_id = 0
_id_map = {}
def embed2(X, n_dim, n_iter):
    '''
    X is a sparse matrix, (CSR), ignore the diagonal elements.
    '''

    DEBUG_FLAG = eval(exp_config.get('debug', 'flag'))
    BATCH_SIZE = eval(exp_config.get('cosine_embedding', 'batch_size'))
    LEARNING_RATE = eval(exp_config.get('cosine_embedding', 'learning_rate'))

    # checking
    if DEBUG_FLAG:
        print('X', X.shape)

        print('X[0,0]', X[0, 0])
        print('X[1,1]', X[1, 1])

        print('min X', np.min(X))
        print('max X', np.max(X))

    # focused on neighbour pairs
    start_time = time.time()

    pairs_i, pairs_j = X.nonzero()
    x_list = X.data

    assert len(x_list) == len(pairs_i)

    elements = list()
    for i, j, x in zip(pairs_i, pairs_j, x_list):
        if i != j:
            elements.append((i, j, x))

    pairs_i, pairs_j, x_list = zip(*elements)

    pairs_i = np.array(pairs_i, dtype=np.int32)
    pairs_j = np.array(pairs_j, dtype=np.int32)
    x_list = np.array(x_list, dtype=np.float32)

    print('take ', time.time() - start_time, ' seconds')

    print('pairs_i', len(pairs_i), 'pairs_j', len(pairs_j), 'x_list',
          len(x_list))

    n_W = len(x_list)
    if DEBUG_FLAG:
        print('n_pairs', n_W, (n_W + 0.) / (X.shape[0]**2))

    batch_start_id = 0
    permutation = np.random.permutation(n_W)

    # building model

    graph = tf.Graph()

    with graph.as_default():

        global_step = tf.Variable(0, trainable=False)

        input_pairs_i = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
        input_pairs_j = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
        input_xs = tf.placeholder(tf.float32, shape=[BATCH_SIZE])

        #attribute_embeddings = tf.Variable(tf.random_uniform([X.shape[0], n_dim], -2.0/np.sqrt(n_dim), 2.0/np.sqrt(n_dim)),
        #                              name='attribute_embeddings')
        attribute_embeddings = tf.Variable(
            tf.truncated_normal([X.shape[0], n_dim], stddev=1e-5))

        mt = tf.nn.embedding_lookup(attribute_embeddings,
                                    input_pairs_i) * tf.nn.embedding_lookup(
                                        attribute_embeddings, input_pairs_j)
        mt = tf.reduce_sum(mt, axis=1)
        loss = tf.nn.l2_loss(mt - input_xs, name='loss') * 2 / BATCH_SIZE

        learning_rate = tf.train.exponential_decay(10.,
                                                   global_step,
                                                   25 * n_W / BATCH_SIZE,
                                                   0.96,
                                                   staircase=True)  #!!!

        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(
            loss)

    with tf.Session(
            graph=graph,
            config=tf.ConfigProto(log_device_placement=False,
                                  allow_soft_placement=True)) as session:

        tf.global_variables_initializer().run()

        real_iters = max(n_iter * n_W / BATCH_SIZE, 10000)
        print('real iters', real_iters)
        for i in range(real_iters):
            if batch_start_id + BATCH_SIZE > n_W:
                batch_start_id = 0
                permutation = np.random.permutation(n_W)
            choice = permutation[batch_start_id:batch_start_id + BATCH_SIZE]

            _input_pairs_i, _input_pairs_j, _input_xs = pairs_i[
                choice], pairs_j[choice], x_list[choice]
            batch_start_id += BATCH_SIZE

            start_time = time.time()
            _, loss_val = session.run(
                [optimizer, loss],
                feed_dict={
                    input_pairs_i: _input_pairs_i,
                    input_pairs_j: _input_pairs_j,
                    input_xs: _input_xs
                })
            if DEBUG_FLAG and np.random.rand() < 0.001 or i == real_iters - 1:
                print(i, i / (n_W / BATCH_SIZE), 'f_loss_val', loss_val,
                      time.time() - start_time, 's')

        attribute_embeddings = attribute_embeddings.eval()

    return attribute_embeddings
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'


import topic_sketch.stemmer as stemmer
import fast_signi


import exp_config


_SIGNI_THRESHOLD = eval(exp_config.get('detection', 'signi_threshold'))


class SparseSigniContainer():
    _THRESHOLD_FOR_CLEANING = eval(exp_config.get('detection', 'threshold_for_cleaning'))
    _CAPACITY_FOR_CLEANING = eval(exp_config.get('detection', 'capacity_for_cleaning'))


    def __init__(self):
        self.container = {}

    def _clean(self, _timestamp):
        to_be_cleaned_up = []
        for key, value in self.container.iteritems():
            value.observe(_timestamp, 0.)
            if value.ewma <= self._THRESHOLD_FOR_CLEANING:
                to_be_cleaned_up.append(key)