예제 #1
0
def main():
    [Xs, vects, DT, ind2obj] = loadPickle(sys.argv[1])
    [inits, labels, centers] = loadPickle(sys.argv[2])
    tweetPre = sys.argv[3]
    if len(sys.argv) > 4:
        outPickle = sys.argv[4]

### params move to argv ###
    eventID = 0
    #    Xinds = [0,1,2,3]
    Xinds = [0, 5]
    ww = 1
    wp = 1
    wl = 1
    wo = 1
    #    ws = [ww,wp,wl,wo]
    ws = [ww, 1]
    selectTime = 1  #
    wt = 0.5
    lambdaB = 0.5
    Learn = (1, 10)  #
    params = NodeParams(Xinds, ws, lambdaB, selectTime, wt, Learn)
    ### run ###
    rootNode = EventNode(Xs, DT, params, inits)
    rootNode.run()
    ########## pickle #######
    if len(sys.argv) > 4:
        sys.stderr.write('saving pickle...\n')
        with open(outPickle, 'w') as f:
            pickle.dump([params, rootNode.descriptor], f)
    ########### print #######
    sys.stderr.write('After pickle, printing...\n')
    if len(sys.argv) > 5:
        rootNode.printCluster(vects, ind2obj, tweetPre=tweetPre)
예제 #2
0
def main():
    [resDocInd, tweetsObj, tweetsObjDedup,
     tweetsScore] = loadPickle(sys.argv[1])
    [Xs, vects, DT, ind2obj] = loadPickle(sys.argv[2])
    rootParams, rootNodeDescriptor = loadPickle(sys.argv[3])

    topK = int(sys.argv[4])  # ent used
    kSummary = int(sys.argv[5])  # summary sentences
    i = int(sys.argv[6])
    window = 5

    t0 = time()
    Pw_zs = rootNodeDescriptor.Pw_zs
    Pe_z = Pw_zs[1][:, i]
    evocab = vects[5].get_feature_names()
    vocab = vects[0].get_feature_names()

    ent_ind, ents = getEntInd(evocab, Pe_z,
                              topK)  # ents: the order in which EN comes in

    print("entscore in " + str(time() - t0))
    t0 = time()

    newsObj = [ind2obj[n] for n in resDocInd]
    XN, XEn, NEb, sentencesIn, sentencesInObj, ent_text_n = getNewsContext(
        newsObj, ent_ind, ents, vocab, window)
    print("get news Context in " + str(time() - t0))
    print len(newsObj), len(sentencesIn), len(set(sentencesIn))
    t0 = time()

    XT, XEt, TEb, tweetsIn, tweetsInObj, ent_text_t = getTweetContext(
        tweetsObjDedup, ent_ind, ents, vocab, window)
    print("get tweet Context in " + str(time() - t0))

    print len(tweetsObjDedup), len(tweetsIn), len(set(tweetsIn))
    t0 = time()

    newsScore = XN.dot(Pw_zs[0][:, i])
    tweetsScore = XT.dot(Pw_zs[0][:, i])

    print("init score in " + str(time() - t0))
    t0 = time()

    NE_ = XN.dot(XEn.T)  #.multiply(NEb)
    TE_ = XT.dot(XEt.T)  #.multiply(TEb)
    NE, EN = normBypartite(NE_)
    TE, ET = normBypartite(TE_)
    print("graph constr in " + str(time() - t0))
    t0 = time()

    nScore, tScore = triHits(newsScore, tweetsScore, NE, EN, TE, ET, 0.2, 0.2,
                             5)
    print("trihits in " + str(time() - t0))
    t0 = time()

    printSummary(newsScore, tweetsScore, sentencesIn, sentencesInObj, tweetsIn,
                 tweetsInObj, kSummary)
    print "*****"
    printSummary(nScore, tScore, sentencesIn, sentencesInObj, tweetsIn,
                 tweetsInObj, kSummary)
예제 #3
0
def main():
    [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[1])
    [inits,labels,centers] = loadPickle(sys.argv[2])
    tweetPre=sys.argv[3]
    if len(sys.argv)>4:
        outPickle = sys.argv[4]

   ### params move to argv ### 
    eventID=0
#    Xinds = [0,1,2,3]
    Xinds = [0,5]
    ww = 1
    wp=1
    wl=1
    wo=1
#    ws = [ww,wp,wl,wo]
    ws = [ww,1]
    selectTime = 1  #
    wt = 0.5
    lambdaB = 0.5
    Learn=(1,10)  #
    params = NodeParams(Xinds,ws,lambdaB,selectTime,wt,Learn)
    ### run ###
    rootNode = EventNode(Xs,DT,params,inits)
    rootNode.run()
    ########## pickle #######
    if len(sys.argv)>4:
        sys.stderr.write('saving pickle...\n')
        with open(outPickle, 'w') as f:
            pickle.dump([params,rootNode.descriptor],f)          
    ########### print #######
    sys.stderr.write('After pickle, printing...\n')
    if len(sys.argv)>5:
        rootNode.printCluster(vects,ind2obj,tweetPre=tweetPre)
예제 #4
0
    def _loadFuncs(self):
        self.log('Start files loading...')
        filesMap = self.filesMap
        for key in filesMap:
            filePath = filesMap[key]
            self.log(f'Try to load {key} file: {filePath}')

            func = None
            if filePath.endswith('.pickle'):
                grid = utils.loadPickle(filePath, 'rb')
                if grid is None:
                    self.error(
                        f'Unable to load {key} function as pickle:\n{filePath}.'
                    )
                func = num_methods.interpolation.SplineInterpolation(grid)
            elif filePath.endswith('.json'):
                jsn = utils.loadJson(filePath, 'r')
                if jsn is None:
                    self.error(
                        f'Unable to load {key} function interpolation as json:\n{filePath}.'
                    )
                func = num_methods.interpolation.SplineInterpolation(
                    None).load_from_dict(jsn)
            else:
                grid = utils.loadCSV(filePath)
                if grid is None:
                    self.error(
                        f'Unable to load {key} function as csv:\n{filePath}.')
                func = num_methods.interpolation.SplineInterpolation(grid)

            self.funcDict[key] = func
            self.log('Successful')
        self.log('All files loaded')
예제 #5
0
def spectral_cluster():
    t0 = time()
    S = spectral_clustering(
        loadPickle('./models/trump_sample_affinity.pickle'), n_clusters=100)
    savePickle(S, './models/trump_sample_spectral.pickle')
    print(S)
    print("Spectral clustering took {}s".format(time() - t0))
예제 #6
0
def kmeans():
    t0 = time()
    K = k_means(loadPickle('./models/trump_sample_vectors.pickle'),
                n_clusters=100,
                n_jobs=-1)
    savePickle(K, './models/trump_sample_kmeans.pickle')
    print(K)
    print("K-means took {}s".format(time() - t0))
예제 #7
0
from utils import countLines
import sys
from warnings import warn
import string
import numpy as np
from itertools import cycle
from itertools import repeat

# Try to load the word2vec model and the multilabelbinarizer
w2vfile = './models/w2v'
mlbfile = './models/mlb.pickle'
w2v = False

# Loading pickle files is faster, so check that one first
if os.path.exists(w2vfile + '.pickle'):
    w2v = loadPickle(w2vfile + '.pickle')
elif os.path.exists(w2vfile + '.bin'):
    w2v = loadWord2Vec(w2vfile + '.bin')
else:
    warn(
        "{} not found, will not be able to sub or create word matrices".format(
            w2vfile))

if w2v:
    word_d = w2v.layer1_size

prepare_mode = '-p' in sys.argv or '--prepare' in sys.argv or '-m' in sys.argv or '--make' in sys.argv

if os.path.exists(mlbfile) and not prepare_mode:
    mlb = loadPickle(mlbfile)
    valid_hashtags = set(mlb.classes_)
예제 #8
0
 def loadCoeff(self, path):
     self.coeff = loadPickle(path)
예제 #9
0
 def loadLoc(self, path):
     self.loc = loadPickle(path)
     self.N = len(self.loc)
     self.width = 2 * int(
         np.abs(self.loc[:, :2]).max() + self.loc[:, 6].max() / 2.0)
예제 #10
0
import sys,os
import pickle
from eknot_utils import init_all,EventNode
from utils import loadPickle

if __name__ == "__main__":
    # input args: K tweetPre  dataPickle outPickle mini [n_init init_size batch_size]
    K=int(sys.argv[1])
    tweetPre=sys.argv[2]
    [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[3])
    outPickle = sys.argv[4]
    mini = int(sys.argv[5])
    if mini:
        n_init = int(sys.argv[6])
        init_size = int(sys.argv[7])
        batch_size = int(sys.argv[8])
    
    # inits
    sys.stderr.write("begin initiating... \n")
    if mini:
        inits,labels,centers = init_all(K,Xs,DT,mini,n_init,init_size,batch_size)
    else:
        inits,labels,centers = init_all(K,Xs,DT)
    # write
    if outPickle != 'null':
        with open(outPickle, 'w') as f:
            pickle.dump([inits,labels,centers],f)
    
    sys.stderr.write("Pickle saved. Begin printing... \n")
    ####################### print #######################
    rootNode = EventNode(Xs,initsDescriptor=inits)
예제 #11
0
import sys, os
import pickle
from eknot_utils import nextData, weightX, subRun, EventNode, NodeParams
from utils import loadPickle

if __name__ == "__main__":
    # data_pickle eventNode_pickle tweetPre switchText eventID Kevent [outPickle]
    [_, vects, _, ind2obj] = loadPickle(sys.argv[1])
    rootNode = loadPickle(sys.argv[2])
    tweetPre = sys.argv[3]
    switch = sys.argv[4]
    eventID = int(sys.argv[5])  # sub event number
    K = int(sys.argv[6])
    if len(sys.argv) > 7:
        outPickle = sys.argv[7]

    ######### sub ###########
    sys.stderr.write('Running sub...\n')
    n_wdxPz_wds, XsWeighted = nextData(rootNode)
    ## params
    numX = len(XsWeighted)
    Xinds = range(numX)  # can be customized
    ws = [1 for i in Xinds]  # can be customized
    selectTime = 0  #
    wt = 0.5
    lambdaB = 0.5
    Learn = (1, 10)
    ##
    params = NodeParams(Xinds, ws, lambdaB, selectTime, wt, Learn, eventID)
    eventNode = subRun(XsWeighted, n_wdxPz_wds, K, params, rootNode.DT,
                       rootNode.dID)
예제 #12
0
from preprocess import text2mat
from utils import loadPickle
import numpy as np
import os
from warnings import warn
from numpy.linalg import norm
from utils import saveTweet2Vec
from utils import loadTweet2Vec
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
from keras.callbacks import CSVLogger
from sklearn.metrics.pairwise import euclidean_distances

mlb_file = './models/mlb.pickle'
if os.path.exists(mlb_file):
    mlb = loadPickle(mlb_file)
else:
    warn(
        "{} doesn't exist - need this to generate labels for training: run `./preprocess.py --prepare input.txt` first"
    )


class Tweet2Vec:
    def __init__(self,
                 model=None,
                 char=True,
                 chrd=True,
                 word=True,
                 normalize=False):
        '''
        Initialize stuff
예제 #13
0
from utils import loadPickle
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
celebs, celeb_encodings = loadPickle(
    'Show-Segmentation-2020/final_celeb_detection/final_pickles/anchors-with-TV-encodings.pickle'
)
celeb_encodings = np.array([np.array(x) for x in celeb_encodings])

# Populating KNN space with labelled encodings
X = []
Y = []
for i in range(len(celeb_encodings)):  #prepare dataset
    for celeb_encoding in celeb_encodings[i]:
        X.append(celeb_encoding)
        Y.append(celebs[i])

neigh = KNeighborsClassifier(n_neighbors=30)
neigh.fit(X, Y)


def encoding2name(f_encodings):
    return neigh.predict(f_encodings)


def findHostNames(shows, face_encodings):
    for show in shows:
        hosts = show.hosts.split('&')  #getting list of hosts of the show
        hosts = sorted(
            hosts, key=lambda x: len(face_encodings[int(x)]),
            reverse=True)  #Most occuring anchor is taken as the main anchor
예제 #14
0
import os
import pandas as pd
import matplotlib.pyplot as plt
from plotly.offline import plot
import plotly.express as px
from sklearn.manifold import TSNE
import seaborn as sns

import utils as ut
import configs as cf

#%% Read text2vec pickle
shape_run_id = '0209-0306'
run_root_dir = os.path.join(cf.SHAPE_RUN_DIR, shape_run_id)

shape2vec = ut.loadPickle(os.path.join(run_root_dir, "shape2vec.pkl"))
shape2loss = ut.loadPickle(os.path.join(run_root_dir, "shape2loss.pkl"))

bright = [
    "#023EFF", "#FF7C00", "#1AC938", "#E8000B", "#8B2BE2", "#9F4800",
    "#F14CC1", "#A3A3A3", "#000099", "#00D7FF", "#222A2A"
]

#%% Run TSNE on the latent vectors
latent_dim = shape2vec.get('52255064fb4396f1b129901f80d24b7b').shape[0]
latent_vects = np.zeros((len(shape2vec), latent_dim))
for i, key in enumerate(shape2vec.keys()):
    latent_vects[i, :] = shape2vec[key]

perp, lr = 40, 200
tsne = TSNE(n_components=2,
예제 #15
0
from utils import loadPickle
from smart_open import smart_open
from preprocess import TweetHashtagIterator

w2vfile = './models/w2v_1day.pickle'
w2v = loadPickle(w2vfile)


class TweetSubIterator(TweetHashtagIterator):
    def __init__(self, source):
        TweetHashtagIterator.__init__(self, source, 'tweet', True)

    def __iter__(self):
        pass


def sub(tweet, thresh=.9):
    # TODO cache "most_similar" for speed?
    words = tweet.split()
    most_sims = []
    for word in words:
        if word in w2v:
            most_sim = w2v.most_similar(word)[0]
            if most_sim[1] > thresh:
                most_sims.append(most_sim[0])
            else:
                most_sims.append(word)
        else:
            most_sims.append(word)

    return ' '.join(most_sims)
예제 #16
0
import sys,os
import pickle
from eknot_utils import nextData,weightX,subRun,EventNode,NodeParams
from utils import loadPickle

if __name__ == "__main__":
    # data_pickle plsa_pickle tweetPre switchText eventID Kevent [outPickle]
    [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[1])
    rootParams,rootNodeDescriptor = loadPickle(sys.argv[2])
    tweetPre = sys.argv[3]
    switch = sys.argv[4]
    eventID = int(sys.argv[5]) # event number
    K=int(sys.argv[6])
    if len(sys.argv)>7:
        outPickle = sys.argv[7]

    rootNode = EventNode(Xs,params=rootParams,descriptor=rootNodeDescriptor)

    ######### sub ###########
    sys.stderr.write('Running sub...\n')
    n_wdxPz_wds,XsWeighted = nextData(rootNode)
    ## params
    numX = len(XsWeighted)
    Xinds = range(numX)  # can be customized
    ws = [1 for i in Xinds] # can be customized
    selectTime = 0  #
    wt = 0.5
    lambdaB = 0.5
    Learn=(1,10)
    ##
    params = NodeParams(Xinds,ws,lambdaB,selectTime,wt,Learn,eventID)
예제 #17
0
import sys, os
import pickle
from eknot_utils import init_all, EventNode
from utils import loadPickle

if __name__ == "__main__":
    # input args: K tweetPre  dataPickle outPickle mini [n_init init_size batch_size]
    K = int(sys.argv[1])
    tweetPre = sys.argv[2]
    [Xs, vects, DT, ind2obj] = loadPickle(sys.argv[3])
    outPickle = sys.argv[4]
    mini = int(sys.argv[5])
    if mini:
        n_init = int(sys.argv[6])
        init_size = int(sys.argv[7])
        batch_size = int(sys.argv[8])

    # inits
    sys.stderr.write("begin initiating... \n")
    if mini:
        inits, labels, centers = init_all(K, Xs, DT, mini, n_init, init_size,
                                          batch_size)
    else:
        inits, labels, centers = init_all(K, Xs, DT)
    # write
    if outPickle != 'null':
        with open(outPickle, 'w') as f:
            pickle.dump([inits, labels, centers], f)

    sys.stderr.write("Pickle saved. Begin printing... \n")
    ####################### print #######################
예제 #18
0
import sys
from eknot_utils import EventNode
from utils import loadPickle
import pickle

if __name__ == "__main__":
    # input args: data_pickle inits/plsa_pickle tweetPre switchtext i outpickle
    [Xs,vects,DT,ind2obj] = loadPickle(sys.argv[1])
    picklename = sys.argv[2]
    tweetPre = sys.argv[3]
    switch = sys.argv[4]
    i = int(sys.argv[5])
    outPickle = sys.argv[6]
    if 'inits_' in picklename:
        [inits,labels,centers] = loadPickle(picklename)
        rootNode = EventNode(Xs,initsDescriptor=inits)
        sys.stderr.write('Printing...\n')
        rootNode.printCluster(vects,ind2obj,tweetPre=tweetPre,switch=switch,fromPlsa=0)
    elif 'plsa_' in picklename:
        rootParams,rootNodeDescriptor = loadPickle(picklename)
        rootNode = EventNode(Xs,params=rootParams,descriptor=rootNodeDescriptor)
        sys.stderr.write('Printing...\n')
        resDocInd,tweetsObj,tweetsObjDedup,tweetsScore = rootNode.printCluster_i(vects,
                ind2obj,i,tweetPre=tweetPre,switch=switch,fromPlsa=1)
        sys.stderr.write('saving pickle...\n')
        with open(outPickle, 'w') as f:
            pickle.dump([resDocInd,tweetsObj,tweetsObjDedup,tweetsScore],f)          

    else:
        sys.stderr.write("wrong plsa/inits pickle name\n")
        exit(-1)
예제 #19
0
def get_affinity():
    t0 = time()
    A = rbf_kernel(loadPickle('./models/trump_sample_vectors.pickle'))
    savePickle(A, './models/trump_sample_affinity.pickle')
    print(A.shape)
    print("Spectral clustering took {}s".format(time() - t0))