def extendAS(self,ext_vocab=[]): ext_vocab=[x.lower() for x in ext_vocab] #this is just a restart so labels are still valid labels={key: value for key, value in enumerate(self.activeSearch.labels.tolist()) if value > -1} self.extendedVocabulary.update(set(ext_vocab)) #attach only ngram_range=(500,0) if len(ext_vocab)==0: return for x in ext_vocab: l=len(x.split()) ngram_range=(min((l,ngram_range[0])),max((l,ngram_range[1]))) tempvectorizer=CountVectorizer(analyzer='word',vocabulary=ext_vocab,binary=True,ngram_range=ngram_range,decode_error=u'ignore') addX=tempvectorizer.fit_transform(self.text) #scale by mean distance and some factor #some_factor=2 #addX.multiply(self.scalefactor*float(some_factor)) #add column self.Xsparse = sparse.hstack((self.Xsparse,addX)) if self.dimred: print self.Xsparse.shape svd=TruncatedSVD(n_components=self.n_components) X=svd.fit_transform(self.Xsparse) print("dimensionalty reduction leads to explained variance ratio sum of "+str(svd.explained_variance_ratio_.sum())) self.sparse=False else: X=self.Xsparse params=asI.Parameters(pi=self.prevalence,verbose=False,sparse=self.sparse,eta=self.eta) self.activeSearch = asI.kernelAS(params=params) ##fast self.activeSearch.initialize(X.transpose(),init_labels = labels)
def test_nan (): import cPickle as pickle import os, os.path as osp with open(osp.join(os.getenv('HOME'), 'Research/Data/ActiveSearch/ben/forumthreadsSparseMatrix.pkl'),'r') as fl: X = pickle.load(fl) X = X.T # import IPython # IPython.embed() X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] print X.shape r,n = X.shape nt = int(0.05*n) num_eval = 50 Y = np.array([1]*nt + [0]*(n-nt), dtype=int) nr.shuffle(Y) pi = sum(Y)/len(Y) init_pt = 537 # import IPython # IPython.embed() # A = np.array((X.T.dot(X)).todense()) t1 = time.time() verbose = True prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose) kAS = ASI.kernelAS(prms) kAS.initialize(X) init_lbls = {init_pt:1} kAS.firstMessage(init_pt) # fs2 = [kAS.f] import IPython IPython.embed() for i in range(num_eval): idx1 = kAS.getNextMessage() kAS.setLabelCurrent(Y[idx1]) init_lbls[idx1] = Y[idx1] import IPython IPython.embed()
def test_CC (): nac = np.allclose n = 1000 r = 100 nt = 200 rcross = 0 X,Y = createFakeData3(n, r, nt, rcross) num_eval = 50 pi = sum(Y)/len(Y) init_pt = 5 # import IPython # IPython.embed() A = X.T.dot(X) t1 = time.time() verbose = True prms = ASI.Parameters(pi=pi,sparse=False, verbose=verbose) kAS = ASI.kernelAS(prms) kAS.initialize(X) sAS = ASI.shariAS(prms) sAS.initialize(A) #sAS2 = ASI.naiveShariAS(prms) kAS.firstMessage(init_pt) sAS.firstMessage(init_pt) # fs2 = [kAS.f] for i in range(num_eval): idx1 = kAS.getNextMessage() kAS.setLabelCurrent(Y[idx1]) # init_lbls[idx1] = Y[idx1] idx2 = sAS.getNextMessage() sAS.setLabelCurrent(Y[idx2]) print('NEXT') print idx1==idx2 print nac(kAS.f, sAS.f) # fs2.append(kAS.f) # fs3.append(sAS.f) import IPython IPython.embed()
def test_interface (): verbose = False #ts_data = ef.load_timestamps (tsfile) Xfull = load_sparse_csr('Xfull1.npz') r,n = Xfull.shape nt = int(0.05*n) num_eval = 1000 # num_eval = nt*2 Y = np.array([1]*nt + [0]*(n-nt), dtype=int) pi = sum(Y)/len(Y) init_pt = 100 t1 = time.time() prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose) kAS = ASI.kernelAS(prms) kAS.initialize(Xfull) kAS.firstMessage(init_pt) fs2 = [kAS.f] for i in range(num_eval): idx = kAS.getNextMessage() kAS.setLabelCurrent(Y[idx]) fs2.append(kAS.f) t2 = time.time() f1,h1,s1,fs1,dtinv1 = AS.kernel_AS (Xfull, Y, pi=pi, num_eval=num_eval, init_pt=init_pt, verbose=verbose,all_fs=True,tinv=True,sparse=True) t3 = time.time() checks = [np.allclose(fs1[i],fs2[i]) for i in range(len(fs1))] import IPython IPython.embed()
dataConn = mysql_conn.flatfileDataConnect() message_count = dataConn.connect(args.JSON_path) else: dataConn = mysql_conn.mysqlDataConnect() message_count = dataConn.connect(args.database, args.database_hostname, args.database_user, args.database_pass) activeSearch = None # when firstMessage is called we reinitialize the kernel algorithm. However calling # initialize again requires us to invert C so we could be smarter and save that # For now the invert time is a couple of seconds so we can do that as future work restart_save = None first_run = True if (args.method == "kernel"): print "Using kernelAS" activeSearch = asI.kernelAS() wMat = dataConn.getFinalFeatureMatrix(args.wordlimit,args.skip_stemmer, args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0,0) restart_save = wMat.copy() activeSearch.initialize(wMat) elif (args.method == "shari"): print "Using shariAS" activeSearch = asI.shariAS() A = dataConn.getAffinityMatrix(args.wordlimit,args.skip_stemmer,args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0,0) # Feeding in the dense version to shari's code because the sparse version is not implemented activeSearch.initialize(np.array(A.todense())) elif (args.method == "naiveshari"): print "Using naieveShariAS" activeSearch = asI.naiveShariAS() A = dataConn.getAffinityMatrix(args.wordlimit,args.skip_stemmer,args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0,0) # Feeding in the dense version to shari's code because the sparse version is not implemented activeSearch.initialize(np.array(A.todense()))
def test_warm_start (): verbose = True nac = np.allclose #ts_data = ef.load_timestamps (tsfile) Xfull = load_sparse_csr('Xfull1.npz') # print Xfull.shape # Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:] # Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))] # # r,n = Xfull.shape # print Xfull.shape # Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:] # Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))] # getting rid of features which are zero for all these elements n = 300 r = 600 X = Xfull[:,:n] X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] X = X[:r,:] X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] print X.shape #X = np.load('X11.npy') r,n = X.shape nt = int(0.05*n) num_eval = 50 Y = np.array([1]*nt + [0]*(n-nt), dtype=int) nr.shuffle(Y) pi = sum(Y)/len(Y) init_pt = 5 # import IPython # IPython.embed() A = np.array((X.T.dot(X)).todense()) t1 = time.time() prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose) kAS = ASI.kernelAS(prms) kAS.initialize(X) kAS2 = ASI.kernelAS(prms) sAS = ASI.shariAS(prms) sAS2 = ASI.naiveShariAS(prms) # import IPython # IPython.embed() init_lbls = {init_pt:1} kAS.firstMessage(init_pt) fs2 = [kAS.f] for i in range(num_eval): idx1 = kAS.getNextMessage() kAS.setLabelCurrent(Y[idx1]) init_lbls[idx1] = Y[idx1] # sAS.setLabelCurrent(Y[idx2]) # fs2.append(kAS.f) # fs3.append(sAS.f) print("Batch initializing:") print("Kernel AS:") kAS2.initialize(X, init_lbls) print("Shari AS:") sAS.initialize(A, init_lbls) print("Naive Shari AS:") sAS2.initialize(A, init_lbls) import IPython IPython.embed()
def test_interface3 (): verbose = True nac = np.allclose #ts_data = ef.load_timestamps (tsfile) Xfull = load_sparse_csr('Xfull1.npz') print Xfull.shape Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:] Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))] # r,n = Xfull.shape print Xfull.shape Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:] Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))] # getting rid of features which are zero for all these elements # n = 300 # r = 600 X = Xfull#[:,:n] # X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] # X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] # X = X[:r,:] # X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] # X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] # print X.shape # #X = np.load('X11.npy') r,n = X.shape nt = int(0.05*n) num_eval = 50 Y = np.array([1]*nt + [0]*(n-nt), dtype=int) pi = sum(Y)/len(Y) init_pt = 5 # import IPython # IPython.embed() A = np.array((X.T.dot(X)).todense()) t1 = time.time() prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose) kAS = ASI.kernelAS(prms) kAS.initialize(X) sAS = ASI.shariAS(prms) sAS.initialize(A) # ofk = kAS.f # ofs = sAS.f # import IPython # IPython.embed() kAS.firstMessage(init_pt) fs2 = [kAS.f] sAS.firstMessage(init_pt) fs3 = [sAS.f] # # # lbl = 1 # idx = 5 # B = np.ones(n)/(1+prms.w0) # D = A.sum(axis=1) # BDinv = np.diag(np.squeeze(B*1./D)) # IA = np.eye(n) - BDinv.dot(A) # IAi = np.matrix(nlg.inv(IA)) # IAk = nlg.inv(np.eye(n) + kAS.BDinv.dot(X.T.dot(nlg.inv(np.eye(r) - X.dot(kAS.BDinv.dot(X.T))))).dot(X.todense())) # IAki = nlg.inv(IAk) # t = (1+prms.w0)*(1-prms.eta) # e = np.zeros((n,1)) # e[idx] = 1 # IA2 = IA + (1-t)*e.dot(e.T).dot(BDinv.dot(A)) # ai = (1./D)[idx]/(1+ prms.w0)*A[idx,:] # Ad = (1-t)*IAi[:,idx].dot(ai.dot(IAi))/(1 + (1-t)*ai.dot(IAi[:,idx])) # IA2i = IAi - Ad # # import IPython # IPython.embed() for i in range(num_eval): idx1 = kAS.getNextMessage() idx2 = sAS.getNextMessage() print('NEXT') print idx1==idx2 print nac(kAS.f, sAS.f) # import IPython # IPython.embed() kAS.setLabelCurrent(Y[idx1]) sAS.setLabelCurrent(Y[idx2]) fs2.append(kAS.f) fs3.append(sAS.f) t2 = time.time() # f1,h1,s1,fs1,dtinv1 = AS.kernel_AS (Xfull, Y, pi=pi, num_eval=num_eval, init_pt=init_pt, verbose=verbose,all_fs=True,tinv=True,sparse=True) t3 = time.time() # checks = [np.allclose(fs1[i],fs2[i]) for i in range(len(fs1))] import IPython IPython.embed()
def test_interface2 (): verbose = True nac = np.allclose #ts_data = ef.load_timestamps (tsfile) Xfull = load_sparse_csr('Xfull1.npz') print Xfull.shape # Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:] # Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))] # r,n = Xfull.shape # print Xfull.shape # Xfull = Xfull[np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=1))[0])),:] # Xfull = Xfull[:,np.squeeze(np.asarray(np.nonzero(Xfull.sum(axis=0))[1]))] # getting rid of features which are zero for all these elements n = 300 r = 600 X = Xfull[:,:n] X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] X = X[:r,:] X = X[np.squeeze(np.array(np.nonzero(X.sum(axis=1))[0])),:] X = X[:,np.squeeze(np.array(np.nonzero(X.sum(axis=0))[1]))] print X.shape #X = np.load('X11.npy') r,n = X.shape nt = int(0.05*n) num_eval = 50 Y = np.array([1]*nt + [0]*(n-nt), dtype=int) pi = sum(Y)/len(Y) init_pt = 5 # import IPython # IPython.embed() A = np.array((X.T.dot(X)).todense()) t1 = time.time() prms = ASI.Parameters(pi=pi,sparse=True, verbose=verbose) kAS = ASI.kernelAS(prms) kAS.initialize(X) sAS = ASI.naiveShariAS(prms) sAS.initialize(A) import IPython IPython.embed() kAS.firstMessage(init_pt) fs2 = [kAS.f] sAS.firstMessage(init_pt) fs3 = [sAS.f] import IPython IPython.embed() for i in range(num_eval): idx1 = kAS.getNextMessage() idx2 = sAS.getNextMessage() print('NEXT') print idx1==idx2 print nac(kAS.f, sAS.f) # import IPython # IPython.embed() kAS.setLabelCurrent(Y[idx1]) sAS.setLabelCurrent(Y[idx2]) fs2.append(kAS.f) fs3.append(sAS.f) t2 = time.time() # f1,h1,s1,fs1,dtinv1 = AS.kernel_AS (Xfull, Y, pi=pi, num_eval=num_eval, init_pt=init_pt, verbose=verbose,all_fs=True,tinv=True,sparse=True) t3 = time.time() # checks = [np.allclose(fs1[i],fs2[i]) for i in range(len(fs1))] import IPython IPython.embed()
message_count = dataConn.connect(args.JSON_path) else: dataConn = mysql_conn.mysqlDataConnect() message_count = dataConn.connect(args.database, args.database_hostname, args.database_user, args.database_pass) activeSearch = None # when firstMessage is called we reinitialize the kernel algorithm. However calling # initialize again requires us to invert C so we could be smarter and save that # For now the invert time is a couple of seconds so we can do that as future work restart_save = None first_run = True if (args.method == "kernel"): print "Using kernelAS" activeSearch = asI.kernelAS() wMat = dataConn.getFinalFeatureMatrix(args.wordlimit, args.skip_stemmer, args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0, 0) restart_save = wMat.copy() activeSearch.initialize(wMat) elif (args.method == "shari"): print "Using shariAS" activeSearch = asI.shariAS() A = dataConn.getAffinityMatrix(args.wordlimit, args.skip_stemmer, args.num_cpus, message_count, args.out_to_database, args.in_from_database, 0, 0) # Feeding in the dense version to shari's code because the sparse version is not implemented activeSearch.initialize(np.array(A.todense()))
def startAS(self,corpus,labeled_corpus=[],labels=[],starting_points=[]): """ corpus --> list of touples (id,text) where id is external id """ num_labels = len(labels) if num_labels != len(labeled_corpus): raise Exception ("Number of lables and number of previously labeled objects does not match") if num_labels > 0: self.prev_corpus.extend(labeled_corpus) self.prev_labels.extend(labels) #initialise with previous information self.start_idx=len(self.prev_labels) #get map from external id to internal index for the new corpus self.id_to_idx={}#maps external id (e.g. AdId) to internal index for i,el in enumerate(corpus): self.id_to_idx[el[0]]=i+self.start_idx #do not include indices pointing to already labeled objects from previous AS self.curr_corpus=corpus self.num_messages=len(corpus) self.unlabeled_idxs=set(xrange(self.start_idx,self.num_messages)) self.hashlookup={} if self.dedupe: #calculate all minhash values self.hashed=[self.hashing(tup[1].lower()) for i,tup in enumerate(corpus)]#minhash #for now, save collisions in a dictionary. Replace with locality sensitive hashing later for i,h in enumerate(self.hashed): if h in self.hashlookup: self.hashlookup[h].append(i) else: self.hashlookup[h]=[i] text = [x[1] for x in self.prev_corpus] + [y[1] for y in corpus] #save text so that restart is possible self.text=text #featurize ngram_range=(500,0) if len(self.extendedVocabulary)==0: ngram_range=(1,1) for x in self.extendedVocabulary: l=len(x.split()) ngram_range=(min((l,ngram_range[0])),max((l,ngram_range[1]))) if self.vocab == None: vocabulary = self.getVocabulary(text,extendedVoc=list(self.extendedVocabulary)) else: vocabulary = self.vocab+list(self.extendedVocabulary) if self.tfidf: self.setTfidf(vocab=vocabulary,ngram_range=ngram_range) else: self.setCountVectorizer(vocab=vocabulary,ngram_range=ngram_range) self.Xsparse=self.vectorizer.fit_transform(text) #add column with ones for empty rows a = self.Xsparse.dot(np.ones(self.Xsparse.shape[1]))#this works because features are non-negative anonz=a.nonzero()[0] if anonz.shape[0] != self.Xsparse.shape[0]:#matrix contains empty rows b=np.ones(self.Xsparse.shape[0]) b[anonz]=0 self.Xsparse=sparse.hstack((self.Xsparse,sparse.csr_matrix(b).T)) if self.dimred: print self.Xsparse.shape svd=TruncatedSVD(n_components=self.n_components) X=svd.fit_transform(self.Xsparse) print("dimensionalty reduction leads to explained variance ratio sum of "+str(svd.explained_variance_ratio_.sum())) self.sparse=False #b=np.array([len(x) for x in text,ndmin=2).transpose() #X=np.hstack((X,b)) else: #b=np.array([len(x) for x in text,ndmin=2).transpose() #self.Xsparse=sparse.hstack((X,b)) X=self.Xsparse #get scale #extimate pairwise distances through random sampling #pairwise_dists = squareform(pdist(X[np.random.choice(X.shape[0], 1000, replace=False),:], 'euclidean')) #self.scalefactor = np.mean(pairwise_dists) params=asI.Parameters(pi=self.prevalence,verbose=False,sparse=self.sparse,eta=self.eta) self.activeSearch = asI.kernelAS(params=params) ##fast if len(starting_points)==0: if len(self.prev_labels)==0: raise Exception ("No start point and no labels provided") init_labels = {key: value for key, value in enumerate(self.prev_labels)} for x in starting_points: idx=self.id_to_idx[x] self.unlabeled_idxs.remove(idx) init_labels[idx]=1 self.activeSearch.initialize(X.transpose(),init_labels = init_labels)