def vectorizer(tokens, w2v_db): db_path = w2v_db # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors of words. Maintain order as in document. token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words. conn.close() return unsorted_kw, token_vecs
def search(self, search_str, search_dir, depth=2): # Depth -> number of top nodes to search. Default 2 (arbitrary) has been sufficient so far. results = Counter() # {url:score} dist_neu_searchq = Counter( ) # {nid:dist} Contains the similarity of each neuron to the aggregated search query neuron_lookup = self._neu_index neuron_labels = [[ k for (k, n) in Counter(neuron.get_keylist()).most_common()[:10] ] for neuron in self._neurons] glomat = self._glomat # Global matrix. Contains similarity of each search term in the query to all neurons. Something like a cache. conn = SQLCon() searchvecs = [(x, list(conn.read(x))) for x in search_str.split() if not conn.read(x) is None ] # Obtain (word,vec) of search terms search_len = len(searchvecs) for (w, v) in searchvecs: # For colour coding the map try: for nid in glomat[w]: if glomat[w][nid] > dist_neu_searchq[nid]: dist_neu_searchq[nid] += glomat[w][nid] / search_len except KeyError: glomat[w] = {} for nid, neuron in enumerate(self._neurons): glomat[w][nid] = distance( neuron.get_weights(), v ) # cosine similarity, hence 1 is best. 0 is bleh. -1 is opposite. if glomat[w][nid] > dist_neu_searchq[nid]: dist_neu_searchq[nid] += glomat[w][nid] / search_len # Union of all doclists with minimum dist_from_neuron. doclist = {} for nid in dist_neu_searchq.most_common()[:depth]: neuron = neuron_lookup[nid[0]] doclist.update(neuron.get_top_docs(30)) files = (open(doc) for doc in doclist) for json_file in files: data = json.load(json_file) centroids = data['centroids'] url = data['url'] json_file.close() wc_sim = [distance(v, c) for c in centroids] max_wc_sim = max(wc_sim) results[url] += max_wc_sim / len(searchvecs) results = OrderedDict(results.most_common(20)) htmlVars = {'query': search_str, 'results': results} htmlCode = template.render(htmlVars) result_path = os.path.join(search_dir, search_str + '.html') map_path = os.path.join(search_dir, search_str + '_map.html') with open(result_path, 'w') as f: f.write(htmlCode) self.draw_SOM(search_str, dist_neu_searchq, neuron_labels, map_path) result_path = "file://{}".format(pathname2url(result_path)) map_path = "file://{}".format(pathname2url(map_path)) webbrowser.open(result_path)
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w, t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs) / 4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data, nk, iter=20, minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir, name + '.json') file_dest = open(json_path, 'w') json.dump( { 'url': url, 'vectors': token_vecs, 'keyword_frequency': unsorted_kw, 'centroids': centroids }, file_dest) file_dest.close()
def search(self,search_str,search_dir,depth=2): # Depth -> number of top nodes to search. Default 2 (arbitrary) has been sufficient so far. results = Counter() # {url:score} dist_neu_searchq=Counter() # {nid:dist} Contains the similarity of each neuron to the aggregated search query neuron_lookup = self._neu_index neuron_labels = [[k for (k,n) in Counter(neuron.get_keylist()).most_common()[:10]] for neuron in self._neurons] glomat = self._glomat # Global matrix. Contains similarity of each search term in the query to all neurons. Something like a cache. conn = SQLCon() searchvecs = [(x,list(conn.read(x))) for x in search_str.split() if not conn.read(x) is None] # Obtain (word,vec) of search terms search_len = len(searchvecs) for (w,v) in searchvecs: # For colour coding the map try: for nid in glomat[w]: if glomat[w][nid] > dist_neu_searchq[nid]: dist_neu_searchq[nid] += glomat[w][nid]/search_len except KeyError: glomat[w]={} for nid,neuron in enumerate(self._neurons): glomat[w][nid] = distance(neuron.get_weights(),v) # cosine similarity, hence 1 is best. 0 is bleh. -1 is opposite. if glomat[w][nid] > dist_neu_searchq[nid]: dist_neu_searchq[nid] += glomat[w][nid]/search_len # Union of all doclists with minimum dist_from_neuron. doclist = {} for nid in dist_neu_searchq.most_common()[:depth]: neuron = neuron_lookup[nid[0]] doclist.update(neuron.get_top_docs(30)) files = (open(doc) for doc in doclist) for json_file in files: data = json.load(json_file) centroids = data['centroids'] url = data['url'] json_file.close() wc_sim = [distance(v,c) for c in centroids] max_wc_sim = max(wc_sim) results[url] += max_wc_sim/len(searchvecs) results = OrderedDict(results.most_common(20)) htmlVars = {'query': search_str, 'results':results} htmlCode = template.render(htmlVars) result_path = os.path.join(search_dir,search_str+'.html') map_path = os.path.join(search_dir,search_str+'_map.html') with open(result_path,'w') as f: f.write(htmlCode) self.draw_SOM(search_str,dist_neu_searchq,neuron_labels,map_path) result_path = "file://{}".format(pathname2url(result_path)) map_path = "file://{}".format(pathname2url(map_path)) webbrowser.open(result_path)
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs)/4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data,nk,iter=20,minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir,name+'.json') file_dest = open(json_path, 'w') json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest) file_dest.close()