Пример #1
0
def vectorizer(tokens, w2v_db):
    db_path = w2v_db
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)
    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
            
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label
    # Get the vectors of words. Maintain order as in document.
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
    conn.close()
    return unsorted_kw, token_vecs
Пример #2
0
    def search(self, search_str, search_dir, depth=2):
        # Depth -> number of top nodes to search. Default 2 (arbitrary) has been sufficient so far.
        results = Counter()  # {url:score}
        dist_neu_searchq = Counter(
        )  # {nid:dist} Contains the similarity of each neuron to the aggregated search query
        neuron_lookup = self._neu_index
        neuron_labels = [[
            k for (k, n) in Counter(neuron.get_keylist()).most_common()[:10]
        ] for neuron in self._neurons]
        glomat = self._glomat  # Global matrix. Contains similarity of each search term in the query to all neurons. Something like a cache.

        conn = SQLCon()
        searchvecs = [(x, list(conn.read(x))) for x in search_str.split()
                      if not conn.read(x) is None
                      ]  # Obtain (word,vec) of search terms
        search_len = len(searchvecs)
        for (w, v) in searchvecs:  # For colour coding the map
            try:
                for nid in glomat[w]:
                    if glomat[w][nid] > dist_neu_searchq[nid]:
                        dist_neu_searchq[nid] += glomat[w][nid] / search_len
            except KeyError:
                glomat[w] = {}
                for nid, neuron in enumerate(self._neurons):
                    glomat[w][nid] = distance(
                        neuron.get_weights(), v
                    )  # cosine similarity, hence 1 is best. 0 is bleh. -1 is opposite.
                    if glomat[w][nid] > dist_neu_searchq[nid]:
                        dist_neu_searchq[nid] += glomat[w][nid] / search_len

            # Union of all doclists with minimum dist_from_neuron.
            doclist = {}
            for nid in dist_neu_searchq.most_common()[:depth]:
                neuron = neuron_lookup[nid[0]]
                doclist.update(neuron.get_top_docs(30))
            files = (open(doc) for doc in doclist)
            for json_file in files:
                data = json.load(json_file)
                centroids = data['centroids']
                url = data['url']
                json_file.close()
                wc_sim = [distance(v, c) for c in centroids]
                max_wc_sim = max(wc_sim)
                results[url] += max_wc_sim / len(searchvecs)

        results = OrderedDict(results.most_common(20))
        htmlVars = {'query': search_str, 'results': results}
        htmlCode = template.render(htmlVars)
        result_path = os.path.join(search_dir, search_str + '.html')
        map_path = os.path.join(search_dir, search_str + '_map.html')

        with open(result_path, 'w') as f:
            f.write(htmlCode)
        self.draw_SOM(search_str, dist_neu_searchq, neuron_labels, map_path)

        result_path = "file://{}".format(pathname2url(result_path))
        map_path = "file://{}".format(pathname2url(map_path))
        webbrowser.open(result_path)
Пример #3
0
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger',
                       'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w, t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs) / 4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data, nk, iter=20, minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir, name + '.json')
    file_dest = open(json_path, 'w')
    json.dump(
        {
            'url': url,
            'vectors': token_vecs,
            'keyword_frequency': unsorted_kw,
            'centroids': centroids
        }, file_dest)
    file_dest.close()
Пример #4
0
    def search(self,search_str,search_dir,depth=2):
        # Depth -> number of top nodes to search. Default 2 (arbitrary) has been sufficient so far.
        results = Counter()        # {url:score}
        dist_neu_searchq=Counter()  # {nid:dist} Contains the similarity of each neuron to the aggregated search query
        neuron_lookup = self._neu_index
        neuron_labels = [[k for (k,n) in Counter(neuron.get_keylist()).most_common()[:10]] for neuron in self._neurons]
        glomat = self._glomat # Global matrix. Contains similarity of each search term in the query to all neurons. Something like a cache.

        conn = SQLCon()
        searchvecs = [(x,list(conn.read(x))) for x in search_str.split() if not conn.read(x) is None] # Obtain (word,vec) of search terms
        search_len = len(searchvecs)
        for (w,v) in searchvecs:        # For colour coding the map
            try:
                for nid in glomat[w]:
                    if glomat[w][nid] > dist_neu_searchq[nid]:
                        dist_neu_searchq[nid] += glomat[w][nid]/search_len
            except KeyError:
                glomat[w]={}
                for nid,neuron in enumerate(self._neurons):
                    glomat[w][nid] = distance(neuron.get_weights(),v) # cosine similarity, hence 1 is best. 0 is bleh. -1 is opposite.
                    if glomat[w][nid] > dist_neu_searchq[nid]:
                        dist_neu_searchq[nid] += glomat[w][nid]/search_len


            # Union of all doclists with minimum dist_from_neuron. 
            doclist = {}
            for nid in dist_neu_searchq.most_common()[:depth]:
                neuron = neuron_lookup[nid[0]]
                doclist.update(neuron.get_top_docs(30))
            files = (open(doc) for doc in doclist)
            for json_file in files:
                data = json.load(json_file)
                centroids = data['centroids']
                url = data['url']
                json_file.close()
                wc_sim = [distance(v,c) for c in centroids]
                max_wc_sim = max(wc_sim)
                results[url] += max_wc_sim/len(searchvecs)

        results = OrderedDict(results.most_common(20))
        htmlVars = {'query': search_str, 'results':results}
        htmlCode = template.render(htmlVars)
        result_path = os.path.join(search_dir,search_str+'.html')
        map_path = os.path.join(search_dir,search_str+'_map.html')

        with open(result_path,'w') as f:
            f.write(htmlCode)
        self.draw_SOM(search_str,dist_neu_searchq,neuron_labels,map_path)


        result_path = "file://{}".format(pathname2url(result_path))
        map_path = "file://{}".format(pathname2url(map_path))
        webbrowser.open(result_path)
Пример #5
0
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs)/4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data,nk,iter=20,minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir,name+'.json')
    file_dest = open(json_path, 'w')
    json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest)
    file_dest.close()