Exemplo n.º 1
0
def score_pods(query, query_dist, query_freqs):
    '''Score pods for a query'''
    pod_scores = {}
    score_sum = 0.0
    pods = db.session.query(Pods).filter_by(registered=True).all()
    for p in pods:
        DS_score = cosine_similarity(convert_to_array(p.DS_vector), query_dist)
        term_score, coverage = term_cosine.run(query, query_freqs,
                                               p.word_vector)
        score = DS_score + term_score + 2 * coverage
        if math.isnan(score):
            score = 0
        pod_scores[p.name] = score
        score_sum += score
    print(pod_scores)
    '''If all scores are rubbish, search entire pod collection (we're desperate!)'''
    if score_sum < 1:
        return list(pod_scores.keys())
    else:
        best_pods = []
        for k in sorted(pod_scores, key=pod_scores.get, reverse=True):
            if len(best_pods) < 1:
                best_pods.append(k)
            else:
                break
        return best_pods
Exemplo n.º 2
0
def score(query, query_dist, query_freqs):
    """ Get distributional score """
    DS_scores = {}
    term_scores = {}
    coverages = {}
    for p in db.session.query(Pods).filter_by(registered=False).all():
        DS_scores[p.url] = cosine_similarity(convert_to_array(p.DS_vector), query_dist)
        term_scores[p.url], coverages[p.url] = term_cosine.run(query, query_freqs, p.word_vector)
    return DS_scores, term_scores
Exemplo n.º 3
0
def sim_to_matrix_url(target_url, n):
    cosines = {}
    target = convert_to_array(get_db_url_vector(target_url))
    for u in Urls.query.all():
        cos = cosine_similarity(target, convert_to_array(u.vector))
        cosines[u.url] = cos
    c = 0
    neighbours = []
    for url in sorted(cosines, key=cosines.get, reverse=True):
        if c < n:
            #print(t,cosines[t])
            title = Urls.query.filter(Urls.url == url).first().title
            snippet = Urls.query.filter(Urls.url == url).first().snippet
            url = Urls.query.filter(Urls.url == url).first().url
            neighbours.append([url, title, snippet])
            c += 1
        else:
            break
    return neighbours, len(cosines)
Exemplo n.º 4
0
def mk_matrix_from_db():
    print("Making URL matrix from database...")
    urls = []
    DS_M = []
    url_to_mat = {}
    mat_to_url = {}
    try:
        urls = Urls.query.all()
        print("Found", len(urls), "records...")
    except:
        print("Database empty")
    if len(urls) > 0:
        c = 0
        DS_M = convert_to_array(urls[0].vector).reshape(1, 400)
        url_to_mat[urls[0].url] = c
        mat_to_url[c] = urls[0].url
        c += 1
        for u in urls[1:]:
            DS_M = np.vstack((DS_M, convert_to_array(u.vector).reshape(1,
                                                                       400)))
            url_to_mat[u.url] = c
            mat_to_url[c] = u.url
            c += 1
    return DS_M, url_to_mat, mat_to_url
Exemplo n.º 5
0
def score(query, query_dist, query_freqs, pod):
    """ Get various scores -- This is slow, slow, slow. Add code for vec to matrix calculations """
    DS_scores = {}
    URL_scores = {}
    title_scores = {}
    term_scores = {}
    coverages = {}
    #cosines = cosine_to_matrix(query_dist,DS_M)	#Code for vec to matrix cosine calculation -- work in progress
    for u in db.session.query(Urls).filter_by(pod=pod).all():
        DS_scores[u.url] = cosine_similarity(convert_to_array(u.vector),
                                             query_dist)
        #DS_scores[u.url] = cosines[url_to_mat[u.url]]
        URL_scores[u.url] = score_url_overlap(query, u.url)
        title_scores[u.url] = generic_overlap(query, u.title)
        term_scores[u.url], coverages[u.url] = term_cosine.run(
            query, query_freqs, u.freqs)
    return DS_scores, URL_scores, title_scores, term_scores, coverages
Exemplo n.º 6
0
def compute_pod_summary(name):
    '''This function is very similar to 'self' in PeARS-pod'''
    DS_vector = np.zeros(400)
    word_vector = ""
    freqs = {}
    for u in db.session.query(Urls).filter_by(pod=name).all():
        DS_vector += convert_to_array(u.vector)
        for k, v in convert_string_to_dict(u.freqs).items():
            if k in freqs:
                freqs[k] += int(v)
            else:
                freqs[k] = int(v)
    DS_vector = convert_to_string(normalise(DS_vector))
    c = 0
    for w in sorted(freqs, key=freqs.get, reverse=True):
        word_vector += w + ':' + str(freqs[w]) + ' '
        c += 1
        if c == 300:
            break
    return DS_vector, word_vector