Exemplo n.º 1
0
def score_pods(query, query_dist, query_freqs):
    '''Score pods for a query'''
    pod_scores = {}
    score_sum = 0.0
    pods = db.session.query(Pods).filter_by(registered=True).all()
    for p in pods:
        DS_score = cosine_similarity(convert_to_array(p.DS_vector), query_dist)
        term_score, coverage = term_cosine.run(query, query_freqs,
                                               p.word_vector)
        score = DS_score + term_score + 2 * coverage
        if math.isnan(score):
            score = 0
        pod_scores[p.name] = score
        score_sum += score
    print(pod_scores)
    '''If all scores are rubbish, search entire pod collection (we're desperate!)'''
    if score_sum < 1:
        return list(pod_scores.keys())
    else:
        best_pods = []
        for k in sorted(pod_scores, key=pod_scores.get, reverse=True):
            if len(best_pods) < 1:
                best_pods.append(k)
            else:
                break
        return best_pods
Exemplo n.º 2
0
def score(query, query_dist, query_freqs):
    """ Get distributional score """
    DS_scores = {}
    term_scores = {}
    coverages = {}
    for p in db.session.query(Pods).filter_by(registered=False).all():
        DS_scores[p.url] = cosine_similarity(convert_to_array(p.DS_vector), query_dist)
        term_scores[p.url], coverages[p.url] = term_cosine.run(query, query_freqs, p.word_vector)
    return DS_scores, term_scores
Exemplo n.º 3
0
def run(q, d1, d2_s):
    d2 = convert_string_to_dict(d2_s)

    dimensions = return_keys(d1, d2)
    v1 = mk_vector(d1, dimensions)
    v2 = mk_vector(d2, dimensions)
    v1_bin = binarise(v1)
    v2_bin = binarise(v2)

    coverage = sum(v1_bin * v2_bin) / len(q.split())

    d1_vec = normalise(v1)
    d2_vec = normalise(v2)

    return cosine_similarity(d1_vec, d2_vec), coverage
Exemplo n.º 4
0
def score(query, query_dist, query_freqs, pod):
    """ Get various scores -- This is slow, slow, slow. Add code for vec to matrix calculations """
    DS_scores = {}
    URL_scores = {}
    title_scores = {}
    term_scores = {}
    coverages = {}
    #cosines = cosine_to_matrix(query_dist,DS_M)	#Code for vec to matrix cosine calculation -- work in progress
    for u in db.session.query(Urls).filter_by(pod=pod).all():
        DS_scores[u.url] = cosine_similarity(convert_to_array(u.vector),
                                             query_dist)
        #DS_scores[u.url] = cosines[url_to_mat[u.url]]
        URL_scores[u.url] = score_url_overlap(query, u.url)
        title_scores[u.url] = generic_overlap(query, u.title)
        term_scores[u.url], coverages[u.url] = term_cosine.run(
            query, query_freqs, u.freqs)
    return DS_scores, URL_scores, title_scores, term_scores, coverages
Exemplo n.º 5
0
def sim_to_matrix_url(target_url, n):
    cosines = {}
    target = convert_to_array(get_db_url_vector(target_url))
    for u in Urls.query.all():
        cos = cosine_similarity(target, convert_to_array(u.vector))
        cosines[u.url] = cos
    c = 0
    neighbours = []
    for url in sorted(cosines, key=cosines.get, reverse=True):
        if c < n:
            #print(t,cosines[t])
            title = Urls.query.filter(Urls.url == url).first().title
            snippet = Urls.query.filter(Urls.url == url).first().snippet
            url = Urls.query.filter(Urls.url == url).first().url
            neighbours.append([url, title, snippet])
            c += 1
        else:
            break
    return neighbours, len(cosines)