Exemplo n.º 1
0
def descAboutmeSimilarity(cur, sims):
    g_users = loadGithubProjectDescription(cur, "labeled_data_test") 
    ### Load user info of Stack Overflow
    cur.execute('''
		select distinct l.s_id, u.about_me
		from so_users u, labeled_data_test l
		where u.about_me != '' and u.id = l.s_id
	''')
    s_users = {}
    for c in cur.fetchall():
        s_users[c[0]] = c[1]

    if len(g_users) == 0 or len(s_users) == 0:
        return
    distances, g_key_indices, s_key_indices = tfidfSimilarities(g_users, s_users)

    cur.execute('''
		select distinct l.g_id, l.s_id
		from user_project_description g, labeled_data_test l, so_users s
		where g.description != '' and g.user_id = l.g_id
			and s.about_me != '' and s.id = l.s_id
	''')
    for p in cur.fetchall():
        g_ind = g_key_indices.get(p[0])
        s_ind = s_key_indices.get(p[1])

        if g_ind is not None and s_ind is not None:
            distance = distances[g_ind][s_ind]
        else:
            continue
        sims['desc_aboutme'][(p[0], p[1])] = 1 - distance

    cur.close()
Exemplo n.º 2
0
def descPTagsSimilarity(cur, sims):
    g_keys, g_values = loadGithubProjectDescription(cur, "labeled_data_test") 

    ### Load user info of Stack Overflow
    cur.execute('''
		select distinct l.s_id, u.tags
		from so_posts u, labeled_data_test l
		where u.tags != '' and u.owner_user_id = l.s_id
	''')
    s_users = {}
    for c in cur.fetchall():
        if c[0] in s_users:
            s_users[c[0]] += " " + c[1]
        else:
            s_users[c[0]] = c[1]
    s_keys = s_users.keys()

    if len(g_values) == 0 or len(s_keys) == 0:
        return
    sims_tmp = tfidfSimilarities(g_values, s_users.values())

    cur.execute('''
		select distinct l.g_id, l.s_id
		from user_project_description g, labeled_data_test l, so_posts s
		where g.description != '' and g.user_id = l.g_id
			and s.tags != '' and s.owner_user_id = l.s_id
	''')
    for p in cur.fetchall():
        sims['desc_ptags'][(p[0], p[1])] = 1 - sims_tmp[g_keys.index(p[0])][s_keys.index(p[1])]
    cur.close()
Exemplo n.º 3
0
def generateDescCommentSimilarity(cfg, redoSimilarity=False):
    print("\n===========\nRUNNING generateDescCommentSimilarity()\n===========\n")
    con, cur = getDbConnection(cfg)
    
    # check if done before
    if redoSimilarity:
        cur.execute('delete from similarities_among_desc_comment')
        con.commit()
    else:
        cur.execute('select g_id from similarities_among_desc_comment limit 1')
        existing = [r[0] for r in cur.fetchall()]
        if len(existing) > 0:
            print("similarities_among_desc_comment has already been generated")
            return

    print("created table similarities_among_desc_comment")

    ### Load user info of GitHub
    g_users = loadGithubProjectDescription(cur, "labeled_data")

    ### Load user info of Stack Overflow
    cur.execute('''
		select distinct l.s_id, u.text
		from so_comments u, labeled_data l
		where u.text != '' and u.user_id = l.s_id
	''')
    s_users = {}
    for c in cur.fetchall():
        if c[0] in s_users:
            s_users[c[0]] += " " + c[1]
        else:
            s_users[c[0]] = c[1]

    ### TF-IDF computation
    distances, g_key_indices, s_key_indices = tfidfSimilarities(g_users, s_users)

    print("shape - distances: {}.\n".format(distances.shape))

    ### store similarities
    cur.execute('''
		select distinct l.g_id, l.s_id
		from user_project_description g, labeled_data l, so_comments s
		where g.description != '' and g.user_id = l.g_id
			and s.text != '' and s.user_id = l.s_id
	''')
    good = 0
    bad = 0
    for p in cur.fetchall():
        g_ind = g_key_indices.get(p[0])
        s_ind = s_key_indices.get(p[1])

        if g_ind is not None and s_ind is not None:
            distance = distances[g_ind][s_ind]
            # print("\t1-similarity_val: {}".format(1 - distance))
            good += 1
        else:
            # print("p[0]: {}, p[1]: {}".format(p[0], p[1]))
            # print("\tg_ind: {}, s_ind: {}".format(g_ind, s_ind))
            bad += 1
            continue

        cur.execute('''
			insert into similarities_among_desc_comment
			values (%s, %s, %s)
		''', (p[0], p[1], 1 - distance))

    print("Close connection")
    con.commit()
    cur.close()
    con.close()
    print("\nAll done. #good ones: {}, #bad ones: {}".format(good, bad))
    print("=======End generateDescCommentSimilarity()=======")