def clf(filename="clf_1558_origin.txt"): c=0.1 alpha=0.6 # 1.build formation probability (for social and anchor) data = getSampleData() kf = cv.KFold(n=len(data), n_folds=5, shuffle=True) for train_index, test_index in kf: getFormProb(train_index, test_index, data) links_anchor=list() for inst in data: if inst[-1]!=0: links_anchor.append((inst[1],inst[2],inst[-1])) # 2. Use formation probability to random walk, alpha s=0.6, alpha a=0.6, c = 0.1 matrix, nodes, gids, tids = getMatrix(links_anchor, alpha) print("matrix over") preds = list() for gid in gids: print(gid) p = np.zeros(len(nodes)) p[nodes.index(gid)]=1 p_final = randomWalk(matrix,p,c) tid = nodes[p_final[len(gids):].argmax()] preds.append([gid,tid,1]) ut.writeList2CommaLine("../prediction/",filename)
def statNameScore(): gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName) gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gtsStrict twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) results = list() for gt in gts: googleId = gt[0] twitterId = gt[1] twitterName = twitterIdName[twitterId] print(googleId) print(twitterName) googleProfile = ut.readJson2Dict(interPath + "google/profile/", googleId) twitterProfile = ut.readJson2Dict(interPath + "twitter/profile/", twitterName) nameScore = ft.calNameScore(googleProfile, twitterProfile) displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile) totalScore = nameScore + displaynameScore results.append([ googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore) ]) ut.writeList2CommaLine(interPath, "name_score", results)
def output_topics(): cates = ["information management","marketing", "transportation", "om&or"] alpha = string.ascii_uppercase topics = read_topics() for cate in cates: results = list() topics_cate = topics[cate] for i, sub_cate in enumerate(topics_cate): for j, sub_topic in enumerate(sub_cate["topics"]): result = ["\""+alpha[i]+str(j+1)+"\"", "\""+sub_topic["title"]+"\""] results.append(result) ut.writeList2CommaLine(output_path, "mapping_"+cate+".csv", results)
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingId = list() twitterNameId = dict() twitterIdName = dict() for m in mapping: twitterUrl = m[1] twitterName = twitterUrl.split("/")[-1].strip() googleId = m[0] if twitterName=="": twitterName = twitterUrl.split("/")[-2] if twitterName=="#%21" or "twitter.com" in twitterName or "twitter" == twitterName: continue # check if the google plus id is a person # read twitter profile file to check # try: # location = "../data/google/profile/"+googleId # with open(location, "r") as fi: # jresult = json.loads(fi.read()) # if jresult["objectType"]!="person": # print(googleId) # except: # pass # check if the twitter name exist try: location = inputPath+"twitter/profile/"+twitterName with open(location, "r") as fi: jresult = json.loads(fi.read()) twitterId = jresult.get("id_str", 0) if twitterId != 0: mappingId.append([googleId, twitterId]) twitterNameId[twitterName] = twitterId twitterIdName[twitterId] = twitterName except: pass ut.writeList2CommaLine(interPath, "gt", mappingId) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingId = list() twitterNameId = dict() twitterIdName = dict() for m in mapping: twitterUrl = m[1] twitterName = twitterUrl.split("/")[-1].strip() googleId = m[0] if twitterName == "": twitterName = twitterUrl.split("/")[-2] if twitterName == "#%21" or "twitter.com" in twitterName or "twitter" == twitterName: continue # check if the google plus id is a person # read twitter profile file to check # try: # location = "../data/google/profile/"+googleId # with open(location, "r") as fi: # jresult = json.loads(fi.read()) # if jresult["objectType"]!="person": # print(googleId) # except: # pass # check if the twitter name exist try: location = inputPath + "twitter/profile/" + twitterName with open(location, "r") as fi: jresult = json.loads(fi.read()) twitterId = jresult.get("id_str", 0) if twitterId != 0: mappingId.append([googleId, twitterId]) twitterNameId[twitterName] = twitterId twitterIdName[twitterId] = twitterName except: pass ut.writeList2CommaLine(interPath, "gt", mappingId) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
def statNameScore(): gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName) gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gtsStrict twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) results = list() for gt in gts: googleId = gt[0] twitterId = gt[1] twitterName = twitterIdName[twitterId] print(googleId) print(twitterName) googleProfile = ut.readJson2Dict(interPath+"google/profile/", googleId) twitterProfile = ut.readJson2Dict(interPath+"twitter/profile/", twitterName) nameScore = ft.calNameScore(googleProfile, twitterProfile) displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile) totalScore = nameScore + displaynameScore results.append([googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore)]) ut.writeList2CommaLine(interPath, "name_score", results)
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingIdLoose = list() mappingIdStrict = list() twitterNameId = dict() twitterIdName = dict() mappingLoss = list() for m in mapping: twitterUrl = m[1] twitterName = getTwitterUsername(twitterUrl) googleId = m[0] if twitterName == "": continue (google_profile_bool, google_posts_bool) = checkGoogleData(googleId) (twitter_profile_bool, twitter_posts_bool, twitter_profile) = checkTwitterData(twitterName) if google_profile_bool == False or twitter_profile_bool == False: mappingLoss.append(m) else: twitterId = twitter_profile.get("id_str", 0) if google_posts_bool == False or twitter_posts_bool == False: mappingIdLoose.append([googleId, twitterId]) else: mappingIdLoose.append([googleId, twitterId]) mappingIdStrict.append([googleId, twitterId]) twitterIdName[twitterId] = twitterName twitterNameId[twitterName] = twitterId ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose) ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName) ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)
def createSNMapping(): path = "../data/" snLists = ut.readCommaLine2List(path, snFile) print(len(snLists)) fbMapping = list() twitterMapping = list() youtubeMapping = list() googleMapping = list() for snList in snLists: uid = snList[0] if snList[1] != "": youtubeMapping.append([snList[0],snList[1]]) if snList[2] != "": fbMapping.append([snList[0],snList[2]]) if snList[3] != "": twitterMapping.append([snList[0],snList[3]]) # if "plus.google" in snList[-1]: # googleMapping.append([snList]) print(len(twitterMapping)) ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping) ut.writeList2CommaLine("../data", "fbMapping", fbMapping) ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)
def createSNMapping(): path = "../data/" snLists = ut.readCommaLine2List(path, snFile) print(len(snLists)) fbMapping = list() twitterMapping = list() youtubeMapping = list() googleMapping = list() for snList in snLists: uid = snList[0] if snList[1] != "": youtubeMapping.append([snList[0], snList[1]]) if snList[2] != "": fbMapping.append([snList[0], snList[2]]) if snList[3] != "": twitterMapping.append([snList[0], snList[3]]) # if "plus.google" in snList[-1]: # googleMapping.append([snList]) print(len(twitterMapping)) ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping) ut.writeList2CommaLine("../data", "fbMapping", fbMapping) ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)