Exemplo n.º 1
0
def clf(filename="clf_1558_origin.txt"):
	c=0.1
	alpha=0.6
	# 1.build formation probability (for social and anchor)
	data = getSampleData()
	kf = cv.KFold(n=len(data), n_folds=5, shuffle=True)
	for train_index, test_index in kf:
		getFormProb(train_index, test_index, data)
	links_anchor=list()
	for inst in data:
		if inst[-1]!=0:
			links_anchor.append((inst[1],inst[2],inst[-1]))

	# 2. Use formation probability to random walk, alpha s=0.6, alpha a=0.6, c = 0.1

	matrix, nodes, gids, tids = getMatrix(links_anchor, alpha)
	print("matrix over")
	preds = list()
	for gid in gids:
		print(gid)
		p = np.zeros(len(nodes))
		p[nodes.index(gid)]=1
		p_final = randomWalk(matrix,p,c)
		tid = nodes[p_final[len(gids):].argmax()]
		preds.append([gid,tid,1])
	ut.writeList2CommaLine("../prediction/",filename)
Exemplo n.º 2
0
def statNameScore():
    gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName)
    gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName)
    gts = gtsStrict
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)
    results = list()

    for gt in gts:
        googleId = gt[0]
        twitterId = gt[1]
        twitterName = twitterIdName[twitterId]
        print(googleId)
        print(twitterName)
        googleProfile = ut.readJson2Dict(interPath + "google/profile/",
                                         googleId)
        twitterProfile = ut.readJson2Dict(interPath + "twitter/profile/",
                                          twitterName)
        nameScore = ft.calNameScore(googleProfile, twitterProfile)
        displaynameScore = ft.calDisplayNameScore(googleProfile,
                                                  twitterProfile)
        totalScore = nameScore + displaynameScore
        results.append([
            googleId, twitterId,
            str(nameScore),
            str(displaynameScore),
            str(totalScore)
        ])
    ut.writeList2CommaLine(interPath, "name_score", results)
Exemplo n.º 3
0
def output_topics():
	cates = ["information management","marketing", "transportation", "om&or"]
	alpha = string.ascii_uppercase
	topics = read_topics()
	for cate in cates:
		results = list()
		topics_cate = topics[cate]
		for i, sub_cate in enumerate(topics_cate):
			for j, sub_topic in enumerate(sub_cate["topics"]):
				result = ["\""+alpha[i]+str(j+1)+"\"", "\""+sub_topic["title"]+"\""] 
				results.append(result)
		ut.writeList2CommaLine(output_path, "mapping_"+cate+".csv", results)
Exemplo n.º 4
0
def getGroundTruth():
	mapping = ut.readCommaLine2List(inputPath, mappingFileName)
	mappingId = list()
	twitterNameId = dict()
	twitterIdName = dict()
	for m in mapping:
		twitterUrl = m[1]
		twitterName = twitterUrl.split("/")[-1].strip()
		googleId = m[0]
		if twitterName=="":
			twitterName = twitterUrl.split("/")[-2]
		if twitterName=="#%21" or "twitter.com" in twitterName or "twitter" == twitterName:
			continue

		# check if the google plus id is a person

		# read twitter profile file to check
		# try:
		# 	location = "../data/google/profile/"+googleId
		# 	with open(location, "r") as fi:
		# 		jresult = json.loads(fi.read())
		# 		if jresult["objectType"]!="person":
		# 			print(googleId)
		# except:
		# 	pass

		# check if the twitter name exist
		try:
			location = inputPath+"twitter/profile/"+twitterName
			with open(location, "r") as fi:
				jresult = json.loads(fi.read())
				twitterId = jresult.get("id_str", 0)
				if twitterId != 0:
					mappingId.append([googleId, twitterId])
					twitterNameId[twitterName] = twitterId
					twitterIdName[twitterId] = twitterName
		except:
			pass
	ut.writeList2CommaLine(interPath, "gt", mappingId)
	ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
	ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
Exemplo n.º 5
0
def getGroundTruth():
    mapping = ut.readCommaLine2List(inputPath, mappingFileName)
    mappingId = list()
    twitterNameId = dict()
    twitterIdName = dict()
    for m in mapping:
        twitterUrl = m[1]
        twitterName = twitterUrl.split("/")[-1].strip()
        googleId = m[0]
        if twitterName == "":
            twitterName = twitterUrl.split("/")[-2]
        if twitterName == "#%21" or "twitter.com" in twitterName or "twitter" == twitterName:
            continue

        # check if the google plus id is a person

        # read twitter profile file to check
        # try:
        # 	location = "../data/google/profile/"+googleId
        # 	with open(location, "r") as fi:
        # 		jresult = json.loads(fi.read())
        # 		if jresult["objectType"]!="person":
        # 			print(googleId)
        # except:
        # 	pass

        # check if the twitter name exist
        try:
            location = inputPath + "twitter/profile/" + twitterName
            with open(location, "r") as fi:
                jresult = json.loads(fi.read())
                twitterId = jresult.get("id_str", 0)
                if twitterId != 0:
                    mappingId.append([googleId, twitterId])
                    twitterNameId[twitterName] = twitterId
                    twitterIdName[twitterId] = twitterName
        except:
            pass
    ut.writeList2CommaLine(interPath, "gt", mappingId)
    ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
    ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
Exemplo n.º 6
0
def statNameScore():
	gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName)
	gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName)
	gts = gtsStrict
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)
	results = list()

	for gt in gts:
		googleId = gt[0]
		twitterId = gt[1]
		twitterName = twitterIdName[twitterId]
		print(googleId)
		print(twitterName)
		googleProfile = ut.readJson2Dict(interPath+"google/profile/", googleId)
		twitterProfile = ut.readJson2Dict(interPath+"twitter/profile/", twitterName)
		nameScore = ft.calNameScore(googleProfile, twitterProfile)
		displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile)
		totalScore = nameScore + displaynameScore
		results.append([googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore)])
	ut.writeList2CommaLine(interPath, "name_score", results)
Exemplo n.º 7
0
def getGroundTruth():
	mapping = ut.readCommaLine2List(inputPath, mappingFileName)
	mappingIdLoose = list()
	mappingIdStrict = list()
	twitterNameId = dict()
	twitterIdName = dict()
	mappingLoss = list()
	for m in mapping:
		twitterUrl = m[1]
		twitterName = getTwitterUsername(twitterUrl)
		googleId = m[0]

		if twitterName == "":
			continue
		(google_profile_bool, google_posts_bool) = checkGoogleData(googleId)
		(twitter_profile_bool, twitter_posts_bool, twitter_profile) = checkTwitterData(twitterName)

		if google_profile_bool == False or twitter_profile_bool == False:
			mappingLoss.append(m)
		else:
			twitterId = twitter_profile.get("id_str", 0)
			if google_posts_bool == False or twitter_posts_bool == False:
				mappingIdLoose.append([googleId, twitterId])
			else:
				mappingIdLoose.append([googleId, twitterId])
				mappingIdStrict.append([googleId, twitterId])
			twitterIdName[twitterId] = twitterName
			twitterNameId[twitterName] = twitterId
	ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose)
	ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict)
	ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
	ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
	ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)
Exemplo n.º 8
0
def getGroundTruth():
    mapping = ut.readCommaLine2List(inputPath, mappingFileName)
    mappingIdLoose = list()
    mappingIdStrict = list()
    twitterNameId = dict()
    twitterIdName = dict()
    mappingLoss = list()
    for m in mapping:
        twitterUrl = m[1]
        twitterName = getTwitterUsername(twitterUrl)
        googleId = m[0]

        if twitterName == "":
            continue
        (google_profile_bool, google_posts_bool) = checkGoogleData(googleId)
        (twitter_profile_bool, twitter_posts_bool,
         twitter_profile) = checkTwitterData(twitterName)

        if google_profile_bool == False or twitter_profile_bool == False:
            mappingLoss.append(m)
        else:
            twitterId = twitter_profile.get("id_str", 0)
            if google_posts_bool == False or twitter_posts_bool == False:
                mappingIdLoose.append([googleId, twitterId])
            else:
                mappingIdLoose.append([googleId, twitterId])
                mappingIdStrict.append([googleId, twitterId])
            twitterIdName[twitterId] = twitterName
            twitterNameId[twitterName] = twitterId
    ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose)
    ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict)
    ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
    ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
    ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)
Exemplo n.º 9
0
def createSNMapping():
	path = "../data/"
	snLists = ut.readCommaLine2List(path, snFile)
	print(len(snLists))
	fbMapping = list()
	twitterMapping = list()
	youtubeMapping = list()
	googleMapping = list()
	for snList in snLists:
		uid = snList[0]
		if snList[1] != "":
			youtubeMapping.append([snList[0],snList[1]])
		if snList[2] != "":
			fbMapping.append([snList[0],snList[2]])
		if snList[3] != "":
			twitterMapping.append([snList[0],snList[3]])
		# if "plus.google" in snList[-1]:
		# 	googleMapping.append([snList])
	print(len(twitterMapping))
	ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping)
	ut.writeList2CommaLine("../data", "fbMapping", fbMapping)
	ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)
Exemplo n.º 10
0
def createSNMapping():
    path = "../data/"
    snLists = ut.readCommaLine2List(path, snFile)
    print(len(snLists))
    fbMapping = list()
    twitterMapping = list()
    youtubeMapping = list()
    googleMapping = list()
    for snList in snLists:
        uid = snList[0]
        if snList[1] != "":
            youtubeMapping.append([snList[0], snList[1]])
        if snList[2] != "":
            fbMapping.append([snList[0], snList[2]])
        if snList[3] != "":
            twitterMapping.append([snList[0], snList[3]])
        # if "plus.google" in snList[-1]:
        # 	googleMapping.append([snList])
    print(len(twitterMapping))
    ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping)
    ut.writeList2CommaLine("../data", "fbMapping", fbMapping)
    ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)