예제 #1
0
def structUserData(sn, uid):
    print(uid)
    # norm profile
    profile = ut.readJson2Dict(inputPath + sn + "/profile/", uid)
    posts = ut.readJson2Dict(inputPath + sn + "/wall/", uid)

    print("profile:" + interPath + sn + "/profile/" + uid)
    newProfile = normProfile(sn, profile)
    print("wall:" + interPath + sn + "/wall/" + uid)
    newPosts = normWall(sn, posts)

    ut.writeDict2Json(interPath + sn + "/profile/", uid, newProfile)
    ut.writeDict2Json(interPath + sn + "/wall/", uid, newPosts)

    # wall statisitcs
    langDistri = ut.getDistri([post["lang"] for post in newPosts])
    # sentiment sum
    sentiments = [post["sentiment"]["polarity"] for post in newPosts]
    sentiment_score = sum(sentiments) / len(sentiments) if len(
        sentiments) > 0 else 0
    # topic sum
    topicDistris = [post["topic_distri"] for post in newPosts]
    userTopicDistri = ut.mergeDict(topicDistris)
    userTopicDistri = ut.normVector(userTopicDistri)
    # tf
    tfs = [post["tf"] for post in newPosts]
    userTf = ut.mergeDict(tfs)
    return (userTf, langDistri, sentiment_score, userTopicDistri)
예제 #2
0
def getBehaviorScore(sn1, user1, sn2, user2):
	posts1 = ut.readJson2Dict(interPath+sn1+"/wall/", user1) 
	posts2 = ut.readJson2Dict(interPath+sn2+"/wall/", user2)
	text1 = ut.readJson2Dict(interPath+sn1+"/text/", user1)
	text2 = ut.readJson2Dict(interPath+sn2+"/text/", user2)
	scoresBehavior = calBehaviorScore(posts1, posts2, text1, text2)
	return scoresBehavior
예제 #3
0
def structUserData(sn, uid):
	print(uid)
	# norm profile
	profile = ut.readJson2Dict(inputPath+sn+"/profile/", uid)
	posts = ut.readJson2Dict(inputPath+sn+"/wall/", uid)

	print("profile:"+interPath+sn+"/profile/"+uid)
	newProfile = normProfile(sn, profile)
	print("wall:"+interPath+sn+"/wall/"+uid)
	newPosts = normWall(sn, posts)

	ut.writeDict2Json(interPath+sn+"/profile/", uid, newProfile)
	ut.writeDict2Json(interPath+sn+"/wall/", uid, newPosts)

	# wall statisitcs
	langDistri = ut.getDistri([post["lang"] for post in newPosts])
	# sentiment sum
	sentiments = [post["sentiment"]["polarity"] for post in newPosts]
	sentiment_score = sum(sentiments)/len(sentiments) if len(sentiments)>0 else 0
	# topic sum
	topicDistris = [post["topic_distri"] for post in newPosts]
	userTopicDistri = ut.mergeDict(topicDistris)
	userTopicDistri = ut.normVector(userTopicDistri)
	# tf
	tfs = [post["tf"] for post in newPosts]
	userTf = ut.mergeDict(tfs)
	return (userTf, langDistri, sentiment_score, userTopicDistri)
예제 #4
0
def statNameScore():
    gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName)
    gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName)
    gts = gtsStrict
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)
    results = list()

    for gt in gts:
        googleId = gt[0]
        twitterId = gt[1]
        twitterName = twitterIdName[twitterId]
        print(googleId)
        print(twitterName)
        googleProfile = ut.readJson2Dict(interPath + "google/profile/",
                                         googleId)
        twitterProfile = ut.readJson2Dict(interPath + "twitter/profile/",
                                          twitterName)
        nameScore = ft.calNameScore(googleProfile, twitterProfile)
        displaynameScore = ft.calDisplayNameScore(googleProfile,
                                                  twitterProfile)
        totalScore = nameScore + displaynameScore
        results.append([
            googleId, twitterId,
            str(nameScore),
            str(displaynameScore),
            str(totalScore)
        ])
    ut.writeList2CommaLine(interPath, "name_score", results)
예제 #5
0
def getBehaviorScore(sn1, user1, sn2, user2):
    posts1 = ut.readJson2Dict(interPath + sn1 + "/wall/", user1)
    posts2 = ut.readJson2Dict(interPath + sn2 + "/wall/", user2)
    text1 = ut.readJson2Dict(interPath + sn1 + "/text/", user1)
    text2 = ut.readJson2Dict(interPath + sn2 + "/text/", user2)
    scoresBehavior = calBehaviorScore(posts1, posts2, text1, text2)
    return scoresBehavior
예제 #6
0
def checkTwitterData(uname):
	profile = ut.readJson2Dict(inputPath+"twitter/profile/", uname)
	posts = ut.readJson2Dict(inputPath+"twitter/wall/", uname)
	profile_bool = True
	posts_bool = True
	if len(profile)==0 or type(profile.get("errors", 0))==list:
		profile_bool = False
	if len(posts)==0:
		posts_bool = False
	return (profile_bool, posts_bool, profile)
예제 #7
0
def checkGoogleData(uid):
	profile = ut.readJson2Dict(inputPath+"google/profile/", uid)
	posts = ut.readJson2Dict(inputPath+"google/wall/", uid)
	profile_bool = True
	posts_bool = True
	if profile.get("status", 0) == "error" or len(profile)==0:
		profile_bool = False
	if type(posts) == dict or len(posts)==0:
		posts_bool = False
	return (profile_bool, posts_bool)
예제 #8
0
def checkTwitterData(uname):
    profile = ut.readJson2Dict(inputPath + "twitter/profile/", uname)
    posts = ut.readJson2Dict(inputPath + "twitter/wall/", uname)
    profile_bool = True
    posts_bool = True
    if len(profile) == 0 or type(profile.get("errors", 0)) == list:
        profile_bool = False
    if len(posts) == 0:
        posts_bool = False
    return (profile_bool, posts_bool, profile)
예제 #9
0
def checkGoogleData(uid):
    profile = ut.readJson2Dict(inputPath + "google/profile/", uid)
    posts = ut.readJson2Dict(inputPath + "google/wall/", uid)
    profile_bool = True
    posts_bool = True
    if profile.get("status", 0) == "error" or len(profile) == 0:
        profile_bool = False
    if type(posts) == dict or len(posts) == 0:
        posts_bool = False
    return (profile_bool, posts_bool)
예제 #10
0
def stat_others(input_path="../prediction/input/"):
	cates = ["information management","marketing", "transportation", "om&or"]
	data = ut.readJson2Dict(input_path, "data_labeled.json")
	topics = ["Others but relevant to IM", "Others but relevant to Marketing", "Others but relevant to OM", "Others but relevant to Transportation", "Not relevant", "Other Methods", "Relevant to IM", "Relevant to Transportation", "Relevant to Marketing", "Not Relevant to All Fields"]
	paper_cate = {"information management": dict(),"marketing": dict(), "transportation": dict(), "om&or": dict()}
	for paper in data:
		label = paper["fields"]["label_final"].strip()
		tmp = label.lower()
		if "relevant" in tmp or "relevent" in tmp:
		# if label in topics:
			cate = paper["fields"]["category"]
			tmp = paper_cate[cate].get(label, list())
			tmp.append(paper)
			paper_cate[cate][label] = tmp
	for cate in cates:
		fo = open("../problem/"+cate+".csv", "w")
		fields = ["title","author","journal","volume","number","pages","year","month","keyword","keyword-plus","abstract"]
		fo.write(","+",".join(fields)+"\n")
		cate_topics = paper_cate[cate]
		for cate_topic, papers in cate_topics.items():
			fo.write(cate_topic)
			for paper in papers:
				for field in fields:
					fo.write(",\""+paper["fields"].get(field,"")+"\"")
				fo.write("\n")
예제 #11
0
def stat_top_label(input_path="../result/"):
	fo = codecs.open("../output/stat_top_label.csv", "w", encoding="big5")
	w = csv.writer(fo)
	cates = ["information management","marketing", "transportation", "om&or"]
	data = ut.readJson2Dict(input_path, "data_final.json")
	data_labeled = {"information management": dict(), "marketing": dict(), "transportation": dict(), "om&or": dict()}
	for paper in data:
		year = paper["fields"]["year"]
		cate = paper["fields"]["category"]
		if paper["fields"]["phased3"]==3:
			s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";")))
		elif paper["fields"]["phased3"]==2:
			s = set(paper["fields"]["label4"].split(";"))
		else:
			s =set(paper["fields"]["label3"].split(";"))
		combinations = get_combination(s)
		# s = ";".join(sorted(list(s)))
		if data_labeled[cate].get(year, 0)==0:
			data_labeled[cate][year] = dict()
		for c in combinations:
			data_labeled[cate][year][c] = data_labeled[cate][year].get(c,0) + 1
	for cate in cates:
		w.writerow([cate.upper()])
		for year, sets in sorted(data_labeled[cate].items()):
			topn = sorted([(count, s) for s, count in sets.items()], reverse=True)[:10]
			# fo.write(str(year)+","+",".join([s for count, s in topn])+"\n")
			w.writerow([year]+[s for count, s in topn])
	fo.close()
예제 #12
0
def stat_journal(input_path="../result/", fname="data_final.json"):
	fo = open("../output/stat_journal.csv", "w")
	papers = ut.readJson2Dict(input_path, fname)
	print(len(papers))
	paper_cate_journal = dict()
	for p in papers:
		cate = p["fields"]["category"].upper()
		journal = p["fields"]["journal"]
		vol = p["fields"]["volume"]
		no = p["fields"]["number"].replace(",","")
		if paper_cate_journal.get(cate, 0) == 0:
			paper_cate_journal[cate] = dict()
		if paper_cate_journal[cate].get(journal, 0) == 0:
			paper_cate_journal[cate][journal] = dict()
		if paper_cate_journal[cate][journal].get(vol, 0) == 0:
			paper_cate_journal[cate][journal][vol] = dict()
		if paper_cate_journal[cate][journal][vol].get(no, 0) == 0:
			paper_cate_journal[cate][journal][vol][no] = 0
		paper_cate_journal[cate][journal][vol][no]+=1
	len(paper_cate_journal)
	for cate, journals in paper_cate_journal.items():
		for j, vols in sorted(journals.items()):
			for v, nos in sorted(vols.items()):
				for no, count in sorted(nos.items()):
					fo.write(cate+","+j+","+v+","+no+","+str(count)+"\n")
	fo.close()
예제 #13
0
def readData(users_google, users_twitter, twitterIdName):
	profileGoogle = dict()
	profileTwitter = dict()
	wallGoogle = dict()
	wallTwitter = dict()
	textGoogle = dict()
	textTwitter = dict()
	for user in users_google:
		profileGoogle[user] = ut.readJson2Dict(interPath+"google/profile/", user)
		# wallGoogle[user] = ut.readJson2Dict(interPath+"google/wall/", user)
		# textGoogle[user] = ut.readJson2Dict(interPath+"google/text/", user)
	for user in users_twitter:
		twitterName = twitterIdName[user]
		profileTwitter[user] = ut.readJson2Dict(interPath+"twitter/profile/", twitterName)
		# wallTwitter[user] = ut.readJson2Dict(interPath+"twitter/wall/", twitterName)
		# textTwitter[user] = ut.readJson2Dict(interPath+"twitter/text/", twitterName)
	return profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter
	print("load file over")
예제 #14
0
def stat_user2(input_path="../result/"):
	cates = ["information management","marketing", "transportation", "om&or"]
	data = ut.readJson2Dict(input_path, "data_final.json")
	users = ut.readJson2Dict("../website/public/", "data_user1.json")
	data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()}
	results = dict()
	for paper in data:
		data_labeled[paper["fields"]["category"]].append(paper)
	with codecs.open("../output/stat_user2.txt", "w", encoding="big5") as fo:
		for cate in cates:
			fo.write(cate.upper()+"\n")
			papers = data_labeled[cate]
			print(len(papers))
			time3s = [paper["fields"]["time3"] for paper in papers if paper["fields"]["time3"]<1200 and paper["fields"]["time3"]>0]
			time4s = [paper["fields"]["time4"] for paper in papers if paper["fields"]["time4"]<1200 and paper["fields"]["time4"]>0]
			avg_time3 = sum(time3s)/len(time3s) 
			avg_time4 = sum(time4s)/len(time4s)
			users_cate = [user["fields"]["name"] for user in users if user["fields"]["category"]==cate]
			fo.write(" ".join([users_cate[0], str(avg_time3), users_cate[1], str(avg_time4)])+"\n")
예제 #15
0
def output_edit_im():
	data = ut.readJson2Dict("../output/", "parsed_edit_im.json")
	fields = ["title","author","journal","volume","number","pages","year","month","keyword","keyword-plus","abstract"]
	fo = open(output_path+"papers_im_editorial"+".csv", "w")
	fo.write("index,"+",".join(fields)+"\n")
	for index, paper in enumerate(data):
		fo.write(str(index+1))
		for field in fields:
			fo.write(",\""+paper.get(field,"")+"\"")
		fo.write("\n")
	fo.close()
예제 #16
0
def output_data(input_path ="../result/", output_path="../result/"):
	data = ut.readJson2Dict(input_path, "data_final.json")
	cates = ["information management","marketing", "transportation", "om&or"]
	paper_cate = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()}
	output_topics()
	for paper in data:
		paper_cate[paper["fields"]["category"]].append(paper)
	for cate in cates:
		papers = paper_cate[cate]
		# output_article(papers, output_path, cate)
		output_label(papers, output_path, cate)
예제 #17
0
def readData(users_google, users_twitter, twitterIdName):
    profileGoogle = dict()
    profileTwitter = dict()
    wallGoogle = dict()
    wallTwitter = dict()
    textGoogle = dict()
    textTwitter = dict()
    for user in users_google:
        profileGoogle[user] = ut.readJson2Dict(interPath + "google/profile/",
                                               user)
        # wallGoogle[user] = ut.readJson2Dict(interPath+"google/wall/", user)
        # textGoogle[user] = ut.readJson2Dict(interPath+"google/text/", user)
    for user in users_twitter:
        twitterName = twitterIdName[user]
        profileTwitter[user] = ut.readJson2Dict(interPath + "twitter/profile/",
                                                twitterName)
        # wallTwitter[user] = ut.readJson2Dict(interPath+"twitter/wall/", twitterName)
        # textTwitter[user] = ut.readJson2Dict(interPath+"twitter/text/", twitterName)
    return profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter
    print("load file over")
예제 #18
0
def statNameScore():
	gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName)
	gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName)
	gts = gtsStrict
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)
	results = list()

	for gt in gts:
		googleId = gt[0]
		twitterId = gt[1]
		twitterName = twitterIdName[twitterId]
		print(googleId)
		print(twitterName)
		googleProfile = ut.readJson2Dict(interPath+"google/profile/", googleId)
		twitterProfile = ut.readJson2Dict(interPath+"twitter/profile/", twitterName)
		nameScore = ft.calNameScore(googleProfile, twitterProfile)
		displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile)
		totalScore = nameScore + displaynameScore
		results.append([googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore)])
	ut.writeList2CommaLine(interPath, "name_score", results)
예제 #19
0
def structData():
    # init
    s = time.time()
    usersTf1 = dict()
    usersTf2 = dict()
    usersLangDistri1 = dict()
    usersLangDistri2 = dict()
    usersSentimentScore1 = dict()
    usersSentimentScore2 = dict()
    usersTopicDistir1 = dict()
    usersTopicDistri2 = dict()
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName)
    gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName)
    gts = gts_strict
    if not os.path.isdir(interPath + sn1):
        os.makedirs(interPath + sn1 + "/profile")
        os.makedirs(interPath + sn1 + "/wall")
        os.makedirs(interPath + sn1 + "/text")
        os.makedirs(interPath + sn2 + "/profile")
        os.makedirs(interPath + sn2 + "/wall")
        os.makedirs(interPath + sn2 + "/text")
    # norm profile and wall
    for gt in gts:
        uid1 = gt[0]
        uid2 = gt[1]
        try:
            if sn1 == "twitter":
                uid1 = twitterIdName[uid1]
            if sn2 == "twitter":
                uid2 = twitterIdName[uid2]
        except:
            continue
        # if not os.path.exists(interPath+sn1+"/profile/"+uid1):
        # norm profile and posts: google and twitter
        (userTf1, langDistri1, userSentimentScore1,
         userTopicDistri1) = structUserData(sn1, uid1)
        (userTf2, langDistri2, userSentimentScore2,
         userTopicDistri2) = structUserData(sn2, uid2)
        usersTf1[uid1] = userTf1
        usersTf2[uid2] = userTf2
        usersLangDistri1[uid1] = langDistri1
        usersLangDistri2[uid2] = langDistri2
        usersSentimentScore1[uid1] = userSentimentScore1
        usersSentimentScore2[uid2] = userSentimentScore2
        usersTopicDistir1[uid1] = userTopicDistri1
        usersTopicDistri2[uid2] = userTopicDistri2
    # build dictionary and idf
    writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2,
                   usersSentimentScore1, usersSentimentScore2,
                   usersTopicDistir1, usersTopicDistri2)
    e = time.time()
    print(e - s)
예제 #20
0
def getScores(sn1, user1, sn2, user2, g1, g2, g0):
	# read the twitter name id mapping file
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	if sn2 == "twitter":
		user2_name = twitterIdName[user2]
		scores = getProfileScore(sn1, user1, sn2, user2_name)+getSocialScore(sn1, user1, sn2, user2, g1, g2, g0)+getBehaviorScore(sn1, user1, sn2, user2_name)
	elif sn1 == "twitter":
		user1_name = twitterIdName[user1]
		scores = getProfileScore(sn1, user1_name, sn2, user2)+getSocialScore(sn1, user1, sn2, user2, g1, g2, g0)+getBehaviorScore(sn1, user1_name, sn2, user2)
	else:
		scores = getProfileScore(sn1, user1, sn2, user2)+getSocialScore(sn1, user1, sn2, user2, g1, g2, g0)+getBehaviorScore(sn1, user1, sn2, user2)
	return scores
예제 #21
0
def reviseTwitterRelationship():
    names = list()
    twitterNameId = ut.readJson2Dict(interPath, "twitterNameId")
    with open(interPath + "twitter/relationship_file_revise", "w") as fo:
        with open(interPath + "twitter/relationship_file", "r") as fi:
            for line in fi:
                ids = line.split(" ")
                user = ids[0]
                friends = ids[1]
                if user not in names and twitterNameId.get(user, 0) != 0:
                    uid = twitterNameId[user]
                    fo.write(uid + " " + friends)
                else:
                    print(user)
예제 #22
0
def reviseTwitterRelationship():
	names = list()
	twitterNameId = ut.readJson2Dict(interPath, "twitterNameId")
	with open(interPath+"twitter/relationship_file_revise", "w") as fo:
		with open(interPath+"twitter/relationship_file", "r") as fi:
			for line in fi:
				ids = line.split(" ")
				user = ids[0]
				friends = ids[1]
				if user not in names and twitterNameId.get(user, 0) != 0:
					uid = twitterNameId[user]
					fo.write(uid+" "+friends)
				else:
					print(user)
예제 #23
0
def structData():
	# init
	s = time.time()
	usersTf1 = dict()
	usersTf2 = dict()
	usersLangDistri1 = dict()
	usersLangDistri2 = dict()
	usersSentimentScore1 = dict()
	usersSentimentScore2 = dict()
	usersTopicDistir1 = dict()
	usersTopicDistri2 = dict()
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName)
	gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName)
	gts = gts_strict
	if not os.path.isdir(interPath+sn1):
		os.makedirs(interPath+sn1+"/profile")
		os.makedirs(interPath+sn1+"/wall")
		os.makedirs(interPath+sn1+"/text")
		os.makedirs(interPath+sn2+"/profile")
		os.makedirs(interPath+sn2+"/wall")
		os.makedirs(interPath+sn2+"/text")
	# norm profile and wall
	for gt in gts:
		uid1 = gt[0]
		uid2 = gt[1]
		try:
			if sn1 == "twitter":
				uid1 = twitterIdName[uid1]
			if sn2 =="twitter":
				uid2 = twitterIdName[uid2]
		except:
			continue
		# if not os.path.exists(interPath+sn1+"/profile/"+uid1):
			# norm profile and posts: google and twitter
		(userTf1, langDistri1, userSentimentScore1, userTopicDistri1) = structUserData(sn1, uid1)
		(userTf2, langDistri2, userSentimentScore2, userTopicDistri2) = structUserData(sn2, uid2)
		usersTf1[uid1] = userTf1 
		usersTf2[uid2] = userTf2
		usersLangDistri1[uid1] = langDistri1
		usersLangDistri2[uid2] = langDistri2
		usersSentimentScore1[uid1] = userSentimentScore1
		usersSentimentScore2[uid2] = userSentimentScore2
		usersTopicDistir1[uid1] = userTopicDistri1
		usersTopicDistri2[uid2] = userTopicDistri2
	# build dictionary and idf
	writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistir1, usersTopicDistri2)
	e = time.time()
	print(e-s)
예제 #24
0
def stat_user(input_path="../prediction/input/"):
	cates = ["information management","marketing", "transportation", "om&or"]
	data = ut.readJson2Dict(input_path, "data_labeled.json")
	users = ut.readJson2Dict("../website/public/", "data_user1.json")
	data_labeled = {"information management": {"1":list(), "2": list()}, "marketing": {"1":list(), "2": list()}, "transportation": {"1":list(), "2": list()}, "om&or": {"1":list(), "2": list()}}
	results = dict()
	# four cate, two step
	for paper in data:
		if paper["fields"]["is_phased1"]:
			data_labeled[paper["fields"]["category"]]["1"].append(paper)
		elif paper["fields"]["is_phased2"]:
			data_labeled[paper["fields"]["category"]]["2"].append(paper)
		else:
			pass
	with codecs.open("../output/stat_user.txt", "w", encoding="big5") as fo:
		for cate in cates:
			fo.write(cate.upper()+"\n")
			# print(cate.upper())
			for i in range(1,3):
				papers = data_labeled[cate][str(i)]
				time1 = sum([paper["fields"]["time1"] for paper in papers if paper["fields"]["time1"]<1200])/len(papers)
				time2 = sum([paper["fields"]["time2"] for paper in papers if paper["fields"]["time2"]<1200])/len(papers)
				users_cate = [user["fields"]["name"] for user in users if user["fields"]["category"]==cate]
				fo.write(" ".join(["Phase"+str(i), users_cate[0], str(time1), users_cate[1], str(time2)])+"\n")
예제 #25
0
def output_err_data():
	data = ut.readJson2Dict(result_path, "data_final.json")
	papers_428 = list()
	# 2. excel row 428
	for p in data:
		if p["fields"]["journal"] == "JOURNAL OF MANAGEMENT INFORMATION SYSTEMS" and p["fields"]["volume"]=="31" and p["fields"]["number"]=="4":
		# if p["fields"]["journal"] == "JOURNAL OF MANAGEMENT INFORMATION SYSTEMS":
			print(p["fields"]["title"])
			papers_428.append(p)
	output_paper("../output/stat_error_428.csv", papers_428)
	# 3. excel row 657
	papers_657 = list()
	for p in data:
		if p["fields"]["journal"] == "TRANSPORTATION RESEARCH PART C-EMERGING TECHNOLOGIES" and p["fields"]["volume"]=="47":
		# if p["fields"]["journal"] == "JOURNAL OF MANAGEMENT INFORMATION SYSTEMS":
			print(p["fields"]["title"])
			papers_657.append(p)
	output_paper("../output/stat_error_657.csv", papers_657)
예제 #26
0
def getScores(sn1, user1, sn2, user2, g1, g2, g0):
    # read the twitter name id mapping file
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    if sn2 == "twitter":
        user2_name = twitterIdName[user2]
        scores = getProfileScore(sn1, user1, sn2, user2_name) + getSocialScore(
            sn1, user1, sn2, user2, g1, g2, g0) + getBehaviorScore(
                sn1, user1, sn2, user2_name)
    elif sn1 == "twitter":
        user1_name = twitterIdName[user1]
        scores = getProfileScore(sn1, user1_name, sn2, user2) + getSocialScore(
            sn1, user1, sn2, user2, g1, g2, g0) + getBehaviorScore(
                sn1, user1_name, sn2, user2)
    else:
        scores = getProfileScore(sn1, user1, sn2, user2) + getSocialScore(
            sn1, user1, sn2, user2, g1, g2, g0) + getBehaviorScore(
                sn1, user1, sn2, user2)
    return scores
예제 #27
0
def stat_label2(input_path="../result/"):
	cates = ["information management","marketing", "transportation", "om&or"]
	data = ut.readJson2Dict(input_path, "data_final.json")
	data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()}
	for paper in data:
		if paper["fields"]["phased3"]==3:
			data_labeled[paper["fields"]["category"]].append(paper)
	for cate in cates:
		print(cate.upper())
		papers = data_labeled[cate]
		jaccards = list()
		aligns = list()
		for paper in papers:
			s1 = set(paper["fields"]["label3"].split(";"))
			s2 = set(paper["fields"]["label4"].split(";"))
			jaccards.append(jaccard(s1,s2))
			aligns.append(align_ratio(s1,s2,3))
		print("Jaccards:", sum(jaccards)/len(jaccards))
		print("Alignment Ratio:", sum(aligns)/len(aligns))
예제 #28
0
def stat_other_label(input_path="../result/"):
	mapping = {'information management': 'Others but relevant to IM','marketing':'Others but relevant to Marketing', 'transportation':'Others but relevant to Transportation', 'om&or':'Others but relevant to OM&OR'}
	data = ut.readJson2Dict(input_path, "data_final.json")
	data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()}

	# read data
	for paper in data:
		cate = paper["fields"]["category"]
		if paper["fields"]["phased3"]==3:
			s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";")))
		elif paper["fields"]["phased3"]==2:
			s = set(paper["fields"]["label4"].split(";"))
		else:
			s =set(paper["fields"]["label3"].split(";"))
		if mapping[cate] in s:
			data_labeled[cate].append(paper)
	# output data
	cols = ['isi', 'title', 'author', 'journal', 'year', 'month', 'volume', 'number', 'pages', 'keyword', 'keywords_plus','web_of_science_categories', 'abstract']
	for cate, papers in data_labeled.items():
		output_paper("../output/stat_label_others_"+cate+".csv", papers)
예제 #29
0
def stat_label_distri(input_path="../result/"):
	cates = ["information management","marketing", "transportation", "om&or"]
	data = ut.readJson2Dict(input_path, "data_final.json")
	data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()}

	for paper in data:
		if paper["fields"]["phased3"]==3:
			s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";")))
		elif paper["fields"]["phased3"]==2:
			s = set(paper["fields"]["label4"].split(";"))
		else:
			s =set(paper["fields"]["label3"].split(";"))
		data_labeled[paper["fields"]["category"]].append(s)
	for cate in cates:
		print(cate.upper())
		label_len_distri = dict()
		for s in data_labeled[cate]:
			label_len_distri[len(s)] = label_len_distri.get(len(s),0) +1
		for k, v in sorted(label_len_distri.items()):
			print(k, v/len(data_labeled[cate]))
예제 #30
0
def output_im_word_data():
	data = ut.readJson2Dict(result_path, "data_final.json")
	words = ["Neuro IS", "NeuroIS", "Neuro-IS", "Virtual World", "Online game", "Ethics", "Open Source"]
	papers = list()
	for paper in data:
		cate = paper["fields"]["category"]
		if cate != "information management":
			continue
		if paper["fields"]["phased3"]==3:
			s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";")))
		elif paper["fields"]["phased3"]==2:
			s = set(paper["fields"]["label4"].split(";"))
		else:
			s =set(paper["fields"]["label3"].split(";"))
		if "Others but relevant to IM" in s:
			continue
		for w in words:
			if w in paper["fields"]["abstract"]:
				papers.append(paper)
				break
	output_paper("../output/stat_im_new_topic.csv", papers)
예제 #31
0
def stat_label(input_path="../prediction/input/"):
	cates = ["information management","marketing", "transportation", "om&or"]
	data = ut.readJson2Dict(input_path, "data_labeled.json")
	data_labeled = {"information management": {"1":list(), "2": list()}, "marketing": {"1":list(), "2": list()}, "transportation": {"1":list(), "2": list()}, "om&or": {"1":list(), "2": list()}}
	# four cate, two step
	for paper in data:
		if paper["fields"]["is_phased1"]:
			data_labeled[paper["fields"]["category"]]["1"].append(paper)
		elif paper["fields"]["is_phased2"]:
			data_labeled[paper["fields"]["category"]]["2"].append(paper)
		else:
			pass
	print(",label1 vs label2, label1 vs label_final, label_final vs label2")
	for cate in cates:
		# phase 1
		print(cate.upper())
		for i in range(1,3):
			papers = data_labeled[cate][str(i)]
			answers1 = [paper["fields"]["label1"] for paper in papers]
			answers2 = [paper["fields"]["label2"] for paper in papers]
			answers_final = [paper["fields"]["label_final"] for paper in papers]
			print("Phase"+str(i),kappa(answers1, answers2), kappa(answers1, answers_final), kappa(answers2, answers_final))
예제 #32
0
def getUsersFeatures(procNum=10):
    # init user pair by mapping
    gts = ut.readCommaLine2List(interPath, gtStrictFileName)
    sn1 = "google"
    sn2 = "twitter"
    users_sn1 = list()
    users_sn2 = list()
    # scoresMatrix = lil_matrix((len(gts), len(gts)))
    scoresMatrix = dict()
    for gt in gts:
        users_sn1.append(gt[0])
        users_sn2.append(gt[1])
    # build graph
    print("build graph")
    s = time.time()
    g1, g2, g0 = buildGraphs(users_sn1, users_sn2)
    e = time.time()
    print("build graph over cost: " + str(e - s))
    # for profile using

    print("popular count")
    s = time.time()
    writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2)
    e = time.time()
    print("popular count over cost: " + str(e - s))

    print("calculate features start")
    # calculate features
    s = time.time()
    pairs = [(a, b) for a in range(len(gts)) for b in range(len(gts))
             if b >= a]
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)

    profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData(
        users_sn1, users_sn2, twitterIdName)

    #
    # for pair in pairs:
    # 	print(pair)
    # 	scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0)
    # 	scoresMatrix[(pair[0], pair[1])] = scores

    # parallel
    batchNum = round(len(pairs) / procNum)
    procs = list()
    q = mp.Queue()

    for i in range(procNum):
        batchPairs = list()
        if i == procNum - 1:
            batchPairs = pairs[i * batchNum:]
        else:
            batchPairs = pairs[i * batchNum:(i + 1) * batchNum]
        # p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q))
        p = td.Thread(target=getScoresWorker,
                      args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2,
                            g0, q, profileGoogle, profileTwitter, wallGoogle,
                            wallTwitter, textGoogle, textTwitter))
        p.start()
        procs.append(p)
    print("update start")
    for i in range(len(pairs)):
        print(i)
        result = q.get()
        # scoresMatrix.update(result)
        scoresMatrix[result["key"]] = result["value"]
    print("update over")
    print(len(scoresMatrix))
    for proc in procs:
        proc.join()

    # output feature
    with open(outputPath + featureFileName, "w") as fo:
        for i in range(len(gts)):
            for j in range(len(gts)):
                if i == j:
                    rank = 1
                else:
                    rank = 0
                if i > j:
                    scores = scoresMatrix[(j, i)]
                else:
                    scores = scoresMatrix[(i, j)]
                outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j],
                                          scores)
                fo.write(outputStr)

    # with open(outputPath+featureFileName, "w") as fo:
    # 	for i in range(len(gts)):
    # 		print(users_sn1[i])
    # 		print(i)
    # 		for j in range(len(gts)):
    # 			print(j)
    # 			if i == j:
    # 				rank = 1
    # 			else:
    # 				rank = 0
    # 			scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0)
    # 			outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores)
    # 			fo.write(outputStr)
    e = time.time()
    print("write feature costs:" + str(e - s))
예제 #33
0
def getUsersFeatures(procNum = 10):
	# init user pair by mapping
	gts = ut.readCommaLine2List(interPath, gtStrictFileName)
	sn1 = "google"
	sn2 = "twitter"
	users_sn1 = list()
	users_sn2 = list()
	# scoresMatrix = lil_matrix((len(gts), len(gts)))
	scoresMatrix = dict()
	for gt in gts:
		users_sn1.append(gt[0])
		users_sn2.append(gt[1])
	# build graph
	print("build graph")
	s = time.time()
	g1, g2, g0 = buildGraphs(users_sn1, users_sn2)
	e = time.time()
	print("build graph over cost: "+str(e-s))
	# for profile using

	print("popular count")
	s = time.time()
	writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2)
	e = time.time()
	print("popular count over cost: "+str(e-s))

	print("calculate features start")
	# calculate features
	s = time.time()
	pairs = [(a,b) for a in range(len(gts)) for b in range(len(gts)) if b>=a]
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)

	profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData(users_sn1, users_sn2, twitterIdName)

	# 
	# for pair in pairs:
	# 	print(pair)
	# 	scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0)
	# 	scoresMatrix[(pair[0], pair[1])] = scores

	# parallel
	batchNum = round(len(pairs)/procNum)
	procs = list()
	q = mp.Queue()

	for i in range(procNum):
		batchPairs = list()
		if i == procNum-1:
			batchPairs = pairs[i*batchNum:]
		else:
			batchPairs = pairs[i*batchNum:(i+1)*batchNum]
		# p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q))
		p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q, profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter))
		p.start()
		procs.append(p)
	print("update start")
	for i in range(len(pairs)):
		print(i)
		result = q.get()
		# scoresMatrix.update(result)
		scoresMatrix[result["key"]] = result["value"]
	print("update over")
	print(len(scoresMatrix))
	for proc in procs:
		proc.join()

	# output feature
	with open(outputPath+featureFileName, "w") as fo:
		for i in range(len(gts)):
			for j in range(len(gts)):
				if i == j:
					rank = 1
				else:
					rank = 0
				if i > j:
					scores = scoresMatrix[(j, i)]
				else:
					scores = scoresMatrix[(i, j)]
				outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores)
				fo.write(outputStr)

	# with open(outputPath+featureFileName, "w") as fo:
	# 	for i in range(len(gts)):
	# 		print(users_sn1[i])
	# 		print(i)
	# 		for j in range(len(gts)):
	# 			print(j)
	# 			if i == j:
	# 				rank = 1
	# 			else:
	# 				rank = 0
	# 			scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0)
	# 			outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores)
	# 			fo.write(outputStr)
	e = time.time()
	print("write feature costs:" + str(e-s))