def structUserData(sn, uid): print(uid) # norm profile profile = ut.readJson2Dict(inputPath + sn + "/profile/", uid) posts = ut.readJson2Dict(inputPath + sn + "/wall/", uid) print("profile:" + interPath + sn + "/profile/" + uid) newProfile = normProfile(sn, profile) print("wall:" + interPath + sn + "/wall/" + uid) newPosts = normWall(sn, posts) ut.writeDict2Json(interPath + sn + "/profile/", uid, newProfile) ut.writeDict2Json(interPath + sn + "/wall/", uid, newPosts) # wall statisitcs langDistri = ut.getDistri([post["lang"] for post in newPosts]) # sentiment sum sentiments = [post["sentiment"]["polarity"] for post in newPosts] sentiment_score = sum(sentiments) / len(sentiments) if len( sentiments) > 0 else 0 # topic sum topicDistris = [post["topic_distri"] for post in newPosts] userTopicDistri = ut.mergeDict(topicDistris) userTopicDistri = ut.normVector(userTopicDistri) # tf tfs = [post["tf"] for post in newPosts] userTf = ut.mergeDict(tfs) return (userTf, langDistri, sentiment_score, userTopicDistri)
def getBehaviorScore(sn1, user1, sn2, user2): posts1 = ut.readJson2Dict(interPath+sn1+"/wall/", user1) posts2 = ut.readJson2Dict(interPath+sn2+"/wall/", user2) text1 = ut.readJson2Dict(interPath+sn1+"/text/", user1) text2 = ut.readJson2Dict(interPath+sn2+"/text/", user2) scoresBehavior = calBehaviorScore(posts1, posts2, text1, text2) return scoresBehavior
def structUserData(sn, uid): print(uid) # norm profile profile = ut.readJson2Dict(inputPath+sn+"/profile/", uid) posts = ut.readJson2Dict(inputPath+sn+"/wall/", uid) print("profile:"+interPath+sn+"/profile/"+uid) newProfile = normProfile(sn, profile) print("wall:"+interPath+sn+"/wall/"+uid) newPosts = normWall(sn, posts) ut.writeDict2Json(interPath+sn+"/profile/", uid, newProfile) ut.writeDict2Json(interPath+sn+"/wall/", uid, newPosts) # wall statisitcs langDistri = ut.getDistri([post["lang"] for post in newPosts]) # sentiment sum sentiments = [post["sentiment"]["polarity"] for post in newPosts] sentiment_score = sum(sentiments)/len(sentiments) if len(sentiments)>0 else 0 # topic sum topicDistris = [post["topic_distri"] for post in newPosts] userTopicDistri = ut.mergeDict(topicDistris) userTopicDistri = ut.normVector(userTopicDistri) # tf tfs = [post["tf"] for post in newPosts] userTf = ut.mergeDict(tfs) return (userTf, langDistri, sentiment_score, userTopicDistri)
def statNameScore(): gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName) gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gtsStrict twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) results = list() for gt in gts: googleId = gt[0] twitterId = gt[1] twitterName = twitterIdName[twitterId] print(googleId) print(twitterName) googleProfile = ut.readJson2Dict(interPath + "google/profile/", googleId) twitterProfile = ut.readJson2Dict(interPath + "twitter/profile/", twitterName) nameScore = ft.calNameScore(googleProfile, twitterProfile) displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile) totalScore = nameScore + displaynameScore results.append([ googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore) ]) ut.writeList2CommaLine(interPath, "name_score", results)
def getBehaviorScore(sn1, user1, sn2, user2): posts1 = ut.readJson2Dict(interPath + sn1 + "/wall/", user1) posts2 = ut.readJson2Dict(interPath + sn2 + "/wall/", user2) text1 = ut.readJson2Dict(interPath + sn1 + "/text/", user1) text2 = ut.readJson2Dict(interPath + sn2 + "/text/", user2) scoresBehavior = calBehaviorScore(posts1, posts2, text1, text2) return scoresBehavior
def checkTwitterData(uname): profile = ut.readJson2Dict(inputPath+"twitter/profile/", uname) posts = ut.readJson2Dict(inputPath+"twitter/wall/", uname) profile_bool = True posts_bool = True if len(profile)==0 or type(profile.get("errors", 0))==list: profile_bool = False if len(posts)==0: posts_bool = False return (profile_bool, posts_bool, profile)
def checkGoogleData(uid): profile = ut.readJson2Dict(inputPath+"google/profile/", uid) posts = ut.readJson2Dict(inputPath+"google/wall/", uid) profile_bool = True posts_bool = True if profile.get("status", 0) == "error" or len(profile)==0: profile_bool = False if type(posts) == dict or len(posts)==0: posts_bool = False return (profile_bool, posts_bool)
def checkTwitterData(uname): profile = ut.readJson2Dict(inputPath + "twitter/profile/", uname) posts = ut.readJson2Dict(inputPath + "twitter/wall/", uname) profile_bool = True posts_bool = True if len(profile) == 0 or type(profile.get("errors", 0)) == list: profile_bool = False if len(posts) == 0: posts_bool = False return (profile_bool, posts_bool, profile)
def checkGoogleData(uid): profile = ut.readJson2Dict(inputPath + "google/profile/", uid) posts = ut.readJson2Dict(inputPath + "google/wall/", uid) profile_bool = True posts_bool = True if profile.get("status", 0) == "error" or len(profile) == 0: profile_bool = False if type(posts) == dict or len(posts) == 0: posts_bool = False return (profile_bool, posts_bool)
def stat_others(input_path="../prediction/input/"): cates = ["information management","marketing", "transportation", "om&or"] data = ut.readJson2Dict(input_path, "data_labeled.json") topics = ["Others but relevant to IM", "Others but relevant to Marketing", "Others but relevant to OM", "Others but relevant to Transportation", "Not relevant", "Other Methods", "Relevant to IM", "Relevant to Transportation", "Relevant to Marketing", "Not Relevant to All Fields"] paper_cate = {"information management": dict(),"marketing": dict(), "transportation": dict(), "om&or": dict()} for paper in data: label = paper["fields"]["label_final"].strip() tmp = label.lower() if "relevant" in tmp or "relevent" in tmp: # if label in topics: cate = paper["fields"]["category"] tmp = paper_cate[cate].get(label, list()) tmp.append(paper) paper_cate[cate][label] = tmp for cate in cates: fo = open("../problem/"+cate+".csv", "w") fields = ["title","author","journal","volume","number","pages","year","month","keyword","keyword-plus","abstract"] fo.write(","+",".join(fields)+"\n") cate_topics = paper_cate[cate] for cate_topic, papers in cate_topics.items(): fo.write(cate_topic) for paper in papers: for field in fields: fo.write(",\""+paper["fields"].get(field,"")+"\"") fo.write("\n")
def stat_top_label(input_path="../result/"): fo = codecs.open("../output/stat_top_label.csv", "w", encoding="big5") w = csv.writer(fo) cates = ["information management","marketing", "transportation", "om&or"] data = ut.readJson2Dict(input_path, "data_final.json") data_labeled = {"information management": dict(), "marketing": dict(), "transportation": dict(), "om&or": dict()} for paper in data: year = paper["fields"]["year"] cate = paper["fields"]["category"] if paper["fields"]["phased3"]==3: s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";"))) elif paper["fields"]["phased3"]==2: s = set(paper["fields"]["label4"].split(";")) else: s =set(paper["fields"]["label3"].split(";")) combinations = get_combination(s) # s = ";".join(sorted(list(s))) if data_labeled[cate].get(year, 0)==0: data_labeled[cate][year] = dict() for c in combinations: data_labeled[cate][year][c] = data_labeled[cate][year].get(c,0) + 1 for cate in cates: w.writerow([cate.upper()]) for year, sets in sorted(data_labeled[cate].items()): topn = sorted([(count, s) for s, count in sets.items()], reverse=True)[:10] # fo.write(str(year)+","+",".join([s for count, s in topn])+"\n") w.writerow([year]+[s for count, s in topn]) fo.close()
def stat_journal(input_path="../result/", fname="data_final.json"): fo = open("../output/stat_journal.csv", "w") papers = ut.readJson2Dict(input_path, fname) print(len(papers)) paper_cate_journal = dict() for p in papers: cate = p["fields"]["category"].upper() journal = p["fields"]["journal"] vol = p["fields"]["volume"] no = p["fields"]["number"].replace(",","") if paper_cate_journal.get(cate, 0) == 0: paper_cate_journal[cate] = dict() if paper_cate_journal[cate].get(journal, 0) == 0: paper_cate_journal[cate][journal] = dict() if paper_cate_journal[cate][journal].get(vol, 0) == 0: paper_cate_journal[cate][journal][vol] = dict() if paper_cate_journal[cate][journal][vol].get(no, 0) == 0: paper_cate_journal[cate][journal][vol][no] = 0 paper_cate_journal[cate][journal][vol][no]+=1 len(paper_cate_journal) for cate, journals in paper_cate_journal.items(): for j, vols in sorted(journals.items()): for v, nos in sorted(vols.items()): for no, count in sorted(nos.items()): fo.write(cate+","+j+","+v+","+no+","+str(count)+"\n") fo.close()
def readData(users_google, users_twitter, twitterIdName): profileGoogle = dict() profileTwitter = dict() wallGoogle = dict() wallTwitter = dict() textGoogle = dict() textTwitter = dict() for user in users_google: profileGoogle[user] = ut.readJson2Dict(interPath+"google/profile/", user) # wallGoogle[user] = ut.readJson2Dict(interPath+"google/wall/", user) # textGoogle[user] = ut.readJson2Dict(interPath+"google/text/", user) for user in users_twitter: twitterName = twitterIdName[user] profileTwitter[user] = ut.readJson2Dict(interPath+"twitter/profile/", twitterName) # wallTwitter[user] = ut.readJson2Dict(interPath+"twitter/wall/", twitterName) # textTwitter[user] = ut.readJson2Dict(interPath+"twitter/text/", twitterName) return profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter print("load file over")
def stat_user2(input_path="../result/"): cates = ["information management","marketing", "transportation", "om&or"] data = ut.readJson2Dict(input_path, "data_final.json") users = ut.readJson2Dict("../website/public/", "data_user1.json") data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()} results = dict() for paper in data: data_labeled[paper["fields"]["category"]].append(paper) with codecs.open("../output/stat_user2.txt", "w", encoding="big5") as fo: for cate in cates: fo.write(cate.upper()+"\n") papers = data_labeled[cate] print(len(papers)) time3s = [paper["fields"]["time3"] for paper in papers if paper["fields"]["time3"]<1200 and paper["fields"]["time3"]>0] time4s = [paper["fields"]["time4"] for paper in papers if paper["fields"]["time4"]<1200 and paper["fields"]["time4"]>0] avg_time3 = sum(time3s)/len(time3s) avg_time4 = sum(time4s)/len(time4s) users_cate = [user["fields"]["name"] for user in users if user["fields"]["category"]==cate] fo.write(" ".join([users_cate[0], str(avg_time3), users_cate[1], str(avg_time4)])+"\n")
def output_edit_im(): data = ut.readJson2Dict("../output/", "parsed_edit_im.json") fields = ["title","author","journal","volume","number","pages","year","month","keyword","keyword-plus","abstract"] fo = open(output_path+"papers_im_editorial"+".csv", "w") fo.write("index,"+",".join(fields)+"\n") for index, paper in enumerate(data): fo.write(str(index+1)) for field in fields: fo.write(",\""+paper.get(field,"")+"\"") fo.write("\n") fo.close()
def output_data(input_path ="../result/", output_path="../result/"): data = ut.readJson2Dict(input_path, "data_final.json") cates = ["information management","marketing", "transportation", "om&or"] paper_cate = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()} output_topics() for paper in data: paper_cate[paper["fields"]["category"]].append(paper) for cate in cates: papers = paper_cate[cate] # output_article(papers, output_path, cate) output_label(papers, output_path, cate)
def readData(users_google, users_twitter, twitterIdName): profileGoogle = dict() profileTwitter = dict() wallGoogle = dict() wallTwitter = dict() textGoogle = dict() textTwitter = dict() for user in users_google: profileGoogle[user] = ut.readJson2Dict(interPath + "google/profile/", user) # wallGoogle[user] = ut.readJson2Dict(interPath+"google/wall/", user) # textGoogle[user] = ut.readJson2Dict(interPath+"google/text/", user) for user in users_twitter: twitterName = twitterIdName[user] profileTwitter[user] = ut.readJson2Dict(interPath + "twitter/profile/", twitterName) # wallTwitter[user] = ut.readJson2Dict(interPath+"twitter/wall/", twitterName) # textTwitter[user] = ut.readJson2Dict(interPath+"twitter/text/", twitterName) return profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter print("load file over")
def statNameScore(): gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName) gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gtsStrict twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) results = list() for gt in gts: googleId = gt[0] twitterId = gt[1] twitterName = twitterIdName[twitterId] print(googleId) print(twitterName) googleProfile = ut.readJson2Dict(interPath+"google/profile/", googleId) twitterProfile = ut.readJson2Dict(interPath+"twitter/profile/", twitterName) nameScore = ft.calNameScore(googleProfile, twitterProfile) displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile) totalScore = nameScore + displaynameScore results.append([googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore)]) ut.writeList2CommaLine(interPath, "name_score", results)
def structData(): # init s = time.time() usersTf1 = dict() usersTf2 = dict() usersLangDistri1 = dict() usersLangDistri2 = dict() usersSentimentScore1 = dict() usersSentimentScore2 = dict() usersTopicDistir1 = dict() usersTopicDistri2 = dict() twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName) gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gts_strict if not os.path.isdir(interPath + sn1): os.makedirs(interPath + sn1 + "/profile") os.makedirs(interPath + sn1 + "/wall") os.makedirs(interPath + sn1 + "/text") os.makedirs(interPath + sn2 + "/profile") os.makedirs(interPath + sn2 + "/wall") os.makedirs(interPath + sn2 + "/text") # norm profile and wall for gt in gts: uid1 = gt[0] uid2 = gt[1] try: if sn1 == "twitter": uid1 = twitterIdName[uid1] if sn2 == "twitter": uid2 = twitterIdName[uid2] except: continue # if not os.path.exists(interPath+sn1+"/profile/"+uid1): # norm profile and posts: google and twitter (userTf1, langDistri1, userSentimentScore1, userTopicDistri1) = structUserData(sn1, uid1) (userTf2, langDistri2, userSentimentScore2, userTopicDistri2) = structUserData(sn2, uid2) usersTf1[uid1] = userTf1 usersTf2[uid2] = userTf2 usersLangDistri1[uid1] = langDistri1 usersLangDistri2[uid2] = langDistri2 usersSentimentScore1[uid1] = userSentimentScore1 usersSentimentScore2[uid2] = userSentimentScore2 usersTopicDistir1[uid1] = userTopicDistri1 usersTopicDistri2[uid2] = userTopicDistri2 # build dictionary and idf writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistir1, usersTopicDistri2) e = time.time() print(e - s)
def getScores(sn1, user1, sn2, user2, g1, g2, g0): # read the twitter name id mapping file twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) if sn2 == "twitter": user2_name = twitterIdName[user2] scores = getProfileScore(sn1, user1, sn2, user2_name)+getSocialScore(sn1, user1, sn2, user2, g1, g2, g0)+getBehaviorScore(sn1, user1, sn2, user2_name) elif sn1 == "twitter": user1_name = twitterIdName[user1] scores = getProfileScore(sn1, user1_name, sn2, user2)+getSocialScore(sn1, user1, sn2, user2, g1, g2, g0)+getBehaviorScore(sn1, user1_name, sn2, user2) else: scores = getProfileScore(sn1, user1, sn2, user2)+getSocialScore(sn1, user1, sn2, user2, g1, g2, g0)+getBehaviorScore(sn1, user1, sn2, user2) return scores
def reviseTwitterRelationship(): names = list() twitterNameId = ut.readJson2Dict(interPath, "twitterNameId") with open(interPath + "twitter/relationship_file_revise", "w") as fo: with open(interPath + "twitter/relationship_file", "r") as fi: for line in fi: ids = line.split(" ") user = ids[0] friends = ids[1] if user not in names and twitterNameId.get(user, 0) != 0: uid = twitterNameId[user] fo.write(uid + " " + friends) else: print(user)
def reviseTwitterRelationship(): names = list() twitterNameId = ut.readJson2Dict(interPath, "twitterNameId") with open(interPath+"twitter/relationship_file_revise", "w") as fo: with open(interPath+"twitter/relationship_file", "r") as fi: for line in fi: ids = line.split(" ") user = ids[0] friends = ids[1] if user not in names and twitterNameId.get(user, 0) != 0: uid = twitterNameId[user] fo.write(uid+" "+friends) else: print(user)
def structData(): # init s = time.time() usersTf1 = dict() usersTf2 = dict() usersLangDistri1 = dict() usersLangDistri2 = dict() usersSentimentScore1 = dict() usersSentimentScore2 = dict() usersTopicDistir1 = dict() usersTopicDistri2 = dict() twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName) gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gts_strict if not os.path.isdir(interPath+sn1): os.makedirs(interPath+sn1+"/profile") os.makedirs(interPath+sn1+"/wall") os.makedirs(interPath+sn1+"/text") os.makedirs(interPath+sn2+"/profile") os.makedirs(interPath+sn2+"/wall") os.makedirs(interPath+sn2+"/text") # norm profile and wall for gt in gts: uid1 = gt[0] uid2 = gt[1] try: if sn1 == "twitter": uid1 = twitterIdName[uid1] if sn2 =="twitter": uid2 = twitterIdName[uid2] except: continue # if not os.path.exists(interPath+sn1+"/profile/"+uid1): # norm profile and posts: google and twitter (userTf1, langDistri1, userSentimentScore1, userTopicDistri1) = structUserData(sn1, uid1) (userTf2, langDistri2, userSentimentScore2, userTopicDistri2) = structUserData(sn2, uid2) usersTf1[uid1] = userTf1 usersTf2[uid2] = userTf2 usersLangDistri1[uid1] = langDistri1 usersLangDistri2[uid2] = langDistri2 usersSentimentScore1[uid1] = userSentimentScore1 usersSentimentScore2[uid2] = userSentimentScore2 usersTopicDistir1[uid1] = userTopicDistri1 usersTopicDistri2[uid2] = userTopicDistri2 # build dictionary and idf writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistir1, usersTopicDistri2) e = time.time() print(e-s)
def stat_user(input_path="../prediction/input/"): cates = ["information management","marketing", "transportation", "om&or"] data = ut.readJson2Dict(input_path, "data_labeled.json") users = ut.readJson2Dict("../website/public/", "data_user1.json") data_labeled = {"information management": {"1":list(), "2": list()}, "marketing": {"1":list(), "2": list()}, "transportation": {"1":list(), "2": list()}, "om&or": {"1":list(), "2": list()}} results = dict() # four cate, two step for paper in data: if paper["fields"]["is_phased1"]: data_labeled[paper["fields"]["category"]]["1"].append(paper) elif paper["fields"]["is_phased2"]: data_labeled[paper["fields"]["category"]]["2"].append(paper) else: pass with codecs.open("../output/stat_user.txt", "w", encoding="big5") as fo: for cate in cates: fo.write(cate.upper()+"\n") # print(cate.upper()) for i in range(1,3): papers = data_labeled[cate][str(i)] time1 = sum([paper["fields"]["time1"] for paper in papers if paper["fields"]["time1"]<1200])/len(papers) time2 = sum([paper["fields"]["time2"] for paper in papers if paper["fields"]["time2"]<1200])/len(papers) users_cate = [user["fields"]["name"] for user in users if user["fields"]["category"]==cate] fo.write(" ".join(["Phase"+str(i), users_cate[0], str(time1), users_cate[1], str(time2)])+"\n")
def output_err_data(): data = ut.readJson2Dict(result_path, "data_final.json") papers_428 = list() # 2. excel row 428 for p in data: if p["fields"]["journal"] == "JOURNAL OF MANAGEMENT INFORMATION SYSTEMS" and p["fields"]["volume"]=="31" and p["fields"]["number"]=="4": # if p["fields"]["journal"] == "JOURNAL OF MANAGEMENT INFORMATION SYSTEMS": print(p["fields"]["title"]) papers_428.append(p) output_paper("../output/stat_error_428.csv", papers_428) # 3. excel row 657 papers_657 = list() for p in data: if p["fields"]["journal"] == "TRANSPORTATION RESEARCH PART C-EMERGING TECHNOLOGIES" and p["fields"]["volume"]=="47": # if p["fields"]["journal"] == "JOURNAL OF MANAGEMENT INFORMATION SYSTEMS": print(p["fields"]["title"]) papers_657.append(p) output_paper("../output/stat_error_657.csv", papers_657)
def getScores(sn1, user1, sn2, user2, g1, g2, g0): # read the twitter name id mapping file twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) if sn2 == "twitter": user2_name = twitterIdName[user2] scores = getProfileScore(sn1, user1, sn2, user2_name) + getSocialScore( sn1, user1, sn2, user2, g1, g2, g0) + getBehaviorScore( sn1, user1, sn2, user2_name) elif sn1 == "twitter": user1_name = twitterIdName[user1] scores = getProfileScore(sn1, user1_name, sn2, user2) + getSocialScore( sn1, user1, sn2, user2, g1, g2, g0) + getBehaviorScore( sn1, user1_name, sn2, user2) else: scores = getProfileScore(sn1, user1, sn2, user2) + getSocialScore( sn1, user1, sn2, user2, g1, g2, g0) + getBehaviorScore( sn1, user1, sn2, user2) return scores
def stat_label2(input_path="../result/"): cates = ["information management","marketing", "transportation", "om&or"] data = ut.readJson2Dict(input_path, "data_final.json") data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()} for paper in data: if paper["fields"]["phased3"]==3: data_labeled[paper["fields"]["category"]].append(paper) for cate in cates: print(cate.upper()) papers = data_labeled[cate] jaccards = list() aligns = list() for paper in papers: s1 = set(paper["fields"]["label3"].split(";")) s2 = set(paper["fields"]["label4"].split(";")) jaccards.append(jaccard(s1,s2)) aligns.append(align_ratio(s1,s2,3)) print("Jaccards:", sum(jaccards)/len(jaccards)) print("Alignment Ratio:", sum(aligns)/len(aligns))
def stat_other_label(input_path="../result/"): mapping = {'information management': 'Others but relevant to IM','marketing':'Others but relevant to Marketing', 'transportation':'Others but relevant to Transportation', 'om&or':'Others but relevant to OM&OR'} data = ut.readJson2Dict(input_path, "data_final.json") data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()} # read data for paper in data: cate = paper["fields"]["category"] if paper["fields"]["phased3"]==3: s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";"))) elif paper["fields"]["phased3"]==2: s = set(paper["fields"]["label4"].split(";")) else: s =set(paper["fields"]["label3"].split(";")) if mapping[cate] in s: data_labeled[cate].append(paper) # output data cols = ['isi', 'title', 'author', 'journal', 'year', 'month', 'volume', 'number', 'pages', 'keyword', 'keywords_plus','web_of_science_categories', 'abstract'] for cate, papers in data_labeled.items(): output_paper("../output/stat_label_others_"+cate+".csv", papers)
def stat_label_distri(input_path="../result/"): cates = ["information management","marketing", "transportation", "om&or"] data = ut.readJson2Dict(input_path, "data_final.json") data_labeled = {"information management": list(), "marketing": list(), "transportation": list(), "om&or": list()} for paper in data: if paper["fields"]["phased3"]==3: s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";"))) elif paper["fields"]["phased3"]==2: s = set(paper["fields"]["label4"].split(";")) else: s =set(paper["fields"]["label3"].split(";")) data_labeled[paper["fields"]["category"]].append(s) for cate in cates: print(cate.upper()) label_len_distri = dict() for s in data_labeled[cate]: label_len_distri[len(s)] = label_len_distri.get(len(s),0) +1 for k, v in sorted(label_len_distri.items()): print(k, v/len(data_labeled[cate]))
def output_im_word_data(): data = ut.readJson2Dict(result_path, "data_final.json") words = ["Neuro IS", "NeuroIS", "Neuro-IS", "Virtual World", "Online game", "Ethics", "Open Source"] papers = list() for paper in data: cate = paper["fields"]["category"] if cate != "information management": continue if paper["fields"]["phased3"]==3: s = set(paper["fields"]["label3"].split(";")).union(set(paper["fields"]["label4"].split(";"))) elif paper["fields"]["phased3"]==2: s = set(paper["fields"]["label4"].split(";")) else: s =set(paper["fields"]["label3"].split(";")) if "Others but relevant to IM" in s: continue for w in words: if w in paper["fields"]["abstract"]: papers.append(paper) break output_paper("../output/stat_im_new_topic.csv", papers)
def stat_label(input_path="../prediction/input/"): cates = ["information management","marketing", "transportation", "om&or"] data = ut.readJson2Dict(input_path, "data_labeled.json") data_labeled = {"information management": {"1":list(), "2": list()}, "marketing": {"1":list(), "2": list()}, "transportation": {"1":list(), "2": list()}, "om&or": {"1":list(), "2": list()}} # four cate, two step for paper in data: if paper["fields"]["is_phased1"]: data_labeled[paper["fields"]["category"]]["1"].append(paper) elif paper["fields"]["is_phased2"]: data_labeled[paper["fields"]["category"]]["2"].append(paper) else: pass print(",label1 vs label2, label1 vs label_final, label_final vs label2") for cate in cates: # phase 1 print(cate.upper()) for i in range(1,3): papers = data_labeled[cate][str(i)] answers1 = [paper["fields"]["label1"] for paper in papers] answers2 = [paper["fields"]["label2"] for paper in papers] answers_final = [paper["fields"]["label_final"] for paper in papers] print("Phase"+str(i),kappa(answers1, answers2), kappa(answers1, answers_final), kappa(answers2, answers_final))
def getUsersFeatures(procNum=10): # init user pair by mapping gts = ut.readCommaLine2List(interPath, gtStrictFileName) sn1 = "google" sn2 = "twitter" users_sn1 = list() users_sn2 = list() # scoresMatrix = lil_matrix((len(gts), len(gts))) scoresMatrix = dict() for gt in gts: users_sn1.append(gt[0]) users_sn2.append(gt[1]) # build graph print("build graph") s = time.time() g1, g2, g0 = buildGraphs(users_sn1, users_sn2) e = time.time() print("build graph over cost: " + str(e - s)) # for profile using print("popular count") s = time.time() writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2) e = time.time() print("popular count over cost: " + str(e - s)) print("calculate features start") # calculate features s = time.time() pairs = [(a, b) for a in range(len(gts)) for b in range(len(gts)) if b >= a] twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData( users_sn1, users_sn2, twitterIdName) # # for pair in pairs: # print(pair) # scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0) # scoresMatrix[(pair[0], pair[1])] = scores # parallel batchNum = round(len(pairs) / procNum) procs = list() q = mp.Queue() for i in range(procNum): batchPairs = list() if i == procNum - 1: batchPairs = pairs[i * batchNum:] else: batchPairs = pairs[i * batchNum:(i + 1) * batchNum] # p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q)) p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q, profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter)) p.start() procs.append(p) print("update start") for i in range(len(pairs)): print(i) result = q.get() # scoresMatrix.update(result) scoresMatrix[result["key"]] = result["value"] print("update over") print(len(scoresMatrix)) for proc in procs: proc.join() # output feature with open(outputPath + featureFileName, "w") as fo: for i in range(len(gts)): for j in range(len(gts)): if i == j: rank = 1 else: rank = 0 if i > j: scores = scoresMatrix[(j, i)] else: scores = scoresMatrix[(i, j)] outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) fo.write(outputStr) # with open(outputPath+featureFileName, "w") as fo: # for i in range(len(gts)): # print(users_sn1[i]) # print(i) # for j in range(len(gts)): # print(j) # if i == j: # rank = 1 # else: # rank = 0 # scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0) # outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) # fo.write(outputStr) e = time.time() print("write feature costs:" + str(e - s))
def getUsersFeatures(procNum = 10): # init user pair by mapping gts = ut.readCommaLine2List(interPath, gtStrictFileName) sn1 = "google" sn2 = "twitter" users_sn1 = list() users_sn2 = list() # scoresMatrix = lil_matrix((len(gts), len(gts))) scoresMatrix = dict() for gt in gts: users_sn1.append(gt[0]) users_sn2.append(gt[1]) # build graph print("build graph") s = time.time() g1, g2, g0 = buildGraphs(users_sn1, users_sn2) e = time.time() print("build graph over cost: "+str(e-s)) # for profile using print("popular count") s = time.time() writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2) e = time.time() print("popular count over cost: "+str(e-s)) print("calculate features start") # calculate features s = time.time() pairs = [(a,b) for a in range(len(gts)) for b in range(len(gts)) if b>=a] twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData(users_sn1, users_sn2, twitterIdName) # # for pair in pairs: # print(pair) # scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0) # scoresMatrix[(pair[0], pair[1])] = scores # parallel batchNum = round(len(pairs)/procNum) procs = list() q = mp.Queue() for i in range(procNum): batchPairs = list() if i == procNum-1: batchPairs = pairs[i*batchNum:] else: batchPairs = pairs[i*batchNum:(i+1)*batchNum] # p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q)) p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q, profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter)) p.start() procs.append(p) print("update start") for i in range(len(pairs)): print(i) result = q.get() # scoresMatrix.update(result) scoresMatrix[result["key"]] = result["value"] print("update over") print(len(scoresMatrix)) for proc in procs: proc.join() # output feature with open(outputPath+featureFileName, "w") as fo: for i in range(len(gts)): for j in range(len(gts)): if i == j: rank = 1 else: rank = 0 if i > j: scores = scoresMatrix[(j, i)] else: scores = scoresMatrix[(i, j)] outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) fo.write(outputStr) # with open(outputPath+featureFileName, "w") as fo: # for i in range(len(gts)): # print(users_sn1[i]) # print(i) # for j in range(len(gts)): # print(j) # if i == j: # rank = 1 # else: # rank = 0 # scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0) # outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) # fo.write(outputStr) e = time.time() print("write feature costs:" + str(e-s))