def getUsersProfile(): print("get users profile") userids = ut.readLine2List(path, "ids_mapping") useridsCrawled = ut.readLine2List(path, idProfileFileName) useridsLeft = list(set(userids)-set(useridsCrawled)) fi = open(path+idProfileFileName, 'a') for userid in useridsLeft: # for userid in userids[len(useridsCrawled):]: time.sleep(8) print(userid) profile = getUserProfile(userid) with codecs.open(path+"profile/"+userid, "w", encoding="utf-8") as fo: fo.write(json.dumps(profile, indent=4, ensure_ascii=False)) fi.write(userid+'\n')
def getUsersTweets(): usernames = ut.readLine2List(path, namesMappingFileName) usernamesCrawled = ut.readLine2List(path, "id_post_file") usernamesLeft = list(set(usernames)-set(usernamesCrawled)) fi = open(path+idPostFileName, 'a') for username in usernamesLeft: print(username) time.sleep(5) tweets = getUserTweets("",username) # file cannot be opened due to wrong name try: with codecs.open(path+"wall/"+username, "w", encoding="utf-8") as fo: fo.write(json.dumps(tweets, indent=4, ensure_ascii=False)) except: pass fi.write(username+'\n')
def getUsersTweets(): usernames = ut.readLine2List(path, namesMappingFileName) usernamesCrawled = ut.readLine2List(path, "id_post_file") usernamesLeft = list(set(usernames) - set(usernamesCrawled)) fi = open(path + idPostFileName, 'a') for username in usernamesLeft: print(username) time.sleep(5) tweets = getUserTweets("", username) # file cannot be opened due to wrong name try: with codecs.open(path + "wall/" + username, "w", encoding="utf-8") as fo: fo.write(json.dumps(tweets, indent=4, ensure_ascii=False)) except: pass fi.write(username + '\n')
def getUsersProfile(): # total user - crawled users before usernames = ut.readLine2List(path, namesMappingFileName) usernamesCrawled = ut.readLine2List(path, idProfileFileName) usernamesLeft = list(set(usernames)-set(usernamesCrawled)) fi = open(path+idProfileFileName, 'a') for username in usernamesLeft: # for username in usernames[len(usernamesCrawled):]: print(username) time.sleep(5) profile = getUserProfile("",username) # file cannot be opened due to wrong name try: with codecs.open(path+"profile/"+username, "w", encoding="utf-8") as fo: fo.write(json.dumps(profile, indent=4, ensure_ascii=False)) except: pass fi.write(username+'\n')
def getUsersPost(): print("get users post") userids = ut.readLine2List(path, "ids_mapping") useridsCrawled = ut.readLine2List(path, idPostFileName) useridsError = ut.readLine2List(statPath, "google_ids_post_errors") useridsLeft = list(set(userids)-set(useridsCrawled)) # fi = open(path+idPostFileName, 'a') # fi.write("start") # for userid in useridsError: for userid in useridsLeft: # timer here # fi.write(userid+'\n') with open(path+idPostFileName, "a") as fi: fi.write(userid+"\n") print(userid) posts = getUserPost(userid) time.sleep(8) with codecs.open(path+"wall/"+userid, "w", encoding="utf-8") as fo: fo.write(json.dumps(posts, indent=4, ensure_ascii=False))
def getUsersProfile(): # total user - crawled users before usernames = ut.readLine2List(path, namesMappingFileName) usernamesCrawled = ut.readLine2List(path, idProfileFileName) usernamesLeft = list(set(usernames) - set(usernamesCrawled)) fi = open(path + idProfileFileName, 'a') for username in usernamesLeft: # for username in usernames[len(usernamesCrawled):]: print(username) time.sleep(5) profile = getUserProfile("", username) # file cannot be opened due to wrong name try: with codecs.open(path + "profile/" + username, "w", encoding="utf-8") as fo: fo.write(json.dumps(profile, indent=4, ensure_ascii=False)) except: pass fi.write(username + '\n')
def ranking(n=1558, filename="ranking_origin_1558.txt"): scores = ut.readLine2List(predPath, filename) preds = list() for i in range(n): # print(i*n) scores_i = scores[i*n:(i+1)*n] max_index = max(enumerate(scores_i), key=lambda k: float(k[1]))[0] # print(max_index) preds_i = ["0"]*1558 preds_i[max_index] = "1" preds += preds_i ut.writeList2Line(predPath, predictionRankFilename, preds) return preds
def getUsersFriendship(): usernames = ut.readLine2List(path, namesMappingFileName) counts = 0 # from where to start with open(path + relationshipFileName, "r") as fi: count = len(fi.readlines()) id_post_writer = open(path+"friends_over1page", "a") with open(path+relationshipFileName, "a", encoding="utf-8") as fo: for username in usernames[count:]: print(username) time.sleep(60) friends = getUserFriendship(id_post_writer, "", username, ) friends = [str(a) for a in friends] fo.write(username+" "+",".join(friends)+"\n")
def getGoogleUsers(sn="google"): driver = getDriver() loginGoogle(driver) # init variable snFolder = path + sn + "/" ids_visited = ut.readLine2List(snFolder, idsVisitedFileName) ids_saw = ut.readLine2List(snFolder, idsSawFileName) ids_error = ut.readLine2List(snFolder, idsErrorFileName) nextids = list(set(ids_saw) - set(ids_visited) - set(ids_error)) print(len(nextids)) # nextids = ids_saw[len(ids_visited)+1:] ids_error_writer = open(snFolder + idsErrorFileName, "a") if len(ids_saw) == 0: ids_saw.append(root) # build social network here g = initGraph(ids_saw, ids_visited) for uid in nextids: # if uid not in ids: error = 0 print(uid) # iterate until parse successfully while True: try: if error == 5: ids_error_writer.write(str(uid) + "\n") break if parseGoogleUser(driver, g, snFolder, uid, ids_visited, ids_saw, nextids): break except: error = error + 1 pass # just add new ids here, don't delete the user id driver.close()
def getGoogleUsers(sn = "google"): driver = getDriver() loginGoogle(driver) # init variable snFolder = path+sn+"/" ids_visited = ut.readLine2List(snFolder, idsVisitedFileName) ids_saw = ut.readLine2List(snFolder, idsSawFileName) ids_error = ut.readLine2List(snFolder, idsErrorFileName) nextids = list(set(ids_saw)-set(ids_visited)-set(ids_error)) print(len(nextids)) # nextids = ids_saw[len(ids_visited)+1:] ids_error_writer = open(snFolder+idsErrorFileName, "a") if len(ids_saw)==0: ids_saw.append(root) # build social network here g = initGraph(ids_saw, ids_visited) for uid in nextids: # if uid not in ids: error = 0 print(uid) # iterate until parse successfully while True: try: if error == 5: ids_error_writer.write(str(uid)+"\n") break if parseGoogleUser(driver, g, snFolder, uid, ids_visited, ids_saw, nextids): break except: error = error+1 pass # just add new ids here, don't delete the user id driver.close()
def reviseIdFile(): ids_visited = ut.readLine2List(snFolder, idsVisitedFileName+"2") ids_saw = ut.readLine2List(snFolder, idsSawFileName) loss = ut.readLine2List(snFolder, "tmp_ids") # revise id file duplicate problem g=nx.Graph() dup = list() num = list() for i in range(len(ids_saw)): id = ids_saw[i] try: g.node[id] dup.append(id) num.append(i) except: g.add_node(id) print(len(dup)) for i in range(len(num)-1, -1, -1): pos = num[i] del ids_saw[pos] for l in loss: ids_saw.append(l) ut.writeList2Line("../data/google/", "ids_saw2", ids_saw)
def reviseIdFile(): ids_visited = ut.readLine2List(snFolder, idsVisitedFileName + "2") ids_saw = ut.readLine2List(snFolder, idsSawFileName) loss = ut.readLine2List(snFolder, "tmp_ids") # revise id file duplicate problem g = nx.Graph() dup = list() num = list() for i in range(len(ids_saw)): id = ids_saw[i] try: g.node[id] dup.append(id) num.append(i) except: g.add_node(id) print(len(dup)) for i in range(len(num) - 1, -1, -1): pos = num[i] del ids_saw[pos] for l in loss: ids_saw.append(l) ut.writeList2Line("../data/google/", "ids_saw2", ids_saw)
def writeMissingGooglePosts(): ids = ut.readLine2List("../data/google/", "ids_mapping") ids_parsed = list() ids_errors = list() for root, folder, filenames in os.walk("../data/google/wall"): ids_parsed = filenames ids_errors = list(set(ids)-set(ids_parsed)) for filename in filenames: with open(os.path.join(root, filename), "r", errors="ignore") as fi: try: result = json.loads(fi.read()) if type(result) == dict: ids_errors.append(filename) except: pass ut.writeList2Line("../data/stat/", "google_ids_post_errors", ids_errors)
def writeMissingGooglePosts(): ids = ut.readLine2List("../data/google/", "ids_mapping") ids_parsed = list() ids_errors = list() for root, folder, filenames in os.walk("../data/google/wall"): ids_parsed = filenames ids_errors = list(set(ids) - set(ids_parsed)) for filename in filenames: with open(os.path.join(root, filename), "r", errors="ignore") as fi: try: result = json.loads(fi.read()) if type(result) == dict: ids_errors.append(filename) except: pass ut.writeList2Line("../data/stat/", "google_ids_post_errors", ids_errors)
def getUsersFriendship(): usernames = ut.readLine2List(path, namesMappingFileName) counts = 0 # from where to start with open(path + relationshipFileName, "r") as fi: count = len(fi.readlines()) id_post_writer = open(path + "friends_over1page", "a") with open(path + relationshipFileName, "a", encoding="utf-8") as fo: for username in usernames[count:]: print(username) time.sleep(60) friends = getUserFriendship( id_post_writer, "", username, ) friends = [str(a) for a in friends] fo.write(username + " " + ",".join(friends) + "\n")
def main(): urls = list() ids = ut.readLine2List("../data/google/", "id_file") for i in range(10): uid = ids[i] urlPrefix = "https://plus.google.com/" urlAbout = urlPrefix+uid+"/about" urls.append(urlAbout) s = time() nprocs = 4 procList = list() result = list() q = mp.Queue() index = 0 # driver = webdriver.Firefox() while index < len(urls): # url = urls[index] # index = index+1 # driver.get(url) # result.append(driver.title) for i in range(nprocs): print(index) scope = 2 urls_short = urls[index:index+scope] p = mp.Process(target=f, args=([3], q, urls_short)) p.start() procList.append(p) index = index + scope print(q.qsize) for i in range(q.qsize()): result += q.get() for p in procList: p.join() e = time() print (result) print (e-s)
def main(): urls = list() ids = ut.readLine2List("../data/google/", "id_file") for i in range(10): uid = ids[i] urlPrefix = "https://plus.google.com/" urlAbout = urlPrefix + uid + "/about" urls.append(urlAbout) s = time() nprocs = 4 procList = list() result = list() q = mp.Queue() index = 0 # driver = webdriver.Firefox() while index < len(urls): # url = urls[index] # index = index+1 # driver.get(url) # result.append(driver.title) for i in range(nprocs): print(index) scope = 2 urls_short = urls[index:index + scope] p = mp.Process(target=f, args=([3], q, urls_short)) p.start() procList.append(p) index = index + scope print(q.qsize) for i in range(q.qsize()): result += q.get() for p in procList: p.join() e = time() print(result) print(e - s)
def getGoogleUsersParellel(): ids_visited = ut.readLine2List(snFolder, idsVisitedFileName) ids_saw = ut.readLine2List(snFolder, idsSawFileName) ids_error = ut.readLine2List(snFolder, idsErrorFileName) # nextids = ids_saw[len(ids)+1:] nextids = list(set(ids_saw)-set(ids_visited)-set(ids_error)) # write file ids_error_writer = open(snFolder+idsErrorFileName, "a") ids_visited_writer = open(snFolder+idsVisitedFileName, 'a', encoding="utf8") ids_saw_writer = open(snFolder+idsSawFileName, 'a', encoding="utf8") ids_recorded_writer = open(snFolder+idsRecordedFileName, 'a', encoding="utf8") sn_writer = open(path+"sn_file", 'a', encoding="utf8") profile_writer = open(snFolder+"profile_file", 'a', encoding="utf8") rela_writer = open(snFolder+"relationship_file", 'a', encoding="utf8") # initialize graph if len(ids_saw) == 0: ids_saw.append(root) g = initGraph(ids_saw, ids_visited) index = 0 # multiprocess to get the user info procNum = 3 batchNum = 100 # drivers = list() # for i in range(procNum): # drivers.append(webdriver.Firefox()) while index < len(nextids): result = list() q = mp.Queue() # q = th.Queue() # q = queue.Queue() roundNum = procNum * batchNum procs = list() if index+roundNum < len(nextids): for i in range(procNum): batchids = nextids[index+i*batchNum:index+((i+1)*batchNum)] p = mp.Process(target=worker_p, args=(batchids,q)) p.start() procs.append(p) for i in range(roundNum): result += q.get() for proc in procs: proc.join() else: batchids = nextids[index:] p = mp.Process(target=worker_p, args=(batchids,q)) p.start() for i in range(len(batchids)): result += q.get() p.join() # process back data # lock.acquire() for userData in result: # dictionary: {id: uid, status: false or true,infos: infos, friends: friends, friend_bool: true, sns: sns, sn_bool: true false} uid = userData["id"] infos = userData["infos"] friends = userData["friends"] sns = userData["sns"] sn_bool = userData["sn_bool"] friend_bool = userData["friend_bool"] status = userData["status"] if g.node[uid]["status"] == 1: # print("already in graph") continue elif status==False: # print("cannot read be parsed") ids_error_writer.write(uid+"\n") else: # print("new user") if infos != None: # writeUser2File(uid, sns, sn_bool, infos, friends, friend_bool, sn_writer, profile_writer, rela_writer, ids_visited_writer, ids_recorded_writer) # print("start to write:"+uid) sn_writer.write(uid+','+','.join(sns)+'\n') profile_writer.write(uid+',\t'+',\t'.join(infos)+'\n') rela_writer.write(uid+' '+','.join(friends)+'\n') ids_recorded_writer.write(uid) if sn_bool: ids_recorded_writer.write(","+str(1)) else: ids_recorded_writer.write(","+str(0)) if friend_bool: ids_recorded_writer.write(","+str(1)+"\n") else: ids_recorded_writer.write(","+str(0)+'\n') # print("finish write") addFriend(g, friends, ids_saw, ids_saw_writer, nextids) g.node[uid]["status"] = 1 ids_visited.append(uid) ids_visited_writer.write(uid+"\n") # lock.release() ut.removeWinSpace() index = index + procNum*batchNum
def rankingConstraint(n=1558): scores = [float(i) for i in ut.readLine2List(predPath, predictionRankOriginFilename)] oneMapping(scores, predictionRankConstraintFilename, n)
def evalNm(filename="nm_1558.txt"): print("Evaluation: Name Matching") preds = ut.readLine2List(predPath, filename) gts = ut.readLine2List(predPath, gtFilename) return evaluate(gts, preds)
def getGoogleUsersParellel(): ids_visited = ut.readLine2List(snFolder, idsVisitedFileName) ids_saw = ut.readLine2List(snFolder, idsSawFileName) ids_error = ut.readLine2List(snFolder, idsErrorFileName) # nextids = ids_saw[len(ids)+1:] nextids = list(set(ids_saw) - set(ids_visited) - set(ids_error)) # write file ids_error_writer = open(snFolder + idsErrorFileName, "a") ids_visited_writer = open(snFolder + idsVisitedFileName, 'a', encoding="utf8") ids_saw_writer = open(snFolder + idsSawFileName, 'a', encoding="utf8") ids_recorded_writer = open(snFolder + idsRecordedFileName, 'a', encoding="utf8") sn_writer = open(path + "sn_file", 'a', encoding="utf8") profile_writer = open(snFolder + "profile_file", 'a', encoding="utf8") rela_writer = open(snFolder + "relationship_file", 'a', encoding="utf8") # initialize graph if len(ids_saw) == 0: ids_saw.append(root) g = initGraph(ids_saw, ids_visited) index = 0 # multiprocess to get the user info procNum = 3 batchNum = 100 # drivers = list() # for i in range(procNum): # drivers.append(webdriver.Firefox()) while index < len(nextids): result = list() q = mp.Queue() # q = th.Queue() # q = queue.Queue() roundNum = procNum * batchNum procs = list() if index + roundNum < len(nextids): for i in range(procNum): batchids = nextids[index + i * batchNum:index + ((i + 1) * batchNum)] p = mp.Process(target=worker_p, args=(batchids, q)) p.start() procs.append(p) for i in range(roundNum): result += q.get() for proc in procs: proc.join() else: batchids = nextids[index:] p = mp.Process(target=worker_p, args=(batchids, q)) p.start() for i in range(len(batchids)): result += q.get() p.join() # process back data # lock.acquire() for userData in result: # dictionary: {id: uid, status: false or true,infos: infos, friends: friends, friend_bool: true, sns: sns, sn_bool: true false} uid = userData["id"] infos = userData["infos"] friends = userData["friends"] sns = userData["sns"] sn_bool = userData["sn_bool"] friend_bool = userData["friend_bool"] status = userData["status"] if g.node[uid]["status"] == 1: # print("already in graph") continue elif status == False: # print("cannot read be parsed") ids_error_writer.write(uid + "\n") else: # print("new user") if infos != None: # writeUser2File(uid, sns, sn_bool, infos, friends, friend_bool, sn_writer, profile_writer, rela_writer, ids_visited_writer, ids_recorded_writer) # print("start to write:"+uid) sn_writer.write(uid + ',' + ','.join(sns) + '\n') profile_writer.write(uid + ',\t' + ',\t'.join(infos) + '\n') rela_writer.write(uid + ' ' + ','.join(friends) + '\n') ids_recorded_writer.write(uid) if sn_bool: ids_recorded_writer.write("," + str(1)) else: ids_recorded_writer.write("," + str(0)) if friend_bool: ids_recorded_writer.write("," + str(1) + "\n") else: ids_recorded_writer.write("," + str(0) + '\n') # print("finish write") addFriend(g, friends, ids_saw, ids_saw_writer, nextids) g.node[uid]["status"] = 1 ids_visited.append(uid) ids_visited_writer.write(uid + "\n") # lock.release() ut.removeWinSpace() index = index + procNum * batchNum
def evalMnaConstraint(filename="mna_constraint_1558.txt"): print("Evaluation: MNA Constraint") preds = ut.readLine2List(predPath, filename) gts = ut.readLine2List(predPath, gtFilename) return evaluate(gts, preds)
def evalRankingConstraint(filename="ranking_constraint_1558.txt"): print("Evaluation: Ranking constraint") preds = ut.readLine2List(predPath, filename) gts = ut.readLine2List(predPath, gtFilename) return evaluate(gts, preds)