Пример #1
0
def getUsersProfile():
	print("get users profile")
	userids = ut.readLine2List(path, "ids_mapping")
	useridsCrawled = ut.readLine2List(path, idProfileFileName)
	useridsLeft = list(set(userids)-set(useridsCrawled))
	fi = open(path+idProfileFileName, 'a')
	for userid in useridsLeft:
	# for userid in userids[len(useridsCrawled):]:
		time.sleep(8)
		print(userid)
		profile = getUserProfile(userid)
		with codecs.open(path+"profile/"+userid, "w", encoding="utf-8") as fo:
			fo.write(json.dumps(profile, indent=4, ensure_ascii=False))
		fi.write(userid+'\n')
Пример #2
0
def getUsersTweets():
	usernames = ut.readLine2List(path, namesMappingFileName)
	usernamesCrawled = ut.readLine2List(path, "id_post_file")
	usernamesLeft = list(set(usernames)-set(usernamesCrawled))
	fi = open(path+idPostFileName, 'a')
	for username in usernamesLeft:
		print(username)
		time.sleep(5)
		tweets = getUserTweets("",username)
		# file cannot be opened due to wrong name
		try:
			with codecs.open(path+"wall/"+username, "w", encoding="utf-8") as fo:
				fo.write(json.dumps(tweets, indent=4, ensure_ascii=False))
		except:
			pass
		fi.write(username+'\n')
Пример #3
0
def getUsersTweets():
    usernames = ut.readLine2List(path, namesMappingFileName)
    usernamesCrawled = ut.readLine2List(path, "id_post_file")
    usernamesLeft = list(set(usernames) - set(usernamesCrawled))
    fi = open(path + idPostFileName, 'a')
    for username in usernamesLeft:
        print(username)
        time.sleep(5)
        tweets = getUserTweets("", username)
        # file cannot be opened due to wrong name
        try:
            with codecs.open(path + "wall/" + username, "w",
                             encoding="utf-8") as fo:
                fo.write(json.dumps(tweets, indent=4, ensure_ascii=False))
        except:
            pass
        fi.write(username + '\n')
Пример #4
0
def getUsersProfile():
	# total user - crawled users before
	usernames = ut.readLine2List(path, namesMappingFileName)
	usernamesCrawled = ut.readLine2List(path, idProfileFileName)
	usernamesLeft = list(set(usernames)-set(usernamesCrawled))
	fi = open(path+idProfileFileName, 'a')
	for username in usernamesLeft:
	# for username in usernames[len(usernamesCrawled):]:
		print(username)
		time.sleep(5)
		profile = getUserProfile("",username)
		# file cannot be opened due to wrong name
		try:
			with codecs.open(path+"profile/"+username, "w", encoding="utf-8") as fo:
				fo.write(json.dumps(profile, indent=4, ensure_ascii=False))
		except:
			pass
		fi.write(username+'\n')
Пример #5
0
def getUsersPost():
	print("get users post")
	userids = ut.readLine2List(path, "ids_mapping")
	useridsCrawled = ut.readLine2List(path, idPostFileName)
	useridsError = ut.readLine2List(statPath, "google_ids_post_errors")
	useridsLeft = list(set(userids)-set(useridsCrawled))
	# fi = open(path+idPostFileName, 'a')
	# fi.write("start")
	# for userid in useridsError:
	for userid in useridsLeft:
		# timer here
		# fi.write(userid+'\n')
		with open(path+idPostFileName, "a") as fi:
			fi.write(userid+"\n")
		print(userid)
		posts = getUserPost(userid)
		time.sleep(8)
		with codecs.open(path+"wall/"+userid, "w", encoding="utf-8") as fo:
			fo.write(json.dumps(posts, indent=4, ensure_ascii=False))
Пример #6
0
def getUsersProfile():
    # total user - crawled users before
    usernames = ut.readLine2List(path, namesMappingFileName)
    usernamesCrawled = ut.readLine2List(path, idProfileFileName)
    usernamesLeft = list(set(usernames) - set(usernamesCrawled))
    fi = open(path + idProfileFileName, 'a')
    for username in usernamesLeft:
        # for username in usernames[len(usernamesCrawled):]:
        print(username)
        time.sleep(5)
        profile = getUserProfile("", username)
        # file cannot be opened due to wrong name
        try:
            with codecs.open(path + "profile/" + username,
                             "w",
                             encoding="utf-8") as fo:
                fo.write(json.dumps(profile, indent=4, ensure_ascii=False))
        except:
            pass
        fi.write(username + '\n')
Пример #7
0
def ranking(n=1558, filename="ranking_origin_1558.txt"):
	scores = ut.readLine2List(predPath, filename)
	preds = list()
	for i in range(n):
		# print(i*n)
		scores_i = scores[i*n:(i+1)*n]
		max_index = max(enumerate(scores_i), key=lambda k: float(k[1]))[0]
		# print(max_index)
		preds_i = ["0"]*1558
		preds_i[max_index] = "1"
		preds += preds_i
	ut.writeList2Line(predPath, predictionRankFilename, preds)
	return preds
Пример #8
0
def getUsersFriendship():
	usernames = ut.readLine2List(path, namesMappingFileName)
	counts = 0
	# from where to start
	with open(path + relationshipFileName, "r") as fi:
		count = len(fi.readlines())
	id_post_writer = open(path+"friends_over1page", "a")
	with open(path+relationshipFileName, "a", encoding="utf-8") as fo:
		for username in usernames[count:]:
			print(username)
			time.sleep(60)
			friends = getUserFriendship(id_post_writer, "", username, )
			friends = [str(a) for a in friends]
			fo.write(username+" "+",".join(friends)+"\n")
Пример #9
0
def getGoogleUsers(sn="google"):
    driver = getDriver()
    loginGoogle(driver)
    # init variable
    snFolder = path + sn + "/"
    ids_visited = ut.readLine2List(snFolder, idsVisitedFileName)
    ids_saw = ut.readLine2List(snFolder, idsSawFileName)
    ids_error = ut.readLine2List(snFolder, idsErrorFileName)
    nextids = list(set(ids_saw) - set(ids_visited) - set(ids_error))
    print(len(nextids))
    # nextids = ids_saw[len(ids_visited)+1:]

    ids_error_writer = open(snFolder + idsErrorFileName, "a")

    if len(ids_saw) == 0:
        ids_saw.append(root)
    # build social network here
    g = initGraph(ids_saw, ids_visited)

    for uid in nextids:
        # if uid not in ids:
        error = 0
        print(uid)
        # iterate until parse successfully
        while True:
            try:
                if error == 5:
                    ids_error_writer.write(str(uid) + "\n")
                    break
                if parseGoogleUser(driver, g, snFolder, uid, ids_visited,
                                   ids_saw, nextids):
                    break
            except:
                error = error + 1
                pass
        # just add new ids here, don't delete the user id
    driver.close()
Пример #10
0
def getGoogleUsers(sn = "google"):
	driver = getDriver()
	loginGoogle(driver)
	# init variable
	snFolder = path+sn+"/"
	ids_visited = ut.readLine2List(snFolder, idsVisitedFileName)
	ids_saw = ut.readLine2List(snFolder, idsSawFileName)
	ids_error = ut.readLine2List(snFolder, idsErrorFileName)
	nextids = list(set(ids_saw)-set(ids_visited)-set(ids_error))
	print(len(nextids))
	# nextids = ids_saw[len(ids_visited)+1:]

	ids_error_writer = open(snFolder+idsErrorFileName, "a")


	if len(ids_saw)==0:
		ids_saw.append(root)
	# build social network here
	g = initGraph(ids_saw, ids_visited)

	for uid in nextids:
		# if uid not in ids:
		error = 0
		print(uid)
		# iterate until parse successfully
		while True:
			try:
				if error == 5:
					ids_error_writer.write(str(uid)+"\n")
					break
				if parseGoogleUser(driver, g, snFolder, uid, ids_visited, ids_saw, nextids):
					break
			except:
				error = error+1
				pass
		# just add new ids here, don't delete the user id
	driver.close()
Пример #11
0
def reviseIdFile():
	ids_visited = ut.readLine2List(snFolder, idsVisitedFileName+"2")
	ids_saw = ut.readLine2List(snFolder, idsSawFileName)
	loss = ut.readLine2List(snFolder, "tmp_ids")

	# revise id file duplicate problem
	g=nx.Graph()
	dup = list()
	num = list()
	for i in range(len(ids_saw)):
		id = ids_saw[i]
		try:
			g.node[id]
			dup.append(id)
			num.append(i)
		except:
			g.add_node(id)
	print(len(dup))
	for i in range(len(num)-1, -1, -1):
		pos = num[i]
		del ids_saw[pos]
	for l in loss:
		ids_saw.append(l)
	ut.writeList2Line("../data/google/", "ids_saw2", ids_saw)
Пример #12
0
def reviseIdFile():
    ids_visited = ut.readLine2List(snFolder, idsVisitedFileName + "2")
    ids_saw = ut.readLine2List(snFolder, idsSawFileName)
    loss = ut.readLine2List(snFolder, "tmp_ids")

    # revise id file duplicate problem
    g = nx.Graph()
    dup = list()
    num = list()
    for i in range(len(ids_saw)):
        id = ids_saw[i]
        try:
            g.node[id]
            dup.append(id)
            num.append(i)
        except:
            g.add_node(id)
    print(len(dup))
    for i in range(len(num) - 1, -1, -1):
        pos = num[i]
        del ids_saw[pos]
    for l in loss:
        ids_saw.append(l)
    ut.writeList2Line("../data/google/", "ids_saw2", ids_saw)
Пример #13
0
def writeMissingGooglePosts():
	ids = ut.readLine2List("../data/google/", "ids_mapping")
	ids_parsed = list()
	ids_errors = list()
	for root, folder, filenames in os.walk("../data/google/wall"):
		ids_parsed = filenames
		ids_errors = list(set(ids)-set(ids_parsed))
		for filename in filenames:
			with open(os.path.join(root, filename), "r", errors="ignore") as fi:
				try:
					result = json.loads(fi.read())
					if type(result) == dict:
						ids_errors.append(filename)
				except:
					pass
	ut.writeList2Line("../data/stat/", "google_ids_post_errors", ids_errors)
Пример #14
0
def writeMissingGooglePosts():
    ids = ut.readLine2List("../data/google/", "ids_mapping")
    ids_parsed = list()
    ids_errors = list()
    for root, folder, filenames in os.walk("../data/google/wall"):
        ids_parsed = filenames
        ids_errors = list(set(ids) - set(ids_parsed))
        for filename in filenames:
            with open(os.path.join(root, filename), "r",
                      errors="ignore") as fi:
                try:
                    result = json.loads(fi.read())
                    if type(result) == dict:
                        ids_errors.append(filename)
                except:
                    pass
    ut.writeList2Line("../data/stat/", "google_ids_post_errors", ids_errors)
Пример #15
0
def getUsersFriendship():
    usernames = ut.readLine2List(path, namesMappingFileName)
    counts = 0
    # from where to start
    with open(path + relationshipFileName, "r") as fi:
        count = len(fi.readlines())
    id_post_writer = open(path + "friends_over1page", "a")
    with open(path + relationshipFileName, "a", encoding="utf-8") as fo:
        for username in usernames[count:]:
            print(username)
            time.sleep(60)
            friends = getUserFriendship(
                id_post_writer,
                "",
                username,
            )
            friends = [str(a) for a in friends]
            fo.write(username + " " + ",".join(friends) + "\n")
Пример #16
0
def main():
	urls = list()
	ids = ut.readLine2List("../data/google/", "id_file")
	for i in range(10):
		uid = ids[i]
		urlPrefix = "https://plus.google.com/"
		urlAbout = urlPrefix+uid+"/about"
		urls.append(urlAbout)
	s = time()
	nprocs = 4
	procList = list()
	result = list()
	q = mp.Queue()
	index = 0 
	# driver = webdriver.Firefox()
	while index < len(urls):
		# url = urls[index]
		# index = index+1
		# driver.get(url)
		# result.append(driver.title)

		for i in range(nprocs):
			print(index)
			scope = 2
			urls_short = urls[index:index+scope]
			p = mp.Process(target=f, args=([3], q, urls_short))
			p.start()
			procList.append(p)
			index = index + scope
		print(q.qsize)
		for i in range(q.qsize()):
			result += q.get()
		for p in procList:
			p.join()
	e = time()
	print (result)
	print (e-s)
Пример #17
0
def main():
    urls = list()
    ids = ut.readLine2List("../data/google/", "id_file")
    for i in range(10):
        uid = ids[i]
        urlPrefix = "https://plus.google.com/"
        urlAbout = urlPrefix + uid + "/about"
        urls.append(urlAbout)
    s = time()
    nprocs = 4
    procList = list()
    result = list()
    q = mp.Queue()
    index = 0
    # driver = webdriver.Firefox()
    while index < len(urls):
        # url = urls[index]
        # index = index+1
        # driver.get(url)
        # result.append(driver.title)

        for i in range(nprocs):
            print(index)
            scope = 2
            urls_short = urls[index:index + scope]
            p = mp.Process(target=f, args=([3], q, urls_short))
            p.start()
            procList.append(p)
            index = index + scope
        print(q.qsize)
        for i in range(q.qsize()):
            result += q.get()
        for p in procList:
            p.join()
    e = time()
    print(result)
    print(e - s)
Пример #18
0
def getGoogleUsersParellel():
	ids_visited = ut.readLine2List(snFolder, idsVisitedFileName)
	ids_saw = ut.readLine2List(snFolder, idsSawFileName)
	ids_error = ut.readLine2List(snFolder, idsErrorFileName)
	# nextids = ids_saw[len(ids)+1:]
	nextids = list(set(ids_saw)-set(ids_visited)-set(ids_error))

	# write file
	ids_error_writer = open(snFolder+idsErrorFileName, "a")
	ids_visited_writer = open(snFolder+idsVisitedFileName, 'a', encoding="utf8")
	ids_saw_writer = open(snFolder+idsSawFileName, 'a', encoding="utf8")
	ids_recorded_writer = open(snFolder+idsRecordedFileName, 'a', encoding="utf8")

	sn_writer = open(path+"sn_file", 'a', encoding="utf8")
	profile_writer = open(snFolder+"profile_file", 'a', encoding="utf8")
	rela_writer = open(snFolder+"relationship_file", 'a', encoding="utf8")

	# initialize graph
	if len(ids_saw) == 0:
		ids_saw.append(root)
	g = initGraph(ids_saw, ids_visited)
	index = 0

	# multiprocess to get the user info
	procNum = 3
	batchNum = 100
	# drivers = list()
	# for i in range(procNum):
	# 	drivers.append(webdriver.Firefox())
	while index < len(nextids):
		result = list()
		q = mp.Queue()
		# q = th.Queue()
		# q = queue.Queue()
		roundNum = procNum * batchNum
		procs = list()

		if index+roundNum < len(nextids):
			for i in range(procNum):
				batchids = nextids[index+i*batchNum:index+((i+1)*batchNum)]
				p = mp.Process(target=worker_p, args=(batchids,q))
				p.start()
				procs.append(p)
			for i in range(roundNum):
				result += q.get()
			for proc in procs:
				proc.join()
		else:
			batchids = nextids[index:]
			p = mp.Process(target=worker_p, args=(batchids,q))
			p.start()
			for i in range(len(batchids)):
				result += q.get()
			p.join()
		# process back data 
		# lock.acquire()
		for userData in result:
			# dictionary: {id: uid, status: false or true,infos: infos, friends: friends, friend_bool: true, sns: sns, sn_bool: true false}
			uid = userData["id"]
			infos = userData["infos"]
			friends = userData["friends"]
			sns = userData["sns"]
			sn_bool = userData["sn_bool"]
			friend_bool = userData["friend_bool"]
			status = userData["status"]

			if g.node[uid]["status"] == 1:
				# print("already in graph")
				continue
			elif status==False:
				# print("cannot read be parsed")
				ids_error_writer.write(uid+"\n")
			else:
				# print("new user")
				if infos != None:
					# writeUser2File(uid, sns, sn_bool, infos, friends, friend_bool, sn_writer, profile_writer, rela_writer, ids_visited_writer, ids_recorded_writer)
					# print("start to write:"+uid)
					sn_writer.write(uid+','+','.join(sns)+'\n')
					profile_writer.write(uid+',\t'+',\t'.join(infos)+'\n')
					rela_writer.write(uid+' '+','.join(friends)+'\n')
					ids_recorded_writer.write(uid)
					if sn_bool:
						ids_recorded_writer.write(","+str(1))
					else:
						ids_recorded_writer.write(","+str(0))
					if friend_bool:
						ids_recorded_writer.write(","+str(1)+"\n")
					else:
						ids_recorded_writer.write(","+str(0)+'\n')
					# print("finish write")
					addFriend(g, friends, ids_saw, ids_saw_writer, nextids)
					g.node[uid]["status"] = 1
					ids_visited.append(uid)
				ids_visited_writer.write(uid+"\n")
		# lock.release()
		ut.removeWinSpace()
		index = index + procNum*batchNum
Пример #19
0
def rankingConstraint(n=1558):
	scores = [float(i) for i in ut.readLine2List(predPath, predictionRankOriginFilename)]
	oneMapping(scores, predictionRankConstraintFilename, n)
Пример #20
0
def evalNm(filename="nm_1558.txt"):
	print("Evaluation: Name Matching")
	preds = ut.readLine2List(predPath, filename)
	gts = ut.readLine2List(predPath, gtFilename)
	return evaluate(gts, preds)
Пример #21
0
def getGoogleUsersParellel():
    ids_visited = ut.readLine2List(snFolder, idsVisitedFileName)
    ids_saw = ut.readLine2List(snFolder, idsSawFileName)
    ids_error = ut.readLine2List(snFolder, idsErrorFileName)
    # nextids = ids_saw[len(ids)+1:]
    nextids = list(set(ids_saw) - set(ids_visited) - set(ids_error))

    # write file
    ids_error_writer = open(snFolder + idsErrorFileName, "a")
    ids_visited_writer = open(snFolder + idsVisitedFileName,
                              'a',
                              encoding="utf8")
    ids_saw_writer = open(snFolder + idsSawFileName, 'a', encoding="utf8")
    ids_recorded_writer = open(snFolder + idsRecordedFileName,
                               'a',
                               encoding="utf8")

    sn_writer = open(path + "sn_file", 'a', encoding="utf8")
    profile_writer = open(snFolder + "profile_file", 'a', encoding="utf8")
    rela_writer = open(snFolder + "relationship_file", 'a', encoding="utf8")

    # initialize graph
    if len(ids_saw) == 0:
        ids_saw.append(root)
    g = initGraph(ids_saw, ids_visited)
    index = 0

    # multiprocess to get the user info
    procNum = 3
    batchNum = 100
    # drivers = list()
    # for i in range(procNum):
    # 	drivers.append(webdriver.Firefox())
    while index < len(nextids):
        result = list()
        q = mp.Queue()
        # q = th.Queue()
        # q = queue.Queue()
        roundNum = procNum * batchNum
        procs = list()

        if index + roundNum < len(nextids):
            for i in range(procNum):
                batchids = nextids[index + i * batchNum:index +
                                   ((i + 1) * batchNum)]
                p = mp.Process(target=worker_p, args=(batchids, q))
                p.start()
                procs.append(p)
            for i in range(roundNum):
                result += q.get()
            for proc in procs:
                proc.join()
        else:
            batchids = nextids[index:]
            p = mp.Process(target=worker_p, args=(batchids, q))
            p.start()
            for i in range(len(batchids)):
                result += q.get()
            p.join()
        # process back data
        # lock.acquire()
        for userData in result:
            # dictionary: {id: uid, status: false or true,infos: infos, friends: friends, friend_bool: true, sns: sns, sn_bool: true false}
            uid = userData["id"]
            infos = userData["infos"]
            friends = userData["friends"]
            sns = userData["sns"]
            sn_bool = userData["sn_bool"]
            friend_bool = userData["friend_bool"]
            status = userData["status"]

            if g.node[uid]["status"] == 1:
                # print("already in graph")
                continue
            elif status == False:
                # print("cannot read be parsed")
                ids_error_writer.write(uid + "\n")
            else:
                # print("new user")
                if infos != None:
                    # writeUser2File(uid, sns, sn_bool, infos, friends, friend_bool, sn_writer, profile_writer, rela_writer, ids_visited_writer, ids_recorded_writer)
                    # print("start to write:"+uid)
                    sn_writer.write(uid + ',' + ','.join(sns) + '\n')
                    profile_writer.write(uid + ',\t' + ',\t'.join(infos) +
                                         '\n')
                    rela_writer.write(uid + ' ' + ','.join(friends) + '\n')
                    ids_recorded_writer.write(uid)
                    if sn_bool:
                        ids_recorded_writer.write("," + str(1))
                    else:
                        ids_recorded_writer.write("," + str(0))
                    if friend_bool:
                        ids_recorded_writer.write("," + str(1) + "\n")
                    else:
                        ids_recorded_writer.write("," + str(0) + '\n')
                    # print("finish write")
                    addFriend(g, friends, ids_saw, ids_saw_writer, nextids)
                    g.node[uid]["status"] = 1
                    ids_visited.append(uid)
                ids_visited_writer.write(uid + "\n")
        # lock.release()
        ut.removeWinSpace()
        index = index + procNum * batchNum
Пример #22
0
def evalMnaConstraint(filename="mna_constraint_1558.txt"):
	print("Evaluation: MNA Constraint")
	preds = ut.readLine2List(predPath, filename)
	gts = ut.readLine2List(predPath, gtFilename)
	return evaluate(gts, preds)
Пример #23
0
def evalRankingConstraint(filename="ranking_constraint_1558.txt"):
	print("Evaluation: Ranking constraint")
	preds = ut.readLine2List(predPath, filename)
	gts = ut.readLine2List(predPath, gtFilename)
	return evaluate(gts, preds)