class TwitterTweetCrawler: BAD_CHAR_SET = ['\n'] MAX_TWEET_ID = (2 ** 63) - 1 tweetCount = 200 urlGetTweet = "https://api.twitter.com/1/statuses/user_timeline.json?user_id=%d&max_id=%d&count=%d" urlHandler = None mySQLDataStore = None def __init__(self): self.urlHandler = URLHandler() self.mySQLDataStore = MySQLDataStore() def dump_resp(self, url): retry = True while retry: try: retry = False rawData = self.urlHandler.open_url(url) data = json.loads(rawData.read()) return data except ValueError, e: print ("ValueError: ", e.message) retry = True
def __init__(self, logName): fin = open(self.idFile, "r") for line in fin: self.idSet.add(int(line)) fin.close() self.logFile = open(logName,"w") self.db = ApolloSQL() self.urlHandler = URLHandler()
def __init__(self, logName): self.logFile = open(logName,"w") self.db = MySQLDataStore() self.urlHandler = URLHandler()
class Crawler: logFile = None db = None urlGetFollowerID = "https://api.twitter.com/1/followers/ids.json?cursor=%d&screen_name=%s" urlCheckLimit = "https://api.twitter.com/1/account/rate_limit_status.json" # for 1 user: id, screen name, name urlSingleUserInfo = "https://api.twitter.com/1/users/show.json?screen_name=%s&include_entities=true" # up to 100 users: returns a list, data[0]['name'] include_entities = true? urlUserInfo = "https://api.twitter.com/1/users/lookup.json?include_entities=true&screen_name=%s" urlHandler = None def __init__(self, logName): self.logFile = open(logName,"w") self.db = MySQLDataStore() self.urlHandler = URLHandler() """ def open_url_followerID(self,url,screenName): count = 1 while (count): if (count == 10): self.logFile.write("URL exceptions occur in %s: %s\n"%(screenName,url)) return None try: res = urllib2.urlopen(url) return res except urllib2.HTTPError, e: self.logFile.write(str(e.strerror, e.message)) count = count + 1 time.sleep(5) except urllib2.URLError, e: self.logFile.write(e.reason) #self.logFile.write(e.strerror) count = count + 1 time.sleep(5) """ def check_limit(self): url = self.urlCheckLimit res = self.urlHandler.open_url(url) data = json.loads(res.read()) limit = data['remaining_hits'] wakeup = data['reset_time_in_seconds'] return (limit,wakeup) """ def get_user_info(self,follower_sname_list): #construct sname-list seperated by , url = self.urlUserInfo #check rate limit res = self.open_url(url) return json.loads(res.read()) def get_follower_location(self,follower_sname_list): locations = [] data = self.get_user_info(follower_sname_list) for i in range(len(follower_sname_list)): locations.append(data[i]['location']) return locations """ def create_file(self,screenName,i): if not os.path.isdir("./"+screenName+"/"): os.mkdir("./"+screenName+"/") outputFile = open("./%s/followerID%d.txt"%(screenName,i),"w") return outputFile def get_screen_name(self,in_filename,out_filename): inputFile = open(in_filename,"r") outputFile = open(out_filename,"w") for line in inputFile: name = re.split(r'[()]',line) outputFile.write(name[1]+'\n') def get_follower_id(self, screenName, userID, offset, cursor): screenName = screenName.split('\n')[0] #works for sample.txt while cursor != 0: offset += 1 (limit,wakeup) = self.check_limit() while (limit == 0): interval = wakeup-time.time() time.sleep(interval) time.sleep(30) (limit,wakeup) = self.check_limit() (pCursor,nCursor,ids) = self.get_one_page_id(screenName,cursor) print (screenName, userID, offset, pCursor, nCursor) if ids == 0 and pCursor == 0 and nCursor == 0: return self.db.store_follower_piece(userID, offset, pCursor, nCursor, ids) cursor = nCursor def get_one_page_id(self, screenName, cursor): print ("Screen Name", screenName, "cursor", cursor) url = self.urlGetFollowerID%(cursor, screenName) print url res = self.urlHandler.open_url(url) if res == None: print "Fatal Errors: follower id page return None!!!" self.logFile.write("Fatal Errors in requesting %s: %s\n",(screenName, url)) return (0, 0, 0) strData = res.read() data = json.loads(strData) if 'errors' in data.keys(): print "Fatal Errors: follower id page return None!!!" self.logFile.write("Fatal Errors in requesting %s: %s\n",(screenName, url)) return (0,0,0) ids = data['ids'] # the cursor is int64, I have used big int in the follower_id table -- Shen Li nCursor = data['next_cursor'] pCursor = data['previous_cursor'] return (pCursor, nCursor,ids) def get_all_follower_id(self,filename): inputFile = open(filename,"r") for line in inputFile: screenName = line.split('\n')[0] userID = self.db.get_one_id(screenName) if not userID: continue (offset, cursor) = self.db.get_next_cursor(userID) self.get_follower_id(screenName, userID, offset, cursor) inputFile.close() def clean_up(self): self.logFile.close() self.db.close()
def __init__(self): self.dataStore = MySQLDataStore() self.rateLimit = RateLimit() self.urlHandler = URLHandler()
class TwitterUserCrawler: parameters = {'user_id':'user_id', 'screen_name':'screen_name'} urlUserLookup = "https://api.twitter.com/1/users/lookup.json?%s=%s" dataStore = None rateLimit = None urlHandler = None def __init__(self): self.dataStore = MySQLDataStore() self.rateLimit = RateLimit() self.urlHandler = URLHandler() def get_user_info(self, screenNameArr, parameter = 'screen_name'): cur = 0 next = 100 print ("get_user_info: ", parameter) curList = [] cnt = 0 for name in screenNameArr: if 'user_id' == parameter and not self.dataStore.check_user_by_id(name): curList.append(name) else: cnt += 1 if len(curList) >= 100: res = self.get_100_user_info(curList, parameter) if res: self.store_users(res) else: f = open("log/%f"%(time.time()), "w") f.write(str(screenNameArr[cur:next])) f.write("\n") f.close() curList = [] print ("removed", cnt, "users") """ while next < len(screenNameArr): res = self.get_100_user_info(screenNameArr[cur:next], parameter) if res: self.store_users(res) else: f = open("log/%f"%(time.time()), "w") f.write(str(screenNameArr[cur:next])) f.write("\n") f.close() cur = next next += 100 if cur < len(screenNameArr): res = self.get_100_user_info(screenNameArr[cur:len(screenNameArr)], parameter) if res: self.store_users(res) """ def store_users(self, dictData): for screenName in dictData.keys(): id = dictData[screenName]['id'] loc = dictData[screenName]['location'] followerNum = dictData[screenName]['followerNum'] followeeNum = dictData[screenName]['followeeNum'] statusNum = dictData[screenName]['statusNum'] favorNum = dictData[screenName]['favorNum'] createdAt = dictData[screenName]['createdAt'] verified = dictData[screenName]['verified'] #self.dataStore.store_user(id, screenName, folNum, loc) self.dataStore.store_user(id, screenName, followerNum, followeeNum, statusNum, favorNum, verified, createdAt, loc) def dump_resp(self, url): retry = True while retry: try: retry = False rawData = self.urlHandler.open_url(url) if not rawData: return data = json.loads(rawData.read()) return data except ValueError, e: print ("ValueError: ", e.message) retry = True
def __init__(self): self.urlHandler = URLHandler() self.mySQLDataStore = MySQLDataStore()
class Crawler: idFile = "/home/tarek/shenli3/project/Centaur/src/crawler/gastweets-sources.txt" idSet = Set() logFile = None db = None urlGetFollowerID = "https://api.twitter.com/1/followers/ids.json?cursor=%d&id=%s" urlCheckLimit = "https://api.twitter.com/1/account/rate_limit_status.json" # for 1 user: id, screen name, name urlSingleUserInfo = "https://api.twitter.com/1/users/show.json?screen_name=%s&include_entities=true" # up to 100 users: returns a list, data[0]['name'] include_entities = true? urlUserInfo = "https://api.twitter.com/1/users/lookup.json?include_entities=true&screen_name=%s" urlHandler = None def __init__(self, logName): fin = open(self.idFile, "r") for line in fin: self.idSet.add(int(line)) fin.close() self.logFile = open(logName,"w") self.db = ApolloSQL() self.urlHandler = URLHandler() def check_limit(self): url = self.urlCheckLimit res = self.urlHandler.open_url(url) data = json.loads(res.read()) limit = data['remaining_hits'] wakeup = data['reset_time_in_seconds'] return (limit,wakeup) def create_file(self,screenName,i): if not os.path.isdir("./"+screenName+"/"): os.mkdir("./"+screenName+"/") outputFile = open("./%s/followerID%d.txt"%(screenName,i),"w") return outputFile def get_link(self, followee_id): followee_id = int(followee_id) cursor = self.db.select_ncursor(followee_id) while cursor != 0: (limit, wakeup) = self.check_limit() while limit == 0: interval = wakeup - time.time() time.sleep(interval) time.sleep(30) (limit, wakeup) = self.check_limit() (pCursor, nCursor, ids) = self.get_one_page_id(followee_id, cursor) print(followee_id, pCursor, nCursor) if ids == 0 and pCursor == 0 and nCursor == 0: self.db.store_page_link(followee_id, [], 0) return newIds = [] for id in ids: if id in self.idSet: newIds.append(id) self.db.store_page_link(followee_id, newIds, nCursor) cursor = nCursor def get_follower_id(self, screenName, userID, offset, cursor): screenName = screenName.split('\n')[0] #works for sample.txt while cursor != 0: offset += 1 (limit,wakeup) = self.check_limit() while (limit == 0): interval = wakeup-time.time() time.sleep(interval) time.sleep(30) (limit,wakeup) = self.check_limit() (pCursor,nCursor,ids) = self.get_one_page_id(screenName,cursor) print (screenName, userID, offset, pCursor, nCursor) if ids == 0 and pCursor == 0 and nCursor == 0: return self.db.store_follower_piece(userID, offset, pCursor, nCursor, ids) cursor = nCursor def get_one_page_id(self, screenName, cursor): print ("Screen Name", screenName, "cursor", cursor) url = self.urlGetFollowerID%(cursor, screenName) print url res = self.urlHandler.open_url(url) if res == None: print "Fatal Errors: follower id page return None!!!" self.logFile.write("Fatal Errors in requesting %s: %s\n"%(screenName, url)) return (0, 0, 0) strData = res.read() data = json.loads(strData) if 'errors' in data.keys(): print "Fatal Errors: follower id page return None!!!" self.logFile.write("Fatal Errors in requesting %s: %s\n"%(screenName, url)) return (0,0,0) ids = data['ids'] # the cursor is int64, I have used big int in the follower_id table -- Shen Li nCursor = data['next_cursor'] pCursor = data['previous_cursor'] return (pCursor, nCursor,ids) def get_all_links(self, filename): fin = open(filename, 'r') for line in fin: followee_id = line.split('\n')[0] self.get_link(followee_id) def get_all_follower_id(self,filename): inputFile = open(filename,"r") for line in inputFile: screenName = line.split('\n')[0] userID = self.db.get_one_id(screenName) if not userID: continue (offset, cursor) = self.db.get_next_cursor(userID) self.get_follower_id(screenName, userID, offset, cursor) inputFile.close() def clean_up(self): self.logFile.close() self.db.close()