class CoorCrawler: INFTY = 99999 filename = "prev_loc.txt" googleGeo = None dataStore = None loc = None def __init__(self): f = open(self.filename, 'r') self.loc = ''.join(f.readlines()) f.close() self.googleGeo = GoogleGeo() self.dataStore = MySQLDataStore() def get_address(self): #cnt = self.dataStore.select_user_count() #print ("count: ", cnt) while True: f = open(self.filename, 'w') f.write(self.loc) f.close() self.loc = self.dataStore.select_user_location_offset(self.loc) print if not self.loc: print ("in not loc") print ("done with current locations") break tmp = self.dataStore.select_addr_location(self.loc) if tmp: print ("in address", tmp) continue res = self.googleGeo.get_coordination(self.loc) if res: (lati, long, formatted, types) = res else: lati = -self.INFTY long = -self.INFTY formatted = None types = "None" print (self.loc, lati, long, formatted, types) self.dataStore.insert_address(self.loc, lati, long, formatted, types)
class TwitterFollowerCrawler: rateLimit = None mySQLDataStore = None userCrawler = None def __init__(self): self.rateLimit = RateLimit() self.mySQLDataStore = MySQLDataStore() self.userCrawler = TwitterUserCrawler() def remove_duplication(self, followerIDList): res = [] for id in followerIDList: if not self.mySQLDataStore.check_user_by_id(id): res.append(id) print "*******************removed %d duplicate users" % ( len(followerIDList) - len(res)) return res def handle_one_followee(self, screenName): #get id from users table for the screenName id = self.mySQLDataStore.get_one_id(screenName) #get current offset in tmp_offset table curOffset = self.mySQLDataStore.select_cur_offset(id) #get max offset in follower_id table maxOffset = self.mySQLDataStore.select_max_offset(id) if maxOffset <= 0: print "User %s has not started yet!" % (screenName) return #if curOffset < maxOffset: pull data from curOffset print "before while" while curOffset < maxOffset: print "In while" curOffset += 1 strFollowers = self.mySQLDataStore.select_follower_piece( id, curOffset) if not strFollowers: print "Piece %d %d is missing!" % (id, curOffset) return listFollowers = json.loads(strFollowers) # listFollowers = self.remove_duplication(listFollowers) print("++++++++++++++", screenName, curOffset, maxOffset, len(listFollowers)) self.userCrawler.get_user_info(listFollowers, parameter='user_id') self.mySQLDataStore.update_cur_offset(id, curOffset) def handle_all_followee(self, screenNameArr): for screenName in screenNameArr: self.handle_one_followee(screenName)
def __init__(self): self.rateLimit = RateLimit() self.mySQLDataStore = MySQLDataStore() self.userCrawler = TwitterUserCrawler()
def __init__(self, logName): self.logFile = open(logName,"w") self.db = MySQLDataStore() self.urlHandler = URLHandler()
class Crawler: logFile = None db = None urlGetFollowerID = "https://api.twitter.com/1/followers/ids.json?cursor=%d&screen_name=%s" urlCheckLimit = "https://api.twitter.com/1/account/rate_limit_status.json" # for 1 user: id, screen name, name urlSingleUserInfo = "https://api.twitter.com/1/users/show.json?screen_name=%s&include_entities=true" # up to 100 users: returns a list, data[0]['name'] include_entities = true? urlUserInfo = "https://api.twitter.com/1/users/lookup.json?include_entities=true&screen_name=%s" urlHandler = None def __init__(self, logName): self.logFile = open(logName,"w") self.db = MySQLDataStore() self.urlHandler = URLHandler() """ def open_url_followerID(self,url,screenName): count = 1 while (count): if (count == 10): self.logFile.write("URL exceptions occur in %s: %s\n"%(screenName,url)) return None try: res = urllib2.urlopen(url) return res except urllib2.HTTPError, e: self.logFile.write(str(e.strerror, e.message)) count = count + 1 time.sleep(5) except urllib2.URLError, e: self.logFile.write(e.reason) #self.logFile.write(e.strerror) count = count + 1 time.sleep(5) """ def check_limit(self): url = self.urlCheckLimit res = self.urlHandler.open_url(url) data = json.loads(res.read()) limit = data['remaining_hits'] wakeup = data['reset_time_in_seconds'] return (limit,wakeup) """ def get_user_info(self,follower_sname_list): #construct sname-list seperated by , url = self.urlUserInfo #check rate limit res = self.open_url(url) return json.loads(res.read()) def get_follower_location(self,follower_sname_list): locations = [] data = self.get_user_info(follower_sname_list) for i in range(len(follower_sname_list)): locations.append(data[i]['location']) return locations """ def create_file(self,screenName,i): if not os.path.isdir("./"+screenName+"/"): os.mkdir("./"+screenName+"/") outputFile = open("./%s/followerID%d.txt"%(screenName,i),"w") return outputFile def get_screen_name(self,in_filename,out_filename): inputFile = open(in_filename,"r") outputFile = open(out_filename,"w") for line in inputFile: name = re.split(r'[()]',line) outputFile.write(name[1]+'\n') def get_follower_id(self, screenName, userID, offset, cursor): screenName = screenName.split('\n')[0] #works for sample.txt while cursor != 0: offset += 1 (limit,wakeup) = self.check_limit() while (limit == 0): interval = wakeup-time.time() time.sleep(interval) time.sleep(30) (limit,wakeup) = self.check_limit() (pCursor,nCursor,ids) = self.get_one_page_id(screenName,cursor) print (screenName, userID, offset, pCursor, nCursor) if ids == 0 and pCursor == 0 and nCursor == 0: return self.db.store_follower_piece(userID, offset, pCursor, nCursor, ids) cursor = nCursor def get_one_page_id(self, screenName, cursor): print ("Screen Name", screenName, "cursor", cursor) url = self.urlGetFollowerID%(cursor, screenName) print url res = self.urlHandler.open_url(url) if res == None: print "Fatal Errors: follower id page return None!!!" self.logFile.write("Fatal Errors in requesting %s: %s\n",(screenName, url)) return (0, 0, 0) strData = res.read() data = json.loads(strData) if 'errors' in data.keys(): print "Fatal Errors: follower id page return None!!!" self.logFile.write("Fatal Errors in requesting %s: %s\n",(screenName, url)) return (0,0,0) ids = data['ids'] # the cursor is int64, I have used big int in the follower_id table -- Shen Li nCursor = data['next_cursor'] pCursor = data['previous_cursor'] return (pCursor, nCursor,ids) def get_all_follower_id(self,filename): inputFile = open(filename,"r") for line in inputFile: screenName = line.split('\n')[0] userID = self.db.get_one_id(screenName) if not userID: continue (offset, cursor) = self.db.get_next_cursor(userID) self.get_follower_id(screenName, userID, offset, cursor) inputFile.close() def clean_up(self): self.logFile.close() self.db.close()
def __init__(self): self.dataStore = MySQLDataStore() self.rateLimit = RateLimit() self.urlHandler = URLHandler()
class TwitterUserCrawler: parameters = {'user_id':'user_id', 'screen_name':'screen_name'} urlUserLookup = "https://api.twitter.com/1/users/lookup.json?%s=%s" dataStore = None rateLimit = None urlHandler = None def __init__(self): self.dataStore = MySQLDataStore() self.rateLimit = RateLimit() self.urlHandler = URLHandler() def get_user_info(self, screenNameArr, parameter = 'screen_name'): cur = 0 next = 100 print ("get_user_info: ", parameter) curList = [] cnt = 0 for name in screenNameArr: if 'user_id' == parameter and not self.dataStore.check_user_by_id(name): curList.append(name) else: cnt += 1 if len(curList) >= 100: res = self.get_100_user_info(curList, parameter) if res: self.store_users(res) else: f = open("log/%f"%(time.time()), "w") f.write(str(screenNameArr[cur:next])) f.write("\n") f.close() curList = [] print ("removed", cnt, "users") """ while next < len(screenNameArr): res = self.get_100_user_info(screenNameArr[cur:next], parameter) if res: self.store_users(res) else: f = open("log/%f"%(time.time()), "w") f.write(str(screenNameArr[cur:next])) f.write("\n") f.close() cur = next next += 100 if cur < len(screenNameArr): res = self.get_100_user_info(screenNameArr[cur:len(screenNameArr)], parameter) if res: self.store_users(res) """ def store_users(self, dictData): for screenName in dictData.keys(): id = dictData[screenName]['id'] loc = dictData[screenName]['location'] followerNum = dictData[screenName]['followerNum'] followeeNum = dictData[screenName]['followeeNum'] statusNum = dictData[screenName]['statusNum'] favorNum = dictData[screenName]['favorNum'] createdAt = dictData[screenName]['createdAt'] verified = dictData[screenName]['verified'] #self.dataStore.store_user(id, screenName, folNum, loc) self.dataStore.store_user(id, screenName, followerNum, followeeNum, statusNum, favorNum, verified, createdAt, loc) def dump_resp(self, url): retry = True while retry: try: retry = False rawData = self.urlHandler.open_url(url) if not rawData: return data = json.loads(rawData.read()) return data except ValueError, e: print ("ValueError: ", e.message) retry = True
def __init__(self): f = open(self.filename, 'r') self.loc = ''.join(f.readlines()) f.close() self.googleGeo = GoogleGeo() self.dataStore = MySQLDataStore()