class MeetupWriter(object): """ A class that reads data about MUGS from the Meetup API using the MeetupAPI class and writes that data to a MongoDB collection. Supports pro and no pro APIs The process function is a generic reader function that takes a retrieval generator (provided by the MeetupAPI class and a processing function. It iterates over the docs returned by the retrieval generator and transforms then with "processFunc". The results are returned in an embedded document with the key "newFieldname". """ INSERT_SIZE = 1000 def _addTimestamp(self, doc): if "timestamp" in doc: raise ValueError("cannot add timestamp, \ 'timestamp' field already exists") if "batchID" in doc: raise ValueError("cannot add batchID, \ 'batchID' field already exists") doc["timestamp"] = datetime.datetime.utcnow() doc["batchID"] = self._batch_ID return doc def __init__(self, apikey, batch_ID, mdb, reshape=True, unordered=True): """Write contents of meetup API to MongoDB""" self._mdb = mdb self._meetup_api = MeetupAPI(apikey, reshape=reshape) self._batch_ID = batch_ID self._groups = self._mdb.groupsCollection() self._members = self._mdb.membersCollection() self._attendees = self._mdb.attendeesCollection() self._pastEvents = self._mdb.pastEventsCollection() self._upcomingEvents = self._mdb.upcomingEventsCollection() # self._mugs = [] self._unordered = unordered self._members_set = set() self._logger = logging.getLogger(__programName__) def update_members(self, retrievalGenerator, processFunc): ''' For nopro collections we count the members in each group. To avoid double counting we use update to overwrite previous records with the same member. ''' docs = [] count = 0 # print( "update_members") for url, i in retrievalGenerator: # ignore already inserted members a member may be in multiple # groups. if self._members.find_one({ "batchID": self._batch_ID, "id": i["id"] }): continue else: # print( "inserting : %s" %i["id"] ) count = count + 1 docs.append(processFunc(i)) if count == 500: self._members.insert_many(docs) docs = [] count = 0 if count > 0: self._members.insert_many(docs) docs = [] count = 0 def write(self, collection, retrievalGenerator, processFunc): """ :param collection: The collection to write too :param retrievalGenerator: a generator that produces docs :param processFunc: a preprocessing function for the docs to be written :return: The number of docs written """ ''' Use retrievalGenerator to get a single document (this should be a generator function). Use processFunc to tranform the document into a new doc (it should take a doc and return a doc). Write the new doc using the newFieldName. Write is done using a generator as well. The write receiver accumulates writes until a threshold is reached and then writes them as a batch using BatchWriter. ''' docs = [] count = 0 # print( "write") for url, i in retrievalGenerator: docs.append(processFunc(i)) if len(docs) == MeetupWriter.INSERT_SIZE: count = count + len(docs) # print( "inserted 500") collection.insert_many(docs) docs = [] if len(docs) > 0: collection.insert_many(docs) count = count + len(docs) return count def write_Attendees(self, group): writer = self._meetup_api.get_attendees(group) self.write(self._attendees, writer, self._addTimestamp) # def write_group(self, url_name, groupName="group"): # group = self._meetup_api.get_group( url_name ) # newDoc = self._addTimestamp( groupName, group ) # self._groups.insert_one( newDoc ) # return newDoc # def updateGroup(self, groupName, doc ): # self._mugs.append( doc[ "urlname" ]) # # return self._addTimestamp( groupName, doc ) def write_nopro_groups(self, mug_list): groups = self._meetup_api.get_groups_by_url(mug_list) self.write(self._groups, groups, self._addTimestamp) def select_groups(self, groups, urls): for url, g in groups: if g["urlname"] in urls: # print(g["urlname"]) yield url, g # def write_pro_groups(self, urls): # # groups = self._meetup_api.get_pro_groups() # self.write(self._pro_groups, self.select_groups(groups, urls), self._addTimestamp) def write_groups(self, urls): """ The old pro API has been disabled by the numbskulls at Meetup so no both pro and no pro APIs use the same get_group call. :param urls: List of urlnames to get group info for :return: No of groups written """ groups = self._meetup_api.get_groups_by_url(urls) return self.write(self._groups, groups, self._addTimestamp) # self.write_nopro_groups(urls) # if collect == "nopro": # self.write_nopro_groups(urls) # elif collect == "pro": # self.write_pro_groups(urls) # else: # self.write_pro_groups(urls) # self.write_nopro_groups(urls) def write_PastEvents(self, url_name): pastEvents = self._meetup_api.get_past_events(url_name) self.write(self._pastEvents, pastEvents, self._addTimestamp) def write_UpcomingEvents(self, url_name): upcomingEvents = self._meetup_api.get_upcoming_events(url_name) self.write(self._upcomingEvents, upcomingEvents, self._addTimestamp) # def write_pro_members(self): # members = self._meetup_api.get_pro_members() # self.write(self._pro_members, members, self._addTimestamp) # # def write_nopro_members(self, urls): # members = self._meetup_api.get_members(urls) # self.update_members(members, self._addTimestamp) def write_members(self, urls): members = self._meetup_api.get_members(urls) self.update_members(members, self._addTimestamp) # if collect == "nopro": # self.write_nopro_members(urls) # elif collect == "pro": # self.write_pro_members() # else: # self.write_pro_members() # self.write_nopro_members(urls) # def mug_list(self): # return self._mugs def capture_snapshot(self, url_name, admin_arg, phases): try: for i in phases: if i == "pastevents": self._logger.info("process past events for : '%s'", url_name) self.write_PastEvents(url_name) elif i == "upcomingevents": self._logger.info("process upcoming events for : '%s'", url_name) self.write_UpcomingEvents(url_name) elif i == "attendees": if admin_arg: self._logger.info( "process attendees : '%s'", url_name) self.write_Attendees(url_name) else: self._logger.warning( "You have not specified the admin arg") self._logger.warning( "You must be a meetup admin user to request attendees" ) self._logger.warning("Ignoring phase 'attendees'") else: self._logger.warn( "ignoring phase '%s': not a valid execution phase", i) except HTTPError as e: self._logger.fatal("Stopped processing: %s", e) raise
class MeetupWriter(object): ''' A class that reads data about MUGS from the Meetup API using the MeetupAPI class and writes that data to a MongoDB collection. Supports pro and no pro APIs ''' def __init__(self, audit, mdb, urls, apikey= get_meetup_key(), unordered=True ): ''' Write contents of meetup API to MongoDB ''' self._mdb = mdb self._meetup_api = MeetupAPI( apikey ) self._audit = audit self._groups = self._mdb.groupsCollection() self._members = self._mdb.membersCollection() self._attendees = self._mdb.attendeesCollection() self._pastEvents = self._mdb.pastEventsCollection() self._upcomingEvents = self._mdb.upcomingEventsCollection() self._mugs = [] self._unordered = unordered self._urls = urls def process(self, collection, retrievalGenerator, processFunc, newFieldName ): ''' Call batchWriter with a collection. Use retrievalGenerator to get a single document (this should be a generator function). Use processFunc to tranform the document into a new doc (it should take a doc and return a doc). Write the new doc using the newFieldName. Write is done using a generator as well. The write receiver accumulates writes until a threshold is reached and then writes them as a batch using BatchWriter. ''' bw = BatchWriter( collection, processFunc, newFieldName, orderedWrites=self._unordered ) writer = bw.bulkWrite( writeLimit=1) for i in retrievalGenerator : writer.send( i ) def processAttendees( self, group ): writer = self._meetup_api.get_attendees( group ) newWriter = mergeEvents( writer ) self.process( self._attendees, newWriter, self._audit.addTimestamp, "info" ) def processGroup(self, url_name, groupName="group"): group = self._meetup_api.get_group( url_name ) newDoc = self._audit.addTimestamp( groupName, group ) self._groups.insert_one( newDoc ) return newDoc def updateGroup(self, groupName, doc ): self._mugs.append( doc[ "urlname" ]) return self._audit.addTimestamp( groupName, doc ) def processGroups(self, nopro ): if nopro: groups = self.get_groups() else: groups = self._meetup_api.get_pro_groups() self.process( self._groups, groups, self.updateGroup, "group" ) def get_groups(self ): for i in self._urls: yield self._meetup_api.get_group( i ) def processPastEvents(self, url_name ): pastEvents = self._meetup_api.get_past_events( url_name ) self.process( self._pastEvents, pastEvents, self._audit.addTimestamp, "event" ) def processUpcomingEvents(self, url_name ): upcomingEvents = self._meetup_api.get_upcoming_events( url_name ) self.process( self._upcomingEvents, upcomingEvents, self._audit.addTimestamp, "event" ) def processMembers( self, nopro=True ): if nopro: members = self.get_members() else: members = self._meetup_api.get_pro_members() self.process( self._members, members, self._audit.addTimestamp, "member" ) def get_members(self ): for i in self._urls: for member in self._meetup_api.get_members( i ): # if member.has_key( "name" ) : # print( member[ "name"] ) # else: # pprint.pprint( member ) yield member def mug_list(self): return self._mugs def capture_snapshot(self, url_name, admin_arg, phases ): try : for i in phases: if i == "pastevents" : logging.info( "process past events for : '%s'", url_name ) self.processPastEvents( url_name ) elif i == "upcomingevents" : logging.info( "process upcoming events for : '%s'", url_name ) self.processUpcomingEvents( url_name ) elif i == "attendees" : if admin_arg: logging.info( "process attendees : '%s'", url_name ) self.processAttendees( url_name ) else: logging.warn( "You have not specified the admin arg") logging.warn( "You must be a meetup admin user to request attendees") logging.warn( "Ignoring phase 'attendees") else: logging.warn( "ignoring phase '%s': not a valid execution phase", i ) except HTTPError, e : logging.fatal( "Stopped processing: %s", e ) raise