def test___iter___w_more(self): from google.cloud.datastore.query import _pb_from_query connection = _Connection() client = self._makeClient(connection) query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE) self._addQueryResults(connection, cursor=self._END, more=True) self._addQueryResults(connection) iterator = self._makeOne(query, client) entities = list(iterator) self.assertFalse(iterator._more_results) self.assertEqual(len(entities), 2) for entity in entities: self.assertEqual( entity.key.path, [{'kind': self._KIND, 'id': self._ID}]) self.assertEqual(entities[1]['foo'], u'Foo') qpb1 = _pb_from_query(query) qpb2 = _pb_from_query(query) qpb2.start_cursor = self._END EXPECTED1 = { 'project': self._PROJECT, 'query_pb': qpb1, 'namespace': self._NAMESPACE, 'transaction_id': None, } EXPECTED2 = { 'project': self._PROJECT, 'query_pb': qpb2, 'namespace': self._NAMESPACE, 'transaction_id': None, } self.assertEqual(len(connection._called_with), 2) self.assertEqual(connection._called_with[0], EXPECTED1) self.assertEqual(connection._called_with[1], EXPECTED2)
def test___iter___w_limit(self): from google.cloud.datastore.query import _pb_from_query connection = _Connection() client = self._makeClient(connection) query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE) skip1 = 4 skip2 = 9 self._addQueryResults(connection, more=True, skipped_results=skip1, no_entity=True) self._addQueryResults(connection, more=True, skipped_results=skip2) self._addQueryResults(connection) offset = skip1 + skip2 iterator = self._makeOne(query, client, limit=2, offset=offset) entities = list(iterator) self.assertFalse(iterator._more_results) self.assertEqual(len(entities), 2) for entity in entities: self.assertEqual(entity.key.path, [{ 'kind': self._KIND, 'id': self._ID }]) qpb1 = _pb_from_query(query) qpb1.limit.value = 2 qpb1.offset = offset qpb2 = _pb_from_query(query) qpb2.start_cursor = self._END qpb2.limit.value = 2 qpb2.offset = offset - skip1 qpb3 = _pb_from_query(query) qpb3.start_cursor = self._END qpb3.limit.value = 1 EXPECTED1 = { 'project': self._PROJECT, 'query_pb': qpb1, 'namespace': self._NAMESPACE, 'transaction_id': None, } EXPECTED2 = { 'project': self._PROJECT, 'query_pb': qpb2, 'namespace': self._NAMESPACE, 'transaction_id': None, } EXPECTED3 = { 'project': self._PROJECT, 'query_pb': qpb3, 'namespace': self._NAMESPACE, 'transaction_id': None, } self.assertEqual(len(connection._called_with), 3) self.assertEqual(connection._called_with[0], EXPECTED1) self.assertEqual(connection._called_with[1], EXPECTED2) self.assertEqual(connection._called_with[2], EXPECTED3)
def test___iter___w_limit(self): from google.cloud.datastore.query import _pb_from_query connection = _Connection() client = self._makeClient(connection) query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE) skip1 = 4 skip2 = 9 self._addQueryResults(connection, more=True, skipped_results=skip1, no_entity=True) self._addQueryResults(connection, more=True, skipped_results=skip2) self._addQueryResults(connection) offset = skip1 + skip2 iterator = self._makeOne(query, client, limit=2, offset=offset) entities = list(iterator) self.assertFalse(iterator._more_results) self.assertEqual(len(entities), 2) for entity in entities: self.assertEqual( entity.key.path, [{'kind': self._KIND, 'id': self._ID}]) qpb1 = _pb_from_query(query) qpb1.limit.value = 2 qpb1.offset = offset qpb2 = _pb_from_query(query) qpb2.start_cursor = self._END qpb2.limit.value = 2 qpb2.offset = offset - skip1 qpb3 = _pb_from_query(query) qpb3.start_cursor = self._END qpb3.limit.value = 1 EXPECTED1 = { 'project': self._PROJECT, 'query_pb': qpb1, 'namespace': self._NAMESPACE, 'transaction_id': None, } EXPECTED2 = { 'project': self._PROJECT, 'query_pb': qpb2, 'namespace': self._NAMESPACE, 'transaction_id': None, } EXPECTED3 = { 'project': self._PROJECT, 'query_pb': qpb3, 'namespace': self._NAMESPACE, 'transaction_id': None, } self.assertEqual(len(connection._called_with), 3) self.assertEqual(connection._called_with[0], EXPECTED1) self.assertEqual(connection._called_with[1], EXPECTED2) self.assertEqual(connection._called_with[2], EXPECTED3)
def delete_from_datastore(project, pipeline_options, run_locally): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. client = datastore.Client() if run_locally: pass #q.add_filter('category', '=', 'BEBOP') q = client.query(kind='PRDebugAttendee') query.order = ['-created_date'] results = list(q.fetch(1)) if not results: logging.error('No PRDebugAttendee objects found') return newest_date = results[0]['created_date'] logging.info('Deleting elements older than %s', newest_date) q1 = client.query(kind='PRDebugAttendee') q2 = client.query(kind='PRCityCategory') datastore_1 = p | 'read PRDebugAttendee from datastore' >> ReadFromDatastore( project, query._pb_from_query(q1), num_splits=400) datastore_2 = p | 'read PRCityCategory from datastore' >> ReadFromDatastore( project, query._pb_from_query(q2), num_splits=400) # Set up our map/reduce pipeline output = ( (datastore_1, datastore_2) | beam.Flatten() | 'convert to entity' >> beam.Map(ConvertToEntity) # Find the events we want to count, and expand all the admins/attendees | 'find old rankings' >> beam.FlatMap(OldPRRecord, newest_date) # And save it all back to the database ) if not run_locally: output | 'delete from datastore' >> beam.ParDo(DeleteFromDatastore()) """ (output | 'convert from entity' >> beam.Map(ConvertFromEntity) | 'write to datastore' >> WriteToDatastore(client.project) ) """ # Actually run the pipeline (all operations above are deferred). result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def test_next_page_w_cursors_w_more(self): from base64 import urlsafe_b64decode from base64 import urlsafe_b64encode from google.cloud.datastore.query import _pb_from_query connection = _Connection() client = self._makeClient(connection) query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE) self._addQueryResults(connection, cursor=self._END, more=True) iterator = self._makeOne(query, client) iterator._start_cursor = self._START iterator._end_cursor = self._END entities, more_results, cursor = iterator.next_page() self.assertEqual(cursor, urlsafe_b64encode(self._END)) self.assertTrue(more_results) self.assertTrue(iterator._more_results) self.assertEqual(iterator._skipped_results, None) self.assertEqual(iterator._end_cursor, None) self.assertEqual(urlsafe_b64decode(iterator._start_cursor), self._END) self.assertEqual(len(entities), 1) self.assertEqual(entities[0].key.path, [{'kind': self._KIND, 'id': self._ID}]) self.assertEqual(entities[0]['foo'], u'Foo') qpb = _pb_from_query(query) qpb.offset = 0 qpb.start_cursor = urlsafe_b64decode(self._START) qpb.end_cursor = urlsafe_b64decode(self._END) EXPECTED = { 'project': self._PROJECT, 'query_pb': qpb, 'namespace': self._NAMESPACE, 'transaction_id': None, } self.assertEqual(connection._called_with, [EXPECTED])
def test_next_page_no_cursors_no_more_w_offset_and_limit(self): from google.cloud.datastore.query import _pb_from_query connection = _Connection() client = self._makeClient(connection) query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE) skipped_results = object() self._addQueryResults(connection, cursor=b'', skipped_results=skipped_results) iterator = self._makeOne(query, client, 13, 29) entities, more_results, cursor = iterator.next_page() self.assertEqual(cursor, None) self.assertFalse(more_results) self.assertFalse(iterator._more_results) self.assertEqual(iterator._skipped_results, skipped_results) self.assertEqual(len(entities), 1) self.assertEqual(entities[0].key.path, [{'kind': self._KIND, 'id': self._ID}]) self.assertEqual(entities[0]['foo'], u'Foo') qpb = _pb_from_query(query) qpb.limit.value = 13 qpb.offset = 29 EXPECTED = { 'project': self._PROJECT, 'query_pb': qpb, 'namespace': self._NAMESPACE, 'transaction_id': None, } self.assertEqual(connection._called_with, [EXPECTED])
def test___iter___no_more(self): from google.cloud.datastore.query import _pb_from_query connection = _Connection() client = self._makeClient(connection) query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE) self._addQueryResults(connection) iterator = self._makeOne(query, client) entities = list(iterator) self.assertFalse(iterator._more_results) self.assertEqual(len(entities), 1) self.assertEqual(entities[0].key.path, [{ 'kind': self._KIND, 'id': self._ID }]) self.assertEqual(entities[0]['foo'], u'Foo') qpb = _pb_from_query(query) qpb.offset = 0 EXPECTED = { 'project': self._PROJECT, 'query_pb': qpb, 'namespace': self._NAMESPACE, 'transaction_id': None, } self.assertEqual(connection._called_with, [EXPECTED])
def test___iter___no_more(self): from google.cloud.datastore.query import _pb_from_query connection = _Connection() client = self._makeClient(connection) query = _Query(client, self._KIND, self._PROJECT, self._NAMESPACE) self._addQueryResults(connection) iterator = self._makeOne(query, client) entities = list(iterator) self.assertFalse(iterator._more_results) self.assertEqual(len(entities), 1) self.assertEqual(entities[0].key.path, [{'kind': self._KIND, 'id': self._ID}]) self.assertEqual(entities[0]['foo'], u'Foo') qpb = _pb_from_query(query) qpb.offset = 0 EXPECTED = { 'project': self._PROJECT, 'query_pb': qpb, 'namespace': self._NAMESPACE, 'transaction_id': None, } self.assertEqual(connection._called_with, [EXPECTED])
def _callFUT(self, query): from google.cloud.datastore.query import _pb_from_query return _pb_from_query(query)
def delete_from_datastore(project, pipeline_options, run_locally): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. client = datastore.Client() if run_locally: pass #q.add_filter('category', '=', 'BEBOP') q = client.query(kind='PRDebugAttendee') query.order = ['-created_date'] results = list(q.fetch(1)) if not results: logging.error('No PRDebugAttendee objects found') return newest_date = results[0]['created_date'] logging.info('Deleting elements older than %s', newest_date) q1 = client.query(kind='PRDebugAttendee') q2 = client.query(kind='PRCityCategory') datastore_1 = p | 'read PRDebugAttendee from datastore' >> ReadFromDatastore(project, query._pb_from_query(q1), num_splits=400) datastore_2 = p | 'read PRCityCategory from datastore' >> ReadFromDatastore(project, query._pb_from_query(q2), num_splits=400) # Set up our map/reduce pipeline output = ((datastore_1, datastore_2) | beam.Flatten() | 'convert to entity' >> beam.Map(ConvertToEntity) # Find the events we want to count, and expand all the admins/attendees | 'find old rankings' >> beam.FlatMap(OldPRRecord, newest_date) # And save it all back to the database ) if not run_locally: output | 'delete from datastore' >> beam.ParDo(DeleteFromDatastore()) """ (output | 'convert from entity' >> beam.Map(ConvertFromEntity) | 'write to datastore' >> WriteToDatastore(client.project) ) """ # Actually run the pipeline (all operations above are deferred). result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def run_pipeline(project, pipeline_options, args): """Creates a pipeline that reads entities from Cloud Datastore.""" run_locally = args.run_locally run_on_fraction = args.run_on_fraction ground_truth_events = args.ground_truth_events debug_attendees = args.debug_attendees want_top_attendees = args.want_top_attendees person_locations = args.person_locations p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. client = datastore.Client() q = client.query(kind='DBEvent') if run_locally: q.key_filter(client.key('DBEvent', '999'), '>') q.key_filter(client.key('DBEvent', 'A'), '<') # Let's build a timestamp to save all our objects with timestamp = datetime.datetime.now() # Set up our map/reduce pipeline produce_attendees = ( p | 'read from datastore' >> ReadFromDatastore(project, query._pb_from_query(q), num_splits=400) | 'convert to entity' >> beam.Map(ConvertToEntity) | # Find the events we want to count, and expand all the admins/attendees 'filter events' >> beam.FlatMap(CountableEvent, ground_truth_events, run_on_fraction) | 'load fb attending' >> beam.ParDo(GetEventAndAttending()) | 'export attendees' >> beam.FlatMap(ExportPeople) ) # yapf: disable if want_top_attendees or debug_attendees: top_attendee_lists = ( produce_attendees | 'map category -> person' >> beam.FlatMap(GroupPeopleByCategory) | 'group by category' >> beam.GroupByKey() | 'build top-people lists' >> beam.FlatMap(CountPeopleInfos) ) # yapf: disable if want_top_attendees: ( top_attendee_lists | 'convert dict to json' >> beam.ParDo(ConvertDictToText) | 'write json' >> WriteToText('gs://dancedeets-hrd.appspot.com/people-ranking-outputs/city-category/%s/data' % timestamp, file_name_suffix='.txt') #'generate PRCityCategory database record' >> beam.ParDo(BuildPRCityCategory(), timestamp, 'PRCityCategory', TOP_CITY_N) | #'write PRCityCategory to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally) ) # yapf: disable if debug_attendees: attendee_event_debugging = ( produce_attendees | 'map city-attendee -> event' >> beam.FlatMap(DebugExportEventPeopleForGrouping) | 'group by city-attendee' >> beam.GroupByKey() | 'within city-attendee, group event_ids by admin_hash' >> beam.FlatMap(DebugGroupEventIds) ) # yapf: disable exploded_top_attendees = ( top_attendee_lists | 'explode the top attendees into a mapping: category-attendee -> YES' >> beam.FlatMap(DebugExplodeAttendeeList) # We don't deal with duplicates, since it requires the objects (ie our dicts) to be hashable # Instead, we rely on DebugFilterForTopAttendee to filter out duplicates created by the above # | 'remove duplicates from multiple overlapping attendee-lists' >> beam.RemoveDuplicates() ) # yapf: disable ( # These both have the same keys: # keys are {city, person_id} (attendee_event_debugging, exploded_top_attendees) | beam.Flatten() | 'group the attendee-debug info with the is-it-a-top-attendee info' >> beam.GroupByKey() | 'filter for TOP_ATTENDEE' >> beam.FlatMap(DebugFilterForTopAttendee) | 'build PRDebugAttendee' >> beam.ParDo(DebugBuildPRDebugAttendee(), timestamp) | 'write PRDebugAttendee to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally and not run_on_fraction) ) # yapf: disable if person_locations: build_person_cities = ( produce_attendees | 'map attendee -> city' >> beam.FlatMap(GroupAttendenceByPerson) | 'group by attendee' >> beam.GroupByKey() | 'build top-cities per-person' >> beam.FlatMap(CountPersonTopCities) | 'convert dict to json' >> beam.ParDo(ConvertDictToText) | 'write json' >> WriteToText('gs://dancedeets-hrd.appspot.com/people-ranking-outputs/people-city/%s/data' % timestamp, file_name_suffix='.txt') #'build PRPersonCity' >> beam.ParDo(BuildPRPersonCity(), timestamp) | #'write PRPersonCity to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally) ) # yapf: disable """ (output | 'convert from entity' >> beam.Map(ConvertFromEntity) | 'write to datastore' >> WriteToDatastore(client.project) ) """ # Actually run the pipeline (all operations above are deferred). result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def run_pipeline(project, pipeline_options, run_locally, debug_attendees): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. client = datastore.Client() q = client.query(kind='DBEvent') if run_locally: q.key_filter(client.key('DBEvent', '999'), '>') q.key_filter(client.key('DBEvent', 'A'), '<') # Let's build a timestamp to save all our objects with timestamp = datetime.datetime.now() # Set up our map/reduce pipeline produce_attendees = ( p | 'read from datastore' >> ReadFromDatastore(project, query._pb_from_query(q), num_splits=400) | 'convert to entity' >> beam.Map(ConvertToEntity) # Find the events we want to count, and expand all the admins/attendees | 'filter events' >> beam.FlatMap(CountableEvent) | 'load fb attending' >> beam.ParDo(GetEventAndAttending()) | 'export attendees' >> beam.FlatMap(ExportPeople) ) top_attendee_lists = ( produce_attendees | 'map category -> person' >> beam.FlatMap(GroupPeopleByCategory) | 'group by category' >> beam.GroupByKey() | 'build top-people lists' >> beam.FlatMap(CountPeopleInfos) ) if debug_attendees: attendee_event_debugging = ( produce_attendees | 'map city-attendee -> event' >> beam.FlatMap(DebugExportEventPeopleForGrouping) | 'group by city-attendee' >> beam.GroupByKey() | 'within city-attendee, group event_ids by admin_hash' >> beam.FlatMap(DebugGroupEventIds) ) exploded_top_attendees = ( top_attendee_lists | 'explode the top attendees into a mapping: category-attendee -> YES' >> beam.FlatMap(DebugExplodeAttendeeList) # We don't deal with duplicates, since it requires the objects (ie our dicts) to be hashable # Instead, we rely on DebugFilterForTopAttendee to filter out duplicates created by the above # | 'remove duplicates from multiple overlapping attendee-lists' >> beam.RemoveDuplicates() ) ( # These both have the same keys: # key contains {person_type, city, category, person_id} (attendee_event_debugging, exploded_top_attendees) | beam.Flatten() # keys are {city, person_id} | 'group the attendee-debug info with the is-it-a-top-attendee info' >> beam.GroupByKey() | 'filter for TOP_ATTENDEE' >> beam.FlatMap(DebugFilterForTopAttendee) | 'build PRDebugAttendee' >> beam.ParDo(DebugBuildPRDebugAttendee(), timestamp) | 'write PRDebugAttendee to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally) ) ( top_attendee_lists | 'generate PRCityCategory database record' >> beam.ParDo(BuildPRCityCategory(), timestamp, 'PRCityCategory', TOP_ALL_N) | 'write PRCityCategory to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally) ) """ (output | 'convert from entity' >> beam.Map(ConvertFromEntity) | 'write to datastore' >> WriteToDatastore(client.project) ) """ # Actually run the pipeline (all operations above are deferred). result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result