def creep(self): with CredibleFrontEndDriver(self._id_source) as driver: for entry in self._leech_driver.get_propagated_vertexes( self._propagation_id): if not self._driving_identifier_stem: self._driving_identifier_stem = IdentifierStem.from_raw( entry['driving_identifier_stem']) if not self._extracted_identifier_stem: self._extracted_identifier_stem = IdentifierStem.from_raw( entry['extracted_identifier_stem']) if not self._propagation_identifier_stem: self._propagation_identifier_stem = IdentifierStem.from_raw( entry['identifier_stem']) try: self._creep( entry, identifier_stem=self._extracted_identifier_stem, driving_identifier_stem=self._driving_identifier_stem, context=self._context, driver=driver) except InsufficientOperationTimeException: return False return { 'propagation_id': self._propagation_id, 'id_source': self._id_source }
def work_remote_id_change_type(**kwargs): from toll_booth.alg_obj.forge.extractors.credible_fe import CredibleFrontEndDriver from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) identifier_stem = IdentifierStem.from_raw(kwargs['identifier_stem']) changelog_types = kwargs['changelog_types'] change_category = changelog_types.categories[kwargs['category_id']] with CredibleFrontEndDriver( driving_identifier_stem.get('id_source')) as driver: extraction_args = { 'driving_id_type': driving_identifier_stem.get('id_type'), 'driving_id_name': driving_identifier_stem.get('id_name'), 'driving_id_value': kwargs['id_value'], 'local_max_value': kwargs['local_max_value'], 'category_id': change_category.category_id, 'driving_identifier_stem': driving_identifier_stem, 'identifier_stem': identifier_stem, 'category': change_category } remote_changes = driver.get_change_logs(**extraction_args) sorted_changes = {} for remote_change in remote_changes: change_action = remote_change['Action'] if change_action not in sorted_changes: sorted_changes[change_action] = [] sorted_changes[change_action].append(remote_change) return {'change_actions': sorted_changes}
def _populate_common_fields(self, entry): if not self._extracted_identifier_stem: self._extracted_identifier_stem = IdentifierStem.from_raw( entry['extracted_identifier_stem']) if not self._driving_identifier_stem: self._driving_identifier_stem = IdentifierStem.from_raw( entry['driving_identifier_stem']) if not self._schema_entry: self._schema_entry = SchemaVertexEntry.retrieve( self._extracted_identifier_stem.object_type) if not self._mapping: self._mapping = self._generate_mapping() if not self._local_max_value: self._local_max_value = entry['local_max_value'] return
def correct(): corrected = [] admin_file_name = os.path.dirname(__file__) admin_directory = os.path.dirname(admin_file_name) schema_file_path = os.path.join(admin_directory, 'starters', 'change_types.json') schema = {} with open(schema_file_path) as test: test_schema = json.load(test) for entry in test_schema: schema[entry['change_action']] = entry soup = bs4.BeautifulSoup(test_html, features='html.parser') options = soup.find_all('option') for option in options: change_id = option.attrs.get('value') change_action = option.text if not change_id or not change_action: continue action_id = int(change_id) schema_entry = schema.get(change_action) if not schema_entry: continue schema_entry['action_id'] = action_id identifier_stem = IdentifierStem.from_raw( schema_entry['identifier_stem']) pairs = identifier_stem.paired_identifiers pairs['action_id'] = action_id corrected_stem = IdentifierStem('vertex', 'ChangeLogType', pairs) sid_value = corrected_stem.for_dynamo schema_entry['identifier_stem'] = str(corrected_stem) schema_entry['sid_value'] = sid_value corrected.append(schema_entry) string_corrected = json.dumps(corrected) print(string_corrected)
def _extract_change_logs(cls, driver, id_value, local_max_values, **kwargs): change_logs = [] identifier_stem = kwargs['identifier_stem'] driving_stem = IdentifierStem.from_raw( identifier_stem.get('identifier_stem')) driving_id_type = driving_stem.get('id_type') driving_id_name = driving_stem.get('id_name') id_source = kwargs['id_source'] mapping = kwargs['mapping'] id_source_mapping = mapping.get(id_source, mapping['default']) object_mapping = id_source_mapping[driving_id_type] for category_id, local_max_value in local_max_values.items(): extraction_args = { 'driving_id_type': driving_id_type, 'driving_id_name': driving_id_name, 'driving_id_value': id_value, 'category_id': category_id, 'local_max_value': local_max_value } source_extraction = driver.get_change_logs(**extraction_args) # change_detail_extraction = driver.get_change_details(**extraction_args) emp_ids = driver.get_emp_ids(**extraction_args) # for change_date, entry in source_extraction.items(): # entry['User'] = emp_ids[change_date] formatted_extraction = cls._format_change_log_data( identifier_stem, source_extraction, object_mapping=object_mapping, driver=driver) change_logs.extend(formatted_extraction)
def graph_links(**kwargs): from toll_booth.alg_obj.graph.ogm.ogm import Ogm from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem from toll_booth.alg_obj.graph import InternalId from toll_booth.alg_obj.graph.ogm.regulators import PotentialVertex edges = [] driving_identifier_stem = IdentifierStem.from_raw( kwargs['identifier_stem']) id_source = driving_identifier_stem.get('id_source') internal_id = InternalId(''.join(['IdSource', id_source])).id_value identifier_stem = IdentifierStem('vertex', 'IdSource', {'id_source': id_source}) potential_vertex = PotentialVertex('IdSource', internal_id, {'id_source': id_source}, identifier_stem, id_source, 'id_source') vertexes = [potential_vertex] link_histories = kwargs.get('new_link_histories', []) new_links = kwargs.get('new_links', []) new_unlinks = kwargs.get('new_unlinks', []) for entry in link_histories: vertexes.append(entry.potential_vertex) edges.append(entry.generate_edge(entry.most_recent_link)) edges.extend(x[0].generate_edge(x[1]) for x in new_links) edges.extend(x[0].generate_edge(x[1]) for x in new_unlinks) ogm = Ogm(**kwargs) ogm.graph_objects(vertexes, edges)
def get_enrichment_for_change_action(**kwargs): from toll_booth.alg_obj.forge.extractors.credible_fe.mule_team import CredibleMuleTeam from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) id_source = driving_identifier_stem.get('id_source') changelog_types = kwargs['changelog_types'] action_id = kwargs['action_id'] change_action = changelog_types[str(action_id)] category_id = changelog_types.get_category_id_from_action_id( str(action_id)) if change_action.is_static and change_action.has_details is False and not change_action.entity_type: empty_data = {'change_detail': {}, 'by_emp_ids': {}, 'entity_ids': {}} return {'enriched_data': empty_data} mule_team = CredibleMuleTeam(id_source) enrichment_args = { 'driving_id_type': driving_identifier_stem.get('id_type'), 'driving_id_name': driving_identifier_stem.get('id_name'), 'driving_id_value': kwargs['id_value'], 'local_max_value': kwargs['local_max_value'], 'category_id': category_id, 'action_id': int(action_id), 'get_details': change_action.has_details is True, 'get_by_emp_ids': change_action.is_static is False, 'get_entity_ids': change_action.entity_type, 'checked_emp_ids': None } enriched_data = mule_team.enrich_data(**enrichment_args) return {'enriched_data': enriched_data}
def __init__(self, identifier_stem, driving_identifier_stem, **kwargs): identifier_stem = IdentifierStem.from_raw(identifier_stem) driving_identifier_stem = IdentifierStem.from_raw( driving_identifier_stem) self._spore_id = uuid.uuid4().hex self._identifier_stem = identifier_stem self._driving_identifier_stem = driving_identifier_stem self._leech_driver = LeechDriver(table_name='VdGraphObjects') self._extractor_setup = self._leech_driver.get_extractor_setup( driving_identifier_stem) self._schema_entry = SchemaVertexEntry.retrieve( driving_identifier_stem.object_type) self._sample_size = kwargs.get('sample_size', 1000) self._extraction_profile = self._generate_extraction_profile() self._driving_vertex_regulator = VertexRegulator.get_for_object_type( driving_identifier_stem.object_type)
def _creep(self, entry, **kwargs): driving_identifier_stem = kwargs['driving_identifier_stem'] driver = kwargs['driver'] identifier_stem = IdentifierStem.from_raw(entry['identifier_stem']) id_value = entry['driving_id_value'] category = identifier_stem.get('category') change_category = self._change_types.get_category_by_name(category) logging.info( f'started the extraction for id_value: {id_value}, change_category_id: {change_category.category_id}' ) local_max_value = self._get_local_max_value(id_value, change_category) extraction_args = { 'driving_id_type': driving_identifier_stem.get('id_type'), 'driving_id_name': driving_identifier_stem.get('id_name'), 'driving_id_value': id_value, 'local_max_value': local_max_value, 'category_id': change_category.category_id, 'driving_identifier_stem': driving_identifier_stem, 'identifier_stem': identifier_stem, 'category': change_category } remote_changes = driver.get_change_logs(**extraction_args) logging.info( f'completed the extraction for id_value: {id_value}, change_category_id: {change_category.category_id}' ) self._mark_creep_vertexes(remote_changes, **extraction_args)
def pull_schema_entry(**kwargs): from toll_booth.alg_obj.graph.schemata.schema import Schema from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem identifier_stem = IdentifierStem.from_raw(kwargs['identifier_stem']) schema = Schema.retrieve(**kwargs) schema_entry = schema[identifier_stem.object_type] return {'schema_entry': schema_entry, 'schema': schema}
def __init__(self, monitor_order, **kwargs): identifier_stem = IdentifierStem.from_raw( monitor_order.identifier_stem) self._identifier_stem = identifier_stem self._id_source = monitor_order.id_source self._leech_driver = LeechDriver(**kwargs) self._local_setup = self._leech_driver.get_field_value_setup( self._identifier_stem) self._sample_size = kwargs.get('sample_size', 1000)
def get_local_ids(**kwargs): from toll_booth.alg_obj.graph.index_manager.index_manager import IndexManager from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = kwargs['driving_identifier_stem'] driving_identifier_stem = IdentifierStem.from_raw(driving_identifier_stem) index_driver = IndexManager.from_graph_schema(kwargs['schema'], **kwargs) local_id_values = index_driver.get_local_id_values(driving_identifier_stem) return {'local_id_values': local_id_values}
def __init__(self, identifier_stem, id_value, **kwargs): object_type = kwargs.get('object_type', None) if identifier_stem: identifier_stem = IdentifierStem.from_raw(identifier_stem) object_type = identifier_stem.object_type self._identifier_stem = identifier_stem self._id_value = id_value self._dynamo_parameters = DynamoParameters(identifier_stem, id_value) self._object_properties = kwargs.get('object_properties', {}) self._object_type = object_type
def test_monitor_extraction(self, specified_identifier_stem): schema_entry = SchemaVertexEntry.retrieve(specified_identifier_stem.object_type) driving_stem = IdentifierStem.from_raw(specified_identifier_stem.retrieve('identifier_stem')) extraction_profile = schema_entry.extract['CredibleFrontEndExtractor'].extraction_properties extraction_profile.update(driving_stem.for_extractor) extraction_profile.update({ 'identifier_stems': [{'identifier_stem': specified_identifier_stem, 'id_value': None}], 'id_source': specified_identifier_stem.retrieve('id_source') }) results = CredibleFrontEndExtractor.extract(**extraction_profile) print()
def __init__(self, *, identifier_stem, **kwargs): identifier_stem = IdentifierStem.from_raw(identifier_stem) self._identifier_stem = identifier_stem self._object_type = identifier_stem.object_type self._schema_entry = SchemaVertexEntry.retrieve(self._object_type) self._leech_driver = LeechDriver() self._extractor_setup = self._leech_driver.get_extractor_setup( identifier_stem) self._extraction_profile = self._generate_extraction_profile() self._extraction_queue = kwargs.get( 'extraction_queue', ForgeQueue.get_for_extraction_queue(**kwargs)) self._sample_size = kwargs.get('sample_size', 1000)
def get_extractor_function_names(self, identifier_stem): identifier_stem = IdentifierStem.from_raw(identifier_stem) params = DynamoParameters(identifier_stem.for_dynamo, identifier_stem) results = self._table.get_item(Key=params.as_key) try: extractor_function_names = results['Item'][ 'extractor_function_names'] return extractor_function_names except KeyError: raise MissingExtractionInformation( 'could not find extractor information for identifier stem %s' % identifier_stem)
def generate_remote_id_change_data(**kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) remote_change = kwargs['remote_change'] changelog_types = kwargs['changelog_types'] action_id = kwargs['action_id'] change_action = changelog_types[str(action_id)] enriched_data = kwargs['enriched_data'] change_date_utc = remote_change['UTCDate'] extracted_data = _build_change_log_extracted_data(remote_change, kwargs['mapping']) id_source = driving_identifier_stem.get('id_source') by_emp_id = enriched_data['emp_ids'].get(change_date_utc, kwargs['id_value']) fungal_stem = FungalStem.from_identifier_stem(driving_identifier_stem, kwargs['id_value'], change_action.category) source_data = { 'change_date_utc': extracted_data['change_date_utc'], 'change_description': extracted_data['change_description'], 'change_date': extracted_data['change_date'], 'fungal_stem': str(fungal_stem), 'action': extracted_data['action'], 'action_id': str(action_id), 'id_source': id_source, 'id_type': 'ChangeLog', 'id_name': 'change_date_utc', 'by_emp_id': by_emp_id } returned_data = { 'source': source_data, 'by_emp_id_target': [{ 'id_source': id_source, 'id_type': 'Employees', 'id_value': by_emp_id }], 'change_target': [], 'changed_target': [] } changed_targets = _build_changed_targets(id_source, extracted_data, change_action) if changed_targets: returned_data['changed_target'].extend(changed_targets) change_details = enriched_data.get('change_detail', {}) change_detail_target = change_details.get(change_date_utc, None) if change_detail_target is not None: returned_data['change_target'].extend(change_detail_target) return {'change_data': returned_data}
def fungus(execution_id, **kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem subtask_name = 'command_fungi' decisions = kwargs['decisions'] subtask_identifier = f'f-{execution_id}' task_args = kwargs['task_args'] identifier_stem = IdentifierStem.from_raw( "#vertex#ChangeLog#{\"id_source\": \"MBI\"}#") driving_identifier_stem = IdentifierStem.from_raw( "#vertex#ExternalId#{\"id_source\": \"MBI\", \"id_type\": \"Employees\", \"id_name\": \"emp_id\"}#" ) task_args.add_argument_value( subtask_name, { 'identifier_stem': identifier_stem, 'driving_identifier_stem': driving_identifier_stem }) fungal_signature = SubtaskSignature(subtask_identifier, subtask_name, **kwargs) results = fungal_signature(**kwargs) if not results: return decisions.append(CompleteWork())
def _build_remote_id_extractor(**kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = kwargs['driving_identifier_stem'] schema = kwargs['schema'] driving_identifier_stem = IdentifierStem.from_raw(driving_identifier_stem) schema_entry = schema[driving_identifier_stem.object_type] extractor_setup = { 'id_type': driving_identifier_stem.get('id_type'), 'type': 'CredibleFrontEndExtractor' } extractor_setup.update(driving_identifier_stem.for_extractor) extractor_setup.update( schema_entry.extract[extractor_setup['type']].extraction_properties) return extractor_setup
def _run_single_extract(cls, identifier, **kwargs): extracted_data = {} id_source = kwargs['id_source'] id_value = identifier['id_value'] identifier_stem = identifier['identifier_stem'] identifier_stem = IdentifierStem.from_raw(identifier_stem) kwargs['identifier_stem'] = identifier_stem object_type = identifier_stem.object_type with CredibleFrontEndDriver(id_source) as driver: if object_type == 'ExternalId': source_extraction = driver.get_ext_id(identifier_stem) if object_type == 'ChangeLog': local_max_values = identifier['local_max_values'] return cls._extract_change_logs(driver, id_value, local_max_values, **kwargs)
def test_get_full_change_logs(self, monitored_object_identifier_stem): identifier_stem = monitored_object_identifier_stem[0] identifier_stem = IdentifierStem.from_raw(identifier_stem) id_value = monitored_object_identifier_stem[1] object_type = identifier_stem.object_type schema_entry = SchemaVertexEntry.retrieve(object_type) kwargs = { 'identifier_stems': [{'identifier_stem': identifier_stem, 'id_value': id_value}], 'id_source': identifier_stem.retrieve('id_source') } extraction_profile = schema_entry.extract['CredibleFrontEndExtractor'] kwargs.update(extraction_profile.extraction_properties) kwargs.update(identifier_stem.for_extractor) results = CredibleFrontEndExtractor.extract(**kwargs) print(results)
def build_mapping(**kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) id_source = driving_identifier_stem.get('id_source') schema = kwargs['schema'] schema_entry = schema[driving_identifier_stem.object_type] fungal_extractor = schema_entry.extract['CredibleFrontEndExtractor'] extraction_properties = fungal_extractor.extraction_properties mapping = extraction_properties['mapping'] id_source_mapping = mapping.get(id_source, mapping['default']) mapping = id_source_mapping[driving_identifier_stem.get('id_type')] return {'mapping': mapping}
def _set_changed_ids(change_type, **kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem from toll_booth.alg_obj.graph.ogm.regulators import VertexRegulator from toll_booth.alg_obj.aws.sapper.leech_driver import LeechDriver from botocore.exceptions import ClientError id_values = kwargs['id_values'] driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) driving_vertex_regulator = VertexRegulator.get_for_object_type( driving_identifier_stem.object_type) leech_driver = LeechDriver( table_name=kwargs.get('table_name', 'VdGraphObjects')) for id_value in id_values: object_data = driving_identifier_stem.for_extractor object_data['id_value'] = id_value potential_vertex = driving_vertex_regulator.create_potential_vertex( object_data) try: if change_type == 'new': leech_driver.set_assimilated_vertex( potential_vertex, False, identifier_stem=driving_identifier_stem, id_value=id_value) continue if change_type == 'link': leech_driver.set_link_object( potential_vertex.internal_id, driving_identifier_stem.get('id_source'), False, identifier_stem=driving_identifier_stem, id_value=id_value) continue if change_type == 'unlink': leech_driver.set_link_object( potential_vertex.internal_id, driving_identifier_stem.get('id_source'), True, identifier_stem=driving_identifier_stem, id_value=id_value) continue raise NotImplementedError( 'could not find operation to perform for changed_ids type: %s' % change_type) except ClientError as e: if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise e
def post_process_get_encounters(**kwargs): from toll_booth.alg_obj.forge.extractors.credible_fe import CredibleFrontEndDriver from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem encounter_id = kwargs['id_value'] id_type = kwargs['id_type'] if id_type != 'ClientVisit': raise NotImplementedError( f'cannot post process an encounter off id_type: {id_type}') driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) id_source = driving_identifier_stem.get('id_source') with CredibleFrontEndDriver(id_source) as driver: results = driver.retrieve_client_encounter(encounter_id) return {'encounter_results': results}
def mark_fruited_vertex(self, propagation_id, creep_identifier_stem, extracted_data, leech_record): creep_identifier_stem = IdentifierStem.from_raw(creep_identifier_stem) try: self._table.update_item(**leech_record.for_vertex_driven_seed(extracted_data)) working = False except ClientError as e: if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise e working = True self._table.delete_item( Key={ 'sid_value': str(propagation_id), 'identifier_stem': str(creep_identifier_stem) } ) return working
def extraction_order(request): from toll_booth.alg_obj.graph.schemata.schema_entry import SchemaEntry from toll_booth.alg_obj.forge.comms.orders import ExtractObjectOrder from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem params = request.param identifier_stem = IdentifierStem.from_raw(params[1]) schema_entry = SchemaEntry.retrieve(params[0]) extraction_properties = identifier_stem.for_extractor schema_extraction_properties = schema_entry.extract[params[3]] extraction_properties.update( schema_extraction_properties.extraction_properties) id_value = params[2] extractor_function_name = params[4] return ExtractObjectOrder(identifier_stem, id_value, extractor_function_name, extraction_properties, schema_entry)
def get_field_value_setup(self, identifier_stem): identifier_stem = IdentifierStem.from_raw(identifier_stem) params = DynamoParameters(identifier_stem.for_dynamo, identifier_stem) results = self._table.get_item( Key=params.as_key ) try: field_values = results['Item']['field_values'] except KeyError: raise MissingFieldValuesException( 'could not find field values for identifier stem %s' % identifier_stem) try: extractor_function_names = results['Item']['extractor_function_names'] except KeyError: raise MissingExtractionInformation( 'could not find extractor names for identifier stem %s' % identifier_stem ) return {'field_values': field_values, 'extractor_names': extractor_function_names}
def _extract(self, stalled_object): identifier_stem = stalled_object['identifier_stem'] identifier_stem = IdentifierStem.from_raw(identifier_stem) extractor_names = self._driver.get_extractor_function_names( identifier_stem) schema_entry = SchemaVertexEntry.retrieve( stalled_object['object_type']) schema_extraction_properties = schema_entry.extract[ extractor_names['type']] extraction_properties = identifier_stem.for_extractor extraction_properties.update( schema_extraction_properties.extraction_properties) extractor_name = extractor_names['extraction'] extraction_order = ExtractObjectOrder(identifier_stem, stalled_object['id_value'], extractor_name, extraction_properties, schema_entry) self._extraction_queue.add_order(extraction_order)
def get_local_max_change_type_value(**kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem from toll_booth.alg_obj.graph.index_manager.index_manager import IndexManager driving_identifier_stem = kwargs['driving_identifier_stem'] id_value = kwargs['id_value'] category_id = kwargs['category_id'] changelog_types = kwargs['changelog_types'] driving_identifier_stem = IdentifierStem.from_raw(driving_identifier_stem) id_source = driving_identifier_stem.get('id_source') id_type = driving_identifier_stem.get('id_type') change_category = changelog_types.categories[category_id] change_stem = f'#{id_source}#{id_type}#{id_value}#{change_category}' index_manager = IndexManager.from_graph_schema(kwargs['schema'], **kwargs) try: local_max_value = index_manager.query_object_max(change_stem) except EmptyIndexException: local_max_value = None return {'local_max_value': local_max_value}
def mark_propagated_vertexes(self, propagation_id, identifier_stem, driving_identifier_stem, driving_id_values, **kwargs): driving_identifier_stem = IdentifierStem.from_raw(driving_identifier_stem) driving_pairs = driving_identifier_stem.paired_identifiers change_types = kwargs['change_types'] with self._table.batch_writer() as writer: for id_value in driving_id_values: for change_category in change_types.categories.values(): change_pairs = driving_pairs.copy() change_pairs['id_value'] = id_value change_pairs['category'] = str(change_category) change_identifier_stem = IdentifierStem('propagation', identifier_stem.object_type, change_pairs) change = { 'identifier_stem': str(change_identifier_stem), 'sid_value': str(propagation_id), 'propagation_id': str(propagation_id), 'driving_identifier_stem': str(driving_identifier_stem), 'extracted_identifier_stem': str(identifier_stem), 'driving_id_value': id_value } writer.put_item(Item=change)