def work_remote_id_change_type(**kwargs): from toll_booth.alg_obj.forge.extractors.credible_fe import CredibleFrontEndDriver from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) identifier_stem = IdentifierStem.from_raw(kwargs['identifier_stem']) changelog_types = kwargs['changelog_types'] change_category = changelog_types.categories[kwargs['category_id']] with CredibleFrontEndDriver( driving_identifier_stem.get('id_source')) as driver: extraction_args = { 'driving_id_type': driving_identifier_stem.get('id_type'), 'driving_id_name': driving_identifier_stem.get('id_name'), 'driving_id_value': kwargs['id_value'], 'local_max_value': kwargs['local_max_value'], 'category_id': change_category.category_id, 'driving_identifier_stem': driving_identifier_stem, 'identifier_stem': identifier_stem, 'category': change_category } remote_changes = driver.get_change_logs(**extraction_args) sorted_changes = {} for remote_change in remote_changes: change_action = remote_change['Action'] if change_action not in sorted_changes: sorted_changes[change_action] = [] sorted_changes[change_action].append(remote_change) return {'change_actions': sorted_changes}
def creep(self): with CredibleFrontEndDriver(self._id_source) as driver: for entry in self._leech_driver.get_propagated_vertexes( self._propagation_id): if not self._driving_identifier_stem: self._driving_identifier_stem = IdentifierStem.from_raw( entry['driving_identifier_stem']) if not self._extracted_identifier_stem: self._extracted_identifier_stem = IdentifierStem.from_raw( entry['extracted_identifier_stem']) if not self._propagation_identifier_stem: self._propagation_identifier_stem = IdentifierStem.from_raw( entry['identifier_stem']) try: self._creep( entry, identifier_stem=self._extracted_identifier_stem, driving_identifier_stem=self._driving_identifier_stem, context=self._context, driver=driver) except InsufficientOperationTimeException: return False return { 'propagation_id': self._propagation_id, 'id_source': self._id_source }
def graph_links(**kwargs): from toll_booth.alg_obj.graph.ogm.ogm import Ogm from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem from toll_booth.alg_obj.graph import InternalId from toll_booth.alg_obj.graph.ogm.regulators import PotentialVertex edges = [] driving_identifier_stem = IdentifierStem.from_raw( kwargs['identifier_stem']) id_source = driving_identifier_stem.get('id_source') internal_id = InternalId(''.join(['IdSource', id_source])).id_value identifier_stem = IdentifierStem('vertex', 'IdSource', {'id_source': id_source}) potential_vertex = PotentialVertex('IdSource', internal_id, {'id_source': id_source}, identifier_stem, id_source, 'id_source') vertexes = [potential_vertex] link_histories = kwargs.get('new_link_histories', []) new_links = kwargs.get('new_links', []) new_unlinks = kwargs.get('new_unlinks', []) for entry in link_histories: vertexes.append(entry.potential_vertex) edges.append(entry.generate_edge(entry.most_recent_link)) edges.extend(x[0].generate_edge(x[1]) for x in new_links) edges.extend(x[0].generate_edge(x[1]) for x in new_unlinks) ogm = Ogm(**kwargs) ogm.graph_objects(vertexes, edges)
def correct(): corrected = [] admin_file_name = os.path.dirname(__file__) admin_directory = os.path.dirname(admin_file_name) schema_file_path = os.path.join(admin_directory, 'starters', 'change_types.json') schema = {} with open(schema_file_path) as test: test_schema = json.load(test) for entry in test_schema: schema[entry['change_action']] = entry soup = bs4.BeautifulSoup(test_html, features='html.parser') options = soup.find_all('option') for option in options: change_id = option.attrs.get('value') change_action = option.text if not change_id or not change_action: continue action_id = int(change_id) schema_entry = schema.get(change_action) if not schema_entry: continue schema_entry['action_id'] = action_id identifier_stem = IdentifierStem.from_raw( schema_entry['identifier_stem']) pairs = identifier_stem.paired_identifiers pairs['action_id'] = action_id corrected_stem = IdentifierStem('vertex', 'ChangeLogType', pairs) sid_value = corrected_stem.for_dynamo schema_entry['identifier_stem'] = str(corrected_stem) schema_entry['sid_value'] = sid_value corrected.append(schema_entry) string_corrected = json.dumps(corrected) print(string_corrected)
def specified_identifier_stem(request): params = request.param identifiers = { 'id_source': params[0], 'id_type': params[2], 'id_name': params[3] } identifier_stem = IdentifierStem('vertex', params[1], {'id_source': params[0]}) driving_stem = IdentifierStem('vertex', 'ExternalId', identifiers) specified_stem = identifier_stem.specify(driving_stem, params[4]) return specified_stem
def propagated_identifier_stem(request): params = request.param source_stem = IdentifierStem('vertex', params[0], {'id_source': params[1]}) driving_stem = IdentifierStem('vertex', 'ExternalId', { 'id_source': params[1], 'id_type': params[2], 'id_name': params[3] }) return { 'identifier_stem': source_stem, 'driving_identifier_stem': driving_stem }
def _populate_common_fields(self, entry): if not self._extracted_identifier_stem: self._extracted_identifier_stem = IdentifierStem.from_raw( entry['extracted_identifier_stem']) if not self._driving_identifier_stem: self._driving_identifier_stem = IdentifierStem.from_raw( entry['driving_identifier_stem']) if not self._schema_entry: self._schema_entry = SchemaVertexEntry.retrieve( self._extracted_identifier_stem.object_type) if not self._mapping: self._mapping = self._generate_mapping() if not self._local_max_value: self._local_max_value = entry['local_max_value'] return
def _creep(self, entry, **kwargs): driving_identifier_stem = kwargs['driving_identifier_stem'] driver = kwargs['driver'] identifier_stem = IdentifierStem.from_raw(entry['identifier_stem']) id_value = entry['driving_id_value'] category = identifier_stem.get('category') change_category = self._change_types.get_category_by_name(category) logging.info( f'started the extraction for id_value: {id_value}, change_category_id: {change_category.category_id}' ) local_max_value = self._get_local_max_value(id_value, change_category) extraction_args = { 'driving_id_type': driving_identifier_stem.get('id_type'), 'driving_id_name': driving_identifier_stem.get('id_name'), 'driving_id_value': id_value, 'local_max_value': local_max_value, 'category_id': change_category.category_id, 'driving_identifier_stem': driving_identifier_stem, 'identifier_stem': identifier_stem, 'category': change_category } remote_changes = driver.get_change_logs(**extraction_args) logging.info( f'completed the extraction for id_value: {id_value}, change_category_id: {change_category.category_id}' ) self._mark_creep_vertexes(remote_changes, **extraction_args)
def __init__(self, identifier_stem, driving_identifier_stem, **kwargs): identifier_stem = IdentifierStem.from_raw(identifier_stem) driving_identifier_stem = IdentifierStem.from_raw( driving_identifier_stem) self._spore_id = uuid.uuid4().hex self._identifier_stem = identifier_stem self._driving_identifier_stem = driving_identifier_stem self._leech_driver = LeechDriver(table_name='VdGraphObjects') self._extractor_setup = self._leech_driver.get_extractor_setup( driving_identifier_stem) self._schema_entry = SchemaVertexEntry.retrieve( driving_identifier_stem.object_type) self._sample_size = kwargs.get('sample_size', 1000) self._extraction_profile = self._generate_extraction_profile() self._driving_vertex_regulator = VertexRegulator.get_for_object_type( driving_identifier_stem.object_type)
def get_enrichment_for_change_action(**kwargs): from toll_booth.alg_obj.forge.extractors.credible_fe.mule_team import CredibleMuleTeam from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) id_source = driving_identifier_stem.get('id_source') changelog_types = kwargs['changelog_types'] action_id = kwargs['action_id'] change_action = changelog_types[str(action_id)] category_id = changelog_types.get_category_id_from_action_id( str(action_id)) if change_action.is_static and change_action.has_details is False and not change_action.entity_type: empty_data = {'change_detail': {}, 'by_emp_ids': {}, 'entity_ids': {}} return {'enriched_data': empty_data} mule_team = CredibleMuleTeam(id_source) enrichment_args = { 'driving_id_type': driving_identifier_stem.get('id_type'), 'driving_id_name': driving_identifier_stem.get('id_name'), 'driving_id_value': kwargs['id_value'], 'local_max_value': kwargs['local_max_value'], 'category_id': category_id, 'action_id': int(action_id), 'get_details': change_action.has_details is True, 'get_by_emp_ids': change_action.is_static is False, 'get_entity_ids': change_action.entity_type, 'checked_emp_ids': None } enriched_data = mule_team.enrich_data(**enrichment_args) return {'enriched_data': enriched_data}
def for_link_object(self, linked_internal_id, id_source, is_unlink): now = self._get_decimal_timestamp() base = self._for_update('linking', is_initial=True) paired_identifiers = { 'linked_id_source': id_source, 'internal_id': linked_internal_id } base['Key'] = DynamoParameters(now, IdentifierStem('vertex', 'link', paired_identifiers)).as_key base['UpdateExpression'] = base['UpdateExpression'] + ', #d=:d, #ids=:ids, #lt=:lt, #iul=:iul, #idv=:lt, #ot=:ot, #li=:li' base['ExpressionAttributeNames'].update({ '#ids': 'linked_id_source', '#lt': 'utc_link_time', '#iul': 'is_unlink', '#d': 'disposition', '#idv': 'id_value', '#ot': 'object_type', '#li': 'linked_internal_id' }) base['ExpressionAttributeValues'].update({ ':ids': id_source, ':lt': now, ':iul': is_unlink, ':d': 'graphing', ':ot': 'link', ':li': linked_internal_id }) return base
def spike_tables(): table = boto3.resource('dynamodb').Table('VdGraphObjects') sql = ''' SELECT Col.Table_Name, Col.Column_Name FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS Tab, INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE Col WHERE Col.Constraint_Name = Tab.Constraint_Name AND Col.Table_Name = Tab.Table_Name AND Constraint_Type = 'PRIMARY KEY' ''' credible_report = CredibleReport.from_sql('MBI', sql) with table.batch_writer() as writer: for table_name, entry in credible_report.items(): try: column_names = [x['Column_Name'] for x in entry] except TypeError: column_names = [entry['Column_Name']] pairs = {'table_name': table_name} identifier_stem = IdentifierStem('vertex', 'CredibleTable', pairs) new_item = { 'sid_value': identifier_stem.for_dynamo, 'identifier_stem': str(identifier_stem), 'table_name': table_name, 'column_names': column_names } try: writer.put_item(Item=new_item) except ClientError as e: print(e)
def _extract_change_logs(cls, driver, id_value, local_max_values, **kwargs): change_logs = [] identifier_stem = kwargs['identifier_stem'] driving_stem = IdentifierStem.from_raw( identifier_stem.get('identifier_stem')) driving_id_type = driving_stem.get('id_type') driving_id_name = driving_stem.get('id_name') id_source = kwargs['id_source'] mapping = kwargs['mapping'] id_source_mapping = mapping.get(id_source, mapping['default']) object_mapping = id_source_mapping[driving_id_type] for category_id, local_max_value in local_max_values.items(): extraction_args = { 'driving_id_type': driving_id_type, 'driving_id_name': driving_id_name, 'driving_id_value': id_value, 'category_id': category_id, 'local_max_value': local_max_value } source_extraction = driver.get_change_logs(**extraction_args) # change_detail_extraction = driver.get_change_details(**extraction_args) emp_ids = driver.get_emp_ids(**extraction_args) # for change_date, entry in source_extraction.items(): # entry['User'] = emp_ids[change_date] formatted_extraction = cls._format_change_log_data( identifier_stem, source_extraction, object_mapping=object_mapping, driver=driver) change_logs.extend(formatted_extraction)
def vd_identifier_stem(request): params = request.param identifier_stem = IdentifierStem('vertex', params[0], { 'id_source': 'Algernon', 'id_type': params[1], 'id_name': params[2] }) return identifier_stem
def get_credible_id_name(self, id_type): table_identifier_stem = IdentifierStem('vertex', 'CredibleTable', {'table_name': id_type}) results = self._table.get_item( Key={'identifier_stem': str(table_identifier_stem), 'sid_value': table_identifier_stem.for_dynamo}) try: return results['Item']['column_names'][0] except KeyError: raise MissingObjectException
def pull_schema_entry(**kwargs): from toll_booth.alg_obj.graph.schemata.schema import Schema from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem identifier_stem = IdentifierStem.from_raw(kwargs['identifier_stem']) schema = Schema.retrieve(**kwargs) schema_entry = schema[identifier_stem.object_type] return {'schema_entry': schema_entry, 'schema': schema}
def _calculate_change_log_identifier_stem(cls, extracted_data): pairs = { 'id_source': extracted_data['source']['id_source'], 'id_type': extracted_data['source']['id_type'], 'id_name': extracted_data['source']['id_name'] } identifier_stem = IdentifierStem('vertex', 'ChangeLog', pairs) return identifier_stem
def __init__(self, monitor_order, **kwargs): identifier_stem = IdentifierStem.from_raw( monitor_order.identifier_stem) self._identifier_stem = identifier_stem self._id_source = monitor_order.id_source self._leech_driver = LeechDriver(**kwargs) self._local_setup = self._leech_driver.get_field_value_setup( self._identifier_stem) self._sample_size = kwargs.get('sample_size', 1000)
def get_local_ids(**kwargs): from toll_booth.alg_obj.graph.index_manager.index_manager import IndexManager from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = kwargs['driving_identifier_stem'] driving_identifier_stem = IdentifierStem.from_raw(driving_identifier_stem) index_driver = IndexManager.from_graph_schema(kwargs['schema'], **kwargs) local_id_values = index_driver.get_local_id_values(driving_identifier_stem) return {'local_id_values': local_id_values}
def _calculate_change_log_identifier_stem(extracted_data): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem pairs = { 'id_source': extracted_data['source']['id_source'], 'id_type': extracted_data['source']['id_type'], 'id_name': extracted_data['source']['id_name'] } identifier_stem = IdentifierStem('vertex', 'ChangeLog', pairs) return identifier_stem
def monitored_object_identifier_stem(request): params = request.param paired_identifiers = OrderedDict() paired_identifiers['id_source'] = params[1] paired_identifiers['id_type'] = params[2] paired_identifiers['id_name'] = params[3] paired_identifiers['id_value'] = params[4] paired_identifiers['data_dict_id'] = params[5] identifier_stem = IdentifierStem('vertex', params[0], paired_identifiers) return identifier_stem, params[6]
def add_data_source_vertex(id_source, **kwargs): internal_id = InternalId(''.join(['IdSource', id_source])).id_value identifier_stem = IdentifierStem('vertex', 'IdSource', {'id_source': id_source}) potential_vertex = PotentialVertex('IdSource', internal_id, {'id_source': id_source}, identifier_stem, id_source, 'id_source') ogm = Ogm(**kwargs) results = ogm.graph_objects(vertexes=[potential_vertex]) return internal_id
def __init__(self, identifier_stem, id_value, **kwargs): object_type = kwargs.get('object_type', None) if identifier_stem: identifier_stem = IdentifierStem.from_raw(identifier_stem) object_type = identifier_stem.object_type self._identifier_stem = identifier_stem self._id_value = id_value self._dynamo_parameters = DynamoParameters(identifier_stem, id_value) self._object_properties = kwargs.get('object_properties', {}) self._object_type = object_type
def mark_propagated_vertexes(self, propagation_id, identifier_stem, driving_identifier_stem, driving_id_values, **kwargs): driving_identifier_stem = IdentifierStem.from_raw(driving_identifier_stem) driving_pairs = driving_identifier_stem.paired_identifiers change_types = kwargs['change_types'] with self._table.batch_writer() as writer: for id_value in driving_id_values: for change_category in change_types.categories.values(): change_pairs = driving_pairs.copy() change_pairs['id_value'] = id_value change_pairs['category'] = str(change_category) change_identifier_stem = IdentifierStem('propagation', identifier_stem.object_type, change_pairs) change = { 'identifier_stem': str(change_identifier_stem), 'sid_value': str(propagation_id), 'propagation_id': str(propagation_id), 'driving_identifier_stem': str(driving_identifier_stem), 'extracted_identifier_stem': str(identifier_stem), 'driving_id_value': id_value } writer.put_item(Item=change)
def test_monitor_extraction(self, specified_identifier_stem): schema_entry = SchemaVertexEntry.retrieve(specified_identifier_stem.object_type) driving_stem = IdentifierStem.from_raw(specified_identifier_stem.retrieve('identifier_stem')) extraction_profile = schema_entry.extract['CredibleFrontEndExtractor'].extraction_properties extraction_profile.update(driving_stem.for_extractor) extraction_profile.update({ 'identifier_stems': [{'identifier_stem': specified_identifier_stem, 'id_value': None}], 'id_source': specified_identifier_stem.retrieve('id_source') }) results = CredibleFrontEndExtractor.extract(**extraction_profile) print()
def __init__(self, *, identifier_stem, **kwargs): identifier_stem = IdentifierStem.from_raw(identifier_stem) self._identifier_stem = identifier_stem self._object_type = identifier_stem.object_type self._schema_entry = SchemaVertexEntry.retrieve(self._object_type) self._leech_driver = LeechDriver() self._extractor_setup = self._leech_driver.get_extractor_setup( identifier_stem) self._extraction_profile = self._generate_extraction_profile() self._extraction_queue = kwargs.get( 'extraction_queue', ForgeQueue.get_for_extraction_queue(**kwargs)) self._sample_size = kwargs.get('sample_size', 1000)
def get_extractor_function_names(self, identifier_stem): identifier_stem = IdentifierStem.from_raw(identifier_stem) params = DynamoParameters(identifier_stem.for_dynamo, identifier_stem) results = self._table.get_item(Key=params.as_key) try: extractor_function_names = results['Item'][ 'extractor_function_names'] return extractor_function_names except KeyError: raise MissingExtractionInformation( 'could not find extractor information for identifier stem %s' % identifier_stem)
def generate_remote_id_change_data(**kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem driving_identifier_stem = IdentifierStem.from_raw( kwargs['driving_identifier_stem']) remote_change = kwargs['remote_change'] changelog_types = kwargs['changelog_types'] action_id = kwargs['action_id'] change_action = changelog_types[str(action_id)] enriched_data = kwargs['enriched_data'] change_date_utc = remote_change['UTCDate'] extracted_data = _build_change_log_extracted_data(remote_change, kwargs['mapping']) id_source = driving_identifier_stem.get('id_source') by_emp_id = enriched_data['emp_ids'].get(change_date_utc, kwargs['id_value']) fungal_stem = FungalStem.from_identifier_stem(driving_identifier_stem, kwargs['id_value'], change_action.category) source_data = { 'change_date_utc': extracted_data['change_date_utc'], 'change_description': extracted_data['change_description'], 'change_date': extracted_data['change_date'], 'fungal_stem': str(fungal_stem), 'action': extracted_data['action'], 'action_id': str(action_id), 'id_source': id_source, 'id_type': 'ChangeLog', 'id_name': 'change_date_utc', 'by_emp_id': by_emp_id } returned_data = { 'source': source_data, 'by_emp_id_target': [{ 'id_source': id_source, 'id_type': 'Employees', 'id_value': by_emp_id }], 'change_target': [], 'changed_target': [] } changed_targets = _build_changed_targets(id_source, extracted_data, change_action) if changed_targets: returned_data['changed_target'].extend(changed_targets) change_details = enriched_data.get('change_detail', {}) change_detail_target = change_details.get(change_date_utc, None) if change_detail_target is not None: returned_data['change_target'].extend(change_detail_target) return {'change_data': returned_data}
def _derive_value_field_stems(self): stems = [] paired_identifiers = self._identifier_stem.paired_identifiers field_values = self._local_setup['field_values'] for field_value in field_values: field_names = ['id_source', 'id_type', 'id_name'] named_fields = OrderedDict() for field_name in field_names: named_fields[field_name] = paired_identifiers[field_name] named_fields['data_dict_id'] = field_value field_identifier_stem = IdentifierStem('vertex', 'FieldValue', named_fields) stems.append(field_identifier_stem) return stems
def fungus(execution_id, **kwargs): from toll_booth.alg_obj.graph.ogm.regulators import IdentifierStem subtask_name = 'command_fungi' decisions = kwargs['decisions'] subtask_identifier = f'f-{execution_id}' task_args = kwargs['task_args'] identifier_stem = IdentifierStem.from_raw( "#vertex#ChangeLog#{\"id_source\": \"MBI\"}#") driving_identifier_stem = IdentifierStem.from_raw( "#vertex#ExternalId#{\"id_source\": \"MBI\", \"id_type\": \"Employees\", \"id_name\": \"emp_id\"}#" ) task_args.add_argument_value( subtask_name, { 'identifier_stem': identifier_stem, 'driving_identifier_stem': driving_identifier_stem }) fungal_signature = SubtaskSignature(subtask_identifier, subtask_name, **kwargs) results = fungal_signature(**kwargs) if not results: return decisions.append(CompleteWork())