def get_description_annotations(self): """ Gets descriptive annotations, and a or a list of fields that are not in containment relationships """ self.description_annotations = ImportFieldAnnotation.objects\ .filter(source_id=self.source_id, predicate=ImportFieldAnnotation.PRED_DESCRIBES)\ .order_by('field_num') if len(self.description_annotations) > 0: self.count_active_fields = len(self.description_annotations) self.des_rels = LastUpdatedOrderedDict() for des_anno in self.description_annotations: add_descriptor_field = False if des_anno.object_field_num not in self.des_rels: # entities being described are in the field identified by object_field_num pg = ProcessGeneral(self.source_id) field_obj = pg.get_field_obj(des_anno.object_field_num) if field_obj is not False: if field_obj.field_type in ImportProfile.DEFAULT_SUBJECT_TYPE_FIELDS: self.des_rels[des_anno.object_field_num] = LastUpdatedOrderedDict() self.des_rels[des_anno.object_field_num]['field'] = field_obj self.des_rels[des_anno.object_field_num]['des_by_fields'] = [] add_descriptor_field = True else: add_descriptor_field = True if add_descriptor_field: # the descriptive field is identified by the field_num pg = ProcessGeneral(self.source_id) des_field_obj = pg.get_field_obj(des_anno.field_num) if des_field_obj is not False: self.des_rels[des_anno.object_field_num]['des_by_fields'].append(des_field_obj)
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.fields = [] self.has_doc_field = False self.raw_field_annotations = [] self.label = False self.has_subjects = False self.get_examples = False self.field_annotations = False self.PREDICATE_CONTAINS = Assertion.PREDICATES_CONTAINS self.PREDICATE_LINK = Assertion.PREDICATES_LINK self.PREDICATE_CONTAINED_IN = ImportFieldAnnotation.PRED_CONTAINED_IN self.PRED_DESCRIBES = ImportFieldAnnotation.PRED_DESCRIBES self.PRED_VALUE_OF = ImportFieldAnnotation.PRED_VALUE_OF self.PRED_MEDIA_PART_OF = ImportFieldAnnotation.PRED_MEDIA_PART_OF self.PRED_DOC_Text = ImportFieldAnnotation.PRED_DOC_Text self.PRED_GEO_LOCATION = ImportFieldAnnotation.PRED_GEO_LOCATION self.PRED_DATE_EVENT = ImportFieldAnnotation.PRED_DATE_EVENT self.PRED_OBS_NUM = ImportFieldAnnotation.PRED_OBS_NUM self.PRED_METADATA = ImportFieldAnnotation.PRED_METADATA self.PRED_DRAFT_CONTAINS = ImportFieldAnnotation.PRED_DRAFT_CONTAINS self.PRED_COMPLEX_DES = ImportFieldAnnotation.PRED_COMPLEX_DES self.PRED_COMPLEX_LABEL = ImportFieldAnnotation.PRED_COMPLEX_LABEL self.nav = False self.has_media_field = False self.has_doc_field = False self.has_complex_des_field = False
def reconcile_item(self, imp_cell_obj): """ Checks to see if the item exists """ self.imp_cell_obj = imp_cell_obj if len(imp_cell_obj.record) > 0: self.combined_name = imp_cell_obj.record self.label = imp_cell_obj.record else: pg = ProcessGeneral(self.source_id) if self.import_rows is not False: check_list = self.import_rows else: check_list = [imp_cell_obj.row_num] self.evenif_blank = pg.check_blank_required( imp_cell_obj.field_num, check_list) if self.evenif_blank: self.combined_name = self.DEFAULT_BLANK self.label = self.DEFAULT_BLANK if isinstance(self.label, str): if len(self.label) > 0: match_found = self.match_against_persons(self.combined_name) if match_found is False: # create new subject, manifest objects. Need new UUID, since we can't assume # the fl_uuid for the ImportCell reflects unique entities in a field, since # uniqueness depends on context (values in other cells) self.new_entity = True self.uuid = GenUUID.uuid4() self.create_person_item() self.update_import_cell_uuid()
def reconcile_item(self, imp_cell_obj): """ Checks to see if the item exists """ self.imp_cell_obj = imp_cell_obj if len(imp_cell_obj.record) > 0: self.combined_name = imp_cell_obj.record self.label = imp_cell_obj.record else: pg = ProcessGeneral(self.source_id) if self.import_rows is not False: check_list = self.import_rows else: check_list = [imp_cell_obj.row_num] self.evenif_blank = pg.check_blank_required(imp_cell_obj.field_num, check_list) if self.evenif_blank: self.combined_name = self.DEFAULT_BLANK self.label = self.DEFAULT_BLANK if isinstance(self.label, str): if len(self.label) > 0: match_found = self.match_against_persons(self.combined_name) if match_found is False: # create new subject, manifest objects. Need new UUID, since we can't assume # the fl_uuid for the ImportCell reflects unique entities in a field, since # uniqueness depends on context (values in other cells) self.new_entity = True self.uuid = GenUUID.uuid4() self.create_person_item() self.update_import_cell_uuid()
def get_link_annotations(self): """ Gets descriptive annotations, and a or a list of fields that are not in containment relationships """ link_annotations = ImportFieldAnnotation.objects\ .filter(source_id=self.source_id)\ .exclude(predicate__in=self.DEFAULT_EXCLUSION_PREDS)\ .order_by('field_num', 'object_field_num') if len(link_annotations) > 0: self.count_active_fields = len(link_annotations) self.link_rels = LastUpdatedOrderedDict() for link_anno in link_annotations: pg = ProcessGeneral(self.source_id) subj_field = pg.get_field_obj(link_anno.field_num) obj_field = pg.get_field_obj(link_anno.object_field_num) if subj_field is not False and obj_field is not False: # print('Found subject, object') if subj_field.field_type in ImportProfile.DEFAULT_SUBJECT_TYPE_FIELDS \ and obj_field.field_type in ImportProfile.DEFAULT_SUBJECT_TYPE_FIELDS: # print('Valid subject, object') if link_anno.field_num not in self.link_rels: rels = {'sub_field_obj': subj_field, 'pred_objs': []} else: rels = self.link_rels[link_anno.field_num] pred_obj = {'predicate_uuid': False, 'pred_field_obj': False, 'obj_field_obj': obj_field} if link_anno.predicate_field_num > 0: pred_obj['pred_field_obj'] = pg.get_field_obj(link_anno.predicate_field_num) else: pred_obj['predicate_uuid'] = link_anno.predicate rels['pred_objs'].append(pred_obj) self.link_rels[link_anno.field_num] = rels
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.types_fields = False self.start_field = False self.stop_field = False self.start_row = 1 self.batch_size = 250 self.end_row = self.batch_size self.example_size = 5
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.complex_des_fields = [] self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.count_active_fields = 0 self.count_new_assertions = 0 self.obs_num_complex_description_assertions = 1
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.example_size = 5 self.link_rels = False self.count_active_fields = 0 self.count_new_assertions = 0
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.media_fields = [] self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.count_active_fields = 0 self.new_entities = [] self.reconciled_entities = [] self.not_reconciled_entities = []
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.persons_fields = [] self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.count_active_fields = 0 self.new_entities = [] self.reconciled_entities = [] self.not_reconciled_entities = []
def get_obs_num_field_num(self, field_obj): """ Gets the observation number field for descriptive fields if they exist.""" obs_num_field_num = False obs_fields = ImportFieldAnnotation.objects\ .filter(source_id=self.source_id, predicate=ImportFieldAnnotation.PRED_OBS_NUM, object_field_num=field_obj.field_num)[:1] if len(obs_fields) > 0: pg = ProcessGeneral(self.source_id) obs_num_obj = pg.get_field_obj(obs_fields[0].field_num) if obs_num_obj: if obs_num_obj.field_type == 'obs-num': obs_num_field_num = obs_num_obj.field_num return obs_num_field_num
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid # object for associated metadata to new manifest objects self.metadata_obj = ManifestMetadata(self.source_id, self.project_uuid) self.documents_fields = [] self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.count_active_fields = 0 self.new_entities = [] self.reconciled_entities = [] self.not_reconciled_entities = []
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.description_annotations = False self.des_rels = False self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.example_size = 5 self.reconciled_predicates = {} self.reconciled_types = {} self.field_valueofs = {} self.count_active_fields = 0 self.count_new_assertions = 0
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid # object for associated metadata to new manifest objects self.metadata_obj = ManifestMetadata(self.source_id, self.project_uuid) self.media_fields = [] self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.count_active_fields = 0 self.new_entities = [] self.reconciled_entities = [] self.not_reconciled_entities = []
def get_field_valueofs(self, variable_field_num): """ gets the field_valueofs for a variable field_num """ valueof_fields = [] val_annos = ImportFieldAnnotation.objects\ .filter(source_id=self.source_id, predicate=ImportFieldAnnotation.PRED_VALUE_OF, object_field_num=variable_field_num)\ .order_by('field_num') if len(val_annos) > 0: for val_anno in val_annos: pg = ProcessGeneral(self.source_id) val_obj = pg.get_field_obj(val_anno.field_num) if val_obj is not False: if val_obj.field_type == 'value': valueof_fields.append(val_obj) self.field_valueofs[variable_field_num] = valueof_fields return valueof_fields
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.subjects_fields = False self.contain_ordered_subjects = {} self.non_contain_subjects = [] self.root_subject_field = False # field_num for the root subject field self.field_parent_entities = {} # Parent entities named for a given field self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.example_size = 5 self.count_active_fields = 0 self.new_entities = [] self.reconciled_entities = [] self.not_reconciled_entities = []
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.fields = [] self.has_media_field = False self.raw_field_annotations = [] self.label = False self.has_subjects = False self.get_examples = False self.field_annotations = False self.PREDICATE_CONTAINS = Assertion.PREDICATES_CONTAINS self.PREDICATE_LINK = Assertion.PREDICATES_LINK self.PREDICATE_CONTAINED_IN = ImportFieldAnnotation.PRED_CONTAINED_IN self.PRED_DESCRIBES = ImportFieldAnnotation.PRED_DESCRIBES self.PRED_VALUE_OF = ImportFieldAnnotation.PRED_VALUE_OF self.PRED_MEDIA_PART_OF = ImportFieldAnnotation.PRED_MEDIA_PART_OF self.nav = False
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.imp_source_obj = False self.row_count = False self.imp_status = False self.start_row = False self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.act_process_num = False self.next_process_num = False self.done = False self.error = False self.ok = True self.active_processes = self.DEFAULT_PROCESS_STAGES self.get_refine_source_meta() self.get_active_stage_row()
def __init__(self, source_id): self.source_id = source_id pg = ProcessGeneral(source_id) pg.get_source() self.project_uuid = pg.project_uuid self.subjects_fields = False self.geospace_fields = {} # subject field num is key, dict has valid lat + lon fields self.date_fields = {} # subject field num is key, dict has early late fields self.geojson_rels = {} # subject field_num is key, integer value is geojson field_num self.contain_ordered_subjects = {} self.non_contain_subjects = [] self.root_subject_field = False # field_num for the root subject field self.field_parent_entities = {} # Parent entities named for a given field self.start_row = 1 self.batch_size = settings.IMPORT_BATCH_SIZE self.end_row = self.batch_size self.example_size = 5 self.count_active_fields = 0 self.new_entities = [] self.reconciled_entities = [] self.not_reconciled_entities = []
def get_variable_valueof(self, des_field_obj): """ Checks to see if the des_by_field is a variable that has designated values """ valueof_fields = [] if des_field_obj.field_type == 'variable': if des_field_obj.field_num in self.field_valueofs: valueof_fields = self.field_valueofs[des_field_obj.field_num] else: # get list of field_nums that have the des_by_field as their object val_annos = ImportFieldAnnotation.objects\ .filter(source_id=self.source_id, predicate=ImportFieldAnnotation.PRED_VALUE_OF, object_field_num=des_field_obj.field_num)\ .order_by(field_num) if len(val_annos) > 1: for val_anno in val_annos: pg = ProcessGeneral(self.source_id) val_obj = pg.get_field_obj(val_anno.field_num) if val_obj is not False: if val_obj.field_type == 'value': valueof_fields.append(val_obj) self.field_valueofs[des_field_obj.field_num] = valueof_fields return valueof_fields
def reconcile_item(self, imp_cell_obj): """ Checks to see if the item exists in the subjects table """ self.imp_cell_obj = imp_cell_obj if len(imp_cell_obj.record) > 0: self.label = self.label_prefix + imp_cell_obj.record else: pg = ProcessGeneral(self.source_id) if self.import_rows is not False: check_list = self.import_rows else: check_list = [imp_cell_obj.row_num] self.evenif_blank = pg.check_blank_required(imp_cell_obj.field_num, check_list) if self.evenif_blank: self.label = self.label_prefix + self.DEFAULT_BLANK if self.allow_new and self.label is not False: # Only create a new item if it is allowed and if the label is not false if len(self.parent_context) > 0: self.context = self.parent_context + Subject.HIEARCHY_DELIM + self.label else: self.context = self.label match_found = self.match_against_subjects(self.context) if match_found is False: # create new subject, manifest objects. Need new UUID, since we can't assume # the fl_uuid for the ImportCell reflects unique entities in a field, since # uniqueness depends on context (values in other cells) self.uuid = GenUUID.uuid4() self.create_subject_item() self.is_new = True else: if self.label is not False: # only allow matches on non-blank items when not creating a record match_found = self.match_against_manifest(self.label, self.class_uri) self.update_import_cell_uuid() self.add_contain_assertion()
def reconcile_item(self, imp_cell_obj): """ Checks to see if the item exists """ self.imp_cell_obj = imp_cell_obj if len(imp_cell_obj.record) > 0: self.label = imp_cell_obj.record else: pg = ProcessGeneral(self.source_id) if self.import_rows is not False: check_list = self.import_rows else: check_list = [imp_cell_obj.row_num] if self.label is not False: match_found = self.match_against_documents(self.label) if match_found is False: # create new document, manifest objects. self.new_entity = True sup_metadata = None self.uuid = GenUUID.uuid4() if self.metadata_obj is not None: sup_metadata = self.metadata_obj.get_metadata(imp_cell_obj.field_num, imp_cell_obj.row_num) meta_uuid = self.metadata_obj.get_uuid_from_metadata_dict(sup_metadata) if isinstance(meta_uuid, str): # use the uuid in the metadata! self.uuid = meta_uuid self.create_document_item(sup_metadata) else: act_doc = None exist_docs = OCdocument.objects\ .filter(uuid=self.uuid)[:1] if len(exist_docs) < 1: # problem! We have a manifest record for the document, but no document record, # so make one act_doc = OCdocument() act_doc.uuid = self.uuid # use the previously assigned temporary UUID act_doc.project_uuid = self.project_uuid act_doc.source_id = self.source_id act_doc.content = self.content act_doc.save() else: act_doc = exist_docs[0] if act_doc is not None: if act_doc.content != self.content and self.content != self.DEFAULT_NO_CONTENT: # update the document content with the latest content act_doc.content = self.content act_doc.save() self.update_import_cell_uuid()
def process_complex_batch(self): """ processes fields for documents entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_complex_description_fields() label_str_uuids = {} if len(self.complex_des_fields) > 0: print('Number of Complex Description Fields: ' + str(len(self.complex_des_fields))) cp_id_number = 0 for cp_field in self.complex_des_fields: cp_id_number += 1 pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid(cp_field.describes_field.field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): if cp_field.obs_num < 1: obs_num = 1 else: obs_num = cp_field.obs_num obs_node = '#obs-' + str(obs_num) subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = cp_field.describes_field.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] print('Look for complex description labels in rows: ' + str(in_rows)) if subject_ok is not False: # OK! we have the subjects of complex descriptions # with uuids, so now we can make an fl_uuid for each # of the complex description fields. complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str(cp_id_number) complex_recs = ImportCell.objects\ .filter(source_id=self.source_id, field_num=cp_field.field_num, row_num__in=in_rows)\ .exclude(record='') if len(complex_recs) > 0: # we have records in the complex description field that are not blank # and are associated with the subject of the complex description. # so now, let's record this association. save_ok = False new_ass = Assertion() new_ass.uuid = subject_uuid new_ass.subject_type = subject_type new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = obs_node new_ass.obs_num = obs_num new_ass.sort = 100 + cp_id_number new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES new_ass.object_type = 'complex-description' new_ass.object_uuid = complex_uuid new_ass.save() try: print('Saved complex-description: ' + complex_uuid) new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1 # now look through the complex description records and make labels for comp_rec in complex_recs: # first save the fl_uuid for the complex description comp_rec.fl_uuid = complex_uuid comp_rec.save() if isinstance(cp_field.value_prefix, str): cp_label = cp_field.value_prefix + comp_rec.record else: cp_label = comp_rec.record if cp_label not in label_str_uuids: # make a uuid for the record value # adding a source_id suffix keeps this from being deleted as descriptions get processed sm = StringManagement() sm.project_uuid = self.project_uuid sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX oc_string = sm.get_make_string(cp_label) content_uuid = oc_string.uuid label_str_uuids[cp_label] = content_uuid content_uuid = label_str_uuids[cp_label] save_ok = False new_ass = Assertion() new_ass.uuid = complex_uuid new_ass.subject_type = 'complex-description' new_ass.project_uuid = self.project_uuid # adding a source_id suffix keeps this from being deleted as descriptions get processed new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = '#obs-' + str(self.obs_num_complex_description_assertions) new_ass.obs_num = self.obs_num_complex_description_assertions new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL new_ass.object_type = 'xsd:string' new_ass.object_uuid = content_uuid try: new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1
def get_link_examples(self): """ Gets example entities with linking relations """ example_entities = [] self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: entity_example_count = 0 # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): if entity_example_count < self.example_size: # if we're less than the example size, make # an example object entity_example_count += 1 entity = LastUpdatedOrderedDict() entity_label = dist_rec['imp_cell_obj'].record if len(entity_label) < 1: entity_label = '[BLANK]' entity_label = rels['sub_field_obj'].value_prefix + entity_label entity['label'] = entity_label entity['id'] = str(subj_field_num) + '-' + str(row_key) entity['links'] = [] example_rows = [] example_rows.append(dist_rec['rows'][0]) in_rows = [dist_rec['rows'][0]] for pred_obj in rels['pred_objs']: act_preds = [] if pred_obj['predicate_uuid'] is not False: pred_item = LastUpdatedOrderedDict() pred_item['id'] = pred_obj['predicate_uuid'] ent = Entity() found = ent.dereference(pred_obj['predicate_uuid']) if found: pred_item['label'] = ent.label else: pred_item['label'] = '[Missing predicate!]' act_preds.append(pred_item) elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field pc = ProcessCells(self.source_id, self.start_row) predicate_records= pc.get_field_records(pred_obj['pred_field_obj'].field_num, in_rows) for pred_row_key, pred_rec in predicate_records.items(): pred_item = LastUpdatedOrderedDict() pred_item['id'] = str(pred_obj['pred_field_obj'].field_num) pred_item['id'] += '-' + str(pred_rec['rows'][0]) pred_item['label'] = pred_rec['imp_cell_obj'].record if len(pred_item['label']) < 1: pred_item['label'] = '[BLANK]' if len(act_precs) < self.example_size: act_preds.append(pred_item) for pred_item in act_preds: link_item = LastUpdatedOrderedDict() link_item['predicate'] = pred_item # values are in a list, to keep consistent with descriptions link_item['object'] = False obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records(obj_field_obj.field_num, in_rows) pg = ProcessGeneral(self.source_id) obj_rec = pg.get_first_distinct_record(obj_recs) if obj_rec is not False: object_val = LastUpdatedOrderedDict() object_label = obj_field_obj.value_prefix if len(obj_rec['imp_cell_obj'].record) > 1: object_label += obj_rec['imp_cell_obj'].record else: object_label += '[BLANK]' object_val['label'] = object_label object_val['id'] = str(obj_rec['imp_cell_obj'].field_num) object_val['id'] += '-' + str(obj_rec['rows'][0]) link_item['object'] = object_val if len(entity['links']) < self.example_size: entity['links'].append(link_item) example_entities.append(entity) return example_entities
def process_complex_batch(self): """ processes fields for documents entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_complex_description_fields() label_str_uuids = {} if len(self.complex_des_fields) > 0: print('Number of Complex Description Fields: ' + str(len(self.complex_des_fields))) cp_id_number = 0 for cp_field in self.complex_des_fields: cp_id_number += 1 pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid( cp_field.describes_field.field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records( distinct_records) for row_key, dist_rec in distinct_records.items(): if cp_field.obs_num < 1: obs_num = 1 else: obs_num = cp_field.obs_num obs_node = '#obs-' + str(obs_num) subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = cp_field.describes_field.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] print('Look for complex description labels in rows: ' + str(in_rows)) if subject_ok is not False: # OK! we have the subjects of complex descriptions # with uuids, so now we can make an fl_uuid for each # of the complex description fields. complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str( cp_id_number) complex_recs = ImportCell.objects\ .filter(source_id=self.source_id, field_num=cp_field.field_num, row_num__in=in_rows)\ .exclude(record='') if len(complex_recs) > 0: # we have records in the complex description field that are not blank # and are associated with the subject of the complex description. # so now, let's record this association. save_ok = False new_ass = Assertion() new_ass.uuid = subject_uuid new_ass.subject_type = subject_type new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = obs_node new_ass.obs_num = obs_num new_ass.sort = 100 + cp_id_number new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES new_ass.object_type = 'complex-description' new_ass.object_uuid = complex_uuid new_ass.save() try: print('Saved complex-description: ' + complex_uuid) new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1 # now look through the complex description records and make labels for comp_rec in complex_recs: # first save the fl_uuid for the complex description comp_rec.fl_uuid = complex_uuid comp_rec.save() if isinstance(cp_field.value_prefix, str): cp_label = cp_field.value_prefix + comp_rec.record else: cp_label = comp_rec.record if cp_label not in label_str_uuids: # make a uuid for the record value # adding a source_id suffix keeps this from being deleted as descriptions get processed sm = StringManagement() sm.project_uuid = self.project_uuid sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX oc_string = sm.get_make_string( cp_label) content_uuid = oc_string.uuid label_str_uuids[ cp_label] = content_uuid content_uuid = label_str_uuids[cp_label] save_ok = False new_ass = Assertion() new_ass.uuid = complex_uuid new_ass.subject_type = 'complex-description' new_ass.project_uuid = self.project_uuid # adding a source_id suffix keeps this from being deleted as descriptions get processed new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = '#obs-' + str( self. obs_num_complex_description_assertions) new_ass.obs_num = self.obs_num_complex_description_assertions new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL new_ass.object_type = 'xsd:string' new_ass.object_uuid = content_uuid try: new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1
def get_description_examples(self): """ Gets example entities described by other fields """ example_entities = [] self.get_description_annotations() if self.des_rels is not False: for subj_field_num, ent_obj in self.des_rels.items(): # get some example records pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: entity_example_count = 0 # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): if entity_example_count < self.example_size: # if we're less than the example size, make # an example object entity_example_count += 1 entity = LastUpdatedOrderedDict() entity_label = dist_rec['imp_cell_obj'].record if len(entity_label) < 1: entity_label = '[BLANK]' entity_label = ent_obj['field'].value_prefix + entity_label entity['label'] = entity_label entity['id'] = str(subj_field_num) + '-' + str(row_key) entity['descriptions'] = [] example_rows = [] example_rows.append(dist_rec['rows'][0]) for des_field_obj in ent_obj['des_by_fields']: des_item = LastUpdatedOrderedDict() des_item['predicate'] = LastUpdatedOrderedDict() # values are in a list, in case there are more than 1 (variable-value) des_item['objects'] = [] des_item['predicate']['type'] = des_field_obj.field_type if des_field_obj.field_type == 'description': # set the predicate for this description des_item['predicate']['label'] = des_field_obj.label des_item['predicate']['id'] = des_field_obj.field_num # now get a value for this description from the imported cells pc = ProcessCells(self.source_id, self.start_row) val_recs = pc.get_field_records(des_field_obj.field_num, example_rows) pg = ProcessGeneral(self.source_id) val_rec = pg.get_first_distinct_record(val_recs) if val_rec is not False: object_val = LastUpdatedOrderedDict() object_val['record'] = val_rec['imp_cell_obj'].record object_val['id'] = val_rec['rows'][0] des_item['objects'].append(object_val) elif des_field_obj.field_type == 'variable': # need to get the predicate from the imported cells pc = ProcessCells(self.source_id, self.start_row) var_recs = pc.get_field_records(des_field_obj.field_num, example_rows) pg = ProcessGeneral(self.source_id) val_rec = pg.get_first_distinct_record(val_recs) if var_rec is not False: des_item['predicate']['label'] = var_rec['imp_cell_obj'].record pid = str(des_field_obj.field_num) + '-' + str(var_rec['rows'][0]) des_item['predicate']['id'] = pid # now need to get fields that have object values for the predicate valueof_fields = self.get_variable_valueof(des_field_obj) for val_field_obj in valueof_fields: pc = ProcessCells(self.source_id, self.start_row) val_recs = pc.get_field_records(val_field_obj.field_num, example_rows) pg = ProcessGeneral(self.source_id) val_rec = pg.get_first_distinct_record(val_recs) if val_rec is not False: object_val = LastUpdatedOrderedDict() object_val['record'] = val_rec['imp_cell_obj'].record oid = str(val_field_obj.field_num) + '-' + str(val_rec['rows'][0]) object_val['id'] = oid des_item['objects'].append(object_val) entity['descriptions'].append(des_item) example_entities.append(entity) return example_entities
def process_link_batch(self): """ processes fields describing linking relations between subjects, media, documents, persons, projects entities. If start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records sub_field_obj = rels['sub_field_obj'] pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records( distinct_records) for row_key, dist_rec in distinct_records.items(): subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = sub_field_obj.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok if subject_uuid is False: subject_ok = False sort = 0 in_rows = dist_rec['rows'] for pred_obj in rels['pred_objs']: act_preds = {} obs_num = 1 # default observation number if pred_obj['predicate_uuid'] is not False: act_preds[pred_obj['predicate_uuid']] = in_rows elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field if pred_obj['pred_field_obj'].obs_num > 0: obs_num = pred_obj[ 'pred_field_obj'].obs_num sort = pred_obj['pred_field_obj'].field_num pc = ProcessCells(self.source_id, self.start_row) pred_recs = pc.get_field_records( pred_obj['pred_field_obj'].field_num, in_rows) for pred_rec in pred_recs: clp = CandidateLinkPredicate() clp.source_id = self.source_id clp.project_uuid = self.project_uuid clp.make_reconcile_link_pred( pred_rec['imp_cell_obj'].record) if clp.uuid is not False: act_preds[clp.uuid] = pred_rec['rows'] obs_node = '#obs-' + str(obs_num) for predicate_uuid, act_in_rows in act_preds.items( ): obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records( obj_field_obj.field_num, act_in_rows) if sort < 1: sort = obj_field_obj.field_num for hash_key, obj_rec in obj_recs.items(): # print('Worry about: ' + str(obj_rec['imp_cell_obj'].record)) object_uuid = obj_rec[ 'imp_cell_obj'].fl_uuid object_type = obj_field_obj.field_type object_ok = obj_rec['imp_cell_obj'].cell_ok cla = CandidateLinkAssertion() cla.project_uuid = self.project_uuid cla.source_id = self.source_id cla.subject_uuid = subject_uuid cla.subject_type = subject_type cla.obs_node = obs_node cla.obs_num = obs_num cla.sort = sort cla.predicate_uuid = predicate_uuid cla.object_uuid = object_uuid cla.object_type = object_type if (subject_ok and object_ok ) and predicate_uuid is not False: # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record)) cla.create_link() if cla.is_valid: self.count_new_assertions += 1 print( 'Count OK: ' + str(self.count_new_assertions))
def process_description_batch(self): """ processes fields describing a subject (subjects, media, documents, persons, projects) entity field. if start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_description_annotations() if self.des_rels is not False: for subj_field_num, ent_obj in self.des_rels.items(): # loop through the fields that describe the subj_field_num self.reconcile_descriptive_predicates(ent_obj['des_by_fields']) # -------- # reconciles types and strings by looping through reconciled predicate fields self.reconcile_types_strings() # -------- for subj_field_num, ent_obj in self.des_rels.items(): subj_field_type = ent_obj['field'].field_type # get records for the subject of the description pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num, False) if distinct_records is not False: pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) # print(str(distinct_records)) for row_key, dist_rec in distinct_records.items(): if dist_rec['imp_cell_obj'].cell_ok: subject_uuid = dist_rec['imp_cell_obj'].fl_uuid # the subject record is OK to use for creating # description records for des_field_obj in ent_obj['des_by_fields']: des_field_num = des_field_obj.field_num if des_field_obj.obs_num < 1: obs_num = 1 else: obs_num = des_field_obj.obs_num obs_node = '#obs-' + str(obs_num) # get the 'value-of' import cell objects for the current # 'descriptive' or 'variable' field_num # 'variable' field_nums may make multiple 'value-of' import_cell_objs object_imp_cell_objs = self.get_assertion_object_values(des_field_num, dist_rec['rows']) for imp_cell_obj in object_imp_cell_objs: row_num = imp_cell_obj.row_num predicate = self.look_up_predicate(des_field_num, row_num) if predicate is not False: cd = CandidateDescription() cd.source_id = self.source_id cd.project_uuid = self.project_uuid cd.subject_uuid = subject_uuid cd.subject_type = subj_field_type cd.obs_num = obs_num cd.obs_node = obs_node cd.sort = des_field_num cd.predicate_uuid = str(predicate.uuid) cd.data_type = predicate.data_type cd.record = str(imp_cell_obj.record) cd.fl_uuid = imp_cell_obj.fl_uuid cd.l_uuid = imp_cell_obj.l_uuid cd.create_description() if cd.is_valid: self.count_new_assertions += 1
def get_link_examples(self): """ Gets example entities with linking relations """ example_entities = [] self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: entity_example_count = 0 # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records( distinct_records) for row_key, dist_rec in distinct_records.items(): if entity_example_count < self.example_size: # if we're less than the example size, make # an example object entity_example_count += 1 entity = LastUpdatedOrderedDict() entity_label = dist_rec['imp_cell_obj'].record if len(entity_label) < 1: entity_label = '[BLANK]' entity_label = rels[ 'sub_field_obj'].value_prefix + entity_label entity['label'] = entity_label entity['id'] = str(subj_field_num) + '-' + str( row_key) entity['links'] = [] example_rows = [] example_rows.append(dist_rec['rows'][0]) in_rows = [dist_rec['rows'][0]] for pred_obj in rels['pred_objs']: act_preds = [] if pred_obj['predicate_uuid'] is not False: pred_item = LastUpdatedOrderedDict() pred_item['id'] = pred_obj[ 'predicate_uuid'] ent = Entity() found = ent.dereference( pred_obj['predicate_uuid']) if found: pred_item['label'] = ent.label else: pred_item[ 'label'] = '[Missing predicate!]' act_preds.append(pred_item) elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field pc = ProcessCells(self.source_id, self.start_row) pred_recs = pc.get_field_records( pred_obj['pred_field_obj'].field_num, in_rows) for pred_rec in pred_recs: pred_item = LastUpdatedOrderedDict() pred_item['id'] = str( pred_obj['pred_field_obj']. field_num) pred_item['id'] += '-' + str( pred_rec['rows'][0]) pred_item['label'] = pred_rec[ 'imp_cell_obj'].record if len(pred_item['label']) < 1: pred_item['label'] = '[BLANK]' if len(act_precs) < self.example_size: act_preds.append(pred_item) for pred_item in act_preds: link_item = LastUpdatedOrderedDict() link_item['predicate'] = pred_item # values are in a list, to keep consistent with descriptions link_item['object'] = False obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records( obj_field_obj.field_num, in_rows) pg = ProcessGeneral(self.source_id) obj_rec = pg.get_first_distinct_record( obj_recs) if obj_rec is not False: object_val = LastUpdatedOrderedDict() object_label = obj_field_obj.value_prefix if len(obj_rec['imp_cell_obj'].record ) > 1: object_label += obj_rec[ 'imp_cell_obj'].record else: object_label += '[BLANK]' object_val['label'] = object_label object_val['id'] = str( obj_rec['imp_cell_obj'].field_num) object_val['id'] += '-' + str( obj_rec['rows'][0]) link_item['object'] = object_val if len(entity['links'] ) < self.example_size: entity['links'].append(link_item) example_entities.append(entity) return example_entities
def process_link_batch(self): """ processes fields describing linking relations between subjects, media, documents, persons, projects entities. If start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records sub_field_obj = rels['sub_field_obj'] pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = sub_field_obj.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok if subject_uuid is False: subject_ok = False sort = 0 in_rows = dist_rec['rows'] for pred_obj in rels['pred_objs']: act_preds = {} obs_num = 1 # default observation number if pred_obj['predicate_uuid'] is not False: act_preds[pred_obj['predicate_uuid']] = in_rows elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field if pred_obj['pred_field_obj'].obs_num > 0: obs_num = pred_obj['pred_field_obj'].obs_num sort = pred_obj['pred_field_obj'].field_num pc = ProcessCells(self.source_id, self.start_row) pred_recs = pc.get_field_records(pred_obj['pred_field_obj'].field_num, in_rows) for pred_rec in pred_recs: clp = CandidateLinkPredicate() clp.source_id = self.source_id clp.project_uuid = self.project_uuid clp.make_reconcile_link_pred(pred_rec['imp_cell_obj'].record) if clp.uuid is not False: act_preds[clp.uuid] = pred_rec['rows'] obs_node = '#obs-' + str(obs_num) for predicate_uuid, act_in_rows in act_preds.items(): obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records(obj_field_obj.field_num, act_in_rows) if sort < 1: sort = obj_field_obj.field_num for hash_key, obj_rec in obj_recs.items(): # print('Worry about: ' + str(obj_rec['imp_cell_obj'].record)) object_uuid = obj_rec['imp_cell_obj'].fl_uuid object_type = obj_field_obj.field_type object_ok = obj_rec['imp_cell_obj'].cell_ok cla = CandidateLinkAssertion() cla.project_uuid = self.project_uuid cla.source_id = self.source_id cla.subject_uuid = subject_uuid cla.subject_type = subject_type cla.obs_node = obs_node cla.obs_num = obs_num cla.sort = sort cla.predicate_uuid = predicate_uuid cla.object_uuid = object_uuid cla.object_type = object_type if (subject_ok and object_ok) and predicate_uuid is not False: # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record)) cla.create_link() if cla.is_valid: self.count_new_assertions += 1 print('Count OK: ' + str(self.count_new_assertions))
def process_link_batch(self): """ processes fields describing linking relations between subjects, media, documents, persons, projects entities. If start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records sub_field_obj = rels['sub_field_obj'] pc = ProcessCells(self.source_id, self.start_row) # now get distinct records as determined by having the same assigned # uuid distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = sub_field_obj.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] if subject_ok is False: in_rows = [-1] for pred_obj in rels['pred_objs']: act_preds = {} obs_num = 1 # default observation number if pred_obj['predicate_uuid'] is not False: # limit to the 'in rows' for the current item act_preds[pred_obj['predicate_uuid']] = in_rows elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field if pred_obj['pred_field_obj'].obs_num > 0: obs_num = pred_obj['pred_field_obj'].obs_num sort = pred_obj['pred_field_obj'].field_num pc = ProcessCells(self.source_id, self.start_row) predicate_records= pc.get_field_records(pred_obj['pred_field_obj'].field_num, in_rows) for pred_row_key, pred_rec in predicate_records.items(): clp = CandidateLinkPredicate() clp.source_id = self.source_id clp.project_uuid = self.project_uuid clp.make_reconcile_link_pred(pred_rec['imp_cell_obj'].record) if clp.uuid is not False: act_preds[clp.uuid] = pred_rec['rows'] obs_node = '#obs-' + str(obs_num) for predicate_uuid, act_in_rows in act_preds.items(): obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records_by_fl_uuid(obj_field_obj.field_num, act_in_rows) if sort < 1: sort = obj_field_obj.field_num if obj_recs is not False: for hash_key, obj_rec in obj_recs.items(): object_uuid = obj_rec['imp_cell_obj'].fl_uuid object_type = obj_field_obj.field_type object_ok = obj_rec['imp_cell_obj'].cell_ok object_record = obj_rec['imp_cell_obj'].record if len(object_record) < 1: # blank record, don't make a link object_ok = False if object_uuid is False or\ len(object_uuid) < 1: object_ok = False if object_uuid == 'False': object_ok = False if object_ok and subject_ok: message = 'Attempt link: ' + subject_record + ' ('+ subject_uuid + ') -> ' message += predicate_uuid + ' -> ' + object_record + ' ('+ object_uuid + ')' message += 'in rows: ' + str(act_in_rows) # print(message) cla = CandidateLinkAssertion() cla.project_uuid = self.project_uuid cla.source_id = self.source_id cla.subject_uuid = subject_uuid cla.subject_type = subject_type cla.obs_node = obs_node cla.obs_num = obs_num cla.sort = sort cla.predicate_uuid = predicate_uuid cla.object_uuid = object_uuid cla.object_type = object_type if (subject_ok and object_ok) and predicate_uuid is not False: # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record)) cla.create_link() if cla.is_valid: self.count_new_assertions += 1 print('Link Count OK: ' + str(self.count_new_assertions))