def process_single_media_label_field(self): """Processes only media field, it does not create new media, only reconciles existing already imported media """ single_media_field = False media_fields = ImportField.objects\ .filter(source_id=self.source_id, field_type='media') if len(media_fields) == 1: # only for the 1 media field in an import source single_media_field = True print('yes we have 1 media field') field_obj = media_fields[0] pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_obj.field_num, False) if distinct_records is not False: print('Found Media Records: ' + str(len(distinct_records))) for rec_hash, dist_rec in distinct_records.items(): # print('Checking on: ' + dist_rec['imp_cell_obj'].record) cm = CandidateMedia() cm.mint_new_entity_ok = False # DO NOT create new entities! cm.project_uuid = self.project_uuid cm.source_id = self.source_id cm.class_uri = field_obj.field_value_cat cm.import_rows = dist_rec['rows'] # list of rows where this record value is found cm.reconcile_manifest_item(dist_rec['imp_cell_obj']) if cm.uuid is not False: self.reconciled_entities.append({'id': str(cm.uuid), 'label': cm.label}) return single_media_field
def process_persons_batch(self): """ processes containment fields for subject entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size if len(self.persons_fields) > 1: for field_obj in self.persons_fields: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_obj.field_num, False) if distinct_records is not False: for rec_hash, dist_rec in distinct_records.items(): cp = CandidatePerson() cp.project_uuid = self.project_uuid cp.source_id = self.source_id cp.foaf_type = field_obj.field_value_cat cp.import_rows = dist_rec['rows'] # list of rows where this record value is found cp.reconcile_item(dist_rec['imp_cell_obj']) if cp.uuid is not False: if cp.new_entity: self.new_entities.append({'id': cp.uuid, 'label': cp.label}) else: self.reconciled_entities.append({'id': cp.uuid, 'label': cp.label}) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) bad_id += '-' + str(dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({'id': bad_id, 'label': dist_rec['imp_cell_obj'].record})
def look_up_predicate(self, field_num, row_num): """ Looks up the appropriate predicate_uuid based on a field_num and a row_num """ predicate = False if field_num in self.reconciled_predicates: act_field = self.reconciled_predicates[field_num] predicate = act_field['predicate'] if predicate is False: if row_num in act_field['rows']: predicate = act_field['rows'][row_num] else: # look up the predicate the hard way # we don't have a predicate for this row, so # look it up through reconciliation des_field_obj = act_field['field_obj'] pc = ProcessCells(self.source_id, row_num) distinct_records = pc.get_field_records(field_num, [row_num]) for row_key, var_dist_rec in distinct_records.items(): if len(var_dist_rec['imp_cell_obj'].record) > 0: cdp = CandidateDescriptivePredicate() cdp.label = var_dist_rec['imp_cell_obj'].record cdp.des_import_cell = var_dist_rec['imp_cell_obj'] cdp.data_type = des_field_obj.field_data_type cdp.reconcile_predicate_var(des_field_obj) predicate = cdp.predicate return predicate
def reconcile_descriptive_predicates(self, des_by_fields): """ reconciles descriptive predicate fields """ for des_field_obj in des_by_fields: field_num = des_field_obj.field_num if field_num not in self.reconciled_predicates: recon_predicate = {'predicate': False, 'field_obj': des_field_obj, 'valueof_fields': [], 'rows': False} if des_field_obj.field_type == 'description': # straight forward. Predicate label from the Import Field label cdp = CandidateDescriptivePredicate() cdp.reconcile_predicate_var(des_field_obj) self.field_valueofs[field_num] = [field_num] # store information about where to get values recon_predicate['predicate'] = cdp.predicate elif des_field_obj.field_type == 'variable': # Predicate label in Records of Import cells pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(des_field_obj.field_num, False) for row_key, dist_rec in distinct_records.items(): pred_rows = {} cdp = CandidateDescriptivePredicate() # checks to see if we need to use even a blank label # beccause of dependencies with value-of fields cdp.label = self.make_var_label_evenif_blank(des_field_obj, dist_rec) cdp.des_import_cell = dist_rec['imp_cell_obj'] cdp.reconcile_predicate_var(des_field_obj) for imp_cell_row in dist_rec['rows']: pred_rows[imp_cell_row] = cpd.predicate recon_predicate['rows'] = pred_rows self.reconciled_predicates[des_field_obj.field_num] = recon_predicate
def reconcile_types_strings(self): """ Reconciles type items by looping through reconciled predicate fields. Also reconciles strings """ for field_num, recon_predicate in self.reconciled_predicates.items(): data_type = recon_predicate['field_obj'].field_data_type if data_type == 'id' or data_type == 'xsd:string': # we have a field with an id data_type, which becomes a types entity if recon_predicate['rows'] is not False: valueof_fields = [] valueof_fields_objs = self.get_variable_valueof(recon_predicate['field_obj']) for valueof_field in valueof_fields_objs: if isinstance(valueof_field, ImportField): valueof_fields.append(valueof_field.field_num) elif isinstance(valueof_field, int): valueof_fields.append(valueof_field) elif recon_predicate['predicate'] is not False: valueof_fields = [field_num] else: valueof_fields = [] for valueof_field in valueof_fields: pc = ProcessCells(self.source_id, self.start_row) # print('Check value of field: ' + str(valueof_field)) distinct_records = pc.get_field_records(valueof_field, False) if distinct_records is not False: for row_key, val_dist_rec in distinct_records.items(): if len(val_dist_rec['imp_cell_obj'].record) > 0: # found a non-blank type item cs = CandidateString() cs.source_id = self.source_id cs.project_uuid = self.project_uuid cs.reconcile_string_cell(val_dist_rec['imp_cell_obj']) content_uuid = cs.uuid # string content uuid if data_type == 'id': if recon_predicate['rows'] is not False: # need to create types row by row, because the predicate # comes from import cell records, not the import field for row_num in val_dist_rec['rows']: predicate = self.look_up_predicate(field_num, row_num) if predicate is not False: ct = CandidateType() ct.reconcile_type_cell(predicate.uuid, content_uuid, val_dist_rec['imp_cell_obj'], row_num) elif recon_predicate['predicate'] is not False: # predicate comes from the import field # no need to worry about individual rows predicate = recon_predicate['predicate'] ct = CandidateType() ct.source_id = self.source_id ct.project_uuid = self.project_uuid ct.reconcile_type_cell(predicate.uuid, content_uuid, val_dist_rec['imp_cell_obj'], False)
def make_type_ld_annotations(self, sub_type_pred_uuid, sub_type_f_num, rel_pred, obj_le_f_num): """ Makes linked data annotations for a type in an import """ rels = [] sub_type_list = ImportCell.objects\ .filter(source_id=self.source_id, field_num=sub_type_f_num) if len(sub_type_list) > 0: distinct_records = {} for cell in sub_type_list: if cell.rec_hash not in distinct_records: distinct_records[cell.rec_hash] = {} distinct_records[cell.rec_hash]['rows'] = [] distinct_records[cell.rec_hash]['imp_cell_obj'] = cell distinct_records[cell.rec_hash]['rows'].append(cell.row_num) for rec_hash_key, distinct_type in distinct_records.items(): # iterate through the distinct types and get associated linked data type_label = distinct_type['imp_cell_obj'].record rows = distinct_type['rows'] if len(type_label) > 0: # the type isn't blank, so we can use it pc = ProcessCells(self.source_id, 0) ld_entities = pc.get_field_records(obj_le_f_num, rows) for ld_hash_key, distinct_ld in ld_entities.items(): obj_uri = distinct_ld['imp_cell_obj'].record if len(obj_uri) > 8: if obj_uri[:7] == 'http://'\ or obj_uri[:8] == 'https://': # we have a valid linked data entity # # now get the UUID for the type tm = TypeManagement() tm.project_uuid = self.project_uuid tm.source_id = self.source_id sub_type = tm.get_make_type_within_pred_uuid(sub_type_pred_uuid, type_label) rel = {'subject_label': type_label, 'subject': sub_type.uuid, 'object_uri': obj_uri} rels.append(rel) if len(rels) > 0: for rel in rels: new_la = LinkAnnotation() new_la.subject = rel['subject'] new_la.subject_type = 'types' new_la.project_uuid = self.project_uuid new_la.source_id = self.source_id new_la.predicate_uri = rel_pred new_la.object_uri = rel['object_uri'] new_la.creator_uuid = '' new_la.save() web_le = WebLinkEntity() web_le.check_add_link_entity(rel['object_uri'])
def process_multiple_media_fields(self): """ processes multiple media fields, if they exist """ self.get_media_fields() self.get_metadata_fields() if len(self.media_fields) > 0: print('yes we have media') for field_obj in self.media_fields: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_obj.field_num, False) if distinct_records is not False: print('Found Media Records: ' + str(len(distinct_records))) for rec_hash, dist_rec in distinct_records.items(): # print('Checking on: ' + dist_rec['imp_cell_obj'].record) cm = CandidateMedia() cm.project_uuid = self.project_uuid cm.source_id = self.source_id cm.class_uri = field_obj.field_value_cat cm.import_rows = dist_rec['rows'] # list of rows where this record value is found cm.metadata_obj = self.metadata_obj cm.reconcile_manifest_item(dist_rec['imp_cell_obj']) if cm.uuid is not False: if cm.new_entity: self.new_entities.append({'id': str(cm.uuid), 'label': cm.label}) else: self.reconciled_entities.append({'id': str(cm.uuid), 'label': cm.label}) # we have a media item! Now we can add files to it for part_field_obj in field_obj.parts: pc = ProcessCells(self.source_id, self.start_row) part_dist_records = pc.get_field_records(part_field_obj.field_num, cm.import_rows) if part_dist_records is not False: for rec_hash, part_dist_rec in part_dist_records.items(): # distinct records for the media file parts of a media item cmf = CandidateMediaFile(cm.uuid) cmf.imp_cell_obj = part_dist_rec['imp_cell_obj'] cmf.project_uuid = self.project_uuid cmf.source_id = self.source_id # file type is in the field_value_cat cmf.file_type = part_field_obj.field_value_cat file_uri = part_dist_rec['imp_cell_obj'].record if file_uri[:7] == 'http://' \ or file_uri[:8] == 'https://': # its a URI part cmf.reconcile_media_file(file_uri) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) bad_id += '-' + str(dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({'id': bad_id, 'label': dist_rec['imp_cell_obj'].record})
def process_documents_batch(self): """ processes fields for documents entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_documents_fields() self.get_metadata_fields() if len(self.documents_fields) > 0: print('Number of Document Fields: ' + str(len(self.documents_fields))) for field_obj in self.documents_fields: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_obj.field_num, False) if distinct_records is not False: print('Distinct document recs: ' + str(len(distinct_records))) for rec_hash, dist_rec in distinct_records.items(): content = None if isinstance(field_obj.doc_text_field_num, int): # we have a related document text content field # get the text for the document in the first row doc_text_rows = ImportCell.objects\ .filter(source_id=self.source_id, field_num=field_obj.doc_text_field_num, row_num=dist_rec['rows'][0])[:1] if len(doc_text_rows) > 0: # we found text content associated with this set content = doc_text_rows[0].record cd = CandidateDocument() cd.project_uuid = self.project_uuid cd.source_id = self.source_id cd.label = field_obj.field_value_cat if isinstance(content, str): # we found content to add to the document. cd.content = content cd.import_rows = dist_rec['rows'] # list of rows where this record value is found cd.metadata_obj = self.metadata_obj cd.reconcile_item(dist_rec['imp_cell_obj']) if cd.uuid is not False: if cd.new_entity: self.new_entities.append({'id': str(cd.uuid), 'label': cd.label}) else: self.reconciled_entities.append({'id': str(cd.uuid), 'label': cd.label}) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) bad_id += '-' + str(dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({'id': str(bad_id), 'label': dist_rec['imp_cell_obj'].record})
def get_assertion_object_values(self, field_num, in_rows): """ Gets the import_cell_objects for a given field and row constraint """ object_imp_cell_objs = [] if field_num in self.field_valueofs: valueof_fields = self.field_valueofs[field_num] for valueof_field in valueof_fields: pc = ProcessCells(self.source_id, self.start_row) cells = pc.get_field_row_records(valueof_field, in_rows) for cell in cells: object_imp_cell_objs.append(cell) return object_imp_cell_objs
def process_media_batch(self): """ process media items """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_media_fields() if len(self.media_fields) > 0: for field_obj in self.media_fields: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_obj.field_num, False) if distinct_records is not False: for rec_hash, dist_rec in distinct_records.items(): # print('Checking on: ' + dist_rec['imp_cell_obj'].record) cm = CandidateMedia() cm.project_uuid = self.project_uuid cm.source_id = self.source_id cm.class_uri = field_obj.field_value_cat cm.import_rows = dist_rec['rows'] # list of rows where this record value is found cm.reconcile_manifest_item(dist_rec['imp_cell_obj']) if cm.uuid is not False: if cm.new_entity: self.new_entities.append({'id': str(cm.uuid), 'label': cm.label}) else: self.reconciled_entities.append({'id': str(cm.uuid), 'label': cm.label}) # we have a media item! Now we can add files to it for part_field_obj in field_obj.parts: pc = ProcessCells(self.source_id, self.start_row) part_dist_records = pc.get_field_records(part_field_obj.field_num, cm.import_rows) if part_dist_records is not False: for rec_hash, part_dist_rec in part_dist_records.items(): # distinct records for the media file parts of a media item cmf = CandidateMediaFile(cm.uuid) cmf.project_uuid = self.project_uuid cmf.source_id = self.source_id # file type is in the field_value_cat cmf.file_type = part_field_obj.field_value_cat file_uri = part_dist_rec['imp_cell_obj'].record cmf.reconcile_media_file(file_uri) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) bad_id += '-' + str(dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({'id': bad_id, 'label': dist_rec['imp_cell_obj'].record})
def process_persons_batch(self): """ processes containment fields for subject entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_persons_fields() if len(self.persons_fields) > 0: print('Number of Person Fields: ' + str(len(self.persons_fields))) for field_obj in self.persons_fields: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records( field_obj.field_num, False) if distinct_records is not False: print('Distinct person recs: ' + str(len(distinct_records))) for rec_hash, dist_rec in distinct_records.items(): cp = CandidatePerson() cp.project_uuid = self.project_uuid cp.source_id = self.source_id cp.foaf_type = field_obj.field_value_cat cp.import_rows = dist_rec[ 'rows'] # list of rows where this record value is found cp.reconcile_item(dist_rec['imp_cell_obj']) if cp.uuid is not False: if cp.new_entity: self.new_entities.append({ 'id': str(cp.uuid), 'label': cp.label }) else: self.reconciled_entities.append({ 'id': str(cp.uuid), 'label': cp.label }) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) bad_id += '-' + str( dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({ 'id': str(bad_id), 'label': dist_rec['imp_cell_obj'].record })
def process_non_contain_subjects(self): """ processes subject entitites that are not in containment relations. This only allows reconciliation based on subject labels, it does not allow creation of new subjects. Subjects can only be created if they are defined in a spatial hierarchy """ if len(self.non_contain_subjects) > 0: print('Non-contain process') for field_num in self.non_contain_subjects: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_num, False) if distinct_records is not False: field_obj = self.subjects_fields[field_num] for rec_hash, dist_rec in distinct_records.items(): cs = CandidateSubject() cs.project_uuid = self.project_uuid cs.source_id = self.source_id cs.obs_node = 'obs-' + str(field_obj.obs_num) cs.obs_num = field_obj.obs_num cs.parent_context = False cs.parent_uuid = False cs.label_prefix = field_obj.value_prefix cs.allow_new = False # do not allow new, not in a hierarchy cs.class_uri = field_obj.field_value_cat cs.import_rows = dist_rec['rows'] # list of rows where this record value is found cs.reconcile_item(dist_rec['imp_cell_obj']) if cs.uuid is not False: self.process_geospace_item(field_num, cs.import_rows, cs.uuid) self.process_geojson_item(field_num, cs.import_rows, cs.uuid) self.process_date_item(field_num, cs.import_rows, cs.uuid) self.reconciled_entities.append({'id': cs.uuid, 'label': cs.label}) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) bad_id += '-' + str(dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({'id': bad_id, 'label': dist_rec['imp_cell_obj'].record})
def make_var_label_evenif_blank(self, des_field_obj, dist_rec): """ Checks to see if a descriptive field of type "variable" needs to be created even in cases of Import Cell records, that are used for labeling predicate-variables are blank. We need a "blank" predicate-variable when """ label = dist_rec['imp_cell_obj'].record if len(label) < 1: valueof_fields = self.get_variable_valueof(des_field_obj) for valueof_field in valueof_fields: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(valueof_field, dist_rec['rows']) for row_key, val_dist_rec in distinct_records.items(): if len(val_dist_rec['imp_cell_obj'].record) > 0: label = CandidateDescriptivePredicate.DEFAULT_BLANK label += '[Field: ' + str(des_field_obj.field_num) + ']' break if len(label) > 0: break return label
def get_assertion_object_values(self, field_num, in_rows): """ Gets the import_cell_objects for a given field and row constraint """ object_imp_cell_objs = [] if field_num not in self.field_valueofs: # for some reason we don't have the value of fields yet self.get_field_valueofs(field_num) if field_num in self.field_valueofs: valueof_fields = self.field_valueofs[field_num] for valueof_field in valueof_fields: if isinstance(valueof_field, ImportField): # it is not an integer, but an ImportField object valueof_field = valueof_field.field_num print('Value of field: ' + str(valueof_field)) pc = ProcessCells(self.source_id, self.start_row) cells = pc.get_field_row_records(valueof_field, in_rows) for cell in cells: object_imp_cell_objs.append(cell) else: print('cannot find field_valueofs for ' + str(field_num)) pass return object_imp_cell_objs
def process_single_media_label_field(self): """Processes only media field, it does not create new media, only reconciles existing already imported media """ single_media_field = False media_fields = ImportField.objects\ .filter(source_id=self.source_id, field_type='media') if len(media_fields) == 1: # only for the 1 media field in an import source single_media_field = True print('yes we have 1 media field') field_obj = media_fields[0] # make the metadata fields for this one media field media_field_nums = [field_obj.field_num] self.get_metadata_fields(media_field_nums) pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_obj.field_num, False) if distinct_records is not False: print('Found Media Records: ' + str(len(distinct_records))) for rec_hash, dist_rec in distinct_records.items(): # print('Checking on: ' + dist_rec['imp_cell_obj'].record) cm = CandidateMedia() cm.mint_new_entity_ok = False # DO NOT create new entities! cm.project_uuid = self.project_uuid cm.source_id = self.source_id cm.class_uri = field_obj.field_value_cat cm.import_rows = dist_rec['rows'] # list of rows where this record value is found cm.metadata_obj = self.metadata_obj cm.reconcile_manifest_item(dist_rec['imp_cell_obj']) if cm.uuid is not False: self.reconciled_entities.append({'id': str(cm.uuid), 'label': cm.label}) return single_media_field
def process_field_hierarchy(self, field_num, parent_uuid=False, parent_context='', in_rows=False): """ processes subject entitites from a given field. takes arguments about: 1. field_num (the field to find candidate subject entities) 2. parent_uuid (the uuid for the parent / containing subject entity) 3. parent_context (the context path of the parent entitiy) 4. in_rows (a list of row numbers to search within. this insures that entities are reconciled within contexts so that a Bone 1 in a Locus 1 is noted as different from a Bone 1 in Locus 2) Note: this function is recursive and calls itself if the the field_num has child fields. """ pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_num, in_rows) if distinct_records is not False: field_obj = self.subjects_fields[field_num] if field_num == self.root_subject_field and parent_uuid is False: if field_num in self.field_parent_entities: if self.field_parent_entities[field_num] is not False: parent_uuid = self.field_parent_entities[field_num].uuid parent_context = self.field_parent_entities[field_num].context for rec_hash, dist_rec in distinct_records.items(): cs = CandidateSubject() cs.project_uuid = self.project_uuid cs.source_id = self.source_id cs.obs_node = 'obs-' + str(field_obj.obs_num) cs.obs_num = field_obj.obs_num cs.parent_context = parent_context cs.parent_uuid = parent_uuid cs.label_prefix = field_obj.value_prefix cs.allow_new = True # allow new because it is a hierarchic field cs.class_uri = field_obj.field_value_cat cs.import_rows = dist_rec['rows'] # list of rows where this record value is found cs.reconcile_item(dist_rec['imp_cell_obj']) # show_item = str(unidecode(dist_rec['imp_cell_obj'].record)) # print('Reconciled item: ' + show_item) # print('--- Has uuid: ' + str(cs.uuid)) if cs.uuid is not False: if cs.is_new: self.new_entities.append({'id': str(cs.uuid), 'label': cs.context}) else: self.reconciled_entities.append({'id': str(cs.uuid), 'label': cs.context}) if field_num in self.contain_ordered_subjects: if self.contain_ordered_subjects[field_num] is not False: # subject entity successfully reconciled or created # now process next level down in hierarchy, if it exists for child_field in self.contain_ordered_subjects[field_num]: self.process_field_hierarchy(child_field, cs.uuid, cs.context, dist_rec['rows']) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) + '-' + str(dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({'id': str(bad_id), 'label': dist_rec['imp_cell_obj'].record})
def process_media_batch(self): """ process media items """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_media_fields() if len(self.media_fields) > 0: for field_obj in self.media_fields: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records( field_obj.field_num, False) if distinct_records is not False: for rec_hash, dist_rec in distinct_records.items(): # print('Checking on: ' + dist_rec['imp_cell_obj'].record) cm = CandidateMedia() cm.project_uuid = self.project_uuid cm.source_id = self.source_id cm.class_uri = field_obj.field_value_cat cm.import_rows = dist_rec[ 'rows'] # list of rows where this record value is found cm.reconcile_manifest_item(dist_rec['imp_cell_obj']) if cm.uuid is not False: if cm.new_entity: self.new_entities.append({ 'id': str(cm.uuid), 'label': cm.label }) else: self.reconciled_entities.append({ 'id': str(cm.uuid), 'label': cm.label }) # we have a media item! Now we can add files to it for part_field_obj in field_obj.parts: pc = ProcessCells(self.source_id, self.start_row) part_dist_records = pc.get_field_records( part_field_obj.field_num, cm.import_rows) if part_dist_records is not False: for rec_hash, part_dist_rec in part_dist_records.items( ): # distinct records for the media file parts of a media item cmf = CandidateMediaFile(cm.uuid) cmf.project_uuid = self.project_uuid cmf.source_id = self.source_id # file type is in the field_value_cat cmf.file_type = part_field_obj.field_value_cat file_uri = part_dist_rec[ 'imp_cell_obj'].record cmf.reconcile_media_file(file_uri) else: bad_id = str(dist_rec['imp_cell_obj'].field_num) bad_id += '-' + str( dist_rec['imp_cell_obj'].row_num) self.not_reconciled_entities.append({ 'id': bad_id, 'label': dist_rec['imp_cell_obj'].record })
def get_description_examples(self): """ Gets example entities described by other fields """ example_entities = [] self.get_description_annotations() if self.des_rels is not False: for subj_field_num, ent_obj in self.des_rels.items(): # get some example records pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: entity_example_count = 0 # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): if entity_example_count < self.example_size: # if we're less than the example size, make # an example object entity_example_count += 1 entity = LastUpdatedOrderedDict() entity_label = dist_rec['imp_cell_obj'].record if len(entity_label) < 1: entity_label = '[BLANK]' entity_label = ent_obj['field'].value_prefix + entity_label entity['label'] = entity_label entity['id'] = str(subj_field_num) + '-' + str(row_key) entity['descriptions'] = [] example_rows = [] example_rows.append(dist_rec['rows'][0]) for des_field_obj in ent_obj['des_by_fields']: des_item = LastUpdatedOrderedDict() des_item['predicate'] = LastUpdatedOrderedDict() # values are in a list, in case there are more than 1 (variable-value) des_item['objects'] = [] des_item['predicate']['type'] = des_field_obj.field_type if des_field_obj.field_type == 'description': # set the predicate for this description des_item['predicate']['label'] = des_field_obj.label des_item['predicate']['id'] = des_field_obj.field_num # now get a value for this description from the imported cells pc = ProcessCells(self.source_id, self.start_row) val_recs = pc.get_field_records(des_field_obj.field_num, example_rows) pg = ProcessGeneral(self.source_id) val_rec = pg.get_first_distinct_record(val_recs) if val_rec is not False: object_val = LastUpdatedOrderedDict() object_val['record'] = val_rec['imp_cell_obj'].record object_val['id'] = val_rec['rows'][0] des_item['objects'].append(object_val) elif des_field_obj.field_type == 'variable': # need to get the predicate from the imported cells pc = ProcessCells(self.source_id, self.start_row) var_recs = pc.get_field_records(des_field_obj.field_num, example_rows) pg = ProcessGeneral(self.source_id) val_rec = pg.get_first_distinct_record(val_recs) if var_rec is not False: des_item['predicate']['label'] = var_rec['imp_cell_obj'].record pid = str(des_field_obj.field_num) + '-' + str(var_rec['rows'][0]) des_item['predicate']['id'] = pid # now need to get fields that have object values for the predicate valueof_fields = self.get_variable_valueof(des_field_obj) for val_field_obj in valueof_fields: pc = ProcessCells(self.source_id, self.start_row) val_recs = pc.get_field_records(val_field_obj.field_num, example_rows) pg = ProcessGeneral(self.source_id) val_rec = pg.get_first_distinct_record(val_recs) if val_rec is not False: object_val = LastUpdatedOrderedDict() object_val['record'] = val_rec['imp_cell_obj'].record oid = str(val_field_obj.field_num) + '-' + str(val_rec['rows'][0]) object_val['id'] = oid des_item['objects'].append(object_val) entity['descriptions'].append(des_item) example_entities.append(entity) return example_entities
def get_link_examples(self): """ Gets example entities with linking relations """ example_entities = [] self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: entity_example_count = 0 # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): if entity_example_count < self.example_size: # if we're less than the example size, make # an example object entity_example_count += 1 entity = LastUpdatedOrderedDict() entity_label = dist_rec['imp_cell_obj'].record if len(entity_label) < 1: entity_label = '[BLANK]' entity_label = rels['sub_field_obj'].value_prefix + entity_label entity['label'] = entity_label entity['id'] = str(subj_field_num) + '-' + str(row_key) entity['links'] = [] example_rows = [] example_rows.append(dist_rec['rows'][0]) in_rows = [dist_rec['rows'][0]] for pred_obj in rels['pred_objs']: act_preds = [] if pred_obj['predicate_uuid'] is not False: pred_item = LastUpdatedOrderedDict() pred_item['id'] = pred_obj['predicate_uuid'] ent = Entity() found = ent.dereference(pred_obj['predicate_uuid']) if found: pred_item['label'] = ent.label else: pred_item['label'] = '[Missing predicate!]' act_preds.append(pred_item) elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field pc = ProcessCells(self.source_id, self.start_row) predicate_records= pc.get_field_records(pred_obj['pred_field_obj'].field_num, in_rows) for pred_row_key, pred_rec in predicate_records.items(): pred_item = LastUpdatedOrderedDict() pred_item['id'] = str(pred_obj['pred_field_obj'].field_num) pred_item['id'] += '-' + str(pred_rec['rows'][0]) pred_item['label'] = pred_rec['imp_cell_obj'].record if len(pred_item['label']) < 1: pred_item['label'] = '[BLANK]' if len(act_precs) < self.example_size: act_preds.append(pred_item) for pred_item in act_preds: link_item = LastUpdatedOrderedDict() link_item['predicate'] = pred_item # values are in a list, to keep consistent with descriptions link_item['object'] = False obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records(obj_field_obj.field_num, in_rows) pg = ProcessGeneral(self.source_id) obj_rec = pg.get_first_distinct_record(obj_recs) if obj_rec is not False: object_val = LastUpdatedOrderedDict() object_label = obj_field_obj.value_prefix if len(obj_rec['imp_cell_obj'].record) > 1: object_label += obj_rec['imp_cell_obj'].record else: object_label += '[BLANK]' object_val['label'] = object_label object_val['id'] = str(obj_rec['imp_cell_obj'].field_num) object_val['id'] += '-' + str(obj_rec['rows'][0]) link_item['object'] = object_val if len(entity['links']) < self.example_size: entity['links'].append(link_item) example_entities.append(entity) return example_entities
def process_link_batch(self): """ processes fields describing linking relations between subjects, media, documents, persons, projects entities. If start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records sub_field_obj = rels['sub_field_obj'] pc = ProcessCells(self.source_id, self.start_row) # now get distinct records as determined by having the same assigned # uuid distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = sub_field_obj.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] if subject_ok is False: in_rows = [-1] for pred_obj in rels['pred_objs']: act_preds = {} obs_num = 1 # default observation number if pred_obj['predicate_uuid'] is not False: # limit to the 'in rows' for the current item act_preds[pred_obj['predicate_uuid']] = in_rows elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field if pred_obj['pred_field_obj'].obs_num > 0: obs_num = pred_obj['pred_field_obj'].obs_num sort = pred_obj['pred_field_obj'].field_num pc = ProcessCells(self.source_id, self.start_row) predicate_records= pc.get_field_records(pred_obj['pred_field_obj'].field_num, in_rows) for pred_row_key, pred_rec in predicate_records.items(): clp = CandidateLinkPredicate() clp.source_id = self.source_id clp.project_uuid = self.project_uuid clp.make_reconcile_link_pred(pred_rec['imp_cell_obj'].record) if clp.uuid is not False: act_preds[clp.uuid] = pred_rec['rows'] obs_node = '#obs-' + str(obs_num) for predicate_uuid, act_in_rows in act_preds.items(): obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records_by_fl_uuid(obj_field_obj.field_num, act_in_rows) if sort < 1: sort = obj_field_obj.field_num if obj_recs is not False: for hash_key, obj_rec in obj_recs.items(): object_uuid = obj_rec['imp_cell_obj'].fl_uuid object_type = obj_field_obj.field_type object_ok = obj_rec['imp_cell_obj'].cell_ok object_record = obj_rec['imp_cell_obj'].record if len(object_record) < 1: # blank record, don't make a link object_ok = False if object_uuid is False or\ len(object_uuid) < 1: object_ok = False if object_uuid == 'False': object_ok = False if object_ok and subject_ok: message = 'Attempt link: ' + subject_record + ' ('+ subject_uuid + ') -> ' message += predicate_uuid + ' -> ' + object_record + ' ('+ object_uuid + ')' message += 'in rows: ' + str(act_in_rows) # print(message) cla = CandidateLinkAssertion() cla.project_uuid = self.project_uuid cla.source_id = self.source_id cla.subject_uuid = subject_uuid cla.subject_type = subject_type cla.obs_node = obs_node cla.obs_num = obs_num cla.sort = sort cla.predicate_uuid = predicate_uuid cla.object_uuid = object_uuid cla.object_type = object_type if (subject_ok and object_ok) and predicate_uuid is not False: # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record)) cla.create_link() if cla.is_valid: self.count_new_assertions += 1 print('Link Count OK: ' + str(self.count_new_assertions))
def process_description_batch(self): """ processes fields describing a subject (subjects, media, documents, persons, projects) entity field. if start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_description_annotations() if self.des_rels is not False: for subj_field_num, ent_obj in self.des_rels.items(): # loop through the fields that describe the subj_field_num self.reconcile_descriptive_predicates(ent_obj['des_by_fields']) # -------- # reconciles types and strings by looping through reconciled predicate fields self.reconcile_types_strings() # -------- for subj_field_num, ent_obj in self.des_rels.items(): subj_field_type = ent_obj['field'].field_type # get records for the subject of the description pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num, False) if distinct_records is not False: pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) # print(str(distinct_records)) for row_key, dist_rec in distinct_records.items(): if dist_rec['imp_cell_obj'].cell_ok: subject_uuid = dist_rec['imp_cell_obj'].fl_uuid # the subject record is OK to use for creating # description records for des_field_obj in ent_obj['des_by_fields']: des_field_num = des_field_obj.field_num if des_field_obj.obs_num < 1: obs_num = 1 else: obs_num = des_field_obj.obs_num obs_node = '#obs-' + str(obs_num) # get the 'value-of' import cell objects for the current # 'descriptive' or 'variable' field_num # 'variable' field_nums may make multiple 'value-of' import_cell_objs object_imp_cell_objs = self.get_assertion_object_values(des_field_num, dist_rec['rows']) for imp_cell_obj in object_imp_cell_objs: row_num = imp_cell_obj.row_num predicate = self.look_up_predicate(des_field_num, row_num) if predicate is not False: cd = CandidateDescription() cd.source_id = self.source_id cd.project_uuid = self.project_uuid cd.subject_uuid = subject_uuid cd.subject_type = subj_field_type cd.obs_num = obs_num cd.obs_node = obs_node cd.sort = des_field_num cd.predicate_uuid = str(predicate.uuid) cd.data_type = predicate.data_type cd.record = str(imp_cell_obj.record) cd.fl_uuid = imp_cell_obj.fl_uuid cd.l_uuid = imp_cell_obj.l_uuid cd.create_description() if cd.is_valid: self.count_new_assertions += 1
def process_complex_batch(self): """ processes fields for documents entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_complex_description_fields() label_str_uuids = {} if len(self.complex_des_fields) > 0: print('Number of Complex Description Fields: ' + str(len(self.complex_des_fields))) cp_id_number = 0 for cp_field in self.complex_des_fields: cp_id_number += 1 pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid(cp_field.describes_field.field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): if cp_field.obs_num < 1: obs_num = 1 else: obs_num = cp_field.obs_num obs_node = '#obs-' + str(obs_num) subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = cp_field.describes_field.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] print('Look for complex description labels in rows: ' + str(in_rows)) if subject_ok is not False: # OK! we have the subjects of complex descriptions # with uuids, so now we can make an fl_uuid for each # of the complex description fields. complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str(cp_id_number) complex_recs = ImportCell.objects\ .filter(source_id=self.source_id, field_num=cp_field.field_num, row_num__in=in_rows)\ .exclude(record='') if len(complex_recs) > 0: # we have records in the complex description field that are not blank # and are associated with the subject of the complex description. # so now, let's record this association. save_ok = False new_ass = Assertion() new_ass.uuid = subject_uuid new_ass.subject_type = subject_type new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = obs_node new_ass.obs_num = obs_num new_ass.sort = 100 + cp_id_number new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES new_ass.object_type = 'complex-description' new_ass.object_uuid = complex_uuid new_ass.save() try: print('Saved complex-description: ' + complex_uuid) new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1 # now look through the complex description records and make labels for comp_rec in complex_recs: # first save the fl_uuid for the complex description comp_rec.fl_uuid = complex_uuid comp_rec.save() if isinstance(cp_field.value_prefix, str): cp_label = cp_field.value_prefix + comp_rec.record else: cp_label = comp_rec.record if cp_label not in label_str_uuids: # make a uuid for the record value # adding a source_id suffix keeps this from being deleted as descriptions get processed sm = StringManagement() sm.project_uuid = self.project_uuid sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX oc_string = sm.get_make_string(cp_label) content_uuid = oc_string.uuid label_str_uuids[cp_label] = content_uuid content_uuid = label_str_uuids[cp_label] save_ok = False new_ass = Assertion() new_ass.uuid = complex_uuid new_ass.subject_type = 'complex-description' new_ass.project_uuid = self.project_uuid # adding a source_id suffix keeps this from being deleted as descriptions get processed new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = '#obs-' + str(self.obs_num_complex_description_assertions) new_ass.obs_num = self.obs_num_complex_description_assertions new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL new_ass.object_type = 'xsd:string' new_ass.object_uuid = content_uuid try: new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1
def get_contained_field_exp(self, field_num, in_rows=False, check_parent_entity=False): """ get examples of entities in containment fields, does recursive lookups to get a whole tree, limited to a maximum of a few examples """ contain_nodes = False add_field_examples = True if field_num == self.root_subject_field and check_parent_entity: # Check to see if the root is contained in a named entity if self.field_parent_entities[field_num] is not False: # Root is in a named entity, so add it. contain_nodes = [] add_field_examples = False parent_uuid = self.field_parent_entities[field_num].uuid parent_context = self.field_parent_entities[field_num].context contain_node = LastUpdatedOrderedDict() contain_node['label'] = parent_context contain_node['type'] = 'subjects' contain_node['field_label'] = 'Parent of field: ' + self.subjects_fields[field_num].label contain_node['field_num'] = 0 contain_node['id'] = parent_uuid # now look for children of the root entity. contain_node['children'] = self.get_contained_field_exp(field_num) contain_nodes.append(contain_node) if add_field_examples: pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(field_num, in_rows) if distinct_records is not False: contain_nodes = [] unique_labels = [] field_obj = self.subjects_fields[field_num] for rec_hash, dist_rec in distinct_records.items(): if len(contain_nodes) <= self.example_size: # only add examples if we're less or equal to the the total example size contain_node = LastUpdatedOrderedDict() entity_label = dist_rec['imp_cell_obj'].record if len(entity_label) < 1: entity_label = '[BLANK]' entity_label = field_obj.value_prefix + entity_label contain_node['label'] = entity_label contain_node['type'] = 'import-record' contain_node['field_label'] = field_obj.label contain_node['field_num'] = field_num contain_node['id'] = dist_rec['rows'][0] contain_node['children'] = False if field_num in self.contain_ordered_subjects: if self.contain_ordered_subjects[field_num] is not False: unique_child_labels = [] for child_field in self.contain_ordered_subjects[field_num]: act_children = self.get_contained_field_exp(child_field, dist_rec['rows']) if act_children is not False: if contain_node['children'] is False: contain_node['children'] = [] for act_child in act_children: if act_child['label'] not in unique_child_labels: # so we only list the same entity once contain_node['children'].append(act_child) unique_child_labels.append(act_child['label']) if entity_label not in unique_labels: # so we only list the same entity once contain_nodes.append(contain_node) unique_labels.append(entity_label) return contain_nodes
def get_link_examples(self): """ Gets example entities with linking relations """ example_entities = [] self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: entity_example_count = 0 # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records( distinct_records) for row_key, dist_rec in distinct_records.items(): if entity_example_count < self.example_size: # if we're less than the example size, make # an example object entity_example_count += 1 entity = LastUpdatedOrderedDict() entity_label = dist_rec['imp_cell_obj'].record if len(entity_label) < 1: entity_label = '[BLANK]' entity_label = rels[ 'sub_field_obj'].value_prefix + entity_label entity['label'] = entity_label entity['id'] = str(subj_field_num) + '-' + str( row_key) entity['links'] = [] example_rows = [] example_rows.append(dist_rec['rows'][0]) in_rows = [dist_rec['rows'][0]] for pred_obj in rels['pred_objs']: act_preds = [] if pred_obj['predicate_uuid'] is not False: pred_item = LastUpdatedOrderedDict() pred_item['id'] = pred_obj[ 'predicate_uuid'] ent = Entity() found = ent.dereference( pred_obj['predicate_uuid']) if found: pred_item['label'] = ent.label else: pred_item[ 'label'] = '[Missing predicate!]' act_preds.append(pred_item) elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field pc = ProcessCells(self.source_id, self.start_row) pred_recs = pc.get_field_records( pred_obj['pred_field_obj'].field_num, in_rows) for pred_rec in pred_recs: pred_item = LastUpdatedOrderedDict() pred_item['id'] = str( pred_obj['pred_field_obj']. field_num) pred_item['id'] += '-' + str( pred_rec['rows'][0]) pred_item['label'] = pred_rec[ 'imp_cell_obj'].record if len(pred_item['label']) < 1: pred_item['label'] = '[BLANK]' if len(act_precs) < self.example_size: act_preds.append(pred_item) for pred_item in act_preds: link_item = LastUpdatedOrderedDict() link_item['predicate'] = pred_item # values are in a list, to keep consistent with descriptions link_item['object'] = False obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records( obj_field_obj.field_num, in_rows) pg = ProcessGeneral(self.source_id) obj_rec = pg.get_first_distinct_record( obj_recs) if obj_rec is not False: object_val = LastUpdatedOrderedDict() object_label = obj_field_obj.value_prefix if len(obj_rec['imp_cell_obj'].record ) > 1: object_label += obj_rec[ 'imp_cell_obj'].record else: object_label += '[BLANK]' object_val['label'] = object_label object_val['id'] = str( obj_rec['imp_cell_obj'].field_num) object_val['id'] += '-' + str( obj_rec['rows'][0]) link_item['object'] = object_val if len(entity['links'] ) < self.example_size: entity['links'].append(link_item) example_entities.append(entity) return example_entities
def process_complex_batch(self): """ processes fields for documents entities starting with a given row number. This iterates over all containment fields, starting with the root subjhect field """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_complex_description_fields() label_str_uuids = {} if len(self.complex_des_fields) > 0: print('Number of Complex Description Fields: ' + str(len(self.complex_des_fields))) cp_id_number = 0 for cp_field in self.complex_des_fields: cp_id_number += 1 pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records_by_fl_uuid( cp_field.describes_field.field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records( distinct_records) for row_key, dist_rec in distinct_records.items(): if cp_field.obs_num < 1: obs_num = 1 else: obs_num = cp_field.obs_num obs_node = '#obs-' + str(obs_num) subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = cp_field.describes_field.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok subject_record = dist_rec['imp_cell_obj'].record if subject_uuid is False or\ len(subject_record) < 1: subject_ok = False if subject_uuid == 'False': subject_ok = False sort = 0 in_rows = dist_rec['rows'] print('Look for complex description labels in rows: ' + str(in_rows)) if subject_ok is not False: # OK! we have the subjects of complex descriptions # with uuids, so now we can make an fl_uuid for each # of the complex description fields. complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str( cp_id_number) complex_recs = ImportCell.objects\ .filter(source_id=self.source_id, field_num=cp_field.field_num, row_num__in=in_rows)\ .exclude(record='') if len(complex_recs) > 0: # we have records in the complex description field that are not blank # and are associated with the subject of the complex description. # so now, let's record this association. save_ok = False new_ass = Assertion() new_ass.uuid = subject_uuid new_ass.subject_type = subject_type new_ass.project_uuid = self.project_uuid new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = obs_node new_ass.obs_num = obs_num new_ass.sort = 100 + cp_id_number new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES new_ass.object_type = 'complex-description' new_ass.object_uuid = complex_uuid new_ass.save() try: print('Saved complex-description: ' + complex_uuid) new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1 # now look through the complex description records and make labels for comp_rec in complex_recs: # first save the fl_uuid for the complex description comp_rec.fl_uuid = complex_uuid comp_rec.save() if isinstance(cp_field.value_prefix, str): cp_label = cp_field.value_prefix + comp_rec.record else: cp_label = comp_rec.record if cp_label not in label_str_uuids: # make a uuid for the record value # adding a source_id suffix keeps this from being deleted as descriptions get processed sm = StringManagement() sm.project_uuid = self.project_uuid sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX oc_string = sm.get_make_string( cp_label) content_uuid = oc_string.uuid label_str_uuids[ cp_label] = content_uuid content_uuid = label_str_uuids[cp_label] save_ok = False new_ass = Assertion() new_ass.uuid = complex_uuid new_ass.subject_type = 'complex-description' new_ass.project_uuid = self.project_uuid # adding a source_id suffix keeps this from being deleted as descriptions get processed new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX new_ass.obs_node = '#obs-' + str( self. obs_num_complex_description_assertions) new_ass.obs_num = self.obs_num_complex_description_assertions new_ass.sort = 1 new_ass.visibility = 1 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL new_ass.object_type = 'xsd:string' new_ass.object_uuid = content_uuid try: new_ass.save() save_ok = True except: save_ok = False if save_ok: self.count_new_assertions += 1
def process_link_batch(self): """ processes fields describing linking relations between subjects, media, documents, persons, projects entities. If start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records sub_field_obj = rels['sub_field_obj'] pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records(distinct_records) for row_key, dist_rec in distinct_records.items(): subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = sub_field_obj.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok if subject_uuid is False: subject_ok = False sort = 0 in_rows = dist_rec['rows'] for pred_obj in rels['pred_objs']: act_preds = {} obs_num = 1 # default observation number if pred_obj['predicate_uuid'] is not False: act_preds[pred_obj['predicate_uuid']] = in_rows elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field if pred_obj['pred_field_obj'].obs_num > 0: obs_num = pred_obj['pred_field_obj'].obs_num sort = pred_obj['pred_field_obj'].field_num pc = ProcessCells(self.source_id, self.start_row) pred_recs = pc.get_field_records(pred_obj['pred_field_obj'].field_num, in_rows) for pred_rec in pred_recs: clp = CandidateLinkPredicate() clp.source_id = self.source_id clp.project_uuid = self.project_uuid clp.make_reconcile_link_pred(pred_rec['imp_cell_obj'].record) if clp.uuid is not False: act_preds[clp.uuid] = pred_rec['rows'] obs_node = '#obs-' + str(obs_num) for predicate_uuid, act_in_rows in act_preds.items(): obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records(obj_field_obj.field_num, act_in_rows) if sort < 1: sort = obj_field_obj.field_num for hash_key, obj_rec in obj_recs.items(): # print('Worry about: ' + str(obj_rec['imp_cell_obj'].record)) object_uuid = obj_rec['imp_cell_obj'].fl_uuid object_type = obj_field_obj.field_type object_ok = obj_rec['imp_cell_obj'].cell_ok cla = CandidateLinkAssertion() cla.project_uuid = self.project_uuid cla.source_id = self.source_id cla.subject_uuid = subject_uuid cla.subject_type = subject_type cla.obs_node = obs_node cla.obs_num = obs_num cla.sort = sort cla.predicate_uuid = predicate_uuid cla.object_uuid = object_uuid cla.object_type = object_type if (subject_ok and object_ok) and predicate_uuid is not False: # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record)) cla.create_link() if cla.is_valid: self.count_new_assertions += 1 print('Count OK: ' + str(self.count_new_assertions))
def process_link_batch(self): """ processes fields describing linking relations between subjects, media, documents, persons, projects entities. If start_row is 1, then previous imports of this source are cleared """ self.clear_source() # clear prior import for this source self.end_row = self.start_row + self.batch_size self.get_link_annotations() if self.link_rels is not False: for subj_field_num, rels in self.link_rels.items(): # get some example records sub_field_obj = rels['sub_field_obj'] pc = ProcessCells(self.source_id, self.start_row) distinct_records = pc.get_field_records(subj_field_num, False) if distinct_records is not False: # sort the list in row_order from the import table pg = ProcessGeneral(self.source_id) distinct_records = pg.order_distinct_records( distinct_records) for row_key, dist_rec in distinct_records.items(): subject_uuid = dist_rec['imp_cell_obj'].fl_uuid subject_type = sub_field_obj.field_type subject_ok = dist_rec['imp_cell_obj'].cell_ok if subject_uuid is False: subject_ok = False sort = 0 in_rows = dist_rec['rows'] for pred_obj in rels['pred_objs']: act_preds = {} obs_num = 1 # default observation number if pred_obj['predicate_uuid'] is not False: act_preds[pred_obj['predicate_uuid']] = in_rows elif pred_obj['pred_field_obj'] is not False: # linking predicate is in a field if pred_obj['pred_field_obj'].obs_num > 0: obs_num = pred_obj[ 'pred_field_obj'].obs_num sort = pred_obj['pred_field_obj'].field_num pc = ProcessCells(self.source_id, self.start_row) pred_recs = pc.get_field_records( pred_obj['pred_field_obj'].field_num, in_rows) for pred_rec in pred_recs: clp = CandidateLinkPredicate() clp.source_id = self.source_id clp.project_uuid = self.project_uuid clp.make_reconcile_link_pred( pred_rec['imp_cell_obj'].record) if clp.uuid is not False: act_preds[clp.uuid] = pred_rec['rows'] obs_node = '#obs-' + str(obs_num) for predicate_uuid, act_in_rows in act_preds.items( ): obj_field_obj = pred_obj['obj_field_obj'] # now get a value for the object from the imported cells pc = ProcessCells(self.source_id, self.start_row) obj_recs = pc.get_field_records( obj_field_obj.field_num, act_in_rows) if sort < 1: sort = obj_field_obj.field_num for hash_key, obj_rec in obj_recs.items(): # print('Worry about: ' + str(obj_rec['imp_cell_obj'].record)) object_uuid = obj_rec[ 'imp_cell_obj'].fl_uuid object_type = obj_field_obj.field_type object_ok = obj_rec['imp_cell_obj'].cell_ok cla = CandidateLinkAssertion() cla.project_uuid = self.project_uuid cla.source_id = self.source_id cla.subject_uuid = subject_uuid cla.subject_type = subject_type cla.obs_node = obs_node cla.obs_num = obs_num cla.sort = sort cla.predicate_uuid = predicate_uuid cla.object_uuid = object_uuid cla.object_type = object_type if (subject_ok and object_ok ) and predicate_uuid is not False: # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record)) cla.create_link() if cla.is_valid: self.count_new_assertions += 1 print( 'Count OK: ' + str(self.count_new_assertions))