Пример #1
0
 def process_multiple_media_fields(self):
     """ processes multiple media fields, if they exist """
     self.get_media_fields()
     self.get_metadata_fields()
     if len(self.media_fields) > 0:
         print('yes we have media')
         for field_obj in self.media_fields:
             pc = ProcessCells(self.source_id,
                               self.start_row)
             distinct_records = pc.get_field_records(field_obj.field_num,
                                                     False)
             if distinct_records is not False:
                 print('Found Media Records: ' + str(len(distinct_records)))
                 for rec_hash, dist_rec in distinct_records.items():
                     # print('Checking on: ' + dist_rec['imp_cell_obj'].record)
                     cm = CandidateMedia()
                     cm.project_uuid = self.project_uuid
                     cm.source_id = self.source_id
                     cm.class_uri = field_obj.field_value_cat
                     cm.import_rows = dist_rec['rows']  # list of rows where this record value is found
                     cm.metadata_obj = self.metadata_obj
                     cm.reconcile_manifest_item(dist_rec['imp_cell_obj'])
                     if cm.uuid is not False:
                         if cm.new_entity:
                             self.new_entities.append({'id': str(cm.uuid),
                                                       'label': cm.label})
                         else:
                             self.reconciled_entities.append({'id': str(cm.uuid),
                                                              'label': cm.label})
                         # we have a media item! Now we can add files to it
                         for part_field_obj in field_obj.parts:
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             part_dist_records = pc.get_field_records(part_field_obj.field_num,
                                                                      cm.import_rows)
                             if part_dist_records is not False:
                                 for rec_hash, part_dist_rec in part_dist_records.items():
                                     # distinct records for the media file parts of a media item
                                     cmf = CandidateMediaFile(cm.uuid)
                                     cmf.imp_cell_obj = part_dist_rec['imp_cell_obj']
                                     cmf.project_uuid = self.project_uuid
                                     cmf.source_id = self.source_id
                                     # file type is in the field_value_cat
                                     cmf.file_type = part_field_obj.field_value_cat
                                     file_uri = part_dist_rec['imp_cell_obj'].record
                                     if file_uri[:7] == 'http://' \
                                        or file_uri[:8] == 'https://':
                                         # its a URI part
                                         cmf.reconcile_media_file(file_uri)
                     else:
                         bad_id = str(dist_rec['imp_cell_obj'].field_num)
                         bad_id += '-' + str(dist_rec['imp_cell_obj'].row_num)
                         self.not_reconciled_entities.append({'id': bad_id,
                                                              'label': dist_rec['imp_cell_obj'].record})
Пример #2
0
 def make_type_ld_annotations(self,
                              sub_type_pred_uuid,
                              sub_type_f_num,
                              rel_pred,
                              obj_le_f_num):
     """ Makes linked data annotations
         for a type in an import
     """
     rels = []
     sub_type_list = ImportCell.objects\
                               .filter(source_id=self.source_id,
                                       field_num=sub_type_f_num)
     if len(sub_type_list) > 0:
         distinct_records = {}
         for cell in sub_type_list:
             if cell.rec_hash not in distinct_records:
                 distinct_records[cell.rec_hash] = {}
                 distinct_records[cell.rec_hash]['rows'] = []
                 distinct_records[cell.rec_hash]['imp_cell_obj'] = cell
             distinct_records[cell.rec_hash]['rows'].append(cell.row_num)
         for rec_hash_key, distinct_type in distinct_records.items():
             # iterate through the distinct types and get associated linked data
             type_label = distinct_type['imp_cell_obj'].record
             rows = distinct_type['rows']
             if len(type_label) > 0:
                 # the type isn't blank, so we can use it
                 pc = ProcessCells(self.source_id, 0)
                 ld_entities = pc.get_field_records(obj_le_f_num, rows)
                 for ld_hash_key, distinct_ld in ld_entities.items():
                     obj_uri = distinct_ld['imp_cell_obj'].record
                     if len(obj_uri) > 8:
                         if obj_uri[:7] == 'http://'\
                            or obj_uri[:8] == 'https://':
                             # we have a valid linked data entity
                             #
                             # now get the UUID for the type
                             tm = TypeManagement()
                             tm.project_uuid = self.project_uuid
                             tm.source_id = self.source_id
                             sub_type = tm.get_make_type_within_pred_uuid(sub_type_pred_uuid,
                                                                          type_label)
                             rel = {'subject_label': type_label,
                                    'subject': sub_type.uuid,
                                    'object_uri': obj_uri}
                             rels.append(rel)
     if len(rels) > 0:
         for rel in rels:
             new_la = LinkAnnotation()
             new_la.subject = rel['subject']
             new_la.subject_type = 'types'
             new_la.project_uuid = self.project_uuid
             new_la.source_id = self.source_id
             new_la.predicate_uri = rel_pred
             new_la.object_uri = rel['object_uri']
             new_la.creator_uuid = ''
             new_la.save()
             web_le = WebLinkEntity()
             web_le.check_add_link_entity(rel['object_uri'])
Пример #3
0
 def process_documents_batch(self):
     """ processes fields for documents
         entities starting with a given row number.
         This iterates over all containment fields, starting
         with the root subjhect field
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_documents_fields()
     self.get_metadata_fields()
     if len(self.documents_fields) > 0:
         print('Number of Document Fields: ' + str(len(self.documents_fields)))
         for field_obj in self.documents_fields:
             pc = ProcessCells(self.source_id,
                               self.start_row)
             distinct_records = pc.get_field_records(field_obj.field_num,
                                                     False)
             if distinct_records is not False:
                 print('Distinct document recs: ' + str(len(distinct_records)))
                 for rec_hash, dist_rec in distinct_records.items():
                     content = None
                     if isinstance(field_obj.doc_text_field_num, int):
                         # we have a related document text content field
                         # get the text for the document in the first row
                         doc_text_rows = ImportCell.objects\
                                                   .filter(source_id=self.source_id,
                                                           field_num=field_obj.doc_text_field_num,
                                                           row_num=dist_rec['rows'][0])[:1]
                         if len(doc_text_rows) > 0:
                             # we found text content associated with this set
                             content = doc_text_rows[0].record
                     cd = CandidateDocument()
                     cd.project_uuid = self.project_uuid
                     cd.source_id = self.source_id
                     cd.label = field_obj.field_value_cat
                     if isinstance(content, str):
                         # we found content to add to the document.
                         cd.content = content
                     cd.import_rows = dist_rec['rows']  # list of rows where this record value is found
                     cd.metadata_obj = self.metadata_obj
                     cd.reconcile_item(dist_rec['imp_cell_obj'])
                     if cd.uuid is not False:
                         if cd.new_entity:
                             self.new_entities.append({'id': str(cd.uuid),
                                                       'label': cd.label})
                         else:
                             self.reconciled_entities.append({'id': str(cd.uuid),
                                                              'label': cd.label})
                     else:
                         bad_id = str(dist_rec['imp_cell_obj'].field_num)
                         bad_id += '-' + str(dist_rec['imp_cell_obj'].row_num)
                         self.not_reconciled_entities.append({'id': str(bad_id),
                                                              'label': dist_rec['imp_cell_obj'].record})
Пример #4
0
 def process_persons_batch(self):
     """ processes containment fields for subject
         entities starting with a given row number.
         This iterates over all containment fields, starting
         with the root subjhect field
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_persons_fields()
     if len(self.persons_fields) > 0:
         print('Number of Person Fields: ' + str(len(self.persons_fields)))
         for field_obj in self.persons_fields:
             pc = ProcessCells(self.source_id, self.start_row)
             distinct_records = pc.get_field_records(
                 field_obj.field_num, False)
             if distinct_records is not False:
                 print('Distinct person recs: ' +
                       str(len(distinct_records)))
                 for rec_hash, dist_rec in distinct_records.items():
                     cp = CandidatePerson()
                     cp.project_uuid = self.project_uuid
                     cp.source_id = self.source_id
                     cp.foaf_type = field_obj.field_value_cat
                     cp.import_rows = dist_rec[
                         'rows']  # list of rows where this record value is found
                     cp.reconcile_item(dist_rec['imp_cell_obj'])
                     if cp.uuid is not False:
                         if cp.new_entity:
                             self.new_entities.append({
                                 'id': str(cp.uuid),
                                 'label': cp.label
                             })
                         else:
                             self.reconciled_entities.append({
                                 'id':
                                 str(cp.uuid),
                                 'label':
                                 cp.label
                             })
                     else:
                         bad_id = str(dist_rec['imp_cell_obj'].field_num)
                         bad_id += '-' + str(
                             dist_rec['imp_cell_obj'].row_num)
                         self.not_reconciled_entities.append({
                             'id':
                             str(bad_id),
                             'label':
                             dist_rec['imp_cell_obj'].record
                         })
Пример #5
0
 def process_single_media_label_field(self):
     """Processes only media field, it does not
        create new media, only reconciles existing already imported
        media
     """
     single_media_field = False
     media_fields = ImportField.objects\
                               .filter(source_id=self.source_id,
                                       field_type='media')
     if len(media_fields) == 1:
         # only for the 1 media field in an import source
         single_media_field = True
         print('yes we have 1 media field')
         field_obj = media_fields[0]
         # make the metadata fields for this one media field
         media_field_nums = [field_obj.field_num]
         self.get_metadata_fields(media_field_nums)
         pc = ProcessCells(self.source_id,
                           self.start_row)
         distinct_records = pc.get_field_records(field_obj.field_num,
                                                 False)
         if distinct_records is not False:
             print('Found Media Records: ' + str(len(distinct_records)))
             for rec_hash, dist_rec in distinct_records.items():
                 # print('Checking on: ' + dist_rec['imp_cell_obj'].record)
                 cm = CandidateMedia()
                 cm.mint_new_entity_ok = False  # DO NOT create new entities!
                 cm.project_uuid = self.project_uuid
                 cm.source_id = self.source_id
                 cm.class_uri = field_obj.field_value_cat
                 cm.import_rows = dist_rec['rows']  # list of rows where this record value is found
                 cm.metadata_obj = self.metadata_obj
                 cm.reconcile_manifest_item(dist_rec['imp_cell_obj'])
                 if cm.uuid is not False:
                     self.reconciled_entities.append({'id': str(cm.uuid),
                                                      'label': cm.label})
     return single_media_field
Пример #6
0
 def get_link_examples(self):
     """ Gets example entities with linking relations
     """
     example_entities = []
     self.get_link_annotations()
     if self.link_rels is not False:
         for subj_field_num, rels in self.link_rels.items():
             # get some example records
             pc = ProcessCells(self.source_id, self.start_row)
             distinct_records = pc.get_field_records(subj_field_num, False)
             if distinct_records is not False:
                 entity_example_count = 0
                 # sort the list in row_order from the import table
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(
                     distinct_records)
                 for row_key, dist_rec in distinct_records.items():
                     if entity_example_count < self.example_size:
                         # if we're less than the example size, make
                         # an example object
                         entity_example_count += 1
                         entity = LastUpdatedOrderedDict()
                         entity_label = dist_rec['imp_cell_obj'].record
                         if len(entity_label) < 1:
                             entity_label = '[BLANK]'
                         entity_label = rels[
                             'sub_field_obj'].value_prefix + entity_label
                         entity['label'] = entity_label
                         entity['id'] = str(subj_field_num) + '-' + str(
                             row_key)
                         entity['links'] = []
                         example_rows = []
                         example_rows.append(dist_rec['rows'][0])
                         in_rows = [dist_rec['rows'][0]]
                         for pred_obj in rels['pred_objs']:
                             act_preds = []
                             if pred_obj['predicate_uuid'] is not False:
                                 pred_item = LastUpdatedOrderedDict()
                                 pred_item['id'] = pred_obj[
                                     'predicate_uuid']
                                 ent = Entity()
                                 found = ent.dereference(
                                     pred_obj['predicate_uuid'])
                                 if found:
                                     pred_item['label'] = ent.label
                                 else:
                                     pred_item[
                                         'label'] = '[Missing predicate!]'
                                 act_preds.append(pred_item)
                             elif pred_obj['pred_field_obj'] is not False:
                                 # linking predicate is in a field
                                 pc = ProcessCells(self.source_id,
                                                   self.start_row)
                                 pred_recs = pc.get_field_records(
                                     pred_obj['pred_field_obj'].field_num,
                                     in_rows)
                                 for pred_rec in pred_recs:
                                     pred_item = LastUpdatedOrderedDict()
                                     pred_item['id'] = str(
                                         pred_obj['pred_field_obj'].
                                         field_num)
                                     pred_item['id'] += '-' + str(
                                         pred_rec['rows'][0])
                                     pred_item['label'] = pred_rec[
                                         'imp_cell_obj'].record
                                     if len(pred_item['label']) < 1:
                                         pred_item['label'] = '[BLANK]'
                                     if len(act_precs) < self.example_size:
                                         act_preds.append(pred_item)
                             for pred_item in act_preds:
                                 link_item = LastUpdatedOrderedDict()
                                 link_item['predicate'] = pred_item
                                 # values are in a list, to keep consistent with descriptions
                                 link_item['object'] = False
                                 obj_field_obj = pred_obj['obj_field_obj']
                                 # now get a value for the object from the imported cells
                                 pc = ProcessCells(self.source_id,
                                                   self.start_row)
                                 obj_recs = pc.get_field_records(
                                     obj_field_obj.field_num, in_rows)
                                 pg = ProcessGeneral(self.source_id)
                                 obj_rec = pg.get_first_distinct_record(
                                     obj_recs)
                                 if obj_rec is not False:
                                     object_val = LastUpdatedOrderedDict()
                                     object_label = obj_field_obj.value_prefix
                                     if len(obj_rec['imp_cell_obj'].record
                                            ) > 1:
                                         object_label += obj_rec[
                                             'imp_cell_obj'].record
                                     else:
                                         object_label += '[BLANK]'
                                     object_val['label'] = object_label
                                     object_val['id'] = str(
                                         obj_rec['imp_cell_obj'].field_num)
                                     object_val['id'] += '-' + str(
                                         obj_rec['rows'][0])
                                     link_item['object'] = object_val
                                     if len(entity['links']
                                            ) < self.example_size:
                                         entity['links'].append(link_item)
                         example_entities.append(entity)
     return example_entities
Пример #7
0
 def process_link_batch(self):
     """ processes fields describing linking relations
         between subjects, media, documents, persons, projects entities.
         If start_row is 1, then previous imports of this source are cleared
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_link_annotations()
     if self.link_rels is not False:
         for subj_field_num, rels in self.link_rels.items():
             # get some example records
             sub_field_obj = rels['sub_field_obj']
             pc = ProcessCells(self.source_id, self.start_row)
             distinct_records = pc.get_field_records(subj_field_num, False)
             if distinct_records is not False:
                 # sort the list in row_order from the import table
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(
                     distinct_records)
                 for row_key, dist_rec in distinct_records.items():
                     subject_uuid = dist_rec['imp_cell_obj'].fl_uuid
                     subject_type = sub_field_obj.field_type
                     subject_ok = dist_rec['imp_cell_obj'].cell_ok
                     if subject_uuid is False:
                         subject_ok = False
                     sort = 0
                     in_rows = dist_rec['rows']
                     for pred_obj in rels['pred_objs']:
                         act_preds = {}
                         obs_num = 1  # default observation number
                         if pred_obj['predicate_uuid'] is not False:
                             act_preds[pred_obj['predicate_uuid']] = in_rows
                         elif pred_obj['pred_field_obj'] is not False:
                             # linking predicate is in a field
                             if pred_obj['pred_field_obj'].obs_num > 0:
                                 obs_num = pred_obj[
                                     'pred_field_obj'].obs_num
                             sort = pred_obj['pred_field_obj'].field_num
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             pred_recs = pc.get_field_records(
                                 pred_obj['pred_field_obj'].field_num,
                                 in_rows)
                             for pred_rec in pred_recs:
                                 clp = CandidateLinkPredicate()
                                 clp.source_id = self.source_id
                                 clp.project_uuid = self.project_uuid
                                 clp.make_reconcile_link_pred(
                                     pred_rec['imp_cell_obj'].record)
                                 if clp.uuid is not False:
                                     act_preds[clp.uuid] = pred_rec['rows']
                         obs_node = '#obs-' + str(obs_num)
                         for predicate_uuid, act_in_rows in act_preds.items(
                         ):
                             obj_field_obj = pred_obj['obj_field_obj']
                             # now get a value for the object from the imported cells
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             obj_recs = pc.get_field_records(
                                 obj_field_obj.field_num, act_in_rows)
                             if sort < 1:
                                 sort = obj_field_obj.field_num
                             for hash_key, obj_rec in obj_recs.items():
                                 # print('Worry about: ' + str(obj_rec['imp_cell_obj'].record))
                                 object_uuid = obj_rec[
                                     'imp_cell_obj'].fl_uuid
                                 object_type = obj_field_obj.field_type
                                 object_ok = obj_rec['imp_cell_obj'].cell_ok
                                 cla = CandidateLinkAssertion()
                                 cla.project_uuid = self.project_uuid
                                 cla.source_id = self.source_id
                                 cla.subject_uuid = subject_uuid
                                 cla.subject_type = subject_type
                                 cla.obs_node = obs_node
                                 cla.obs_num = obs_num
                                 cla.sort = sort
                                 cla.predicate_uuid = predicate_uuid
                                 cla.object_uuid = object_uuid
                                 cla.object_type = object_type
                                 if (subject_ok and object_ok
                                     ) and predicate_uuid is not False:
                                     # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record))
                                     cla.create_link()
                                     if cla.is_valid:
                                         self.count_new_assertions += 1
                                         print(
                                             'Count OK: ' +
                                             str(self.count_new_assertions))
Пример #8
0
 def process_link_batch(self):
     """ processes fields describing linking relations
         between subjects, media, documents, persons, projects entities.
         If start_row is 1, then previous imports of this source are cleared
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_link_annotations()
     if self.link_rels is not False:
         for subj_field_num, rels in self.link_rels.items():
             # get some example records
             sub_field_obj = rels['sub_field_obj']
             pc = ProcessCells(self.source_id,
                               self.start_row)
             # now get distinct records as determined by having the same assigned
             # uuid
             distinct_records = pc.get_field_records_by_fl_uuid(subj_field_num,
                                                                False)
             if distinct_records is not False:
                 # sort the list in row_order from the import table
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(distinct_records)
                 for row_key, dist_rec in distinct_records.items():
                     subject_uuid = dist_rec['imp_cell_obj'].fl_uuid
                     subject_type = sub_field_obj.field_type
                     subject_ok = dist_rec['imp_cell_obj'].cell_ok
                     subject_record = dist_rec['imp_cell_obj'].record
                     if subject_uuid is False or\
                        len(subject_record) < 1:
                         subject_ok = False
                     if subject_uuid == 'False':
                         subject_ok = False
                     sort = 0
                     in_rows = dist_rec['rows']
                     if subject_ok is False:
                         in_rows = [-1]
                     for pred_obj in rels['pred_objs']:
                         act_preds = {}
                         obs_num = 1  # default observation number
                         if pred_obj['predicate_uuid'] is not False:
                             # limit to the 'in rows' for the current item
                             act_preds[pred_obj['predicate_uuid']] = in_rows
                         elif pred_obj['pred_field_obj'] is not False:
                             # linking predicate is in a field
                             if pred_obj['pred_field_obj'].obs_num > 0:
                                 obs_num = pred_obj['pred_field_obj'].obs_num
                             sort = pred_obj['pred_field_obj'].field_num
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             predicate_records= pc.get_field_records(pred_obj['pred_field_obj'].field_num,
                                                                     in_rows)
                             for pred_row_key, pred_rec in predicate_records.items():
                                 clp = CandidateLinkPredicate()
                                 clp.source_id = self.source_id
                                 clp.project_uuid = self.project_uuid
                                 clp.make_reconcile_link_pred(pred_rec['imp_cell_obj'].record)
                                 if clp.uuid is not False:
                                     act_preds[clp.uuid] = pred_rec['rows']
                         obs_node = '#obs-' + str(obs_num)
                         for predicate_uuid, act_in_rows in act_preds.items():
                             obj_field_obj = pred_obj['obj_field_obj']
                             # now get a value for the object from the imported cells
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             obj_recs = pc.get_field_records_by_fl_uuid(obj_field_obj.field_num,
                                                                        act_in_rows)
                             if sort < 1:
                                 sort = obj_field_obj.field_num
                             if obj_recs is not False:
                                 for hash_key, obj_rec in obj_recs.items():
                                     object_uuid = obj_rec['imp_cell_obj'].fl_uuid
                                     object_type = obj_field_obj.field_type
                                     object_ok = obj_rec['imp_cell_obj'].cell_ok
                                     object_record = obj_rec['imp_cell_obj'].record
                                     if len(object_record) < 1:
                                         # blank record, don't make a link
                                         object_ok = False
                                     if object_uuid is False or\
                                         len(object_uuid) < 1:
                                          object_ok = False
                                     if object_uuid == 'False':
                                          object_ok = False
                                     if object_ok and subject_ok:
                                         message = 'Attempt link: ' + subject_record + ' ('+ subject_uuid + ') -> '
                                         message += predicate_uuid + ' -> ' + object_record + ' ('+ object_uuid + ')'
                                         message += 'in rows: ' + str(act_in_rows)
                                         # print(message)
                                         cla = CandidateLinkAssertion()
                                         cla.project_uuid = self.project_uuid
                                         cla.source_id = self.source_id
                                         cla.subject_uuid = subject_uuid
                                         cla.subject_type = subject_type
                                         cla.obs_node = obs_node
                                         cla.obs_num = obs_num
                                         cla.sort = sort
                                         cla.predicate_uuid = predicate_uuid
                                         cla.object_uuid = object_uuid
                                         cla.object_type = object_type
                                         if (subject_ok and object_ok) and predicate_uuid is not False:
                                             # print('Link ok: ' + str(obj_rec['imp_cell_obj'].record))
                                             cla.create_link()
                                             if cla.is_valid:
                                                 self.count_new_assertions += 1
                                                 print('Link Count OK: ' + str(self.count_new_assertions))
Пример #9
0
 def process_media_batch(self):
     """ process media items
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_media_fields()
     if len(self.media_fields) > 0:
         for field_obj in self.media_fields:
             pc = ProcessCells(self.source_id, self.start_row)
             distinct_records = pc.get_field_records(
                 field_obj.field_num, False)
             if distinct_records is not False:
                 for rec_hash, dist_rec in distinct_records.items():
                     # print('Checking on: ' + dist_rec['imp_cell_obj'].record)
                     cm = CandidateMedia()
                     cm.project_uuid = self.project_uuid
                     cm.source_id = self.source_id
                     cm.class_uri = field_obj.field_value_cat
                     cm.import_rows = dist_rec[
                         'rows']  # list of rows where this record value is found
                     cm.reconcile_manifest_item(dist_rec['imp_cell_obj'])
                     if cm.uuid is not False:
                         if cm.new_entity:
                             self.new_entities.append({
                                 'id': str(cm.uuid),
                                 'label': cm.label
                             })
                         else:
                             self.reconciled_entities.append({
                                 'id':
                                 str(cm.uuid),
                                 'label':
                                 cm.label
                             })
                         # we have a media item! Now we can add files to it
                         for part_field_obj in field_obj.parts:
                             pc = ProcessCells(self.source_id,
                                               self.start_row)
                             part_dist_records = pc.get_field_records(
                                 part_field_obj.field_num, cm.import_rows)
                             if part_dist_records is not False:
                                 for rec_hash, part_dist_rec in part_dist_records.items(
                                 ):
                                     # distinct records for the media file parts of a media item
                                     cmf = CandidateMediaFile(cm.uuid)
                                     cmf.project_uuid = self.project_uuid
                                     cmf.source_id = self.source_id
                                     # file type is in the field_value_cat
                                     cmf.file_type = part_field_obj.field_value_cat
                                     file_uri = part_dist_rec[
                                         'imp_cell_obj'].record
                                     cmf.reconcile_media_file(file_uri)
                     else:
                         bad_id = str(dist_rec['imp_cell_obj'].field_num)
                         bad_id += '-' + str(
                             dist_rec['imp_cell_obj'].row_num)
                         self.not_reconciled_entities.append({
                             'id':
                             bad_id,
                             'label':
                             dist_rec['imp_cell_obj'].record
                         })
Пример #10
0
 def process_complex_batch(self):
     """ processes fields for documents
         entities starting with a given row number.
         This iterates over all containment fields, starting
         with the root subjhect field
     """
     self.clear_source()  # clear prior import for this source
     self.end_row = self.start_row + self.batch_size
     self.get_complex_description_fields()
     label_str_uuids = {}
     if len(self.complex_des_fields) > 0:
         print('Number of Complex Description Fields: ' +
               str(len(self.complex_des_fields)))
         cp_id_number = 0
         for cp_field in self.complex_des_fields:
             cp_id_number += 1
             pc = ProcessCells(self.source_id, self.start_row)
             distinct_records = pc.get_field_records_by_fl_uuid(
                 cp_field.describes_field.field_num, False)
             if distinct_records is not False:
                 # sort the list in row_order from the import table
                 pg = ProcessGeneral(self.source_id)
                 distinct_records = pg.order_distinct_records(
                     distinct_records)
                 for row_key, dist_rec in distinct_records.items():
                     if cp_field.obs_num < 1:
                         obs_num = 1
                     else:
                         obs_num = cp_field.obs_num
                     obs_node = '#obs-' + str(obs_num)
                     subject_uuid = dist_rec['imp_cell_obj'].fl_uuid
                     subject_type = cp_field.describes_field.field_type
                     subject_ok = dist_rec['imp_cell_obj'].cell_ok
                     subject_record = dist_rec['imp_cell_obj'].record
                     if subject_uuid is False or\
                        len(subject_record) < 1:
                         subject_ok = False
                     if subject_uuid == 'False':
                         subject_ok = False
                     sort = 0
                     in_rows = dist_rec['rows']
                     print('Look for complex description labels in rows: ' +
                           str(in_rows))
                     if subject_ok is not False:
                         # OK! we have the subjects of complex descriptions
                         # with uuids, so now we can make an fl_uuid for each
                         # of the complex description fields.
                         complex_uuid = subject_uuid + self.FRAG_ID_PREFIX + str(
                             cp_id_number)
                         complex_recs = ImportCell.objects\
                                                  .filter(source_id=self.source_id,
                                                          field_num=cp_field.field_num,
                                                          row_num__in=in_rows)\
                                                  .exclude(record='')
                         if len(complex_recs) > 0:
                             # we have records in the complex description field that are not blank
                             # and are associated with the subject of the complex description.
                             # so now, let's record this association.
                             save_ok = False
                             new_ass = Assertion()
                             new_ass.uuid = subject_uuid
                             new_ass.subject_type = subject_type
                             new_ass.project_uuid = self.project_uuid
                             new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                             new_ass.obs_node = obs_node
                             new_ass.obs_num = obs_num
                             new_ass.sort = 100 + cp_id_number
                             new_ass.visibility = 1
                             new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES
                             new_ass.object_type = 'complex-description'
                             new_ass.object_uuid = complex_uuid
                             new_ass.save()
                             try:
                                 print('Saved complex-description: ' +
                                       complex_uuid)
                                 new_ass.save()
                                 save_ok = True
                             except:
                                 save_ok = False
                             if save_ok:
                                 self.count_new_assertions += 1
                             # now look through the complex description records and make labels
                             for comp_rec in complex_recs:
                                 # first save the fl_uuid for the complex description
                                 comp_rec.fl_uuid = complex_uuid
                                 comp_rec.save()
                                 if isinstance(cp_field.value_prefix, str):
                                     cp_label = cp_field.value_prefix + comp_rec.record
                                 else:
                                     cp_label = comp_rec.record
                                 if cp_label not in label_str_uuids:
                                     # make a uuid for the record value
                                     # adding a source_id suffix keeps this from being deleted as descriptions get processed
                                     sm = StringManagement()
                                     sm.project_uuid = self.project_uuid
                                     sm.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                                     oc_string = sm.get_make_string(
                                         cp_label)
                                     content_uuid = oc_string.uuid
                                     label_str_uuids[
                                         cp_label] = content_uuid
                                 content_uuid = label_str_uuids[cp_label]
                                 save_ok = False
                                 new_ass = Assertion()
                                 new_ass.uuid = complex_uuid
                                 new_ass.subject_type = 'complex-description'
                                 new_ass.project_uuid = self.project_uuid
                                 # adding a source_id suffix keeps this from being deleted as descriptions get processed
                                 new_ass.source_id = self.source_id + ProcessGeneral.COMPLEX_DESCRIPTION_SOURCE_SUFFIX
                                 new_ass.obs_node = '#obs-' + str(
                                     self.
                                     obs_num_complex_description_assertions)
                                 new_ass.obs_num = self.obs_num_complex_description_assertions
                                 new_ass.sort = 1
                                 new_ass.visibility = 1
                                 new_ass.predicate_uuid = ComplexDescription.PREDICATE_COMPLEX_DES_LABEL
                                 new_ass.object_type = 'xsd:string'
                                 new_ass.object_uuid = content_uuid
                                 try:
                                     new_ass.save()
                                     save_ok = True
                                 except:
                                     save_ok = False
                                 if save_ok:
                                     self.count_new_assertions += 1