def kvp_writer_udf(row, fm_config): """ Converts XML to kvpjson """ # get handler, that includes defaults xml2kvp_defaults = XML2kvp(**fm_config) # convert XML to kvp xml2kvp_handler = XML2kvp.xml_to_kvp(row.document, return_handler=True, handler=xml2kvp_defaults) # loop through and convert lists/tuples to multivalue_delim for k, v in xml2kvp_handler.kvp_dict.items(): if type(v) in [list, tuple]: xml2kvp_handler.kvp_dict[ k] = xml2kvp_handler.multivalue_delim.join(v) # mixin other row attributes to kvp_dict xml2kvp_handler.kvp_dict.update({ 'record_id': row.record_id, 'combine_id': row.combine_id }) # return JSON line return json.dumps(xml2kvp_handler.kvp_dict)
def test_field_mapper(request): """ View to live test field mapper configurations """ if request.method == 'GET': # get field mapper field_mappers = FieldMapper.objects.all() # check if limiting to one, pre-existing record get_q = request.GET.get('q', None) # check for pre-requested transformation scenario fmid = request.GET.get('fmid', None) # return return render(request, 'core/test_field_mapper.html', { 'q': get_q, 'fmid': fmid, 'field_mappers': field_mappers, 'xml2kvp_handle': XML2kvp(), 'breadcrumbs': breadcrumb_parser(request) }) # If POST, provide mapping of record if request.method == 'POST': LOGGER.debug('running test field mapping') LOGGER.debug(request.POST) # get record record = Record.objects.get(id=request.POST.get('db_id')) # get field mapper info request.POST.get('field_mapper') # TODO: unused fm_config_json = request.POST.get('fm_config_json') try: # parse record with XML2kvp fm_config = json.loads(fm_config_json) kvp_dict = XML2kvp.xml_to_kvp(record.document, **fm_config) # return as JSON return JsonResponse(kvp_dict) except Exception as err: LOGGER.debug('field mapper was unsuccessful') return JsonResponse({'error': str(err)})
def _write_tabular_csv(spark, kvp_batch_rdd, base_path, folder_name, fm_config): # read rdd to DataFrame kvp_batch_df = spark.read.json(kvp_batch_rdd) # load XML2kvp instance _ = XML2kvp(**fm_config) # write to CSV kvp_batch_df.write.csv('%s/%s' % (base_path, folder_name), header=True)
def map_record(self, record_string=None, db_id=None, combine_id=None, record_id=None, publish_set_id=None, fingerprint=None): ''' Map record Args: record_string (str): string of record document db_id (str): mongo db id combine_id (str): combine_id id record_id (str): record id publish_set_id (str): core.models.RecordGroup.published_set_id, used to build publish identifier fingerprint (str): fingerprint Returns: (tuple): 0 (str): ['success','fail'] 1 (dict): details from mapping process, success or failure ''' try: # prepare literals if 'add_literals' not in self.field_mapper_config.keys(): self.field_mapper_config['add_literals'] = {} # add literals self.field_mapper_config['add_literals'].update({ # add temporary id field 'temp_id': db_id, # add combine_id field 'combine_id': combine_id, # add record_id field 'record_id': record_id, # add publish set id 'publish_set_id': publish_set_id, # add record's Combine DB id 'db_id': db_id, # add record's crc32 document hash, aka "fingerprint" 'fingerprint': fingerprint, }) # map with XML2kvp kvp_dict = XML2kvp.xml_to_kvp(record_string, **self.field_mapper_config) return ('success', kvp_dict) except Exception as e: return ('fail', { 'db_id': db_id, 'record_id': record_id, 'mapping_error': str(e) })
def _transform_openrefine(self, row): try: # parse or_actions or_actions = json.loads(self.payload) # load record as parsed_record parsed_record = PythonUDFRecord(row) # loop through actions for event in or_actions: # handle core/mass-edit if event['op'] == 'core/mass-edit': # get xpath xpath = XML2kvp.k_to_xpath(event['columnName']) LOGGER.debug("using xpath value: %s", xpath) # find elements for potential edits elements = parsed_record.xml.xpath( xpath, namespaces=parsed_record.nsmap) # loop through elements for elem in elements: # loop through edits for edit in event['edits']: # check if element text in from, change if elem.text in edit['from']: elem.text = edit['to'] # handle jython if event['op'] == 'core/text-transform' and event[ 'expression'].startswith('jython:'): # fire up temp module temp_pyts = ModuleType('temp_pyts') # parse code code = event['expression'].split('jython:')[1] # wrap in function and write to temp module code = 'def temp_func(value):\n%s' % textwrap.indent( code, prefix=' ') exec(code, temp_pyts.__dict__) # get xpath xpath = XML2kvp.k_to_xpath(event['columnName']) LOGGER.debug("using xpath value: %s", xpath) # find elements for potential edits elements = parsed_record.xml.xpath( xpath, namespaces=parsed_record.nsmap) # loop through elements for elem in elements: elem.text = temp_pyts.temp_func(elem.text) # re-serialize as trans_result return etree.tostring(parsed_record.xml).decode('utf-8') except Exception as err: # set trans_result tuple return str(err)