예제 #1
0
    def kvp_writer_udf(row, fm_config):
        """
        Converts XML to kvpjson
        """

        # get handler, that includes defaults
        xml2kvp_defaults = XML2kvp(**fm_config)

        # convert XML to kvp
        xml2kvp_handler = XML2kvp.xml_to_kvp(row.document,
                                             return_handler=True,
                                             handler=xml2kvp_defaults)

        # loop through and convert lists/tuples to multivalue_delim
        for k, v in xml2kvp_handler.kvp_dict.items():
            if type(v) in [list, tuple]:
                xml2kvp_handler.kvp_dict[
                    k] = xml2kvp_handler.multivalue_delim.join(v)

        # mixin other row attributes to kvp_dict
        xml2kvp_handler.kvp_dict.update({
            'record_id': row.record_id,
            'combine_id': row.combine_id
        })

        # return JSON line
        return json.dumps(xml2kvp_handler.kvp_dict)
예제 #2
0
def test_field_mapper(request):

    """
    View to live test field mapper configurations
    """

    if request.method == 'GET':
        # get field mapper
        field_mappers = FieldMapper.objects.all()

        # check if limiting to one, pre-existing record
        get_q = request.GET.get('q', None)

        # check for pre-requested transformation scenario
        fmid = request.GET.get('fmid', None)

        # return
        return render(request, 'core/test_field_mapper.html', {
            'q': get_q,
            'fmid': fmid,
            'field_mappers': field_mappers,
            'xml2kvp_handle': XML2kvp(),
            'breadcrumbs': breadcrumb_parser(request)
        })

    # If POST, provide mapping of record
    if request.method == 'POST':

        LOGGER.debug('running test field mapping')
        LOGGER.debug(request.POST)

        # get record
        record = Record.objects.get(id=request.POST.get('db_id'))

        # get field mapper info
        request.POST.get('field_mapper') # TODO: unused
        fm_config_json = request.POST.get('fm_config_json')

        try:

            # parse record with XML2kvp
            fm_config = json.loads(fm_config_json)
            kvp_dict = XML2kvp.xml_to_kvp(record.document, **fm_config)

            # return as JSON
            return JsonResponse(kvp_dict)

        except Exception as err:

            LOGGER.debug('field mapper was unsuccessful')
            return JsonResponse({'error': str(err)})
예제 #3
0
def _write_tabular_csv(spark, kvp_batch_rdd, base_path, folder_name,
                       fm_config):

    # read rdd to DataFrame
    kvp_batch_df = spark.read.json(kvp_batch_rdd)

    # load XML2kvp instance
    _ = XML2kvp(**fm_config)

    # write to CSV
    kvp_batch_df.write.csv('%s/%s' % (base_path, folder_name), header=True)
예제 #4
0
파일: es.py 프로젝트: mlibrary/combine
    def map_record(self,
                   record_string=None,
                   db_id=None,
                   combine_id=None,
                   record_id=None,
                   publish_set_id=None,
                   fingerprint=None):
        '''
		Map record

		Args:
			record_string (str): string of record document
			db_id (str): mongo db id
			combine_id (str): combine_id id
			record_id (str): record id
			publish_set_id (str): core.models.RecordGroup.published_set_id, used to build publish identifier
			fingerprint (str): fingerprint

		Returns:
			(tuple):
				0 (str): ['success','fail']
				1 (dict): details from mapping process, success or failure
		'''

        try:

            # prepare literals
            if 'add_literals' not in self.field_mapper_config.keys():
                self.field_mapper_config['add_literals'] = {}

            # add literals
            self.field_mapper_config['add_literals'].update({

                # add temporary id field
                'temp_id':
                db_id,

                # add combine_id field
                'combine_id':
                combine_id,

                # add record_id field
                'record_id':
                record_id,

                # add publish set id
                'publish_set_id':
                publish_set_id,

                # add record's Combine DB id
                'db_id':
                db_id,

                # add record's crc32 document hash, aka "fingerprint"
                'fingerprint':
                fingerprint,
            })

            # map with XML2kvp
            kvp_dict = XML2kvp.xml_to_kvp(record_string,
                                          **self.field_mapper_config)

            return ('success', kvp_dict)

        except Exception as e:

            return ('fail', {
                'db_id': db_id,
                'record_id': record_id,
                'mapping_error': str(e)
            })
예제 #5
0
    def _transform_openrefine(self, row):

        try:

            # parse or_actions
            or_actions = json.loads(self.payload)

            # load record as parsed_record
            parsed_record = PythonUDFRecord(row)

            # loop through actions
            for event in or_actions:

                # handle core/mass-edit
                if event['op'] == 'core/mass-edit':

                    # get xpath
                    xpath = XML2kvp.k_to_xpath(event['columnName'])
                    LOGGER.debug("using xpath value: %s", xpath)

                    # find elements for potential edits
                    elements = parsed_record.xml.xpath(
                        xpath, namespaces=parsed_record.nsmap)

                    # loop through elements
                    for elem in elements:

                        # loop through edits
                        for edit in event['edits']:

                            # check if element text in from, change
                            if elem.text in edit['from']:
                                elem.text = edit['to']

                # handle jython
                if event['op'] == 'core/text-transform' and event[
                        'expression'].startswith('jython:'):

                    # fire up temp module
                    temp_pyts = ModuleType('temp_pyts')

                    # parse code
                    code = event['expression'].split('jython:')[1]

                    # wrap in function and write to temp module
                    code = 'def temp_func(value):\n%s' % textwrap.indent(
                        code, prefix='       ')
                    exec(code, temp_pyts.__dict__)

                    # get xpath
                    xpath = XML2kvp.k_to_xpath(event['columnName'])
                    LOGGER.debug("using xpath value: %s", xpath)

                    # find elements for potential edits
                    elements = parsed_record.xml.xpath(
                        xpath, namespaces=parsed_record.nsmap)

                    # loop through elements
                    for elem in elements:
                        elem.text = temp_pyts.temp_func(elem.text)

            # re-serialize as trans_result
            return etree.tostring(parsed_record.xml).decode('utf-8')

        except Exception as err:
            # set trans_result tuple
            return str(err)