def generate_sample_task_without_check(label_config, mode='upload'): """ Generate sample task only """ # load config parser = etree.XMLParser() xml = etree.fromstring(label_config, parser) if xml is None: raise etree.XMLSchemaParseError('Project config is empty or incorrect') # make examples pretty examples = data_examples(mode=mode) # iterate over xml tree and find values with '$' task = {} parent = xml.findall('.//*[@value]') # take all tags with value attribute for p in parent: value = p.get('value') # process List if p.tag == 'List': key = p.get('elementValue').replace('$', '') examples['List'] = [{key: 'Hello world'}, {key: 'Goodbye world'}] if value and value[0] == '$': # try get example by variable name by_name = examples.get(value, None) # not found by name, try get example by type task[value[1:]] = examples.get(p.tag, 'Something') if by_name is None else by_name return task
def parse_config_to_json(config_string): parser = etree.XMLParser(recover=False) xml = etree.fromstring(config_string, parser) if xml is None: raise etree.XMLSchemaParseError('xml is empty or incorrect') config = xmljson.badgerfish.data(xml) return config
def supported_formats(self): """Returns supported input formats for project (json / csv) :param project: project with label config :return: list of supported file types """ # load config parser = etree.XMLParser() xml = etree.fromstring(self.label_config, parser) if xml is None: raise etree.XMLSchemaParseError( "Project config is empty or incorrect") supported = {"json", "csv", "tsv"} if len(self.data_types.keys()) == 1: supported.add("txt") # if any of Lists are presented there is only json allowed lists = xml.findall(".//List") # take all tags with value attribute if lists: supported.remove("csv") supported.remove("tsv") supported.remove("txt") return supported
def loadSchema(uri, base_uri=None): """Load an XSD XML document (specified by filename or URL), and return a :class:`lxml.etree.XMLSchema`. """ # uri to use for reporting errors - include base uri if any if uri in _loaded_schemas: return _loaded_schemas[uri] error_uri = uri if base_uri is not None: error_uri += ' (base URI %s)' % base_uri try: logger.debug('Loading schema %s' % uri) _loaded_schemas[uri] = etree.XMLSchema( etree.parse(uri, parser=_get_xmlparser(), base_url=base_uri)) return _loaded_schemas[uri] except IOError as io_err: # add a little more detail to the error message - but should still be an IO error raise IOError('Failed to load schema %s : %s' % (error_uri, io_err)) except etree.XMLSchemaParseError as parse_err: # re-raise as a schema parse error, but ensure includes details about schema being loaded raise etree.XMLSchemaParseError('Failed to parse schema %s -- %s' % (error_uri, parse_err))
def loadSchema(uri, base_uri=None, override_proxy_requirement=False): """Load an XSD XML document (specified by filename or URL), and return a :class:`lxml.etree.XMLSchema`. Note that frequently loading a schema without using a web proxy may introduce significant network resource usage as well as instability if the schema becomes unavailable. Thus this function will fail if the ``HTTP_PROXY`` environment variable is not set. """ # uri to use for reporting errors - include base uri if any if uri in _loaded_schemas: return _loaded_schemas[uri] error_uri = uri if base_uri is not None: error_uri += ' (base URI %s)' % base_uri # typical reliable use should include a proxy. warn if they're not using # one. if 'HTTP_PROXY' not in os.environ and _http_uri(uri): message = ('Loading schema %s without a web proxy may introduce ' + 'significant network resource usage as well as ' + 'instability if that server becomes inaccessible. ' + 'The HTTP_PROXY environment variable is required ' + 'for loading schemas. Schema validation will be disabled.') \ % (error_uri,) if override_proxy_requirement: message += (' (overridden: Requesting without proxy. Please ' + 'set HTTP_PROXY as soon as possible.)') logger.warning(message) else: warnings.warn(message, UserWarning) # bail out and return None instead of a schema, so methods # that rely on a loaded schema can detect its absence and # proceed accordingly. return None try: logger.debug('Loading schema %s' % uri) _loaded_schemas[uri] = etree.XMLSchema( etree.parse(uri, parser=_get_xmlparser(), base_url=base_uri)) return _loaded_schemas[uri] except IOError as io_err: # add a little more detail to the error message - but should still be an IO error raise IOError('Failed to load schema %s : %s' % (error_uri, io_err)) except etree.XMLSchemaParseError as parse_err: # re-raise as a schema parse error, but ensure includes details about schema being loaded raise etree.XMLSchemaParseError('Failed to parse schema %s -- %s' % (error_uri, parse_err))
def extract_data_types(cls, label_config): # load config parser = etree.XMLParser() xml = etree.fromstring(label_config, parser) if xml is None: raise etree.XMLSchemaParseError('Project config is empty or incorrect') # take all tags with values attribute and fit them to tag types data_type = {} parent = xml.findall('.//*[@value]') for match in parent: name = match.get('value') if len(name) > 1 and name[0] == '$': name = name[1:] data_type[name] = match.tag return data_type
def generate_sample_task_without_check(label_config, mode='upload'): """ Generate sample task only """ # load config parser = etree.XMLParser() xml = etree.fromstring(label_config, parser) if xml is None: raise etree.XMLSchemaParseError('Project config is empty or incorrect') # make examples pretty examples = data_examples(mode=mode) # iterate over xml tree and find values with '$' task = {} parent = xml.findall('.//*[@value]') # take all tags with value attribute for p in parent: value = p.get('value') value_type = p.get('valueType', p.get('valuetype', None)) # process List if p.tag == 'List': key = p.get('elementValue').replace('$', '') examples['List'] = [{key: 'Hello world'}, {key: 'Goodbye world'}] # valueType="url" examples['Text'] = examples[ 'TextUrl'] if value_type == 'url' else examples['TextRaw'] examples['TimeSeries'] = examples[ 'TimeSeriesUrl'] if value_type == 'url' or value_type is None else examples[ 'TimeSeriesRaw'] if value and value[0] == '$': # try get example by variable name by_name = examples.get(value, None) # not found by name, try get example by type task[value[1:]] = examples.get( p.tag, 'Something') if by_name is None else by_name # TimeSeries special case for ts_tag in xml.findall('.//TimeSeries'): time_column = ts_tag.get('timeColumn') value_columns = [] for ts_child in ts_tag: if ts_child.tag != 'Channel': continue value_columns.append(ts_child.get('column')) sep = ts_tag.get('sep') time_format = ts_tag.get('timeFormat') tag_value = ts_tag.attrib['value'].lstrip('$') ts_task = task[tag_value] if isinstance(ts_task, str): # data is URL params = {'time': time_column, 'values': ','.join(value_columns)} if sep: params['sep'] = sep if time_format: params['tf'] = time_format task[tag_value] = '/samples/time-series.csv?' + urlencode(params) elif isinstance(ts_task, dict): # data is JSON task[tag_value] = generate_time_series_json( time_column, value_columns, time_format) return task
def generate_sample_task_without_check(label_config, mode='upload', secure_mode=False): """ Generate sample task only """ # load config parser = etree.XMLParser() xml = etree.fromstring(label_config, parser) if xml is None: raise etree.XMLSchemaParseError('Project config is empty or incorrect') # make examples pretty examples = data_examples(mode=mode) # iterate over xml tree and find values with '$' task = {} parent = xml.findall('.//*[@value]') # take all tags with value attribute for p in parent: # Make sure it is a real object tag, extract data placeholder key value = p.get('value') if not value or not value.startswith('$'): continue value = value[1:] # detect secured mode - objects served as URLs value_type = p.get('valueType') or p.get('valuetype') only_urls = secure_mode or value_type == 'url' example_from_field_name = examples.get('$' + value) if example_from_field_name: # try get example by variable name task[value] = example_from_field_name elif p.tag == 'Paragraphs': # Paragraphs special case - replace nameKey/textKey if presented name_key = p.get('nameKey') or p.get('namekey') or 'author' text_key = p.get('textKey') or p.get('textkey') or 'text' task[value] = [] for item in examples[p.tag]: task[value].append({ name_key: item['author'], text_key: item['text'] }) elif p.tag == 'TimeSeries': # TimeSeries special case - generate signals on-the-fly time_column = p.get('timeColumn') value_columns = [] for ts_child in p: if ts_child.tag != 'Channel': continue value_columns.append(ts_child.get('column')) sep = p.get('sep') time_format = p.get('timeFormat') if only_urls: # data is URL params = { 'time': time_column, 'values': ','.join(value_columns) } if sep: params['sep'] = sep if time_format: params['tf'] = time_format task[value] = '/samples/time-series.csv?' + urlencode(params) else: # data is JSON task[value] = generate_time_series_json( time_column, value_columns, time_format) else: # patch for valueType="url" examples['Text'] = examples['TextUrl'] if only_urls else examples[ 'TextRaw'] # not found by name, try get example by type task[value] = examples.get(p.tag, 'Something') return task