Пример #1
0
    def get_data(self, corpus, corpus_id, token_id, kwic_len):
        tree_configs = self._conf.get_trees(corpus_id)
        tree_id = self._conf.get_tree_display_list(corpus_id)[0]
        conf = tree_configs[tree_id]
        raw_data = self._load_raw_sent(corpus, corpus_id, token_id, kwic_len, conf.all_attrs)
        parsed_data = self._parse_raw_sent(raw_data['data'], conf.all_attrs,
                                           self._conf.get_empty_value_placeholders(corpus_id))

        fallback_parse = None
        for i in range(len(parsed_data)):
            if self.is_error_node(parsed_data[i]):
                replac = dict(parsed_data[i].result.items())
                if fallback_parse is None:
                    fallback_parse = self._fetch_fallback_info(corpus, corpus_id, token_id, kwic_len, conf.parent_attr,
                                                               conf.attr_refs)
                if self.is_error_node(fallback_parse[i]):
                    # even fallback is broken - nothing we can do
                    raise BackendDataParseException('Failed to parse sentence')
                for k, v in parsed_data[i].result.items():
                    if k == conf.parent_attr or k in conf.attr_refs:
                        replac[k] = fallback_parse[i][k]
                    elif v is None:
                        replac[k] = 'N/A'
                parsed_data[i] = replac

        if conf.root_node:
            parsed_data = [conf.root_node] + parsed_data
        self._decode_tree_data(parsed_data, conf.parent_attr, conf.attr_refs)
        tb = mbk.TreeBuilder()
        tree_data = tb.process(conf, parsed_data)
        template = UcnkTreeTemplate(tree_id, tree_data, raw_data['kwic_pos'], tree_configs)
        return template.export(), mbk.TreeNodeEncoder
Пример #2
0
    def _parse_raw_sent(in_data, tree_attrs, empty_val_placeholders):
        """
        Args:
            in_data (list of str): a string-encoded sentence and required attribute metadata (see _load_raw_sent())
            tree_attrs (list of str): a list of attributes used by nodes/edges of the tree
            empty_val_placeholders (list of str): a list of values which may represent an empty
                value in a raw sentence data

        Returns (list of dict):
            a list of dict items representing tree nodes
        """
        def import_raw_val(v):
            return None if v in empty_val_placeholders or v == '' else v

        data = []
        for i in range(0, len(in_data), 4):
            parsed = [import_raw_val(x) for x in in_data[i + 2].split('/')]
            if len(parsed) > len(tree_attrs):
                item = dict(zip(tree_attrs, len(tree_attrs) * [None]))
                item['word'] = in_data[i]
                # In case of a parsing error we wrap a partial result into
                # an error and try later to fetch essential data only (= parent
                # and other references to other values).
                data.append(BackendDataParseException(result=item))
            else:
                item = dict(zip(tree_attrs, parsed))
                item['word'] = in_data[i]
                data.append(item)
        return data
Пример #3
0
    def _parse_raw_sent(in_data,
                        tree_attrs,
                        empty_val_placeholders,
                        multival_separ=None):
        """
        Args:
            in_data (list of str): a string-encoded sentence and required attribute metadata (see _load_raw_sent())
            tree_attrs (list of str): a list of attributes used by nodes/edges of the tree
            empty_val_placeholders (list of str): a list of values which may represent an empty
                value in a raw sentence data

        Returns (list of dict):
            a list of dict items representing tree nodes
        """
        def import_raw_val(v):
            return None if v in empty_val_placeholders or v == '' else v

        def expand_multivals(values):
            if multival_separ:
                expanded = []
                for v in values:
                    expanded.append(
                        v.split(multival_separ) if v is not None else [None])
                ans = []
                for i in range(0, max(len(x) for x in expanded)):
                    row = []
                    for v in expanded:
                        if len(v) > i:
                            row.append(v[i])
                        else:
                            row.append(v[0])
                    ans.append(row)
                return ans
            return [values]

        data = []
        for i in range(0, len(in_data), 4):
            parsed_m = expand_multivals(
                [import_raw_val(x) for x in in_data[i + 2].split('/')])
            for j, parsed in enumerate(parsed_m):
                if len(parsed) > len(tree_attrs):
                    item = dict(list(zip(tree_attrs,
                                         len(tree_attrs) * [None])))
                    item['word'] = in_data[i]
                    item['multival_flag'] = None
                    # In case of a parsing error we wrap a partial result into
                    # an error and try later to fetch essential data only (= parent
                    # and other references to other values).
                    data.append(BackendDataParseException(result=item))
                else:
                    item = dict(list(zip(tree_attrs, parsed)))
                    item['word'] = in_data[i]
                    if len(parsed_m) > 1:
                        if j == 0:
                            item['multival_flag'] = 'start'
                        elif j == len(parsed_m) - 1:
                            item['multival_flag'] = 'end'
                        else:
                            item['multival_flag'] = None
                    data.append(item)
        return data