示例#1
0
def get_lemma_map(data):
    _lemma_map = {}
    stopwords = get_polish_stopwords()
    for _data_item_list in data.values():
        for _data_item in _data_item_list:
            _entity = strip_string(_data_item.token.lower())
            if _data_item.morphosyntactic_tags != 'interp':
                _value = strip_string(_data_item.lemma.split(':')[0].lower())
                if _value not in stopwords:
                    if _entity in _lemma_map and not _value in _lemma_map[
                            _entity]:
                        _lemma_map[_entity].append(_value)
                    else:
                        _lemma_map[_entity] = [_value]
    return _lemma_map
示例#2
0
def get_mapping_classes(jsons, types):
    _category_map = {}
    _tuples = []
    _pl_map = {}
    _en_map = {}
    _disambiguation = {}
    _disambiguation_helper = {}
    for _json in jsons:
        root_type = ''
        if 'P31' in _json.keys():
            for subtype in _json['P31']:
                if subtype in types:
                    _category_map[_json['id']] = subtype
                    root_type = subtype
        elif 'P279' in _json.keys():
            for subtype in _json['P279']:
                if subtype in types:
                    _category_map[_json['id']] = subtype
                    root_type = subtype

        if _json['wiki']['pl'] and len(_json['wiki']['pl']) > 1:
            __cleaned_entity = strip_string(_json['wiki']['pl'])
            _tuple = EntityTuple(_json['labels']['en'], _json['labels']['pl'],
                                 _json['id'], root_type, _json['wiki']['pl'],
                                 __cleaned_entity)
            _tuples.append(_tuple)
            add_to_map(_pl_map, _disambiguation, _disambiguation_helper,
                       _json['wiki']['pl'], _tuple, root_type)
    return _category_map, _tuples, _pl_map, _en_map, _disambiguation
示例#3
0
def validate(_list_dict, _entity_tuples, _pl_map, _prefix_map, _lemma_map,
             _ply):
    _found = []
    _not_found = []
    _errors = []
    cnt = 0
    _current = set()
    _stopwords = get_polish_stopwords()
    for _dict in _list_dict:
        cnt += 1
        print(
            "Progress {}".format(cnt)) if cnt % int(_ply / 10) == 0 else False
        for _label, _words in _dict.items():
            joined = ' '.join(map(lambda x: strip_string(x).lower(), _words))
            joined2 = ''.join(map(lambda x: strip_string(x).lower(), _words))
            if joined2 + _label not in _current:
                _current.add(joined2 + _label)
                _t = get_label(joined, _entity_tuples, _pl_map, _prefix_map,
                               _lemma_map, _stopwords)
                if _t and len(_t) == 1:
                    _t = _t[0]
                    if _t and _t.original_type_id == _label:
                        _found.append((_label, _words, _t))
                    elif _t:
                        _errors.append((_label, _words, _t))
                elif _t and len(_t) > 1:
                    if len(
                            list(
                                filter(lambda x: x.original_type_id == _label,
                                       _t))) > 0:
                        _found.append(
                            list(
                                filter(lambda x: x.original_type_id == _label,
                                       _t)))
                    elif len(
                            list(
                                filter(lambda x: x.original_type_id == _label,
                                       _t))) == 0:
                        _errors.append((_label, _words, _t))
                else:
                    _not_found.append((_label, _words))

    return _found, _not_found, _errors
示例#4
0
def _get_lemma_map(data):
    stopwords = get_polish_stopwords()
    _lemma_map = {}
    for _data_item_list in data.values():
        _filtered = list(
            filter(
                lambda y: len(strip_string(y.token).lower()) > 1 and y.token.
                lower() not in stopwords, _data_item_list))
        _handy_map = {}
        for di in _filtered:
            _handy_map[strip_string(di.token).lower()] = di.lemma.lower()
        _tokens = list(
            filter(lambda y: y and len(y) > 1,
                   map(lambda x: strip_string(x.token).lower(), _filtered)))
        _numbers = list(filter(lambda y: y and contains_digit(y), _tokens))
        for _number in _numbers:
            _lemma_map[_number] = _number
        _words = list(filter(lambda y: y and not contains_digit(y), _tokens))
        _words = list(set(_words))
        _text = [' '.join(_words)]
        _corpuse = prepare_data(_text)
        _corpuse = preprocess(_corpuse, stemmer, parser)
        _corpuse = decode_prepare_data(_corpuse)
        _tuples = []
        for _i in range(0, min(len(_words), len(_corpuse[0]))):
            if 'sup' in _corpuse[0][_i][1]:
                i = 1
            if 'sup' in _corpuse[0][_i][1] or _words[_i][0:2] == _corpuse[0][
                    _i][0][0:2]:
                # print("{} : {}".format(_i, (_words[_i], _corpuse[0][_i])))
                _tuples.append((_words[_i], _corpuse[0][_i]))

        for _word, _lemma in _tuples:
            if _lemma[1] != 'NaN':
                _lemma_map[_word] = _lemma[0]
            else:
                _lemma_map[_word] = _handy_map[_word]
    return _lemma_map
示例#5
0
def merge_lists(_list1, _lemma_map, _stopwords, _merged_map):
    for _label, _words in _list1:
        _words = strip_dangling_keywords(
            list(map(lambda x: strip_string(x).lower(), _words)))
        _lemma_entities = []
        for _e in _words:
            _entity_lemma = get_entity(_e, _lemma_map, _stopwords)
            if _entity_lemma: _lemma_entities.append(_entity_lemma)
        if len(_lemma_entities) == 1:
            for _lemma in _lemma_entities[0]:
                key_by_entity([_lemma], _merged_map, _label)
            # _key = ''.join(_lemma_entities[0])
        if len(_lemma_entities) == 2:
            _i_size = len(_lemma_entities[0])
            _j_size = len(_lemma_entities[1])
            for _i in range(0, _i_size):
                for _j in range(0, _j_size):
                    _l = [_lemma_entities[0][_i], _lemma_entities[1][_j]]
                    key_by_entity(_l, _merged_map, _label)
        if len(_lemma_entities) == 3:
            _i_size = len(_lemma_entities[0])
            _j_size = len(_lemma_entities[1])
            _k_size = len(_lemma_entities[2])
            for _i in range(0, _i_size):
                for _j in range(0, _j_size):
                    for _k in range(0, _k_size):
                        _l = [
                            _lemma_entities[0][_i], _lemma_entities[1][_j],
                            _lemma_entities[2][_k]
                        ]
                        key_by_entity(_l, _merged_map, _label)
        if len(_lemma_entities) == 4:
            _i_size = len(_lemma_entities[0])
            _j_size = len(_lemma_entities[1])
            _k_size = len(_lemma_entities[2])
            _l_size = len(_lemma_entities[3])
            for _i in range(0, _i_size):
                for _j in range(0, _j_size):
                    for _k in range(0, _k_size):
                        for _li in range(0, _l_size):
                            _le = [
                                _lemma_entities[0][_i], _lemma_entities[1][_j],
                                _lemma_entities[2][_k], _lemma_entities[3][_li]
                            ]
                            key_by_entity(_le, _merged_map, _label)
        if len(_lemma_entities) > 4:
            key_by_entity(_lemma_entities, _merged_map, _label)