示例#1
0
def get_lemma_map(data):
    _lemma_map = {}
    stopwords = get_polish_stopwords()
    for _data_item_list in data.values():
        for _data_item in _data_item_list:
            _entity = strip_string(_data_item.token.lower())
            if _data_item.morphosyntactic_tags != 'interp':
                _value = strip_string(_data_item.lemma.split(':')[0].lower())
                if _value not in stopwords:
                    if _entity in _lemma_map and not _value in _lemma_map[
                            _entity]:
                        _lemma_map[_entity].append(_value)
                    else:
                        _lemma_map[_entity] = [_value]
    return _lemma_map
示例#2
0
def validate(_list_dict, _entity_tuples, _pl_map, _prefix_map, _lemma_map,
             _ply):
    _found = []
    _not_found = []
    _errors = []
    cnt = 0
    _current = set()
    _stopwords = get_polish_stopwords()
    for _dict in _list_dict:
        cnt += 1
        print(
            "Progress {}".format(cnt)) if cnt % int(_ply / 10) == 0 else False
        for _label, _words in _dict.items():
            joined = ' '.join(map(lambda x: strip_string(x).lower(), _words))
            joined2 = ''.join(map(lambda x: strip_string(x).lower(), _words))
            if joined2 + _label not in _current:
                _current.add(joined2 + _label)
                _t = get_label(joined, _entity_tuples, _pl_map, _prefix_map,
                               _lemma_map, _stopwords)
                if _t and len(_t) == 1:
                    _t = _t[0]
                    if _t and _t.original_type_id == _label:
                        _found.append((_label, _words, _t))
                    elif _t:
                        _errors.append((_label, _words, _t))
                elif _t and len(_t) > 1:
                    if len(
                            list(
                                filter(lambda x: x.original_type_id == _label,
                                       _t))) > 0:
                        _found.append(
                            list(
                                filter(lambda x: x.original_type_id == _label,
                                       _t)))
                    elif len(
                            list(
                                filter(lambda x: x.original_type_id == _label,
                                       _t))) == 0:
                        _errors.append((_label, _words, _t))
                else:
                    _not_found.append((_label, _words))

    return _found, _not_found, _errors
示例#3
0
def _get_lemma_map(data):
    stopwords = get_polish_stopwords()
    _lemma_map = {}
    for _data_item_list in data.values():
        _filtered = list(
            filter(
                lambda y: len(strip_string(y.token).lower()) > 1 and y.token.
                lower() not in stopwords, _data_item_list))
        _handy_map = {}
        for di in _filtered:
            _handy_map[strip_string(di.token).lower()] = di.lemma.lower()
        _tokens = list(
            filter(lambda y: y and len(y) > 1,
                   map(lambda x: strip_string(x.token).lower(), _filtered)))
        _numbers = list(filter(lambda y: y and contains_digit(y), _tokens))
        for _number in _numbers:
            _lemma_map[_number] = _number
        _words = list(filter(lambda y: y and not contains_digit(y), _tokens))
        _words = list(set(_words))
        _text = [' '.join(_words)]
        _corpuse = prepare_data(_text)
        _corpuse = preprocess(_corpuse, stemmer, parser)
        _corpuse = decode_prepare_data(_corpuse)
        _tuples = []
        for _i in range(0, min(len(_words), len(_corpuse[0]))):
            if 'sup' in _corpuse[0][_i][1]:
                i = 1
            if 'sup' in _corpuse[0][_i][1] or _words[_i][0:2] == _corpuse[0][
                    _i][0][0:2]:
                # print("{} : {}".format(_i, (_words[_i], _corpuse[0][_i])))
                _tuples.append((_words[_i], _corpuse[0][_i]))

        for _word, _lemma in _tuples:
            if _lemma[1] != 'NaN':
                _lemma_map[_word] = _lemma[0]
            else:
                _lemma_map[_word] = _handy_map[_word]
    return _lemma_map
示例#4
0
# data = get_pickled(saved_data_file.format(5))
#
# test_tuples = get_test_data(data)
# save_to_file("test_tuples", test_tuples)
# test_tuples = get_pickled("test_tuples")

# _found, _not_found, _errors = get_pickled("results")

# w2vec_model = Word2Vec.load(dir + "all-sentences-word2vec-m3.model")

lemma_map = get_pickled("lemma_map_ext")

(category_map, entity_tuples, pl_map, en_map, disambiguation,
 prefix_map) = get_pickled("mapping-objects_ext")
_t1 = get_label("rzymie", entity_tuples, pl_map, prefix_map, lemma_map,
                get_polish_stopwords())
# _t1 = get_label("system operacyjny", entity_tuples, pl_map, prefix_map, lemma_map,
#                 get_polish_stopwords())
# _t1 = get_label("system operacyjny systemów operacyjnych", entity_tuples, pl_map, prefix_map, lemma_map,
#                 get_polish_stopwords())
#
# _t1 = get_label("asocjacyjny", entity_tuples, pl_map, prefix_map, lemma_map,
#                 get_polish_stopwords())
# _t2 = get_label("energię", entity_tuples, pl_map, prefix_map, lemma_map)
# _found, _not_found, _errors = get_pickled("errors")
# validate_debug(_errors, entity_tuples, pl_map, prefix_map, lemma_map, 1000)

# multi = int(sys.argv[1])
multi = 95
process_batches(multi)