def get_lemma_map(data): _lemma_map = {} stopwords = get_polish_stopwords() for _data_item_list in data.values(): for _data_item in _data_item_list: _entity = strip_string(_data_item.token.lower()) if _data_item.morphosyntactic_tags != 'interp': _value = strip_string(_data_item.lemma.split(':')[0].lower()) if _value not in stopwords: if _entity in _lemma_map and not _value in _lemma_map[ _entity]: _lemma_map[_entity].append(_value) else: _lemma_map[_entity] = [_value] return _lemma_map
def validate(_list_dict, _entity_tuples, _pl_map, _prefix_map, _lemma_map, _ply): _found = [] _not_found = [] _errors = [] cnt = 0 _current = set() _stopwords = get_polish_stopwords() for _dict in _list_dict: cnt += 1 print( "Progress {}".format(cnt)) if cnt % int(_ply / 10) == 0 else False for _label, _words in _dict.items(): joined = ' '.join(map(lambda x: strip_string(x).lower(), _words)) joined2 = ''.join(map(lambda x: strip_string(x).lower(), _words)) if joined2 + _label not in _current: _current.add(joined2 + _label) _t = get_label(joined, _entity_tuples, _pl_map, _prefix_map, _lemma_map, _stopwords) if _t and len(_t) == 1: _t = _t[0] if _t and _t.original_type_id == _label: _found.append((_label, _words, _t)) elif _t: _errors.append((_label, _words, _t)) elif _t and len(_t) > 1: if len( list( filter(lambda x: x.original_type_id == _label, _t))) > 0: _found.append( list( filter(lambda x: x.original_type_id == _label, _t))) elif len( list( filter(lambda x: x.original_type_id == _label, _t))) == 0: _errors.append((_label, _words, _t)) else: _not_found.append((_label, _words)) return _found, _not_found, _errors
def _get_lemma_map(data): stopwords = get_polish_stopwords() _lemma_map = {} for _data_item_list in data.values(): _filtered = list( filter( lambda y: len(strip_string(y.token).lower()) > 1 and y.token. lower() not in stopwords, _data_item_list)) _handy_map = {} for di in _filtered: _handy_map[strip_string(di.token).lower()] = di.lemma.lower() _tokens = list( filter(lambda y: y and len(y) > 1, map(lambda x: strip_string(x.token).lower(), _filtered))) _numbers = list(filter(lambda y: y and contains_digit(y), _tokens)) for _number in _numbers: _lemma_map[_number] = _number _words = list(filter(lambda y: y and not contains_digit(y), _tokens)) _words = list(set(_words)) _text = [' '.join(_words)] _corpuse = prepare_data(_text) _corpuse = preprocess(_corpuse, stemmer, parser) _corpuse = decode_prepare_data(_corpuse) _tuples = [] for _i in range(0, min(len(_words), len(_corpuse[0]))): if 'sup' in _corpuse[0][_i][1]: i = 1 if 'sup' in _corpuse[0][_i][1] or _words[_i][0:2] == _corpuse[0][ _i][0][0:2]: # print("{} : {}".format(_i, (_words[_i], _corpuse[0][_i]))) _tuples.append((_words[_i], _corpuse[0][_i])) for _word, _lemma in _tuples: if _lemma[1] != 'NaN': _lemma_map[_word] = _lemma[0] else: _lemma_map[_word] = _handy_map[_word] return _lemma_map
# data = get_pickled(saved_data_file.format(5)) # # test_tuples = get_test_data(data) # save_to_file("test_tuples", test_tuples) # test_tuples = get_pickled("test_tuples") # _found, _not_found, _errors = get_pickled("results") # w2vec_model = Word2Vec.load(dir + "all-sentences-word2vec-m3.model") lemma_map = get_pickled("lemma_map_ext") (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext") _t1 = get_label("rzymie", entity_tuples, pl_map, prefix_map, lemma_map, get_polish_stopwords()) # _t1 = get_label("system operacyjny", entity_tuples, pl_map, prefix_map, lemma_map, # get_polish_stopwords()) # _t1 = get_label("system operacyjny systemów operacyjnych", entity_tuples, pl_map, prefix_map, lemma_map, # get_polish_stopwords()) # # _t1 = get_label("asocjacyjny", entity_tuples, pl_map, prefix_map, lemma_map, # get_polish_stopwords()) # _t2 = get_label("energię", entity_tuples, pl_map, prefix_map, lemma_map) # _found, _not_found, _errors = get_pickled("errors") # validate_debug(_errors, entity_tuples, pl_map, prefix_map, lemma_map, 1000) # multi = int(sys.argv[1]) multi = 95 process_batches(multi)