def get_lemma_map(data): _lemma_map = {} stopwords = get_polish_stopwords() for _data_item_list in data.values(): for _data_item in _data_item_list: _entity = strip_string(_data_item.token.lower()) if _data_item.morphosyntactic_tags != 'interp': _value = strip_string(_data_item.lemma.split(':')[0].lower()) if _value not in stopwords: if _entity in _lemma_map and not _value in _lemma_map[ _entity]: _lemma_map[_entity].append(_value) else: _lemma_map[_entity] = [_value] return _lemma_map
def get_mapping_classes(jsons, types): _category_map = {} _tuples = [] _pl_map = {} _en_map = {} _disambiguation = {} _disambiguation_helper = {} for _json in jsons: root_type = '' if 'P31' in _json.keys(): for subtype in _json['P31']: if subtype in types: _category_map[_json['id']] = subtype root_type = subtype elif 'P279' in _json.keys(): for subtype in _json['P279']: if subtype in types: _category_map[_json['id']] = subtype root_type = subtype if _json['wiki']['pl'] and len(_json['wiki']['pl']) > 1: __cleaned_entity = strip_string(_json['wiki']['pl']) _tuple = EntityTuple(_json['labels']['en'], _json['labels']['pl'], _json['id'], root_type, _json['wiki']['pl'], __cleaned_entity) _tuples.append(_tuple) add_to_map(_pl_map, _disambiguation, _disambiguation_helper, _json['wiki']['pl'], _tuple, root_type) return _category_map, _tuples, _pl_map, _en_map, _disambiguation
def validate(_list_dict, _entity_tuples, _pl_map, _prefix_map, _lemma_map, _ply): _found = [] _not_found = [] _errors = [] cnt = 0 _current = set() _stopwords = get_polish_stopwords() for _dict in _list_dict: cnt += 1 print( "Progress {}".format(cnt)) if cnt % int(_ply / 10) == 0 else False for _label, _words in _dict.items(): joined = ' '.join(map(lambda x: strip_string(x).lower(), _words)) joined2 = ''.join(map(lambda x: strip_string(x).lower(), _words)) if joined2 + _label not in _current: _current.add(joined2 + _label) _t = get_label(joined, _entity_tuples, _pl_map, _prefix_map, _lemma_map, _stopwords) if _t and len(_t) == 1: _t = _t[0] if _t and _t.original_type_id == _label: _found.append((_label, _words, _t)) elif _t: _errors.append((_label, _words, _t)) elif _t and len(_t) > 1: if len( list( filter(lambda x: x.original_type_id == _label, _t))) > 0: _found.append( list( filter(lambda x: x.original_type_id == _label, _t))) elif len( list( filter(lambda x: x.original_type_id == _label, _t))) == 0: _errors.append((_label, _words, _t)) else: _not_found.append((_label, _words)) return _found, _not_found, _errors
def _get_lemma_map(data): stopwords = get_polish_stopwords() _lemma_map = {} for _data_item_list in data.values(): _filtered = list( filter( lambda y: len(strip_string(y.token).lower()) > 1 and y.token. lower() not in stopwords, _data_item_list)) _handy_map = {} for di in _filtered: _handy_map[strip_string(di.token).lower()] = di.lemma.lower() _tokens = list( filter(lambda y: y and len(y) > 1, map(lambda x: strip_string(x.token).lower(), _filtered))) _numbers = list(filter(lambda y: y and contains_digit(y), _tokens)) for _number in _numbers: _lemma_map[_number] = _number _words = list(filter(lambda y: y and not contains_digit(y), _tokens)) _words = list(set(_words)) _text = [' '.join(_words)] _corpuse = prepare_data(_text) _corpuse = preprocess(_corpuse, stemmer, parser) _corpuse = decode_prepare_data(_corpuse) _tuples = [] for _i in range(0, min(len(_words), len(_corpuse[0]))): if 'sup' in _corpuse[0][_i][1]: i = 1 if 'sup' in _corpuse[0][_i][1] or _words[_i][0:2] == _corpuse[0][ _i][0][0:2]: # print("{} : {}".format(_i, (_words[_i], _corpuse[0][_i]))) _tuples.append((_words[_i], _corpuse[0][_i])) for _word, _lemma in _tuples: if _lemma[1] != 'NaN': _lemma_map[_word] = _lemma[0] else: _lemma_map[_word] = _handy_map[_word] return _lemma_map
def merge_lists(_list1, _lemma_map, _stopwords, _merged_map): for _label, _words in _list1: _words = strip_dangling_keywords( list(map(lambda x: strip_string(x).lower(), _words))) _lemma_entities = [] for _e in _words: _entity_lemma = get_entity(_e, _lemma_map, _stopwords) if _entity_lemma: _lemma_entities.append(_entity_lemma) if len(_lemma_entities) == 1: for _lemma in _lemma_entities[0]: key_by_entity([_lemma], _merged_map, _label) # _key = ''.join(_lemma_entities[0]) if len(_lemma_entities) == 2: _i_size = len(_lemma_entities[0]) _j_size = len(_lemma_entities[1]) for _i in range(0, _i_size): for _j in range(0, _j_size): _l = [_lemma_entities[0][_i], _lemma_entities[1][_j]] key_by_entity(_l, _merged_map, _label) if len(_lemma_entities) == 3: _i_size = len(_lemma_entities[0]) _j_size = len(_lemma_entities[1]) _k_size = len(_lemma_entities[2]) for _i in range(0, _i_size): for _j in range(0, _j_size): for _k in range(0, _k_size): _l = [ _lemma_entities[0][_i], _lemma_entities[1][_j], _lemma_entities[2][_k] ] key_by_entity(_l, _merged_map, _label) if len(_lemma_entities) == 4: _i_size = len(_lemma_entities[0]) _j_size = len(_lemma_entities[1]) _k_size = len(_lemma_entities[2]) _l_size = len(_lemma_entities[3]) for _i in range(0, _i_size): for _j in range(0, _j_size): for _k in range(0, _k_size): for _li in range(0, _l_size): _le = [ _lemma_entities[0][_i], _lemma_entities[1][_j], _lemma_entities[2][_k], _lemma_entities[3][_li] ] key_by_entity(_le, _merged_map, _label) if len(_lemma_entities) > 4: key_by_entity(_lemma_entities, _merged_map, _label)