def cluster(self): clusters = super(ADMappingMixin, self).cluster() word = self.m['word'] ad_descr = get_ad_word(word, self.m['ad_root']) ad_centers = get_ad_centers(word, ad_descr, self.m['ad_root']) self.mapping = {} for ci, center in enumerate(self._c.centres): self.mapping[ci] = max(((int(mid), v_closeness(center, m_center)) for mid, m_center in ad_centers.items()), key=itemgetter(1))[0] return clusters
def get(self, ctx_path, word): ctx = self.load(word, ctx_path) contexts = ctx['contexts'] parsed = get_ad_word(word, self.ad_root) sense_by_id = {m['id']: m for m in parsed['meanings']} counts = Counter(ans for _, ans in contexts) self.render('templates/word.html', word=parsed['word'], senses=sorted( (sid, sense_by_id[sid], count / len(contexts)) for sid, count in counts.items()), contexts=contexts)
def build_senses(word, ad_root, out=None): """ Build sense vectors for one word and save them in ``out``. """ ad_word_data = get_ad_word(word, ad_root) weights = load_weights(word, root=ad_root) train_data = get_ad_train_data(word, ad_word_data) senses = {s['id']: {'name': s['name'], 'meaning': s['meaning']} for s in ad_word_data['meanings']} model = SphericalModel(train_data, weights=weights, senses=senses) # Not needed after training del model.context_vectors del model.train_data model.save(word, folder=out)
def cluster(self): word = self.m['word'] ad_descr = get_ad_word(word) ad_centers = get_ad_centers(word, ad_descr) self.mapping = { i: int(meaning['id']) for i, meaning in enumerate(ad_descr['meanings']) } # note that the clusters can drift to quite different positions centers = np.array([ad_centers[m['id']] for m in ad_descr['meanings']]) self._c = kmeans.KMeans(self.features, centres=centers, metric='cosine', verbose=0) return self._cluster()
def run_on_word(ctx_filename, ctx_dir, ad_root, **params): max_contexts = params.get('max_contexts') min_contexts = params.get('min_contexts') word = ctx_filename.split('.')[0] if word[-1].isdigit(): return result_filename = os.path.join(ctx_dir, word + '.json') if os.path.exists(result_filename): print(result_filename, "already exists, skiping", file=sys.stderr) return True with open(os.path.join(ctx_dir, ctx_filename), 'r') as f: contexts = [line.split('\t') for line in f] if max_contexts and len(contexts) > max_contexts: contexts = random.sample(contexts, max_contexts) elif not contexts or (min_contexts and len(contexts) < min_contexts): return ad_word_data = get_ad_word(word, ad_root) if ad_word_data is None: return train_data = get_ad_train_data(word, ad_word_data) model = train_model(word, train_data, ad_root, **params) if model is None: return result = [] confidences = [] for x in contexts: model_ans, confidence = model(x, with_confidence=True) result.append((x, model_ans)) confidences.append(confidence) with open(result_filename, 'w') as f: json.dump( { 'word': word, 'contexts': result, 'estimate': get_accuracy_estimate(confidences, model.confidence_threshold), }, f, ensure_ascii=False) return True
def get(self, pos): name_re = re.compile(r'(\w.*?)\d?\.json', re.U) only = self.get_argument('only', None) if only: words = only.split(',') else: words = { m.groups()[0] for m in (name_re.match(filename) for filename in os.listdir( os.path.join(self.ad_root, 'ad'))) if m is not None } words_info = [] only_pos = {'ГЛАГ': 'v', 'СУЩ': 's'}[pos] ipm = load_ipm(self.ad_root, only_pos=only_pos) for w in sorted(words): w_info = get_ad_word(w, self.ad_root, with_contexts=False) if w_info is not None and w_info['pos'] == pos \ and 2 <= len(w_info['meanings']) <= 10: w_info['ipm'] = ipm.get(w_info['word'].lower()) words_info.append(w_info) self.render('templates/pos_list.html', pos=pos, words_info=words_info)
def main(): parser = argparse.ArgumentParser() parser.add_argument('ad_root') parser.add_argument('contexts_root') parser.add_argument('word') parser.add_argument('--n', type=int, default=100) args = parser.parse_args() w_info = get_ad_word(args.word, args.ad_root, with_contexts=False) with open(os.path.join(args.contexts_root, args.word + '.txt'), 'r') as f: contexts = list(f) random.seed(1) random.shuffle(contexts) contexts = contexts[:args.n] for m in w_info['meanings'] + [ dict(name='Другое', id=len(w_info['meanings']) + 1), dict(name='Не могу определить', id=0) ]: print( ('\t\t%s: %s\t\t%s' % (m['name'], m.get('meaning', ''), m['id']))) for ctx in contexts: print(ctx, end='')
def summary(ad_root, ctx_dir): all_freqs = {} word_ipm = load_ipm(ad_root) for filename in os.listdir(ctx_dir): if not filename.endswith('.json') or filename == 'summary.json': continue with open(os.path.join(ctx_dir, filename), 'r') as f: result = json.load(f) word = result['word'] w_meta = get_ad_word(word, ad_root) meaning_by_id = {m['id']: m['meaning'] for m in w_meta['meanings']} counts = Counter(ans for _, ans in result['contexts']) all_freqs[word] = { 'senses': { ans: dict(meaning=meaning_by_id[ans], freq=cnt / len(result['contexts'])) for ans, cnt in counts.items() }, 'estimate': result.get('estimate'), 'is_homonym': w_meta.get('is_homonym', False), 'ipm': word_ipm.get(word, 0.0), } with open(os.path.join(ctx_dir, 'summary.json'), 'w') as f: json.dump(all_freqs, f, ensure_ascii=False)
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('ad_root') arg('contexts_root', type=Path) arg('words', type=Path) arg('output') arg('--limit', type=int, default=100) args = parser.parse_args() wb = Workbook() right_align = Alignment(horizontal='right') center_align = Alignment(horizontal='center') words = [ l.strip() for l in args.words.read_text(encoding='utf8').split('\n') if l.strip() ] for i, word in enumerate(words): contexts_path = args.contexts_root / '{}.txt'.format(word) if not contexts_path.exists(): print('Contexts for word "{}" not found, skpping'.format(word)) continue contexts = [ l.split('\t') for l in contexts_path.read_text(encoding='utf8').split('\n') ] contexts = [ctx for ctx in contexts if len(ctx) == 3] if not contexts: print('No contexts for word "{}", skipping'.format(word)) continue if len(contexts) > args.limit: random.seed(42) contexts = random.sample(contexts, args.limit) else: print('Warning: only {} contexts for word "{}"'.format( len(contexts), word)) ad_word = get_ad_word(word, args.ad_root, with_contexts=False) if not ad_word: print('Word "{}" not found in AD'.format(ad_word)) continue if i == 0: ws = wb.active ws.title = word else: ws = wb.create_sheet(word) for row, m in enumerate(ad_word['meanings'], 1): ws.cell(row=row, column=3, value='{name}: {meaning}'.format(**m)) ws.cell(row=row, column=4, value=row) n_senses = len(ad_word['meanings']) ws.cell(row=n_senses + 1, column=3, value='Другое:') ws.cell(row=n_senses + 1, column=4, value=n_senses + 1) ws.cell(row=n_senses + 2, column=3, value='Не могу определить:') ws.cell(row=n_senses + 2, column=4, value=0) for row, (left, center, right) in enumerate(contexts, n_senses + 3): ws.cell(row=row, column=1, value=left).alignment = right_align ws.cell(row=row, column=2, value=center).alignment = center_align ws.cell(row=row, column=3, value=right) ws.cell(row=row, column=4, value='-').alignment = right_align ws.column_dimensions['A'].width = 80 ws.column_dimensions['B'].width = \ 2 + max(len(center) for _, center, _ in contexts) ws.column_dimensions['C'].width = 80 wb.save(args.output)
def evaluate_word(word, ad_root, labeled_root, print_errors=False, tsne=False, coarse=False, alt_root=None, alt_senses=False, **model_params): word_path = labeled_root.joinpath(word + '.json') if not word_path.exists(): word_path = labeled_root.joinpath(word + '.txt') senses, test_data = get_labeled_ctx(str(word_path)) ad_word_data = get_ad_word(word, ad_root) if not ad_word_data: print(word, 'no AD data', sep='\t') return ad_senses = { str(i): m['name'] for i, m in enumerate(ad_word_data['meanings'], 1) } if set(ad_senses) != set(senses): print(word, 'AD/labeled sense mismatch', sep='\t') return train_data = get_ad_train_data(word, ad_word_data) if alt_root: senses, test_data, train_data = get_alt_senses_test_train_data( alt_root=alt_root, word=word, test_data=test_data, train_data=train_data, alt_senses=alt_senses) if coarse: sense_mapping = get_coarse_sense_mapping(ad_senses) inverse_mapping = defaultdict(list) for old_id, new_id in sense_mapping.items(): inverse_mapping[new_id].append(old_id) senses = { new_id: '; '.join(senses[old_id] for old_id in old_ids) for new_id, old_ids in inverse_mapping.items() } train_data = [(ctx, sense_mapping[old_id]) for ctx, old_id in train_data] test_data = [(ctx, sense_mapping[old_id]) for ctx, old_id in test_data] mfs_baseline = get_mfs_baseline(train_data, test_data) fs_baseline = get_fs_baseline(test_data) random_baseline = 1 / len(senses) model = train_model(word, train_data, ad_root, **model_params) if not model: print(word, 'no model', sep='\t') return test_accuracy, max_freq_error, js_div, estimate, answers = \ evaluate(model, test_data) if tsne: show_tsne(model, answers, senses, word) # train_data = get_ad_train_data(word, ad_word_data) # show_tsne(model, [(x, ans, ans) for x, ans in train_data], senses, word) if print_errors: _print_errors(test_accuracy, answers, ad_word_data, senses) examples_per_sense = len(train_data) / len(senses) words_per_sense = sum( len(tokenize_s(left) + tokenize_s(right)) for (left, _, right), _ in train_data) / len(senses) return (len(senses), mfs_baseline, fs_baseline, random_baseline, model.get_train_accuracy(verbose=False), test_accuracy, max_freq_error, examples_per_sense, words_per_sense, js_div, estimate)