def _cluster(context_vectors_filename, labeled_dir, n_senses, method, print_clusters, **_): m = load(context_vectors_filename) word = m['word'] classifier = getattr(cluster_methods, method)(m, n_senses) clusters = classifier.cluster() n_contexts = len(m['context_vectors']) if print_clusters: print() print(word) weights = load_weights(word) centers = classifier._c.centres _print_clusters(word, clusters, n_contexts, weights) _print_cluster_sim(centers) mapping = _merge_clusters(centers, 0.75) print(mapping) merged_clusters = defaultdict(list) for sense_id, elements in clusters.items(): merged_clusters[mapping[sense_id]].extend(elements) _print_clusters(word, merged_clusters, n_contexts, weights) labeled_filename = os.path.join(labeled_dir, word + '.txt') mt = {} if os.path.isfile(labeled_filename): mt = _get_metrics(word, classifier, labeled_filename) return mt
def build_context_vectors(contexts_filename, word, out_filename, **_): if os.path.isdir(contexts_filename): assert os.path.isfile(word) assert os.path.isdir(out_filename) with open(word, 'r') as f: for w in f: w = w.strip() build_context_vectors( os.path.join(contexts_filename, w + '.txt'), w, os.path.join(out_filename, w + '.pkl')) else: if os.path.exists(out_filename): print('Output file {} exists, skipping'.format(out_filename)) return print(word) weights = load_weights(word) vectors = get_context_vectors(word, contexts_filename, weights) to_json = out_filename.endswith('.json') to_lst = (lambda x: list(map(float, x))) if to_json else (lambda x: x) vectors = [(ctx, to_lst(v)) for ctx, v in vectors] print(len(vectors), 'contexts') save({ 'word': word, 'context_vectors': vectors }, out_filename, serializer=json.dump if to_json else None)
def build_senses(word, ad_root, out=None): """ Build sense vectors for one word and save them in ``out``. """ ad_word_data = get_ad_word(word, ad_root) weights = load_weights(word, root=ad_root) train_data = get_ad_train_data(word, ad_word_data) senses = {s['id']: {'name': s['name'], 'meaning': s['meaning']} for s in ad_word_data['meanings']} model = SphericalModel(train_data, weights=weights, senses=senses) # Not needed after training del model.context_vectors del model.train_data model.save(word, folder=out)
def get_ad_centers(word, ad_descr, ad_root='.'): centers = {} weights = load_weights(word, root=ad_root) for meaning in ad_descr['meanings']: center = None for ctx in meaning['contexts']: ctx = [w for w in lemmatize_s(ctx.lower()) if word_re.match(w)] vector = context_vector(word, ctx, weights=weights) if vector is not None: if center is None: center = vector else: center += vector if center is not None: centers[meaning['id']] = unitvec(center) return centers
def _get_metrics(word, classifier, labeled_filename): __, w_d = get_labeled_ctx(labeled_filename) contexts = [lemmatize_s(u' '.join(c)) for c, __ in w_d] weights = load_weights(word) vectors = [context_vector(word, ctx, weights=weights) for ctx in contexts] true_labels = [int(ans) for __, ans in w_d] pred_labels = classifier.predict(vectors) metrics = dict( ARI=adjusted_rand_score(true_labels, pred_labels), VM=v_measure_score(true_labels, pred_labels), oracle_accuracy=_oracle_accuracy(true_labels, pred_labels), ) mapping = None if hasattr(classifier, 'mapping'): mapping = classifier.mapping if mapping: metrics['accuracy'] = _mapping_accuracy(true_labels, pred_labels, mapping) metrics['max_freq_error'] = _max_freq_error(true_labels, pred_labels, mapping) return metrics
def train_model(word, train_data, ad_root, method=None, **model_params): weights = None if model_params.pop('no_weights', None) else load_weights( word, root=ad_root, lemmatize=model_params.get('lemmatize')) model = None if train_data: method = getattr(supervised, method, getattr(cluster_methods, method, None)) if issubclass(method, cluster_methods.Method): context_vectors = get_context_vectors( word, os.path.join(ad_root, 'contexts-100k', word + '.txt'), weights) cluster_model = method(dict(word=word, ad_root=ad_root, context_vectors=context_vectors), n_senses=12) cluster_model.cluster() model = SupervisedWrapper(cluster_model, weights=weights, **model_params) else: model = method(train_data, weights=weights, **model_params) return model