예제 #1
0
def _cluster(context_vectors_filename, labeled_dir, n_senses, method,
             print_clusters, **_):
    m = load(context_vectors_filename)
    word = m['word']
    classifier = getattr(cluster_methods, method)(m, n_senses)
    clusters = classifier.cluster()
    n_contexts = len(m['context_vectors'])
    if print_clusters:
        print()
        print(word)
        weights = load_weights(word)
        centers = classifier._c.centres
        _print_clusters(word, clusters, n_contexts, weights)
        _print_cluster_sim(centers)
        mapping = _merge_clusters(centers, 0.75)
        print(mapping)
        merged_clusters = defaultdict(list)
        for sense_id, elements in clusters.items():
            merged_clusters[mapping[sense_id]].extend(elements)
        _print_clusters(word, merged_clusters, n_contexts, weights)
    labeled_filename = os.path.join(labeled_dir, word + '.txt')
    mt = {}
    if os.path.isfile(labeled_filename):
        mt = _get_metrics(word, classifier, labeled_filename)
    return mt
예제 #2
0
def build_context_vectors(contexts_filename, word, out_filename, **_):
    if os.path.isdir(contexts_filename):
        assert os.path.isfile(word)
        assert os.path.isdir(out_filename)
        with open(word, 'r') as f:
            for w in f:
                w = w.strip()
                build_context_vectors(
                    os.path.join(contexts_filename, w + '.txt'), w,
                    os.path.join(out_filename, w + '.pkl'))
    else:
        if os.path.exists(out_filename):
            print('Output file {} exists, skipping'.format(out_filename))
            return
        print(word)
        weights = load_weights(word)
        vectors = get_context_vectors(word, contexts_filename, weights)
        to_json = out_filename.endswith('.json')
        to_lst = (lambda x: list(map(float, x))) if to_json else (lambda x: x)
        vectors = [(ctx, to_lst(v)) for ctx, v in vectors]
        print(len(vectors), 'contexts')
        save({
            'word': word,
            'context_vectors': vectors
        },
             out_filename,
             serializer=json.dump if to_json else None)
예제 #3
0
def build_senses(word, ad_root, out=None):
    """ Build sense vectors for one word and save them in ``out``.
    """
    ad_word_data = get_ad_word(word, ad_root)
    weights = load_weights(word, root=ad_root)
    train_data = get_ad_train_data(word, ad_word_data)
    senses = {s['id']: {'name': s['name'], 'meaning': s['meaning']}
              for s in ad_word_data['meanings']}
    model = SphericalModel(train_data, weights=weights, senses=senses)
    # Not needed after training
    del model.context_vectors
    del model.train_data
    model.save(word, folder=out)
예제 #4
0
def get_ad_centers(word, ad_descr, ad_root='.'):
    centers = {}
    weights = load_weights(word, root=ad_root)
    for meaning in ad_descr['meanings']:
        center = None
        for ctx in meaning['contexts']:
            ctx = [w for w in lemmatize_s(ctx.lower()) if word_re.match(w)]
            vector = context_vector(word, ctx, weights=weights)
            if vector is not None:
                if center is None:
                    center = vector
                else:
                    center += vector
        if center is not None:
            centers[meaning['id']] = unitvec(center)
    return centers
예제 #5
0
def _get_metrics(word, classifier, labeled_filename):
    __, w_d = get_labeled_ctx(labeled_filename)
    contexts = [lemmatize_s(u' '.join(c)) for c, __ in w_d]
    weights = load_weights(word)
    vectors = [context_vector(word, ctx, weights=weights) for ctx in contexts]
    true_labels = [int(ans) for __, ans in w_d]
    pred_labels = classifier.predict(vectors)
    metrics = dict(
        ARI=adjusted_rand_score(true_labels, pred_labels),
        VM=v_measure_score(true_labels, pred_labels),
        oracle_accuracy=_oracle_accuracy(true_labels, pred_labels),
    )
    mapping = None
    if hasattr(classifier, 'mapping'):
        mapping = classifier.mapping
    if mapping:
        metrics['accuracy'] = _mapping_accuracy(true_labels, pred_labels,
                                                mapping)
        metrics['max_freq_error'] = _max_freq_error(true_labels, pred_labels,
                                                    mapping)
    return metrics
예제 #6
0
def train_model(word, train_data, ad_root, method=None, **model_params):
    weights = None if model_params.pop('no_weights', None) else load_weights(
        word, root=ad_root, lemmatize=model_params.get('lemmatize'))
    model = None
    if train_data:
        method = getattr(supervised, method,
                         getattr(cluster_methods, method, None))
        if issubclass(method, cluster_methods.Method):
            context_vectors = get_context_vectors(
                word, os.path.join(ad_root, 'contexts-100k', word + '.txt'),
                weights)
            cluster_model = method(dict(word=word,
                                        ad_root=ad_root,
                                        context_vectors=context_vectors),
                                   n_senses=12)
            cluster_model.cluster()
            model = SupervisedWrapper(cluster_model,
                                      weights=weights,
                                      **model_params)
        else:
            model = method(train_data, weights=weights, **model_params)
    return model