Пример #1
0
def run(test_file,
        sense,
        context,
        output,
        wsd_method='sim',
        filter_ctx=2,
        lowercase=False,
        ignore_case=False):

    print("Loading models...")
    vs = SenseGram.load_word2vec_format(sense, binary=True)
    vc = word2vec.Word2Vec.load_word2vec_format(context, binary=True)
    wsd_model = WSD(vs,
                    vc,
                    method=wsd_method,
                    filter_ctx=filter_ctx,
                    ignore_case=ignore_case)

    print("Loading test set...")
    reader = read_csv(test_file,
                      encoding="utf-8",
                      delimiter="\t",
                      dtype={
                          'predict_related': object,
                          'gold_sense_ids': object,
                          'predict_sense_ids': object
                      })
    rows_count = reader.shape[0]
    print((str(rows_count) + " test instances"))
    pb = pbar.Pbar(rows_count, 100)

    uncovered_words = []  # target words for which sense model has zero senses

    print(("Start prediction over " + test_file))
    pb.start()
    for i, row in reader.iterrows():
        # Form of prediction: (sense, sense_scores)
        ctx = row.context.lower() if lowercase else row.context
        start, end = [int(x) for x in row.target_position.split(',')]

        prediction = wsd_model.dis_text(ctx, row.target, start, end)
        if prediction:
            sense, sense_scores = prediction
            reader.set_value(i, 'predict_sense_ids', sense.split("#")[1])
            #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours)
            #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours]
            #reader.set_value(i, 'predict_related', ",".join(neighbours))
        else:
            uncovered_words.append(row.target)
            continue

        pb.update(i)
    pb.finish()

    reader.to_csv(sep='\t',
                  path_or_buf=output,
                  encoding="utf-8",
                  index=False,
                  quoting=QUOTE_NONE)
    print(("Saved predictions to " + output))
Пример #2
0
def run(clusters, model, n, output, method='weighted', has_header=True):

    print("Loading original context model...")
    contextvec = word2vec.Word2Vec.load_word2vec_format(model, binary=False)
    print("Initializing new word model...")
    wordvec = initialize(clusters, has_header, contextvec.syn0.shape[1])

    print("Pooling cluster vectors (%s method)..." % method)
    reader = read_clusetrs_file(clusters, has_header)

    pb = pbar.Pbar(wordvec.syn0.shape[0], 100)
    pb.start()
    i = 0
    for chunk in reader:
        if debug:
            print("Column types: %s" % chunk.dtypes)
        for j, row in chunk.iterrows():
            row_word = row.word
            row_cluster = row.cluster

            # process new word
            word_cluster = parse_cluster(row_cluster, contextvec)[:n]

            vectors = np.array(
                [contextvec[context] for context, sim in word_cluster])
            sims = np.array([float(sim) for context, sim in word_cluster])
            word_vector = pool_vectors(vectors, sims, method)

            if row_word not in wordvec.vocab:
                wordvec.add_word(row_word, word_vector)

            pb.update(i)
            i += 1
    pb.finish()

    ##### Validation #####
    if wordvec.syn0.shape[0] != len(wordvec.vocab):
        print("Shrinking matrix size from %i to %i" %
              (wordvec.syn0.shape[0], len(wordvec.vocab)))
        wordvec.syn0 = np.ascontiguousarray(wordvec.syn0[:len(wordvec.vocab)])
    print("Sense vectors saved to: " + output)
    wordvec.save_word2vec_format(fname=output, binary=True)
Пример #3
0
def run(clusters,
        model,
        output,
        method='weighted',
        lowercase=False,
        inventory=None,
        has_header=True):

    small_clusters = 0
    sen_count = defaultdict(int)  # number of senses per word
    cluster_sum = defaultdict(int)  # number of cluster words per word

    print("Loading original word model...")
    wordvec = word2vec.Word2Vec.load_word2vec_format(model, binary=True)
    print("Initializing sense model...")
    senvec = initialize(clusters, has_header, wordvec.syn0.shape[1])

    print("Pooling cluster vectors (%s method)..." % method)
    reader = read_clusetrs_file(clusters, has_header)

    pb = pbar.Pbar(senvec.syn0.shape[0], 100)
    pb.start()

    with write_inventory(inventory) as inv_output:
        inv_output.write(inventory_header)
        i = 0
        for chunk in reader:
            if debug:
                print("Column types: %s" % chunk.dtypes)
            for j, row in chunk.iterrows():
                row_word = row.word
                row_cluster = row.cluster

                if lowercase:
                    row_cluster = row_cluster.lower()

                # enumerate word senses from 0
                sen_word = unicode(row_word) + sen_delimiter + unicode(
                    sen_count[row_word])

                # process new sense
                sen_cluster = parse_cluster(row_cluster, wordvec)
                if len(sen_cluster) >= 5:
                    vectors = np.array(
                        [wordvec[word] for word, sim in sen_cluster])
                    sims = np.array([float(sim) for word, sim in sen_cluster])
                    sen_vector = pool_vectors(vectors, sims, method)

                    if sen_word not in senvec.vocab:
                        senvec.add_word(sen_word, sen_vector)
                        senvec.probs[sen_word] = len(
                            sen_cluster)  # number of cluster words per sense
                        sen_count[row_word] += 1  # number of senses per word
                        cluster_sum[row_word] += len(
                            sen_cluster)  # number of cluster words per word

                    # write new sense to sense inventory
                    if inventory:
                        # join back cluster words (only those that were actually used for sense vector)
                        cluster = ",".join(
                            [word + ":" + sim for word, sim in sen_cluster])
                        inv_output.write(
                            u"%s\t%s\t%s\n" %
                            (sen_word.split(sen_delimiter)[0],
                             sen_word.split(sen_delimiter)[1], cluster))
                else:
                    small_clusters += 1
                    if debug:
                        print row_word, "\t", row.cid
                        print sen_cluster
                pb.update(i)
                i += 1
        senvec.__normalize_probs__(cluster_sum)
        pb.finish()

    ##### Validation #####
    if senvec.syn0.shape[0] != len(senvec.vocab):
        print("Shrinking matrix size from %i to %i" %
              (senvec.syn0.shape[0], len(senvec.vocab)))
        senvec.syn0 = np.ascontiguousarray(senvec.syn0[:len(senvec.vocab)])
    print("Sense vectors saved to: " + output)
    senvec.save_word2vec_format(fname=output, binary=True)