예제 #1
0
def run(test_file, sense, context, sense_dep, context_dep, output, wsd_method='sim', filter_ctx=2, lowercase=False, ignore_case=False):
    
    print("Loading models...")
    vs = SenseGram.load_word2vec_format(sense, binary=True)
    vc = word2vec.Word2Vec.load_word2vec_format(context, binary=True)
    vs_dep = SenseGram.load_word2vec_format(sense_dep, binary=True)
    vc_dep = word2vec.Word2Vec.load_word2vec_format(context_dep, binary=False)
   
    wsd_model = WSDdep(vs, vc, vs_dep, vc_dep, method=wsd_method, filter_ctx=filter_ctx, ignore_case=ignore_case)

    print("Loading test set...")
    reader = read_csv(test_file, encoding="utf-8", delimiter="\t", dtype={'predict_related': object, 'gold_sense_ids':object, 'predict_sense_ids':object, 'deps':object})
    rows_count = reader.shape[0]
    print(unicode(rows_count) + " test instances")
    pb = pbar.Pbar(rows_count, 100)
    

    uncovered_words = [] # target words for which sense model has zero senses

    print("Start prediction over " + test_file)
    pb.start()
    reader = reader.fillna('')
    for i, row in reader.iterrows():
        ctx = row.context.lower() if lowercase else row.context
        start, end = [int(x) for x in row.target_position.split(',')]
        
        if row.deps == "ParseError" or row.deps == "":
            deps = []
        else: 
            deps = [dep for dep in row.deps.split() if dep in vc.vocab]
            
        prediction = wsd_model.dis(ctx, row.target, start, end, deps)
        
        if prediction:
            sense, sense_scores = prediction
            reader.set_value(i, 'predict_sense_ids', sense.split("#")[1])
                #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours)
                #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours]
                #reader.set_value(i, 'predict_related', ",".join(neighbours))
        else:
            uncovered_words.append(row.target)
            continue
            
        pb.update(i)
    pb.finish()
    
    reader.to_csv(sep='\t', path_or_buf=output, encoding="utf-8", index=False, quoting=QUOTE_NONE)
    print("Saved predictions to " + output)
예제 #2
0
def run(test_file,
        sense,
        context,
        output,
        wsd_method='sim',
        filter_ctx=2,
        lowercase=False,
        ignore_case=False):

    print("Loading models...")
    vs = SenseGram.load_word2vec_format(sense, binary=True)
    vc = word2vec.Word2Vec.load_word2vec_format(context, binary=True)
    wsd_model = WSD(vs,
                    vc,
                    method=wsd_method,
                    filter_ctx=filter_ctx,
                    ignore_case=ignore_case)

    print("Loading test set...")
    reader = read_csv(test_file,
                      encoding="utf-8",
                      delimiter="\t",
                      dtype={
                          'predict_related': object,
                          'gold_sense_ids': object,
                          'predict_sense_ids': object
                      })
    rows_count = reader.shape[0]
    print((str(rows_count) + " test instances"))
    pb = pbar.Pbar(rows_count, 100)

    uncovered_words = []  # target words for which sense model has zero senses

    print(("Start prediction over " + test_file))
    pb.start()
    for i, row in reader.iterrows():
        # Form of prediction: (sense, sense_scores)
        ctx = row.context.lower() if lowercase else row.context
        start, end = [int(x) for x in row.target_position.split(',')]

        prediction = wsd_model.dis_text(ctx, row.target, start, end)
        if prediction:
            sense, sense_scores = prediction
            reader.set_value(i, 'predict_sense_ids', sense.split("#")[1])
            #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours)
            #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours]
            #reader.set_value(i, 'predict_related', ",".join(neighbours))
        else:
            uncovered_words.append(row.target)
            continue

        pb.update(i)
    pb.finish()

    reader.to_csv(sep='\t',
                  path_or_buf=output,
                  encoding="utf-8",
                  index=False,
                  quoting=QUOTE_NONE)
    print(("Saved predictions to " + output))
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description='Fill in a test dataset using Random Sense method.')
    parser.add_argument('test_file', help='A path to a test dataset. Format: "context_id<TAB>target<TAB>target_pos<TAB>target_position<TAB>gold_sense_ids<TAB>predict_sense_ids<TAB>golden_related<TAB>predict_related<TAB>context')
    parser.add_argument("senses", help="A path to sense vectors")
    parser.add_argument("output", help="An output path to the filled dataset. Same format as test_file")
    
    args = parser.parse_args()
    
    print("Loading sense model...")
    vs = SenseGram.load_word2vec_format(args.senses, binary=False)
    run(args.test_file, vs, args.output)
예제 #4
0
    def build(
            self,
            wv,  # wvo = an intance of dense word vectors
            sense_dim_num=10000,  # unused
            save_pkl=True,  # unused
            norm_type="sum",
            weight_type="score",
            max_cluster_words=20):
        """
        Build sense vectors out of word vectors and save them in binary format.
        """

        # initialize the sense vectors model
        vector_dim = wv.vectors.syn0.shape[1]
        senses_num = self.pcz.get_num_senses()
        sv = SenseGram(size=vector_dim, sorted_vocab=0)
        sv.create_zero_vectors(senses_num, vector_dim)
        sense_count = 0

        # fill the sense vectors model
        for word in self.pcz.data:
            for sense_id in self.pcz.data[word]:
                # try to build sense vector for a word sense
                try:
                    sense_count += 1
                    if sense_count % 10000 == 0:
                        print(sense_count, "senses processed")

                    sense_vector = np.zeros(
                        wv.vectors.syn0[0].shape,
                        dtype=np.float32)  # or the word vector?

                    non_oov = 0
                    for i, cluster_word in enumerate(
                            self.pcz.data[word][sense_id]["cluster"]):
                        if i >= max_cluster_words: break

                        # define the weight
                        if weight_type == "ones": weight = 1.0
                        elif weight_type == "score":
                            weight = float(self.pcz.data[word][sense_id]
                                           ["cluster"][cluster_word])
                        elif weight_type == "rank":
                            weight = 1.0 / (i + 1)
                        else:
                            weight = float(self.pcz.data[word][sense_id]
                                           ["cluster"][cluster_word])

                        if weight == 0:
                            print("Warning: zero weight:",
                                  cluster_word,
                                  end=' ')

                        # define the word
                        if cluster_word in wv.vectors.vocab:
                            cw = cluster_word
                        elif cluster_word.split("#")[0] in wv.vectors.vocab:
                            cw = cluster_word.split("#")[0]
                        else:
                            if self.VERBOSE:
                                print("Warning: word is OOV: '%s'" %
                                      (cluster_word),
                                      file=stderr)

                            compounds = cluster_word.split("#")[0].split("_")
                            for cw in compounds:
                                if cw in wv.vectors.vocab and len(cw) > 3:
                                    if self.VERBOSE:
                                        print(
                                            "Warning: adding a compound '{}' of '{}'"
                                            .format(cw, cluster_word))
                                    sense_vector += (weight / len(compounds)
                                                     ) * wv.vectors[cw]
                                    non_oov += 1

                            continue

                        non_oov += 1
                        sense_vector += weight * wv.vectors[cw]

                    if non_oov == 0:
                        if self.VERBOSE:
                            print("Warning: sense is OOV: %s#%s" %
                                  (word, sense_id),
                                  file=stderr)

                    normalizer = self._normalizer(word, sense_id, norm_type,
                                                  weight_type,
                                                  max_cluster_words)
                    sense_vector = sense_vector / normalizer
                    sense_prob = self.pcz.get_sense_prob(word, sense_id)
                    sv.add_sense(word, sense_id, sense_vector, sense_prob)
                except:
                    print("Cannot process sense:", word, sense_id)
                    print(format_exc())

        # serialize the sense vector model
        sv.save_word2vec_format(self.sense_vectors_bin_fpath,
                                fvocab=None,
                                binary=False)

        print("Sense vectors:", self.sense_vectors_bin_fpath)
        print("Created %d sense vectors" % sense_count)

        return sv
예제 #5
0
 def _load_sense2vector_precomp(self, sense2vector_fpath):
     return SenseGram.load_word2vec_format(sense2vector_fpath)