예제 #1
0
 def __init__(self):
     self.db = MySQLdb.connect(host="127.0.0.1",
                               user="******",
                               passwd="wmmkscsie",
                               db="recommender_system",
                               charset="utf8")
     self.cursor = self.db.cursor()
     # sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_bert FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_bert !=''"
     sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_w2v_sg FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_w2v_sg !=''"
     print(sql)
     self.cursor.execute(sql)
     self.movies_information = self.cursor.fetchall()
     # Relationship Model
     #######################
     # self.model = CNN_E2V_BERT()
     # For Produce Vector
     # self.bert_embedding = BertEmbedding(model = 'bert_12_768_12', dataset_name='wiki_cn', max_seq_length = 50)
     # self.relationship_e2v_bert = []
     # self.scenario_e2v_bert = []
     #######################
     self.model = CNN_E2V_W2V_SG()
     # 產生一個 word2vec 物件
     self.t = Word2Vec()
     self.t.train_file_setting("segmentation.txt", "e2v_w2v_sg")
     self.t.load_model()
     self.dimension = self.t.size
     self.relationship_e2v_w2v_sg = []
     self.scenario_e2v_w2v_sg = []
예제 #2
0
def main():
    tagged_words = brown.tagged_words()
    words_corpus = brown.words()

    word2vec = Word2Vec()
    word2vec.train(words_corpus)

    word_vecs = [word2vec.word2vec(word) for word in words_corpus]

    n_clusters = 10 # random number for now
    kmeans = KMeans(n_clusters)
    kmeans.compute(word_vecs)

    # word-cluster HMM
    p_word = {}
    p_cluster = {}

    p_cluster_given_word = None # softmax
    p_word_given_cluster = None # joint probability formula

    p_transition_cluster = None # count
    p_initial_cluster = None # count

    # cluster-tag HMM
    p_cluster_given_tag = None # softmax
    p_transition_tag = None # count from tagged data
    p_initial_tag = None # count from tagged data

    hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
    hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)

    words = []
    clusters = hmm_word_cluster.viterbi(words)
    tags = hmm_cluster_tag.viterbi(clusters)
예제 #3
0
    def test_cbow(self):
        cbow = Word2Vec(learning_rate=self.learning_rate)
        W1_m, W2_m, loss_m = cbow.cbow(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.)

        with tf.name_scope("cbow"):
            x = tf.placeholder(shape=[self.V, len(self.context_words)], dtype=tf.float32, name="x")
            W1_tf = tf.Variable(self.W1, dtype=tf.float32)
            W2_tf = tf.Variable(self.W2, dtype=tf.float32)
            hh = [tf.matmul(tf.transpose(W1_tf), tf.reshape(x[:, i], [self.V, 1]))
                  for i in range(len(self.context_words))]
            h = tf.reduce_mean(tf.stack(hh), axis=0)
            u = tf.matmul(tf.transpose(W2_tf), h)
            loss_tf = -u[int(np.where(self.center_word == 1)[0])] + tf.log(tf.reduce_sum(tf.exp(u), axis=0))
            grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf])

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2],
                                                             feed_dict={x: self.context_words.T})

        W1_tf -= self.learning_rate * dW1_tf
        W2_tf -= self.learning_rate * dW2_tf

        for i in range(self.V):
            for j in range(self.N):
                self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5)

        for i in range(self.N):
            for j in range(self.V):
                self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5)

        self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
예제 #4
0
    def test_skipgram(self):
        skipgram = Word2Vec(learning_rate=self.learning_rate)
        W1_m, W2_m, loss_m = skipgram.skipgram(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.)

        with tf.name_scope("skipgram"):
            x = tf.placeholder(shape=[self.V, 1], dtype=tf.float32, name="x")
            W1_tf = tf.Variable(self.W1, dtype=tf.float32)
            W2_tf = tf.Variable(self.W2, dtype=tf.float32)
            h = tf.matmul(tf.transpose(W1_tf), x)
            u = tf.stack([tf.matmul(tf.transpose(W2_tf), h) for i in range(len(self.context_words))])
            loss_tf = -tf.reduce_sum([u[i][int(np.where(c == 1)[0])]
                                      for i, c in zip(range(len(self.context_words)), self.context_words)], axis=0)\
                      + tf.reduce_sum(tf.log(tf.reduce_sum(tf.exp(u), axis=1)), axis=0)

            grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf])

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2],
                                                             feed_dict={x: self.center_word.reshape(self.V, 1)})

        W1_tf -= self.learning_rate * dW1_tf
        W2_tf -= self.learning_rate * dW2_tf

        for i in range(self.V):
            for j in range(self.N):
                self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5)

        for i in range(self.N):
            for j in range(self.V):
                self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5)

        self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
예제 #5
0
def main(input,
         output,
         iter=5,
         size=128,
         worker=4,
         batch_nodes=10000,
         negative=5,
         sample=1e-4,
         output_format="gensim"):

    # load karate graph in csr matrix
    RWG = RandomWalksGeneratorCSR(path=input)
    # init model
    skipgram = Word2Vec(sg=1,
                        iter=iter,
                        min_count=0,
                        size=size,
                        workers=worker,
                        batch_words=batch_nodes,
                        sample=sample,
                        negative=negative)
    # build vocab
    skipgram.build_vocab(RWG)
    # learn embbeding
    skipgram.train(RWG)
    if output_format == "gensim":
        skipgram.save(output)
    elif output_format == "txt":
        skipgram.save_word2vec_format(output)
예제 #6
0
    def load_embeddings(self):

        # check if embeddings saved in cache
        if os.path.exists(R.EMB.format(self.dim)):
            # read from cache; return
            return pickle.load(open(R.EMB.format(self.dim), 'rb'))

        # read model from word2vec
        model = Word2Vec(self.dim).get_model()
        embeddings = []
        for w in self._vocab:
            # if word in model
            if w in model:
                emb = model[w]
            # else check if lower-case of w in model
            elif w.lower() in model:
                emb = model[w.lower()]
            # return zero vector
            else:
                emb = np.zeros(self.dim)
            # keep track of embedding
            embeddings.append(emb)
        # np.array
        embeddings = np.stack(embeddings)
        # attach to self
        self.emb = embeddings
        # write to cache
        pickle.dump(self.emb, open(R.EMB.format(self.dim), 'wb'))
        # make sure vocab size == num of embeddings
        assert self.vocab_size() == self.emb.shape[0]

        return self.emb
예제 #7
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    else:
        raise Exception(
            "unknown file format: '%s'. valid formats: 'adjlist', 'edgelist'" %
            args.format)

    print("number of nodes: {}".format(len(G.nodes())))  # .format 格式化字符串(取代{})

    num_walks = len(G.nodes()) * args.number_walks  # 每个节点有多个walks
    print("number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length
    print("data size (walk*length): {}".format(data_size))

    print("walking...")
    walk_file = walks.write_walks_to_disk(G,
                                          args.output,
                                          num_paths=args.number_walks,
                                          path_length=args.walk_length,
                                          alpha=0,
                                          rand=random.Random(args.seed))
    model = Word2Vec(walk_file,
                     args.output,
                     emb_dimension=args.representation_size,
                     window_size=args.window_size,
                     min_count=0)
    print("Training...")

    model.skip_gram_train()
예제 #8
0
def trainWord2Vec(doc_list=None,
                  buildvoc=1,
                  passes=10,
                  sg=1,
                  size=100,
                  dm_mean=0,
                  window=5,
                  hs=0,
                  negative=5,
                  min_count=1,
                  workers=1):
    model = Word2Vec(size=size,
                     sg=sg,
                     window=window,
                     hs=hs,
                     negative=negative,
                     min_count=min_count,
                     workers=workers,
                     compute_loss=True)

    if buildvoc == 1:
        print('Building Vocabulary')
        model.build_vocab(doc_list)  # build vocabulate with words + nodeID

    for epoch in range(passes):
        print('Iteration %d ....' % epoch)
        # shuffle(doc_list)  # shuffling gets best results

        model.train(doc_list, total_examples=len(doc_list), epochs=1)
        print(model.running_training_loss)

    print(model.sg, model.window, model.hs, model.min_count)
    print('batch words', model.batch_words)
    return model
예제 #9
0
def test_word2vec():
    data = [
        'Merge multiple sorted inputs into a single sorted output',
        'The API below differs from textbook heap algorithms in two aspects'
    ]
    wv = Word2Vec(vec_len=50)
    wv.train(data, model='cbow')
    print(wv['into'])
예제 #10
0
 def key_words(self, string, top_number=10):
     # use the pretrained model with 5.9 million pretrained model
     model_name = '5.9m'
     model_download = W2VModelDownload(bq_project)
     model_download.download_w2v_model('patent_landscapes', model_name)
     word2vec5_9m = Word2Vec('5.9m')
     w2v_runtime = word2vec5_9m.restore_runtime()
     return w2v_runtime.find_similar(string, top_number)
예제 #11
0
    def __init__(self,
                 sentences,
                 model_file=None,
                 size=200,
                 alpha=0.025,
                 window=5,
                 min_count=5,
                 sample=0,
                 seed=1,
                 workers=16,
                 min_alpha=0.0001,
                 model="cb",
                 hs=1,
                 negative=0,
                 cbow_mean=0,
                 iteration=1,
                 word_learn=1,
                 init_adjust=True,
                 update_mode=0,
                 normalize_each_epoch=False):
        self.sg = 1 if model == "sg" or model == "dbow" else 0
        self.table = None  # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
        self.alpha = float(alpha)
        self.window = int(window)
        self.seed = seed
        self.sample = sample
        self.workers = workers
        self.min_alpha = min_alpha
        self.hs = hs
        self.negative = negative
        self.cbow_mean = int(cbow_mean)
        self.iteration = iteration
        self.word_learn = int(word_learn)
        self.layer1_size = size
        self.min_count = min_count
        self.sent_no_hash = {}  #mapping sent_id to index of self.sents
        self.sent_id_list = []  #mapping sent_no to sent_id
        self.sane_vec_len = 100000  #for sanity check
        self.sane_max_sim10 = 0.9  #for sanity check
        self.init_adjust = init_adjust  #for adjustment of initialization
        self.update_mode = update_mode  #0:SGD, 1: AdaGrad, 2:AdaDelta, (3:ADAM not implemented)
        self.normalize_each_epoch = normalize_each_epoch

        if sentences:
            if model_file:
                self.w2v = Word2Vec.load(model_file)
                self.vocab = self.w2v.vocab
                self.layer1_size = self.w2v.layer1_size
                self.build_vec(sentences, has_vocab=True)
            else:
                self.word_learn = 1
                self.w2v = Word2Vec(None, self.layer1_size, self.alpha,
                                    self.window, self.min_count, self.sample,
                                    self.seed, self.workers, self.min_alpha,
                                    self.sg, self.hs, self.negative,
                                    self.cbow_mean)
                self.build_vec(sentences, has_vocab=False)
            self.train_iteration(sentences, iteration=iteration)
예제 #12
0
def initialise_model(data):
	input_file = 'test.txt'
	f = open(input_file,'w')
	input_txt = get_all_text(data)
	f.write(input_txt)
	f.close()
	model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=1, workers=8)
	model.save(input_file + '.model')
	model.save_word2vec_format(input_file + '.vec')
예제 #13
0
def train_model(window_size, embedding_dim, batch_size_word2vec):
    file_to_save_trained_data = '../../results/word2vec/ver6/ws-' + str(
        window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str(
            batch_size_word2vec) + '.pkl'
    word2vec = Word2Vec(window_size=window_size,
                        epoch_word2vec=epoch_word2vec,
                        embedding_dim=embedding_dim,
                        batch_size_word2vec=batch_size_word2vec,
                        file_to_save_trained_data=file_to_save_trained_data)
    vectors, word2int, int2word = word2vec.train()
def main(args: argparse.Namespace) -> None:
    """Main entrypoint for the script."""
    # Make sure we have at least one file.
    if len(args.filenames) == 0:
        logger.error('At least one text file is required!')
        exit(1)

    set_seed(args.seed)
    tokenizer = Tokenizer(max_tokens=args.max_tokens,
                          min_word_frequency=args.min_word_frequency,
                          sample_threshold=args.sample_threshold)

    start_time = time.time()
    logger.info('Building vocabulary from corpus...')

    tokenizer.build(filenames=args.filenames)

    logger.info('Finished building vocabulary (took {:.2f} seconds)'.format(
        time.time() - start_time))

    dataset = make_dataset(args.filenames,
                           tokenizer,
                           window_size=args.window_size,
                           batch_size=args.batch_size,
                           epochs=args.epochs)
    model = Word2Vec(tokenizer,
                     hidden_size=args.hidden_size,
                     batch_size=args.batch_size,
                     n_negative_samples=args.n_negative_samples,
                     lambda_power=args.lambda_power,
                     bias=args.bias)

    # Create output directory
    args.output_dir.mkdir(parents=True, exist_ok=True)

    run_name = args.run_name or args.filenames[0].stem
    logdir = args.output_dir / get_next_run_id(args.output_dir, run_name)

    logger.info('Starting training (for {} epochs).'.format(args.epochs))
    model.train(dataset, logdir, args.initial_lr, args.target_lr,
                args.log_freq, args.save_freq)

    # Save embeddings and vocab
    #
    # The weights of the projection layer are components of the
    # embedding vectors. The i-th row of the weight matrix is the
    # embedding vector for the word whose encoded index is i.
    proj = model.weights[0].numpy()
    np.save(logdir / 'proj_weights', proj)
    # Save the tokenizer state
    tokenizer.save(logdir / 'tokenizer.json')
    # Save a list of the vocabulary words
    with open(logdir / 'vocab.txt', 'w') as file:
        for word in tokenizer.words:
            file.write(f'{word}\n')
    def __init__(self, dataset_file=None, cv_folds=10):
        """
        :param embeddings_file: path to the embeddings file.
        :param dataset_file: path to a labeled dataset file.
        :param cv_folds: int, number of folds for cross validation
        """

        self.dataset_file = dataset_file
        self.cv_folds = cv_folds

        # read dataset
        dataset = pd.read_csv(self.dataset_file)
        text = dataset['tweet']
        self.Y = dataset['label']

        # Option 1-- word2vec using embedding -- #
        w2v = Word2Vec()
        self.X = w2v.getVectors(text, )

        # -- Option 2 count vectorization -- #
        #self.X = self.features_extraction(text)

        # -- Option 3 TFIDF -- #
        #self.X = self.tfidfFeatureExtraction(text)

        info('Done loading and vectorizing data.')
        info("--- Sentiment CLASSIFIERS ---")
        info("fitting ... ")

        self.accuracies = {}

        # classifiers to use
        classifiers = [
            #RandomForestClassifier(n_estimators=100),
            #SGDClassifier(),
            LinearSVC(),
            #LinearDiscriminantAnalysis(),
            #LogisticRegression(),
            #GaussianNB(),
            #DecisionTreeClassifier()
        ]

        # RUN classifiers
        for c in classifiers:
            self.classify(c)

        info('results ...')
        for k, v in self.accuracies.items():
            string = '\tAcc. {:.2f}% F1. {:.2f}% P. {:.2f} R. {:.2f} : {}'
            print(
                string.format(v[0] * 100, v[1] * 100, v[2] * 100, v[3] * 100,
                              k))

        info("DONE!")
예제 #16
0
def main():
    contexts = np.fromfile("./data/npcontexts.dat", dtype=int)
    neighbors = np.fromfile("./data/npneighbors.dat", dtype=int)
    skipgram = Word2Vec(contexts,
                        neighbors,
                        35000,
                        10,
                        0.001,
                        64,
                        "sg.ckpt",
                        batch_size=500)
    skipgram.train(2)
예제 #17
0
def ExtractSent2Vec(filename):
    model = Word2Vec(LineSentence(filename),
                     size=512,
                     window=5,
                     sg=0,
                     min_count=5,
                     workers=8)
    model.save(filename + '.model')
    model.save_word2vec_format(filename + '-01.vec')

    model = Sent2Vec(LineSentence(filename), model_file=filename + '.model')
    model.save_sent2vec_format(filename + '-02.vec')
예제 #18
0
def word2vec(rdd):

    sentences = parse_sentences(rdd)
    sentences_without_id = sentences.map(lambda (_id, sent): sent)
    model = Word2Vec(size=100, hs=0, negative=8)

    dd2v = DistDoc2VecFast(model, learn_hidden=True, num_partitions=15, num_iterations=20)
    dd2v.build_vocab_from_rdd(sentences_without_id)

    print "*** done training words ****"
    print "*** len(model.vocab): %d ****" % len(model.vocab)
    return dd2v, sentences
예제 #19
0
 def __init__(self, data):
     self.data = data
     self.corpus = None
     self.liu = LiuLexicon()
     self.subj = SubjLexicon()
     self.buildTweetCorpus()
     self.word_vec_model = Word2Vec(self.corpus)
     self.glove_vec_model = Glove(100, self.corpus)
     self.clusters = Cluster(100)
     self.initEncoders()
     self.topicVecs = self.word_vec_model.getVectorsForTopics(
         self.topicenc.classes_)
     self.collectTopUnigrams()
     self.collectTopBigrams()
예제 #20
0
def main(text):

    params = getattr(parameters, text)

    w2v = Word2Vec(params['file'],
                   window_size=params['window_size'],
                   learning_rate=params['learning_rate'],
                   vocab_size=params['vocab_size'],
                   embedding_size=params['embedding_size'],
                   n_negative=params['n_negative'])

    w2v.fit(n_iter=params['n_iter'], num_proc=params['num_proc'])

    print(w2v.process_time)
    print(w2v.process_time[-1] - w2v.process_time[0])
예제 #21
0
def main():
    # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/contexts.json") as fp:
    #     contexts = json.load(fp)
    # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/neighbors.json") as fp:
    #     neighbors = json.load(fp)
    print("Reading dat files")
    npn = np.fromfile("npneighbors.dat", dtype=int)
    print(str(npn.shape[0]))
    npc = np.fromfile("npcontexts.dat", dtype=int)
    print(str(npc.shape[0]))
    print("finished read")
    # train skipgram model
    skipgram = Word2Vec(npn,
                        npc,
                        35000,
                        10,
                        0.001,
                        64,
                        "sg.ckpt",
                        batch_size=500)
    skipgram.train(5)
    # train cbow model
    cbow = Word2Vec(npc, npn, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500)
    cbow.train(5)
def getTextualFeature(text_reading_path):
    # Train and save the Word2Vec model for the text file.
    # Please note that, you can change the dimension of the resulting feature vector by modifying the value of 'size'.
    model = Word2Vec(LineSentence(text_reading_path),
                     size=500,
                     window=5,
                     sg=0,
                     min_count=5,
                     workers=8)
    model.save(text_reading_path + '.model')

    # Train and save the Sentence2Vec model for the sentence file.
    model = Sent2Vec(LineSentence(text_reading_path),
                     model_file=text_reading_path + '.model')
    model.save_sent2vec_format(text_reading_path + '.vec')

    program = os.path.basename(sys.argv[0])
예제 #23
0
    def __init__(self, batchop, batch_size=1, datapoints=[], w2v=None):

        # current iteration
        self._offset = 0

        # num of examples
        self.n = len(datapoints)

        # default batch size
        self.B = batch_size

        # create Word2Vec model
        self._w2v = Word2Vec() if not w2v else w2v

        # batch process operation
        self._batchop = batchop

        # if data available
        if len(datapoints):
            self.bind(datapoints)
예제 #24
0
def main(_):
    model = Word2Vec()
    norm_w_embed = tf.nn.l2_normalize(model._w_embed_in,
                                      1)  # [vocab_size, embed_size]
    embedings = model._sess.run(norm_w_embed)
    results = bh_tsne(embedings,
                      no_dims=2,
                      perplexity=50,
                      theta=DEFAULT_THETA,
                      randseed=EMPTY_SEED,
                      verbose=VERBOSE)

    with open(os.path.join(word_config.output_dir, "tsne.txt"), "w") as f:
        for result in results:
            fmt = ''
            for i in range(1, len(result)):
                fmt = fmt + '{}\t'
            fmt = fmt + '{}\n'

            f.write(fmt.format(*result))
    if ARGS.mode == "train":
        print("Starting Training branch...")

        print("Loading data ...")
        data = data_processing.get_w2v_data(ARGS)

        print("Initializing dataset and data loader...")
        word2vec_dataset = Word2VecDataset(data, ARGS)
        data_loader = DataLoader(word2vec_dataset,
                                 batch_size=ARGS.batch_size,
                                 shuffle=True,
                                 num_workers=2)

        print("Initializing model ...")
        model = Word2Vec(vocab, ARGS.embed_dim).to(ARGS.device)

        print("Train...")
        train(ARGS, data_loader, model)

    elif ARGS.mode in ["ret_words", "eval"]:
        model_path = os.path.join("models",
                                  f"ww_{ARGS.ww_size}_{ARGS.freq_thresh}")
        model = load_model(ARGS, model_path)
        print(model)

        print(
            f"Load docs: filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl..."
        )
        docs_by_id = data_processing.load_pickle(
            f"filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl")
예제 #26
0
파일: main_acc.py 프로젝트: andy94077/HYML
        None, None
    ]
    training = not args.no_training
    test = args.test
    ensemble = args.ensemble
    function = args.model_function
    lr = args.lr

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    max_seq_len = 32
    w2v_model = Word2Vec().load(word2vec_model_path)
    word2idx = w2v_model.get_word2idx()
    embedding = w2v_model.get_embedding()
    vocabulary_size = len(word2idx)
    print(f'\033[32;1mvocabulary_size: {vocabulary_size}\033[0m')

    if function not in globals():
        globals()[function] = getattr(
            importlib.import_module(function[:function.rfind('.')]),
            function.split('.')[-1])
    model = globals()[function](embedding)
    model.compile(Adam(lr), loss='binary_crossentropy', metrics=['acc'])
    model.summary()

    if training:
        trainX, trainY = utils.load_train_data(labeled_path, word2idx,
예제 #27
0
    #             print(row["word1"], row["word2"])
    #             print(tmpdf)

    # 各データセットとword2vecの比較
    datasets = [{
        "df": wordsim,
        "name": "WordSim"
    }, {
        "df": simlex,
        "name": "SimLex"
    }, {
        "df": men,
        "name": "MEN"
    }]
    modelname = "glove-wiki-gigaword-100"
    w2v = Word2Vec(modelname=modelname)
    for dataset in datasets:
        print(dataset["name"])
        target_dict = {
            "word1": [],
            "word2": [],
            dataset["name"]: [],
            "cos": []
        }
        for _, row in dataset["df"].iterrows():
            cos = w2v.get_cosine_similarity(row["word1"], row["word2"])
            if cos:
                target_dict["word1"].append(row["word1"])
                target_dict["word2"].append(row["word2"])
                target_dict[dataset["name"]].append(row["result"])
                target_dict["cos"].append(cos)
예제 #28
0
def train_word2vec():
    trainpath = 'dl4j.txt'
    w2v = Word2Vec()
    w2v.build(trainpath)
    w2v.fit(trainpath)
    w2v.evaluate()
예제 #29
0
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

category = 'Diseases_and_disorders'

logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

#input_file = 'test2.txt'
input_file = '../inputFile/' + category + '.corpus.txt'
model = Word2Vec(LineSentence(input_file),
                 size=50,
                 window=7,
                 sg=0,
                 min_count=3,
                 workers=8)
model.save(input_file + '.model')
model.save_word2vec_format(input_file + '.vec')

# f_wv=codecs.open('../inputFile/word-vec.txt','w','utf-8')
# with open('../inputFile/vocab.txt') as textfile1, open('../inputFile/wordVectors.txt') as textfile2:
#         for x, y in izip(textfile1, textfile2):
#             x = x.strip()
#             y = y.strip()
#             f_wv.write(x+'\t'+y+'\n')
# f_wv.close()

sent_file = input_file
model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model')
예제 #30
0
def train_word2vec(data_len=90000,
                   vocab_size=1000,
                   embed_size=300,
                   end_iter=2000,
                   verbose=True,
                   verbose_freq=100,
                   save=True,
                   save_freq=100):
    """
    Trains the Word2Vec Model based on:
        - data_len: number of data points to train and test on
        - vocab_size: number of top words the model will choose a solution from
        - embed_size: dimension of the question embedding
        - verbose: (boolean) prints out train and test losses every step
        - save: (boolean) saves model in ./savedir/ 
    """
    data_arr = (get_by_ques_type([], train=True) +
                get_by_ques_type([], train=False))[:data_len]
    p = Pipeline(data_arr, embed_type=embed_type)
    p.create_split()

    train_step = 0
    curr_samples = 0

    train_losses = []
    test_losses = []

    w2v = Word2Vec(vocab_size + 1, embed_size)
    run = True
    while run:
        p.next_batch(train=True, replace=True)
        train_inp, train_out = p.batch_word2vec()

        train_step += 1
        batch_samples = len(train_inp)
        curr_samples += batch_samples

        train_loss = w2v.train_step(np.array(train_inp), np.array(train_out),
                                    sess)

        p.next_batch(train=False, replace=True)
        test_inp, test_out = p.batch_word2vec()
        test_samples = len(test_inp)

        test_loss = w2v.evaluate(np.array(test_inp), np.array(test_out), sess)

        train_losses.append(train_loss)
        test_losses.append(test_loss)

        if train_step % save_freq == 0 and save:
            tf.train.Saver().save(sess,
                                  "saved_models/word2vec_model/word2vec_%d" %
                                  (train_step),
                                  global_step=train_step)
            np.savez(
                "saved_models/word2vec_model/word2vec_train_losses_%d" %
                (train_step), np.array(train_losses))
            np.savez(
                "saved_models/word2vec_model/word2vec_test_losses_%d" %
                (train_step), np.array(test_losses))

        if train_step % verbose_freq == 0 and verbose:
            print(
                "TRAIN STEP: %d | SAMPLES IN TRAIN BATCH: %d | TRAIN SAMPLES SO FAR: %d | TRAIN LOSS: %f | TEST LOSS: %f"
                % (train_step, batch_samples, curr_samples, train_loss,
                   test_loss))

        if train_step == end_iter:
            run = False

    if save:
        tf.train.Saver().save(sess,
                              "saved_models/word2vec_model/word2vec_%d" %
                              (train_step),
                              global_step=train_step)
        np.savez(
            "saved_models/word2vec_model/word2vec_train_losses_%d" %
            (train_step), np.array(train_losses))
        np.savez(
            "saved_models/word2vec_model/word2vec_test_losses_%d" %
            (train_step), np.array(test_losses))