def init_embedder(dataset): ''' initialize the embedder by load it from file if available or build the model by the dataset and save it ''' fname = DIR_MODEL + '%s_embedder.pkl'%(prefix) if os.path.exists(fname): print >> sys.stderr, 'embedding model %s found and loaded'%(fname) return Word2Vec.load(fname) else: class x_iterator: def __init__(self, dataset): self.dataset = dataset def __iter__(self): for set_x, set_y in self.dataset: for x in set_x: yield x embedder = Word2Vec() embedder.build(x_iterator(dataset), dim_proj) embedder.dump(fname) return embedder
def __init__(self, sentences, model_file=None, size=200, alpha=0.025, window=5, min_count=5, sample=0, seed=1, workers=16, min_alpha=0.0001, model="cb", hs=1, negative=0, cbow_mean=0, iteration=1, word_learn=1, init_adjust=True, update_mode=0, normalize_each_epoch=False): self.sg = 1 if model == "sg" or model == "dbow" else 0 self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving self.alpha = float(alpha) self.window = int(window) self.seed = seed self.sample = sample self.workers = workers self.min_alpha = min_alpha self.hs = hs self.negative = negative self.cbow_mean = int(cbow_mean) self.iteration = iteration self.word_learn = int(word_learn) self.layer1_size = size self.min_count = min_count self.sent_no_hash = {} #mapping sent_id to index of self.sents self.sent_id_list = [] #mapping sent_no to sent_id self.sane_vec_len = 100000 #for sanity check self.sane_max_sim10 = 0.9 #for sanity check self.init_adjust = init_adjust #for adjustment of initialization self.update_mode = update_mode #0:SGD, 1: AdaGrad, 2:AdaDelta, (3:ADAM not implemented) self.normalize_each_epoch = normalize_each_epoch if sentences: if model_file: self.w2v = Word2Vec.load(model_file) self.vocab = self.w2v.vocab self.layer1_size = self.w2v.layer1_size self.build_vec(sentences, has_vocab=True) else: self.word_learn = 1 self.w2v = Word2Vec(None, self.layer1_size, self.alpha, self.window, self.min_count, self.sample, self.seed, self.workers, self.min_alpha, self.sg, self.hs, self.negative, self.cbow_mean) self.build_vec(sentences, has_vocab=False) self.train_iteration(sentences, iteration=iteration)
def load_cat2vec_format(cls, cat_model=None, sent_model=None, word_model=None): """ Load sentence vectors """ model = Category2Vec(None) count = 0 if cat_model: logger.info("loading %s object(cat) from %s" % (cls.__name__, cat_model)) for line in open(cat_model,"r"): line = line.rstrip() if count == 0: info = line.split() model.cat_len = int(info[0]) model.layer1_size = int(info[1]) model.sg = int(info[2]) model.hs = int(info[3]) model.negative = int(info[4]) model.cbow_mean = int(info[5]) model.cats = empty((model.cat_len, model.layer1_size), dtype=REAL) model.cat_no_hash = {} model.cat_id_list = [] else: idx = count - 1 row = line.split("\t") cat_id = utils.to_unicode(row[0]) model.cat_no_hash[cat_id] = idx model.cat_id_list.append(cat_id) vals = row[1].split() for j in xrange(model.layer1_size): model.cats[idx][j] = float(vals[j]) count += 1 count = 0 if sent_model: logger.info("loading %s object(sentence) from %s" % (cls.__name__, sent_model)) for line in open(sent_model,"r"): line = line.rstrip() if count == 0: info = line.split() model.sents_len = int(info[0]) model.sents = empty((model.sents_len, model.layer1_size), dtype=REAL) model.sent_no_hash = {} model.sent_id_list = [] else: idx = count - 1 row = line.split("\t") sent_id = utils.to_unicode(row[0]) model.sent_no_hash[sent_id] = idx model.sent_id_list.append(sent_id) vals = row[1].split() for j in xrange(model.layer1_size): model.sents[idx][j] = float(vals[j]) count += 1 if word_model: logger.info("loading word2vec from %s" % word_model) model.w2v = Word2Vec.load(word_model) model.vocab = model.w2v.vocab return model
def __init__(self, sentences, model_file=None, size=200, alpha=0.025, window=5, min_count = 5, sample=0, seed=1, workers=16, min_alpha=0.0001, model="cb", hs=1, negative=0, cbow_mean=0, iteration=1, word_learn=1, init_adjust=True, update_mode = 0, normalize_each_epoch = False): self.sg = 1 if model == "sg" or model == "dbow" else 0 self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving self.alpha = float(alpha) self.window = int(window) self.seed = seed self.sample = sample self.workers = workers self.min_alpha = min_alpha self.hs = hs self.negative = negative self.cbow_mean = int(cbow_mean) self.iteration = iteration self.word_learn = int(word_learn) self.cat_learn = 1 self.layer1_size = size self.min_count = min_count self.sent_no_hash = {} # mapping sent_id to index of self.sents self.sent_id_list = [] # mapping sent_no to sent_id self.cat_no_hash = {} # mapping cat_id to index of self.cats self.cat_id_list = [] # mapping cat_no to cat_id self.sane_vec_len = 100000 # for sanity check self.sane_max_sim10 = 0.9 # for sanity check self.init_adjust = init_adjust # for adjustment of initialization self.update_mode = update_mode # 0:SGD, 1: AdaGrad, 2:AdaDelta, 3:ADAM self.normalize_each_epoch = normalize_each_epoch # normalize vectors after each epoch if sentences: if model_file: self.w2v = Word2Vec.load(model_file) self.vocab = self.w2v.vocab self.layer1_size = self.w2v.layer1_size self.build_vec(sentences, has_vocab = True) else: self.word_learn = 1 self.w2v = Word2Vec(None, self.layer1_size, self.alpha, self.window, self.min_count, self.sample, self.seed, self.workers, self.min_alpha, self.sg, self.hs, self.negative, self.cbow_mean) self.build_vec(sentences, has_vocab = False) self.train_iteration(sentences, iteration=iteration)
def __init__(self, sentences, model_file=None, w2v=None, alpha=0.025, window=5, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iteration=1): self.sg = int(sg) self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving self.alpha = float(alpha) self.window = int(window) self.seed = seed self.sample = sample self.workers = workers self.min_alpha = min_alpha self.hs = hs self.negative = negative self.cbow_mean = int(cbow_mean) self.iteration = iteration if model_file is not None: self.w2v = Word2Vec.load(model_file) elif w2v is not None: self.w2v = w2v if sentences is not None: self.vocab = self.w2v.vocab self.layer1_size = self.w2v.layer1_size self.reset_sent_vec(sentences) for i in range(iteration): self.train_sent(sentences)
def word2vec_feat(reviews): w2v_model_file = "../../models/laptop.word2vec.model" w2v_model = Word2Vec.load(w2v_model_file) bags = [] for review in reviews: bag = [] for sent in review.sentences: instance = None count = 0. for w in sent: if w not in w2v_model: continue if count == 0: instance = w2v_model[w] count += 1. else: instance += w2v_model[w] count += 1. instance /= count bag.append(instance.tolist()) bags.append(bag) save_sparse_feature(corpus_name="laptop", view_name="word2vec", features=bags) save_view_info(view_name="word2vec", dim=100, data_format="sparse", view_type="continuous")
def load(cls, fname, mmap=None): model = super(Sentence2Vec, cls).load(fname, mmap) if os.path.isfile(fname+"_w2v"): model.w2v = Word2Vec.load(fname+"_w2v", mmap) model.vocab = model.w2v.vocab return model
def load(cls, fname, mmap=None): model = super(Sentence2Vec, cls).load(fname, mmap) if os.path.isfile(fname + "_w2v"): model.w2v = Word2Vec.load(fname + "_w2v", mmap) model.vocab = model.w2v.vocab return model
def process_bible(bos=False, eos=False): """ Iterates over all sentences in the Bible. :param bos: Whether to append a BOS token to the sentences. :param eos: Whether to append an EOS token to the sentences. :return: Iterator iterating over Bible books in order (as specified in bible_books) with sentences randomly sorted within each Bible book. """ def shuffled_book(book): text = process_book(book, bos=bos, eos=eos) random.shuffle(text) return text class Iterator: def __iter__(self): return itertools.chain(*map(shuffled_book, bible_books)) return Iterator() try: word2vec = Word2Vec.load('word2vec.pickle') print("Found and loaded word embedding.") except FileNotFoundError: wd = 200 print("Generating word embeddings from scratch.") word2vec = Word2Vec(process_bible(eos=True), size=wd) #word2vec.normalize() word2vec.save('word2vec.pickle')
def load_cat2vec_format(cls, cat_model=None, sent_model=None, word_model=None): """ Load sentence vectors """ model = Category2Vec(None) count = 0 if cat_model: logger.info("loading %s object(cat) from %s" % (cls.__name__, cat_model)) for line in open(cat_model, "r"): line = line.rstrip() if count == 0: info = line.split() model.cat_len = int(info[0]) model.layer1_size = int(info[1]) model.sg = int(info[2]) model.hs = int(info[3]) model.negative = int(info[4]) model.cbow_mean = int(info[5]) model.cats = empty((model.cat_len, model.layer1_size), dtype=REAL) model.cat_no_hash = {} model.cat_id_list = [] else: idx = count - 1 row = line.split("\t") cat_id = utils.to_unicode(row[0]) model.cat_no_hash[cat_id] = idx model.cat_id_list.append(cat_id) vals = row[1].split() for j in xrange(model.layer1_size): model.cats[idx][j] = float(vals[j]) count += 1 count = 0 if sent_model: logger.info("loading %s object(sentence) from %s" % (cls.__name__, sent_model)) for line in open(sent_model, "r"): line = line.rstrip() if count == 0: info = line.split() model.sents_len = int(info[0]) model.sents = empty((model.sents_len, model.layer1_size), dtype=REAL) model.sent_no_hash = {} model.sent_id_list = [] else: idx = count - 1 row = line.split("\t") sent_id = utils.to_unicode(row[0]) model.sent_no_hash[sent_id] = idx model.sent_id_list.append(sent_id) vals = row[1].split() for j in xrange(model.layer1_size): model.sents[idx][j] = float(vals[j]) count += 1 if word_model: logger.info("loading word2vec from %s" % word_model) model.w2v = Word2Vec.load(word_model) model.vocab = model.w2v.vocab return model
def main(): optparser = OptionParser() optparser.add_option("-p", "--pro", dest="product") (options, args) = optparser.parse_args() (train_file, test_file) = CORPUS[options.product] train_reviews = load_dataset(DATA_PATH + train_file) test_reviews = load_dataset(DATA_PATH + test_file) n_cates, cate_index = get_categories(train_reviews + test_reviews) vocab_size = 1000 vocab_index = get_vocab(train_reviews, vocab_size) train_bags = [extract_unigram(vocab_index, vocab_size, review)\ for review in train_reviews] train_X = [bag2vec(bag) for bag in train_bags] train_labels = [extract_labels(cate_index, review)\ for review in train_reviews] test_bags = [extract_unigram(vocab_index, vocab_size, review)\ for review in test_reviews] test_X = [bag2vec(bag) for bag in test_bags] test_labels = [extract_labels(cate_index, review)\ for review in test_reviews] # add word2vec feature w2v_model_file = "../../models/laptop.word2vec.model" w2v_model = Word2Vec.load(w2v_model_file) train_X2 = word2vec_feat(train_reviews, w2v_model) train_X = merge_features(train_X, train_X2) test_X2 = word2vec_feat(test_reviews, w2v_model) test_X = merge_features(test_X, test_X2) labelwise_acc = [] labelwise_output = [] for cate in range(n_cates): # train a binary svm model train_Y = get_Y(train_labels, cate) prob = svm_problem(train_Y, train_X) #param = svm_parameter("-s 0 -t 0 -b 1") param = svm_parameter("-s 0 -t 2 -b 1") m = svm_train(prob, param) # test test_Y = get_Y(test_labels, cate) p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1') labelwise_acc.append(p_acc) labelwise_output.append(p_label) # evaluation p, r, f = microF1(labelwise_output, test_labels) # output out_dir = "results/rbf/" out_dir = "results/" out_file = out_dir + options.product + ".txt" cates = list(cate_index.items()) cates = sorted(cates, key=lambda x:x[1]) labelwise_acc = [(cates[i][0], labelwise_acc[i][0]) for i in range(n_cates)] labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1]) with open(out_file, 'w') as out: out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f)) print("{}\n{}\n{}".format(p, r, f)) for cate_i in range(n_cates): out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))