예제 #1
0
	def init_embedder(dataset):
		'''
		initialize the embedder by load it from file if available
		or build the model by the dataset and save it
		'''
	
		fname = DIR_MODEL + '%s_embedder.pkl'%(prefix)

		if os.path.exists(fname):
			print >> sys.stderr, 'embedding model %s found and loaded'%(fname)
			return Word2Vec.load(fname)
		else:
			class x_iterator:
				def __init__(self, dataset):	
					self.dataset = dataset
				
				def __iter__(self):
					for set_x, set_y in self.dataset:
						for x in set_x:
							yield x

			embedder = Word2Vec()
			embedder.build(x_iterator(dataset), dim_proj)
			embedder.dump(fname)
	
		return embedder
예제 #2
0
    def __init__(self,
                 sentences,
                 model_file=None,
                 size=200,
                 alpha=0.025,
                 window=5,
                 min_count=5,
                 sample=0,
                 seed=1,
                 workers=16,
                 min_alpha=0.0001,
                 model="cb",
                 hs=1,
                 negative=0,
                 cbow_mean=0,
                 iteration=1,
                 word_learn=1,
                 init_adjust=True,
                 update_mode=0,
                 normalize_each_epoch=False):
        self.sg = 1 if model == "sg" or model == "dbow" else 0
        self.table = None  # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
        self.alpha = float(alpha)
        self.window = int(window)
        self.seed = seed
        self.sample = sample
        self.workers = workers
        self.min_alpha = min_alpha
        self.hs = hs
        self.negative = negative
        self.cbow_mean = int(cbow_mean)
        self.iteration = iteration
        self.word_learn = int(word_learn)
        self.layer1_size = size
        self.min_count = min_count
        self.sent_no_hash = {}  #mapping sent_id to index of self.sents
        self.sent_id_list = []  #mapping sent_no to sent_id
        self.sane_vec_len = 100000  #for sanity check
        self.sane_max_sim10 = 0.9  #for sanity check
        self.init_adjust = init_adjust  #for adjustment of initialization
        self.update_mode = update_mode  #0:SGD, 1: AdaGrad, 2:AdaDelta, (3:ADAM not implemented)
        self.normalize_each_epoch = normalize_each_epoch

        if sentences:
            if model_file:
                self.w2v = Word2Vec.load(model_file)
                self.vocab = self.w2v.vocab
                self.layer1_size = self.w2v.layer1_size
                self.build_vec(sentences, has_vocab=True)
            else:
                self.word_learn = 1
                self.w2v = Word2Vec(None, self.layer1_size, self.alpha,
                                    self.window, self.min_count, self.sample,
                                    self.seed, self.workers, self.min_alpha,
                                    self.sg, self.hs, self.negative,
                                    self.cbow_mean)
                self.build_vec(sentences, has_vocab=False)
            self.train_iteration(sentences, iteration=iteration)
예제 #3
0
 def load_cat2vec_format(cls, cat_model=None, sent_model=None, word_model=None):
     """
     Load sentence vectors
     """
     model = Category2Vec(None)
     count = 0
     if cat_model:
         logger.info("loading %s object(cat) from %s" % (cls.__name__, cat_model))
         for line in open(cat_model,"r"):
             line = line.rstrip()
             if count == 0:
                 info = line.split()
                 model.cat_len = int(info[0])
                 model.layer1_size = int(info[1])
                 model.sg = int(info[2])
                 model.hs = int(info[3])
                 model.negative = int(info[4])
                 model.cbow_mean = int(info[5])
                 model.cats = empty((model.cat_len, model.layer1_size), dtype=REAL)
                 model.cat_no_hash = {}
                 model.cat_id_list = []
             else:
                 idx = count - 1
                 row = line.split("\t")
                 cat_id = utils.to_unicode(row[0])
                 model.cat_no_hash[cat_id] = idx
                 model.cat_id_list.append(cat_id)
                 vals = row[1].split()
                 for j in xrange(model.layer1_size):
                     model.cats[idx][j] = float(vals[j])
             count += 1
     count = 0
     if sent_model:
         logger.info("loading %s object(sentence) from %s" % (cls.__name__, sent_model))
         for line in open(sent_model,"r"):
             line = line.rstrip()
             if count == 0:
                 info = line.split()
                 model.sents_len = int(info[0])
                 model.sents = empty((model.sents_len, model.layer1_size), dtype=REAL)
                 model.sent_no_hash = {}
                 model.sent_id_list = []
             else:
                 idx = count - 1
                 row = line.split("\t")
                 sent_id = utils.to_unicode(row[0])
                 model.sent_no_hash[sent_id] = idx
                 model.sent_id_list.append(sent_id)
                 vals = row[1].split()
                 for j in xrange(model.layer1_size):
                     model.sents[idx][j] = float(vals[j])
             count += 1
     if word_model:
         logger.info("loading word2vec from %s" % word_model)
         model.w2v = Word2Vec.load(word_model)
         model.vocab = model.w2v.vocab
     return model
예제 #4
0
    def __init__(self, sentences, model_file=None, size=200, alpha=0.025, window=5, min_count = 5,
                 sample=0, seed=1, workers=16, min_alpha=0.0001, model="cb", hs=1, negative=0, cbow_mean=0,
                 iteration=1, word_learn=1, init_adjust=True, update_mode = 0, normalize_each_epoch = False):
        self.sg = 1 if model == "sg" or model == "dbow" else 0
        self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
        self.alpha = float(alpha)
        self.window = int(window)
        self.seed = seed
        self.sample = sample
        self.workers = workers
        self.min_alpha = min_alpha
        self.hs = hs
        self.negative = negative
        self.cbow_mean = int(cbow_mean)
        self.iteration = iteration
        self.word_learn = int(word_learn)
        self.cat_learn = 1
        self.layer1_size = size
        self.min_count = min_count
        self.sent_no_hash = {} # mapping sent_id to index of self.sents
        self.sent_id_list = [] # mapping sent_no to sent_id
        self.cat_no_hash = {} # mapping cat_id to index of self.cats
        self.cat_id_list = [] # mapping cat_no to cat_id
        self.sane_vec_len = 100000 # for sanity check
        self.sane_max_sim10 = 0.9 # for sanity check
        self.init_adjust = init_adjust # for adjustment of initialization
        self.update_mode = update_mode # 0:SGD, 1: AdaGrad, 2:AdaDelta, 3:ADAM
        self.normalize_each_epoch = normalize_each_epoch # normalize vectors after each epoch

        if sentences:
            if model_file:
                self.w2v = Word2Vec.load(model_file)
                self.vocab = self.w2v.vocab
                self.layer1_size = self.w2v.layer1_size
                self.build_vec(sentences, has_vocab = True)
            else:
                self.word_learn = 1
                self.w2v = Word2Vec(None, self.layer1_size, self.alpha, self.window, self.min_count, self.sample, self.seed, self.workers, self.min_alpha, self.sg, self.hs, self.negative, self.cbow_mean)
                self.build_vec(sentences, has_vocab = False)
            self.train_iteration(sentences, iteration=iteration)
예제 #5
0
    def __init__(self,
                 sentences,
                 model_file=None,
                 w2v=None,
                 alpha=0.025,
                 window=5,
                 sample=0,
                 seed=1,
                 workers=1,
                 min_alpha=0.0001,
                 sg=1,
                 hs=1,
                 negative=0,
                 cbow_mean=0,
                 iteration=1):
        self.sg = int(sg)
        self.table = None  # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
        self.alpha = float(alpha)
        self.window = int(window)
        self.seed = seed
        self.sample = sample
        self.workers = workers
        self.min_alpha = min_alpha
        self.hs = hs
        self.negative = negative
        self.cbow_mean = int(cbow_mean)
        self.iteration = iteration

        if model_file is not None:
            self.w2v = Word2Vec.load(model_file)
        elif w2v is not None:
            self.w2v = w2v

        if sentences is not None:
            self.vocab = self.w2v.vocab
            self.layer1_size = self.w2v.layer1_size
            self.reset_sent_vec(sentences)
            for i in range(iteration):
                self.train_sent(sentences)
예제 #6
0
def word2vec_feat(reviews):
    w2v_model_file = "../../models/laptop.word2vec.model"
    w2v_model = Word2Vec.load(w2v_model_file)
    bags = []
    for review in reviews:
        bag = []
        for sent in review.sentences:
            instance = None
            count = 0.
            for w in sent:
                if w not in w2v_model:
                    continue
                if count == 0:
                    instance = w2v_model[w]
                    count += 1.
                else:
                    instance += w2v_model[w]
                    count += 1.
            instance /= count
            bag.append(instance.tolist())
        bags.append(bag)

    save_sparse_feature(corpus_name="laptop", view_name="word2vec", features=bags)
    save_view_info(view_name="word2vec", dim=100, data_format="sparse", view_type="continuous")
예제 #7
0
 def load(cls, fname, mmap=None):
     model = super(Sentence2Vec, cls).load(fname, mmap)
     if os.path.isfile(fname+"_w2v"):
         model.w2v = Word2Vec.load(fname+"_w2v", mmap)
         model.vocab = model.w2v.vocab
     return model
예제 #8
0
 def load(cls, fname, mmap=None):
     model = super(Sentence2Vec, cls).load(fname, mmap)
     if os.path.isfile(fname + "_w2v"):
         model.w2v = Word2Vec.load(fname + "_w2v", mmap)
         model.vocab = model.w2v.vocab
     return model
예제 #9
0
def process_bible(bos=False, eos=False):
    """
    Iterates over all sentences in the Bible.

    :param bos: Whether to append a BOS token to the sentences.
    :param eos: Whether to append an EOS token to the sentences.
    :return: Iterator iterating over Bible books in order (as specified in bible_books)
       with sentences randomly sorted within each Bible book.
    """
    def shuffled_book(book):
        text = process_book(book, bos=bos, eos=eos)
        random.shuffle(text)
        return text

    class Iterator:
        def __iter__(self):
            return itertools.chain(*map(shuffled_book, bible_books))

    return Iterator()


try:
    word2vec = Word2Vec.load('word2vec.pickle')
    print("Found and loaded word embedding.")
except FileNotFoundError:
    wd = 200
    print("Generating word embeddings from scratch.")
    word2vec = Word2Vec(process_bible(eos=True), size=wd)
    #word2vec.normalize()
    word2vec.save('word2vec.pickle')
예제 #10
0
 def load_cat2vec_format(cls,
                         cat_model=None,
                         sent_model=None,
                         word_model=None):
     """
     Load sentence vectors
     """
     model = Category2Vec(None)
     count = 0
     if cat_model:
         logger.info("loading %s object(cat) from %s" %
                     (cls.__name__, cat_model))
         for line in open(cat_model, "r"):
             line = line.rstrip()
             if count == 0:
                 info = line.split()
                 model.cat_len = int(info[0])
                 model.layer1_size = int(info[1])
                 model.sg = int(info[2])
                 model.hs = int(info[3])
                 model.negative = int(info[4])
                 model.cbow_mean = int(info[5])
                 model.cats = empty((model.cat_len, model.layer1_size),
                                    dtype=REAL)
                 model.cat_no_hash = {}
                 model.cat_id_list = []
             else:
                 idx = count - 1
                 row = line.split("\t")
                 cat_id = utils.to_unicode(row[0])
                 model.cat_no_hash[cat_id] = idx
                 model.cat_id_list.append(cat_id)
                 vals = row[1].split()
                 for j in xrange(model.layer1_size):
                     model.cats[idx][j] = float(vals[j])
             count += 1
     count = 0
     if sent_model:
         logger.info("loading %s object(sentence) from %s" %
                     (cls.__name__, sent_model))
         for line in open(sent_model, "r"):
             line = line.rstrip()
             if count == 0:
                 info = line.split()
                 model.sents_len = int(info[0])
                 model.sents = empty((model.sents_len, model.layer1_size),
                                     dtype=REAL)
                 model.sent_no_hash = {}
                 model.sent_id_list = []
             else:
                 idx = count - 1
                 row = line.split("\t")
                 sent_id = utils.to_unicode(row[0])
                 model.sent_no_hash[sent_id] = idx
                 model.sent_id_list.append(sent_id)
                 vals = row[1].split()
                 for j in xrange(model.layer1_size):
                     model.sents[idx][j] = float(vals[j])
             count += 1
     if word_model:
         logger.info("loading word2vec from %s" % word_model)
         model.w2v = Word2Vec.load(word_model)
         model.vocab = model.w2v.vocab
     return model
예제 #11
0
파일: baseline.py 프로젝트: Friedmannn/ABSA
def main():
    optparser = OptionParser()
    optparser.add_option("-p", "--pro", dest="product")
    (options, args) = optparser.parse_args()

    (train_file, test_file) = CORPUS[options.product]
    train_reviews = load_dataset(DATA_PATH + train_file)
    test_reviews = load_dataset(DATA_PATH + test_file)

    n_cates, cate_index = get_categories(train_reviews + test_reviews)
    vocab_size = 1000
    vocab_index = get_vocab(train_reviews, vocab_size)

    train_bags = [extract_unigram(vocab_index, vocab_size, review)\
            for review in train_reviews]
    train_X = [bag2vec(bag) for bag in train_bags]
    train_labels = [extract_labels(cate_index, review)\
            for review in train_reviews]

    test_bags = [extract_unigram(vocab_index, vocab_size, review)\
            for review in test_reviews]
    test_X = [bag2vec(bag) for bag in test_bags]
    test_labels = [extract_labels(cate_index, review)\
            for review in test_reviews]


    # add word2vec feature
    w2v_model_file = "../../models/laptop.word2vec.model"
    w2v_model = Word2Vec.load(w2v_model_file)
    train_X2 = word2vec_feat(train_reviews, w2v_model)
    train_X = merge_features(train_X, train_X2)
    test_X2 = word2vec_feat(test_reviews, w2v_model)
    test_X = merge_features(test_X, test_X2)


    labelwise_acc = []
    labelwise_output = []
    for cate in range(n_cates):
        # train a binary svm model
        train_Y = get_Y(train_labels, cate)
        prob = svm_problem(train_Y, train_X)
        #param = svm_parameter("-s 0 -t 0 -b 1")
        param = svm_parameter("-s 0 -t 2 -b 1")
        m = svm_train(prob, param)

        # test
        test_Y = get_Y(test_labels, cate)
        p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1')

        labelwise_acc.append(p_acc)
        labelwise_output.append(p_label)

    # evaluation 
    p, r, f = microF1(labelwise_output, test_labels)

    # output
    out_dir = "results/rbf/"
    out_dir = "results/"
    out_file = out_dir + options.product + ".txt"
    cates = list(cate_index.items())
    cates = sorted(cates, key=lambda x:x[1])
    labelwise_acc = [(cates[i][0], labelwise_acc[i][0]) for i in range(n_cates)]
    labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1])
    with open(out_file, 'w') as out:
        out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f))
        print("{}\n{}\n{}".format(p, r, f))
        for cate_i in range(n_cates):
            out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))