def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5, sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0, dm_mean=0, train_words=True, train_lbls=True, **kwargs): """ Initialize the model from an iterable of `sentences`. Each sentence is a LabeledSentence object that will be used for training. The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used. Otherwise, `dbow` is employed. `size` is the dimensionality of the feature vectors. `window` is the maximum distance between the current and predicted word within a sentence. `alpha` is the initial learning rate (will linearly drop to zero as training progresses). `seed` = for the random number generator. `min_count` = ignore all words with total frequency lower than this. `sample` = threshold for configuring which higher-frequency words are randomly downsampled; default is 0 (off), useful value is 1e-5. `workers` = use this many worker threads to train the model (=faster training with multicore machines). `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). `negative` = if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. Only applies when dm is used. """ Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs) self.train_words = train_words self.train_lbls = train_lbls if sentences is not None: self.build_vocab(sentences) self.train(sentences)
def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5, sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0, dm_mean=0, train_words=True, train_lbls=True, **kwargs): """ Initialize the model from an iterable of `sentences`. Each sentence is a LabeledSentence object that will be used for training. The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used. Otherwise, `dbow` is employed. `size` is the dimensionality of the feature vectors. `window` is the maximum distance between the current and predicted word within a sentence. `alpha` is the initial learning rate (will linearly drop to zero as training progresses). `seed` = for the random number generator. `min_count` = ignore all words with total frequency lower than this. `sample` = threshold for configuring which higher-frequency words are randomly downsampled; default is 0 (off), useful value is 1e-5. `workers` = use this many worker threads to train the model (=faster training with multicore machines). `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). `negative` = if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. Only applies when dm is used. """ Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs) self.train_words = train_words self.train_lbls = train_lbls if sentences is not None: self.build_vocab(sentences) self.train(sentences)
def __init__(self, size=50, alpha=0.1, min_count=1, seed=1, workers=1, iter=1, use_gold=0, train_path=None, test_raw_path=None, test_path=None, dev_path=None, quick_test=None, dict_path=None, score_script_path=None, pre_train=False, uni_path=None, bi_path=None, hybrid_pred=False, no_action_feature=False, no_bigram_feature=False, no_unigram_feature=False, no_binary_action_feature=False, no_sb_state_feature=False, **kwargs): print '\n\n### Initialization of the segmentation model ###' self.no_action_feature = no_action_feature self.no_bigram_feature = no_bigram_feature self.no_unigram_feature = no_unigram_feature self.no_binary_action_feature = no_binary_action_feature self.no_sb_feature = no_sb_state_feature self.pre_train = pre_train self.l2_rate = 0.001 # rate for L2 regularization if self.l2_rate: print 'reg with L2, with param=', self.l2_rate self.drop_out = False self.finger_int = str(r_randint(0, 1000000)) self.binary_pred = False self.hybrid_pred = hybrid_pred self.use_gold = use_gold # self.model = None self.START = "#S#" self.END = "#E#" self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab = "$LABEL0", "$LABEL1", "$OOV" self.su_prefix, self.sb_prefix = '$SU', '$SB' # prefix for unigram/bigram state; no prefix for *char* unigram/bigrams self.state_varient = ('0', '1') self.train_path = train_path self.test_raw_path = test_raw_path self.test_path = test_path self.dev_path = dev_path self.quick_test = quick_test self.dict_path = dict_path self.score_script = score_script_path # self.score_script = '../working_data/score' # self.dict_path ='../working_data/pku.dict' print '\nloading train, test, dev corpus...' self.train_corpus = [l.split() for l in codecs.open(self.train_path, 'rU', 'utf-8')] self.test_corpus = [l.split() for l in codecs.open(self.test_raw_path, 'rU', 'utf-8')] self.dev_corpus = [l.split() for l in codecs.open(self.dev_path, 'rU', 'utf-8')] self.quick_test_corpus = [l.split() for l in codecs.open(self.quick_test, 'rU', 'utf-8')] Word2Vec.__init__(self, sentences=None, size=size, alpha=alpha, min_count=min_count, seed=seed, workers=workers, iter=iter, **kwargs) self.mask = [1 for i in range(12)] if self.no_action_feature: self.mask = self.mask[:-3] print 'len mask', len(self.mask) elif self.no_sb_feature: self.mask = self.mask[:-1] print 'len mask', len(self.mask) if self.no_unigram_feature: self.mask = self.mask[:-5] print 'len mask', len(self.mask) if self.no_bigram_feature: self.mask = self.mask[:-4] print 'len mask', len(self.mask) self.f_factor = sum(self.mask) self.f_factor2 = 2 if self.no_binary_action_feature: self.f_factor2 = 0 print 'f-factor2=', self.f_factor2 self.non_fixed_param = self.f_factor * self.layer1_size self.pred_size = self.non_fixed_param + self.f_factor2 if self.drop_out: self.dropout_rate = 0.5 self.dropout_size = int(self.dropout_rate * self.non_fixed_param) print 'using drop_out, rate/size=', self.dropout_rate, self.dropout_size self.train_mode = False self.dev_test_result = [] print '\nLearning rate=', self.alpha, '; Feature (layer1) size=', self.layer1_size, '; Predicate vec size=', self.pred_size, 'f-factor=', self.f_factor, 'f-factor2=', self.f_factor2 if self.pre_train: print '\nloading pre-trained char and char-bigram embeddings' self.uni_emb = Word2Vec.load(uni_path) emb_normalization(self.uni_emb) print 'unigram embedding loaded' self.bi_emb = Word2Vec.load(bi_path) emb_normalization(self.bi_emb) print 'bigram embedding loaded'
def __init__(self, size=50, alpha=0.1, min_count=1, seed=1, workers=1, iter=1, use_gold=0, train_path=None, test_raw_path=None, test_path=None, dev_path=None, quick_test=None, dict_path=None, score_script_path=None, pre_train=False, uni_path=None, bi_path=None, hybrid_pred=False, no_action_feature=False, no_bigram_feature=False, no_unigram_feature=False, no_binary_action_feature=False, no_sb_state_feature=False, **kwargs): print '\n\n### Initialization of the segmentation model ###' self.no_action_feature = no_action_feature self.no_bigram_feature = no_bigram_feature self.no_unigram_feature = no_unigram_feature self.no_binary_action_feature = no_binary_action_feature self.no_sb_feature = no_sb_state_feature self.pre_train = pre_train self.l2_rate = 0.001 # rate for L2 regularization if self.l2_rate: print 'reg with L2, with param=', self.l2_rate self.drop_out = False self.finger_int = str(r_randint(0, 1000000)) self.binary_pred = False self.hybrid_pred = hybrid_pred self.use_gold = use_gold # self.model = None self.START = "#S#" self.END = "#E#" self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab = "$LABEL0", "$LABEL1", "$OOV" self.su_prefix, self.sb_prefix = '$SU', '$SB' # prefix for unigram/bigram state; no prefix for *char* unigram/bigrams self.state_varient = ('0', '1') self.train_path = train_path self.test_raw_path = test_raw_path self.test_path = test_path self.dev_path = dev_path self.quick_test = quick_test self.dict_path = dict_path self.score_script = score_script_path # self.score_script = '../working_data/score' # self.dict_path ='../working_data/pku.dict' print '\nloading train, test, dev corpus...' self.train_corpus = [ l.split() for l in codecs.open(self.train_path, 'rU', 'utf-8') ] self.test_corpus = [ l.split() for l in codecs.open(self.test_raw_path, 'rU', 'utf-8') ] self.dev_corpus = [ l.split() for l in codecs.open(self.dev_path, 'rU', 'utf-8') ] self.quick_test_corpus = [ l.split() for l in codecs.open(self.quick_test, 'rU', 'utf-8') ] Word2Vec.__init__(self, sentences=None, size=size, alpha=alpha, min_count=min_count, seed=seed, workers=workers, iter=iter, **kwargs) self.mask = [1 for i in range(12)] if self.no_action_feature: self.mask = self.mask[:-3] print 'len mask', len(self.mask) elif self.no_sb_feature: self.mask = self.mask[:-1] print 'len mask', len(self.mask) if self.no_unigram_feature: self.mask = self.mask[:-5] print 'len mask', len(self.mask) if self.no_bigram_feature: self.mask = self.mask[:-4] print 'len mask', len(self.mask) self.f_factor = sum(self.mask) self.f_factor2 = 2 if self.no_binary_action_feature: self.f_factor2 = 0 print 'f-factor2=', self.f_factor2 self.non_fixed_param = self.f_factor * self.layer1_size self.pred_size = self.non_fixed_param + self.f_factor2 if self.drop_out: self.dropout_rate = 0.5 self.dropout_size = int(self.dropout_rate * self.non_fixed_param) print 'using drop_out, rate/size=', self.dropout_rate, self.dropout_size self.train_mode = False self.dev_test_result = [] print '\nLearning rate=', self.alpha, '; Feature (layer1) size=', self.layer1_size, '; Predicate vec size=', self.pred_size, 'f-factor=', self.f_factor, 'f-factor2=', self.f_factor2 if self.pre_train: print '\nloading pre-trained char and char-bigram embeddings' self.uni_emb = Word2Vec.load(uni_path) emb_normalization(self.uni_emb) print 'unigram embedding loaded' self.bi_emb = Word2Vec.load(bi_path) emb_normalization(self.bi_emb) print 'bigram embedding loaded'