from arch.Word2Vec import Word2Vec from pipe.ConvertWordIds import convertWordIds from pipe.DownSample import DownSample from pipe.createInputTasks import createW2VInputTasks from pipe.ContextWindows import contextWindow from model.model import Model from tools.word2vec import save from tools.worddict import buildvocab # set cbow=1 to use CBOW, Hierarchical Softmax is used by default if negative is not set (higher than 0) if __name__ == "__main__": m = Model(alpha=0.05, vectorsize=100, input="data/text8", inputrange=None, # means all build=[ buildvocab ], pipeline=[ createW2VInputTasks, convertWordIds, DownSample, contextWindow, Word2Vec ], mintf=5, cores=2, threads=3, windowsize=5, iterations=1, downsample=0.001, cbow=1, ) m.run() save("results/vectors.cbhs.bin", m, binary=True)
parser.add_argument('--save-dir', type=str, default=None) parser.add_argument('--dataset', type=str, default=None) # training params parser.add_argument('--min-count', type=int, default=1) parser.add_argument('--window-size', type=int, default=5) parser.add_argument('--n-negs', type=int, default=5) parser.add_argument('--method', type=str, default='normal') parser.add_argument('--epoch', type=int, default=5) # emb dim parser.add_argument('--emb-dim', type=int, default=200) args = parser.parse_args() return args args = get_args() reg = 0 if args.method == 'reg': reg = 1 m = Model(alpha=0.025, vectorsize=args.emb_dim, input=args.data_path, inputrange=None, build=[buildvocab], pipeline=[ createW2VInputTasks, convertWordIds, DownSample, contextWindow, Word2Vec ], mintf=5, cores=1, threads=1, windowsize=args.window_size, downsample=0.001, iterations=args.epoch, negative=args.n_negs, reg=reg, method=args.method) m.run() save(args.save_dir, m, binary=False)
from arch.Word2Vec import Word2Vec from pipe.ConvertWordIds import convertWordIds from model.model import Model from pipe.DownSample import DownSample from pipe.createInputTasks import createW2VInputTasks from tools.word2vec import save from tools.worddict import buildvocab from pipe.ContextWindows import contextWindow # Word2Vec uses Skipgram by default, set negative > 0 to use negative sampling instead of Hierarchical Softmax if __name__ == "__main__": m = Model(alpha=0.025, vectorsize=100, input="data/text8", inputrange=None, # means all build=[ buildvocab ], pipeline=[ createW2VInputTasks, convertWordIds, DownSample, contextWindow, Word2Vec ], mintf=5, cores=2, threads=3, windowsize=5, downsample=0.001, iterations=1, negative=5) m.run() save("results/vectors.sgns.bin", m, binary=True)