Exemplo n.º 1
0
from arch.Word2Vec import Word2Vec
from pipe.ConvertWordIds import convertWordIds
from pipe.DownSample import DownSample
from pipe.createInputTasks import createW2VInputTasks
from pipe.ContextWindows import contextWindow
from model.model import Model
from tools.word2vec import save
from tools.worddict import buildvocab

# set cbow=1 to use CBOW, Hierarchical Softmax is used by default if negative is not set (higher than 0)
if __name__ == "__main__":
    m = Model(alpha=0.05, vectorsize=100,
                 input="data/text8",
                 inputrange=None, # means all
                 build=[ buildvocab ],
                 pipeline=[ createW2VInputTasks, convertWordIds, DownSample, contextWindow, Word2Vec ],
                 mintf=5, cores=2, threads=3, windowsize=5, iterations=1, downsample=0.001,
                 cbow=1,
              )
    m.run()
    save("results/vectors.cbhs.bin", m, binary=True)

Exemplo n.º 2
0
    parser.add_argument('--save-dir', type=str, default=None)
    parser.add_argument('--dataset', type=str, default=None)

    # training params
    parser.add_argument('--min-count', type=int, default=1)
    parser.add_argument('--window-size', type=int, default=5)
    parser.add_argument('--n-negs', type=int, default=5)
    parser.add_argument('--method', type=str, default='normal')
    parser.add_argument('--epoch', type=int, default=5)

    # emb dim
    parser.add_argument('--emb-dim', type=int, default=200)

    args = parser.parse_args()

    return args


args = get_args()
reg = 0
if args.method == 'reg':
    reg = 1
m = Model(alpha=0.025, vectorsize=args.emb_dim,
        input=args.data_path,
        inputrange=None,
        build=[buildvocab],
        pipeline=[ createW2VInputTasks, convertWordIds, DownSample, contextWindow, Word2Vec ],
        mintf=5, cores=1, threads=1, windowsize=args.window_size, downsample=0.001, iterations=args.epoch, negative=args.n_negs, reg=reg, method=args.method)
m.run()
save(args.save_dir, m, binary=False)
Exemplo n.º 3
0
from arch.Word2Vec import Word2Vec
from pipe.ConvertWordIds import convertWordIds
from model.model import Model
from pipe.DownSample import DownSample
from pipe.createInputTasks import createW2VInputTasks
from tools.word2vec import save
from tools.worddict import buildvocab
from pipe.ContextWindows import contextWindow

# Word2Vec uses Skipgram by default, set negative > 0 to use negative sampling instead of Hierarchical Softmax
if __name__ == "__main__":
    m = Model(alpha=0.025, vectorsize=100,
                 input="data/text8",
                 inputrange=None, # means all
                 build=[ buildvocab ],
                 pipeline=[ createW2VInputTasks, convertWordIds, DownSample, contextWindow, Word2Vec ],
                 mintf=5, cores=2, threads=3, windowsize=5, downsample=0.001, iterations=1, negative=5)
    m.run()
    save("results/vectors.sgns.bin", m, binary=True)