loader = klass(data_params["path"]) data_args = data_params["args"] load_args = data_args.get("load", {}) data = loader.load_data(**load_args) # test all vector models for embedder_model in data_args["models"]: # identify prebuilt model if exists if isinstance(embedder_model, dict): # initialize word vector embedder embedder_model, prebuilt_model_params = embedder_model.items().pop() prebuilt_path_model = prebuilt_model_params.get("model", None) model_args = prebuilt_model_params.get("args", {}) embedder = WordVectorEmbedder(embedder_model, model_fullpath=prebuilt_path_model, model_args=model_args) # update embedder parameters if prebuilt_path_model: model_path_dir, model_path_filename, model_path_filext = WordVectorBuilder.filename_components( prebuilt_path_model ) embedder.model_subset = model_path_filename # training data (custom or default) if prebuilt_model_params.get("train", None): prebuilt_path_train = prebuilt_model_params.get("train") else: prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model) with open(prebuilt_path_train, "rb") as f: data_train = pickle.load(f)
def main(): model_defaults = { 'imdb': { 'data_filename' : "", 'hdf5_name' : "imdb_split.hd5"}, 'amazon': { 'data_filename' : "reviews_Health_and_Personal_Care.json.gz", 'hdf5_name' : "health_personal_split.hd5" }, 'sentiment140': { 'data_filename' : "sentiment140.csv", 'hdf5_name' : "sentiment140_split.hd5" }, 'open_weiboscope': { 'data_filename' : "", 'hdf5_name' : "open_weiboscope.hd5" }, } arg_parser = argparse.ArgumentParser() arg_parser.add_argument("dataset", help="Name of dataset (one of amazon, imdb, sentiment140, open_weiboscope)") arg_parser.add_argument("--working_dir", "-w", default=".", help="Directory where data and results should be put, default PWD.") #arg_parser.add_argument("embedding", choices=('glove','word2vec'), required=False) vector_group=arg_parser.add_mutually_exclusive_group() vector_group.add_argument("--glove", nargs=1, metavar="LOCATION",help="Use glove, object at this path") vector_group.add_argument("--word2vec", nargs=1, metavar="LOCATION",help="Use word2vec, object at this path") arg_parser.add_argument("--results_dir", "-r", default=None, help="custom subfolder to store results and weights in (defaults to dataset)") arg_parser.add_argument("--data_path", "-d", default=None, help="custom path to original data, partially overrides working_dir") model_types=arg_parser.add_mutually_exclusive_group() model_types.add_argument("--cnn", default=True, action="store_true", help="Use convolutional model") model_types.add_argument("--lstm", action="store_true", help="Use LSTM") arg_parser.add_argument("--hdf5_path", "-5", default=None, help="custom path to split data in HDF5") arg_parser.add_argument("--weights_path", default=None, help="path to weights to initialize model with") arg_parser.add_argument("--gpu_id", "-g", default=0, type=int, help="GPU device ID (integer)") arg_parser.add_argument("--learning_rate", default=0.01, type=float, help="Learning rate, default 0.01") arg_parser.add_argument("--momentum_coef", default=0.9, type=float, help="Momentum coefficient, default 0.9") arg_parser.add_argument("--batch_size", default=128, type=int, help="Batch size") arg_parser.add_argument("--nframes", default=256, type=int, help="Frame buffer size for CREPE. 256 or 1024.") arg_parser.add_argument("--rng_seed",default=None,type=float, help="Random number seed") arg_parser.add_argument("--do_evals", default=False, action="store_true") arg_parser.add_argument("--log_level", default=logging.INFO, type=int) args = arg_parser.parse_args() logging.getLogger().setLevel(args.log_level) dataset_name = args.dataset args.working_dir = os.path.abspath(args.working_dir) if not args.results_dir: args.results_dir = dataset_name args.results_dir = os.path.join(args.working_dir, args.results_dir) if not args.data_path: args.data_path = os.path.join(args.working_dir, model_defaults[dataset_name]['data_filename']) if not args.hdf5_path: args.hdf5_path = os.path.join(args.working_dir, model_defaults[dataset_name]['hdf5_name']) # dataset-specific arguments to do_model model_args = { 'sentiment140' : { 'min_length' : 70, 'max_length' : 150, 'normalizer_fun' : normalize_tweet, 'variant' : 'tweet_character', }, 'imdb' : { 'normalizer_fun' : normalize_imdb, }, 'amazon' : { 'normalizer_fun' : data_utils.normalize, }, 'open_weiboscope' : { 'normalizer_fun' : data_utils.normalize, 'balance_labels' : True, 'max_records' : 2e6, }, } # use 50 words in embedding-based models of microblogs, # otherwise use 99 words for other embedding-based models if dataset_name in ('sentiment140','open_weiboscope'): embedding_nr_words = 50 else: embedding_nr_words = 99 # CNN or LSTM if args.cnn: model_args[dataset_name]['model_type'] = 'cnn' # character-based by default (overridden for embedding-based, below) model_args[dataset_name]['transformer_fun'] = data_utils.to_one_hot elif args.lstm: model_args[dataset_name]['model_type'] = 'lstm' # for character-based LSTM, re-reverse data if not (args.glove or args.word2vec): model_args[dataset_name]['transformer_fun'] = reverse_one_hot # set default hidden size (overridden for embedding-based models below) model_args[dataset_name]['hidden_size'] = 10 model_args[dataset_name]['nframes']=args.nframes # parameters for embedding-based models if args.glove or args.word2vec: model_args[dataset_name]['sequence_length'] = embedding_nr_words model_args[dataset_name]['crepe_variant'] = 'embedding{}'.format(embedding_nr_words) model_args[dataset_name]['hidden_size'] = 200 if args.glove: glove_path = os.path.abspath(args.glove[0]) if not os.path.isfile(glove_path): model_downloader = ModelDownloader('glove') glove_url = model_downloader.data_location['twitter-2b']['url'] glove_dir = os.path.dirname(glove_path) glove_file = os.path.basename(glove_path) model_downloader.download_and_save_vectors(glove_url, glove_dir, glove_file) glove_embedder = WordVectorEmbedder("glove",glove_path) model_args[dataset_name]['transformer_fun'] = \ lambda x: glove_embedder.embed_words_into_vectors( transform_for_vectors(x), embedding_nr_words) model_args[dataset_name]['vocab_size'] = 200 if args.word2vec: w2v_embedder = WordVectorEmbedder("word2vec", os.path.abspath(args.word2vec[0])) model_args[dataset_name]['transformer_fun'] = \ lambda x: w2v_embedder.embed_words_into_vectors( transform_for_vectors(x), embedding_nr_words) model_args[dataset_name]['vocab_size'] = 300 try: logger.debug(model_args[dataset_name]['normalizer_fun']) except KeyError: logger.debug("No custom normalization fn specified") do_model(dataset_name, args.working_dir, args.results_dir, args.data_path, args.hdf5_path, gpu_id=args.gpu_id, learning_rate=args.learning_rate, momentum_coef=args.momentum_coef, batch_size=args.batch_size, rng_seed=args.rng_seed, **model_args[dataset_name]) if args.do_evals: do_evaluations(args.results_dir)
def main(): model_defaults = { 'imdb': { 'data_filename': "", 'hdf5_name': "imdb_split.hd5" }, 'amazon': { 'data_filename': "reviews_Health_and_Personal_Care.json.gz", 'hdf5_name': "health_personal_split.hd5" }, 'sentiment140': { 'data_filename': "sentiment140.csv", 'hdf5_name': "sentiment140_split.hd5" }, 'open_weiboscope': { 'data_filename': "", 'hdf5_name': "open_weiboscope.hd5" }, } arg_parser = argparse.ArgumentParser() arg_parser.add_argument( "dataset", help= "Name of dataset (one of amazon, imdb, sentiment140, open_weiboscope)") arg_parser.add_argument( "--working_dir", "-w", default=".", help="Directory where data and results should be put, default PWD.") #arg_parser.add_argument("embedding", choices=('glove','word2vec'), required=False) vector_group = arg_parser.add_mutually_exclusive_group() vector_group.add_argument("--glove", nargs=1, metavar="LOCATION", help="Use glove, object at this path") vector_group.add_argument("--word2vec", nargs=1, metavar="LOCATION", help="Use word2vec, object at this path") arg_parser.add_argument( "--results_dir", "-r", default=None, help= "custom subfolder to store results and weights in (defaults to dataset)" ) arg_parser.add_argument( "--data_path", "-d", default=None, help="custom path to original data, partially overrides working_dir") model_types = arg_parser.add_mutually_exclusive_group() model_types.add_argument("--cnn", default=True, action="store_true", help="Use convolutional model") model_types.add_argument("--lstm", action="store_true", help="Use LSTM") arg_parser.add_argument("--hdf5_path", "-5", default=None, help="custom path to split data in HDF5") arg_parser.add_argument("--weights_path", default=None, help="path to weights to initialize model with") arg_parser.add_argument("--gpu_id", "-g", default=0, type=int, help="GPU device ID (integer)") arg_parser.add_argument("--learning_rate", default=0.01, type=float, help="Learning rate, default 0.01") arg_parser.add_argument("--momentum_coef", default=0.9, type=float, help="Momentum coefficient, default 0.9") arg_parser.add_argument("--batch_size", default=128, type=int, help="Batch size") arg_parser.add_argument("--nframes", default=256, type=int, help="Frame buffer size for CREPE. 256 or 1024.") arg_parser.add_argument("--rng_seed", default=None, type=float, help="Random number seed") arg_parser.add_argument("--do_evals", default=False, action="store_true") arg_parser.add_argument("--log_level", default=logging.INFO, type=int) args = arg_parser.parse_args() logging.getLogger().setLevel(args.log_level) dataset_name = args.dataset args.working_dir = os.path.abspath(args.working_dir) if not args.results_dir: args.results_dir = dataset_name args.results_dir = os.path.join(args.working_dir, args.results_dir) if not args.data_path: args.data_path = os.path.join( args.working_dir, model_defaults[dataset_name]['data_filename']) if not args.hdf5_path: args.hdf5_path = os.path.join( args.working_dir, model_defaults[dataset_name]['hdf5_name']) # dataset-specific arguments to do_model model_args = { 'sentiment140': { 'min_length': 70, 'max_length': 150, 'normalizer_fun': normalize_tweet, 'variant': 'tweet_character', }, 'imdb': { 'normalizer_fun': normalize_imdb, }, 'amazon': { 'normalizer_fun': data_utils.normalize, }, 'open_weiboscope': { 'normalizer_fun': data_utils.normalize, 'balance_labels': True, 'max_records': 2e6, }, } # use 50 words in embedding-based models of microblogs, # otherwise use 99 words for other embedding-based models if dataset_name in ('sentiment140', 'open_weiboscope'): embedding_nr_words = 50 else: embedding_nr_words = 99 # CNN or LSTM if args.cnn: model_args[dataset_name]['model_type'] = 'cnn' # character-based by default (overridden for embedding-based, below) model_args[dataset_name]['transformer_fun'] = data_utils.to_one_hot elif args.lstm: model_args[dataset_name]['model_type'] = 'lstm' # for character-based LSTM, re-reverse data if not (args.glove or args.word2vec): model_args[dataset_name]['transformer_fun'] = reverse_one_hot # set default hidden size (overridden for embedding-based models below) model_args[dataset_name]['hidden_size'] = 10 model_args[dataset_name]['nframes'] = args.nframes # parameters for embedding-based models if args.glove or args.word2vec: model_args[dataset_name]['sequence_length'] = embedding_nr_words model_args[dataset_name]['crepe_variant'] = 'embedding{}'.format( embedding_nr_words) model_args[dataset_name]['hidden_size'] = 200 if args.glove: glove_path = os.path.abspath(args.glove[0]) if not os.path.isfile(glove_path): model_downloader = ModelDownloader('glove') glove_url = model_downloader.data_location['twitter-2b']['url'] glove_dir = os.path.dirname(glove_path) glove_file = os.path.basename(glove_path) model_downloader.download_and_save_vectors(glove_url, glove_dir, glove_file) glove_embedder = WordVectorEmbedder("glove", glove_path) model_args[dataset_name]['transformer_fun'] = \ lambda x: glove_embedder.embed_words_into_vectors( transform_for_vectors(x), embedding_nr_words) model_args[dataset_name]['vocab_size'] = 200 if args.word2vec: w2v_embedder = WordVectorEmbedder("word2vec", os.path.abspath(args.word2vec[0])) model_args[dataset_name]['transformer_fun'] = \ lambda x: w2v_embedder.embed_words_into_vectors( transform_for_vectors(x), embedding_nr_words) model_args[dataset_name]['vocab_size'] = 300 try: logger.debug(model_args[dataset_name]['normalizer_fun']) except KeyError: logger.debug("No custom normalization fn specified") do_model(dataset_name, args.working_dir, args.results_dir, args.data_path, args.hdf5_path, gpu_id=args.gpu_id, learning_rate=args.learning_rate, momentum_coef=args.momentum_coef, batch_size=args.batch_size, rng_seed=args.rng_seed, **model_args[dataset_name]) if args.do_evals: do_evaluations(args.results_dir)
loader = klass(data_params['path']) data_args = data_params['args'] load_args = data_args.get('load', {}) data = loader.load_data(load_args) # test all vector models for embedder_model in data_args['models']: # identify prebuilt model if exists prebuilt_path_model = None if isinstance(embedder_model, dict): embedder_model, prebuilt_model_params = embedder_model.items().pop() prebuilt_path_model = prebuilt_model_params.get('model') # initialize word vector embedder embedder = WordVectorEmbedder(embedder_model, prebuilt_path_model) # load pre-sampled data from disk if prebuilt_path_model: # training data (custom or default) if prebuilt_model_params.get('train', None): prebuilt_path_train = prebuilt_model_params.get('train') else: prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model) # testing data (custom or default) if prebuilt_model_params.get('test', None): prebuilt_path_test = prebuilt_model_params.get('test') else: prebuilt_path_test = WordVectorBuilder.filename_test(prebuilt_path_model)
for data_source, data_params in datasets.iteritems(): # prepare data loader klass = data_params['class'] loader = klass(data_params['path']) data_args = data_params['args'] data = loader.load_data() # initialize lists (will be converted later into numpy arrays) values = [] labels = [] # initialize vector embedder prebuilt_model_path = data_args.get('models', {}).get( embedder_model, {}).get('prebuilt_model_path', None) embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path) # load pre-sampled data from disk if prebuilt_model_path: with open(WordVectorBuilder.filename_train(prebuilt_model_path), 'rb') as f: data = pickle.load(f) else: # get equal-sized subsets of each class min_samples = data_args['min_samples'] if data_args.has_key( 'min_samples') else None data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2) data = data_sampler.sample_balanced(min_samples)