def main(unused_argv): if not FLAGS.data_dir: raise ValueError("--data_dir is required.") if not FLAGS.output_dir: raise ValueError("--output_dir is required.") encoder = encoder_manager.EncoderManager() # Maybe load unidirectional encoder. if FLAGS.uni_checkpoint_path: print("Loading unidirectional model...") uni_config = configuration.model_config() encoder.load_model(uni_config, FLAGS.uni_vocab_file, FLAGS.uni_embeddings_file, FLAGS.uni_checkpoint_path) # Maybe load bidirectional encoder. if FLAGS.bi_checkpoint_path: print("Loading bidirectional model...") bi_config = configuration.model_config(bidirectional_encoder=True) encoder.load_model(bi_config, FLAGS.bi_vocab_file, FLAGS.bi_embeddings_file, FLAGS.bi_checkpoint_path) evaluate(encoder, FLAGS.output_dir, evaltest=True, loc=FLAGS.data_dir) encoder.close()
def main(unused_argv): if not FLAGS.data_dir: raise ValueError("--data_dir is required.") encoder = encoder_manager.EncoderManager() # Maybe load unidirectional encoder. if FLAGS.uni_checkpoint_path: print("Loading unidirectional model...") uni_config = configuration.model_config() encoder.load_model( uni_config, FLAGS.uni_vocab_file, FLAGS.uni_embeddings_file, FLAGS.uni_checkpoint_path) # Maybe load bidirectional encoder. if FLAGS.bi_checkpoint_path: print("Loading bidirectional model...") bi_config = configuration.model_config(bidirectional_encoder=True) encoder.load_model( bi_config, FLAGS.bi_vocab_file, FLAGS.bi_embeddings_file, FLAGS.bi_checkpoint_path) if FLAGS.eval_task in ["MR", "CR", "SUBJ", "MPQA"]: eval_classification.eval_nested_kfold( encoder, FLAGS.eval_task, FLAGS.data_dir, use_nb=False) elif FLAGS.eval_task == "SICK": eval_sick.evaluate(encoder, evaltest=True, loc=FLAGS.data_dir) elif FLAGS.eval_task == "MSRP": eval_msrp.evaluate(encoder, evalcv=True, evaltest=True, use_feats=True, loc=FLAGS.data_dir) elif FLAGS.eval_task == "TREC": eval_trec.evaluate(encoder, evalcv=True, evaltest=True, loc=FLAGS.data_dir) else: raise ValueError("Unrecognized eval_task: %s" % FLAGS.eval_task) encoder.close()
def get_encoder(): # Download and extract the bidirectional model. (shell script) # cd models/ # wget "http://download.tensorflow.org/models/skip_thoughts_bi_2017_02_16.tar.gz" # tar -xvf skip_thoughts_bi_2017_02_16.tar.gz # rm skip_thoughts_bi_2017_02_16.tar.gz # cd .. # # Set paths to the model. pretrained_path = 'models/skip_thoughts_bi_2017_02_16/' VOCAB_FILE = os.path.join(pretrained_path, 'vocab.txt') EMBEDDING_MATRIX_FILE = os.path.join(pretrained_path, 'embeddings.npy') CHECKPOINT_PATH = os.path.join(pretrained_path, 'model.ckpt-500008') # Set up the encoder. Here we are using a single unidirectional model. # To use a bidirectional model as well, call load_model() again with # configuration.model_config(bidirectional_encoder=True) and paths to the # bidirectional model's files. The encoder will use the concatenation of # all loaded models. encoder = encoder_manager.EncoderManager() encoder.load_model(configuration.model_config(bidirectional_encoder=True), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH) return encoder
def extract_by_skip_thought(sent_list: List[str]): """ To make it compatible with the toolkit, we need the input to be a list of sentences :param sent_list: :return: """ skip_thought_dir = os.path.join('/home/junpeiz/Project/Twitter/data', 'skipThoughts', 'pretrained', 'skip_thoughts_uni_2017_02_02') # Set paths to the model. VOCAB_FILE = os.path.join(skip_thought_dir, "vocab.txt") EMBEDDING_MATRIX_FILE = os.path.join(skip_thought_dir, "embeddings.npy") CHECKPOINT_PATH = os.path.join(skip_thought_dir, "model.ckpt-501424") # The following directory should contain files rt-polarity.neg and # rt-polarity.pos. # MR_DATA_DIR = "/dir/containing/mr/data" # Set up the encoder. Here we are using a single unidirectional model. # To use a bidirectional model as well, call load_model() again with # configuration.model_config(bidirectional_encoder=True) and paths to the # bidirectional model's files. The encoder will use the concatenation of # all loaded models. encoder = encoder_manager.EncoderManager() encoder.load_model(configuration.model_config(), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH) encoding_list = encoder.encode(sent_list) return encoding_list
def __init__(self, use_char=False): super(SkipThought, self).__init__() self.use_char2vec = use_char cur_path = os.path.abspath(os.path.dirname(__file__)) # Set paths to the model. VOCAB_FILE = os.path.join( cur_path, "../../models/skip_thoughts_uni_2017_02_02/vocab.txt") EMBEDDING_MATRIX_FILE = os.path.join( cur_path, "../../models/skip_thoughts_uni_2017_02_02/embeddings.txt") CHECKPOINT_PATH = os.path.join( cur_path, "../../models/skip_thoughts_uni_2017_02_02/model.ckpt-501424") self.encoder = encoder_manager.EncoderManager() self.encoder.load_model(configuration.model_config(), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH) if self.use_char2vec: PROJ_MODEL_PATH = os.path.join( cur_path, "../../models/char_word2vec/skip-thought_linear_projection.m") self.char_w2v = CharWord2vec() with open(PROJ_MODEL_PATH) as f: self.proj = pickle.load(f)
def load_model(vocab_file, embedding_matrix_file, checkpoint_path, bidirectional_encoder): encoder = encoder_manager.EncoderManager() encoder.load_model(configuration.model_config( bidirectional_encoder=bidirectional_encoder), vocabulary_file=vocab_file, embedding_matrix_file=embedding_matrix_file, checkpoint_path=checkpoint_path) return encoder
def setup_encoder(): VOCAB_FILE = '/data/ryli/kcli/skip-thoughts/pretrained/skip_thoughts_uni_2017_02_02/vocab.txt' EMBEDDING_MATRIX_FILE = '/data/ryli/kcli/skip-thoughts/pretrained/skip_thoughts_uni_2017_02_02/embeddings.npy' CHECKPOINT_PATH = '/data/ryli/kcli/skip-thoughts/pretrained/skip_thoughts_uni_2017_02_02/model.ckpt-501424' encoder = encoder_manager.EncoderManager() encoder.load_model(configuration.model_config(), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH) return encoder
def __init__(self, withSVM=False): [lib, con, neu] = cPickle.load(open(os.getcwd() + '/sampleData.pkl', 'rb')) self.bias_dict = {} for tree in lib: sentence = tree.get_words() self.bias_dict[sentence] = 1 for tree in con: sentence = tree.get_words() self.bias_dict[sentence] = -1 for tree in neu: sentence = tree.get_words() self.bias_dict[sentence] = 0 self.encoder = encoder_manager.EncoderManager() self.data_encodings = [] self.data = self.bias_dict.keys() self.blacklist = [] #f = open('skipthoughts.pkl', 'rb') # right now, we're using a unidirectional skip model; # we can try the bidirectional model later dir_path = os.path.dirname(os.path.realpath(__file__)) VOCAB_FILE = dir_path + "/../data/vocab.txt" EMBEDDING_MATRIX_FILE = dir_path + "/../data/embeddings.npy" CHECKPOINT_PATH = dir_path + "/../data/model.ckpt-501424" self.encoder.load_model(configuration.model_config(), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH) self.sentiment = SentimentIntensityAnalyzer() self.clf = None self.withSVM = withSVM if withSVM: print('using the SVM!') f = open('./svm.pkl', 'rb') self.clf = cPickle.load(f)
def main(): parser = argparse.ArgumentParser( description="encoding sentences example for skip_thoughts.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('vocab_file', help="specify the vocab_file") parser.add_argument('embedding_matrix_file', help='specify the embedding_matrix_file') parser.add_argument('checkpoint_path', help="specify the checkpoint_path") parser.add_argument('mr_data_dir', help="specify the mr_data_dir") parser.add_argument('--model_name', default="skip_thoughts") parser.add_argument('--bidirect', choices=["True", "False"], default="False") args = parser.parse_args() if args.bidirect == "True": args.bidirect = True else: args.bidirect = False encoder = encoder_manager.EncoderManager(args.model_name) encoder.load_model( configuration.model_config(bidirectional_encoder=args.bidirect), vocabulary_file=args.vocab_file, embedding_matrix_file=args.embedding_matrix_file, checkpoint_path=args.checkpoint_path) data = [] with open(os.path.join(args.mr_data_dir, 'rt-polarity.neg'), 'rb') as f: data.extend([line.decode('latin-1').strip() for line in f]) with open(os.path.join(args.mr_data_dir, 'rt-polarity.pos'), 'rb') as f: data.extend([line.decode('latin-1').strip() for line in f]) encodings = encoder.encode(data) def get_nn(ind, num=10): encoding = encodings[ind] scores = sd.cdist([encoding], encodings, 'cosine')[0] sorted_ids = np.argsort(scores) print("Senetence:") print("", data[ind]) print("\nNearest neighbors:") for i in range(1, num + 1): print(" %d. %s (%.3f)" % (i, data[sorted_ids[i]], scores[sorted_ids[i]])) get_nn(0)
def restore_skipthought(model_dir, model_name, skipthought_embedding, skipthought_vocab): """ :rtype: encoder_manager.EncoderManager() :return: """ check_point_path = os.path.join(model_dir, model_name) skip_thought_embedding_matrix = os.path.join(model_dir, skipthought_embedding) skip_thought_vocab = os.path.join(model_dir, skipthought_vocab) encoder = encoder_manager.EncoderManager() encoder.load_model(configuration.model_config(), vocabulary_file=skip_thought_vocab, embedding_matrix_file=skip_thought_embedding_matrix, checkpoint_path=check_point_path) return encoder
def __init__(self, modelPath, checkpointPath): """Initialize skip though model. Arguments: modelPath {str} -- the path to model checkpointPath {str} -- the filename of mode.ckpt-xxxx """ self.modelPath = modelPath self.checkpointPath = os.path.join(modelPath, "..", checkpointPath) self.vocabFile = os.path.join(modelPath, "vocab.txt") self.embeddingMatrixFile = os.path.join(modelPath, "embeddings.npy") self.encoder = encoder_manager.EncoderManager() self.encoder.load_model(configuration.model_config(), vocabulary_file=self.vocabFile, embedding_matrix_file=self.embeddingMatrixFile, checkpoint_path=self.checkpointPath)
def __init__(self, withSVM=False): [lib, con, neu] = cPickle.load(open('sampleData.pkl', 'rb')) self.bias_dict = {} for tree in lib: sentence = tree.get_words() self.bias_dict[sentence] = 1 for tree in con: sentence = tree.get_words() self.bias_dict[sentence] = -1 for tree in neu: sentence = tree.get_words() self.bias_dict[sentence] = 0 self.encoder = encoder_manager.EncoderManager() self.data_encodings = [] self.data = self.bias_dict.keys() self.blacklist = [] #f = open('skipthoughts.pkl', 'rb') # right now, we're using a unidirectional skip model; # we can try the bidirectional model later VOCAB_FILE = "/Users/az/Desktop/projects/modemo/backend/modules/tf/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/vocab.txt" EMBEDDING_MATRIX_FILE = "/Users/az/Desktop/projects/modemo/backend/modules/tf/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/embeddings.npy" CHECKPOINT_PATH = "/Users/az/Desktop/projects/modemo/backend/modules/tf/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/model.ckpt-501424" self.encoder.load_model(configuration.model_config(), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH) self.sentiment = SentimentIntensityAnalyzer() self.clf = None if withSVM: print('using the SVM!') f = open('./svm.pkl', 'rb') self.clf = cPickle.load(f)
# traditional measurement like levenstein distance, dynamic time wrapping, jaro, etc. print(_generate_log("Average Embedding", ae_sims, sim_names)) print(_generate_log("InferSent", inf_sims, sim_names)) print(_generate_log("SkipThought", st_sims, sim_names)) if __name__ == '__main__': # Load in InferSent infersent = torch.load(MODEL_PATH) # rely on "models.py" as well infersent.set_glove_path(GLOVE_PATH) # Load in SkipThought config_gpu = tf.ConfigProto() config_gpu.gpu_options.allow_growth = True with tf.Graph().as_default(), tf.Session(config=config_gpu) as session: skipthought = encoder_manager.EncoderManager() skipthought.load_model( configuration.model_config(bidirectional_encoder=True), vocabulary_file=VOCAB_FILE, embedding_matrix_file=EMBEDDING_MATRIX_FILE, checkpoint_path=CHECKPOINT_PATH) # Load in average embedding avg_emb = AverageEmbedder(word_emb_dim=300) avg_emb.set_glove_path(GLOVE_PATH) IPython.embed()
import pandas as pd from skip_thoughts import configuration from skip_thoughts import encoder_manager from sklearn.feature_extraction.text import TfidfVectorizer VOCAB_FILE = ".\\skip_thoughts_bi_2017_02_16\\vocab.txt" EMBEDDING_MATRIX_FILE = ".\\skip_thoughts_bi_2017_02_16\\embeddings.npy" CHECKPOINT_PATH = ".\\skip_thoughts_bi_2017_02_16\\model.ckpt-500008" encoder = encoder_manager.EncoderManager() encoder.load_model(configuration.model_config(bidirectional_encoder=True),vocabulary_file=VOCAB_FILE,embedding_matrix_file=EMBEDDING_MATRIX_FILE,checkpoint_path=CHECKPOINT_PATH) def neural_features(dataset_loc): english_dataset = pd.read_csv(dataset_loc) headline = english_dataset['headline'] body = english_dataset['content'] labels = [int(x) for x in english_dataset['label']] labels_done = [] flag = True body_encodings = np.zeros((len(body),2400)) j = 0 for i in range(len(body)): flag=True try: current_body_encoding = encoder.encode(body[i:i+1]) except: flag=False