def load_inferSent(sentences):
    logger.info('load InferSent')
    V = 2
    MODEL_PATH = 'Infersent/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    if torch.cuda.is_available():
        infersent.cuda()

    # set word vector
    if V == 1:
        W2V_PATH = 'Infersent/Glove/glove.840B.300d.txt'
        logger.warning('Use Glove Embedding')
    elif V == 2:
        W2V_PATH = 'Infersen/fastText/crawl-300d-2M.vec'
        logger.warning('Use fastText Embedding')
    else:
        raise NotImplementedError
    infersent.set_w2v_path(W2V_PATH)

    # build voceb
    infersent.build_vocab(sentences, tokenize=True)

    return infersent
示例#2
0
def get_loaded_model(force_gpu=False, k_most_frequent_words=1000000):

    model_path = "infersent/encoder/infersent{}.pkl".format(model_version)
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }

    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))

    if (not torch.cuda.is_available()) and force_gpu:
        raise GPUNotFoundException()

    if torch.cuda.is_available():
        model = model.cuda()

    # If infersent1 -> use GloVe embeddings.
    # If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'infersent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'infersent/dataset/fastText/crawl-300d-2M.vec'  ## noqa
    model.set_w2v_path(W2V_PATH)

    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=k_most_frequent_words)
    return model
示例#3
0
def create_embeddings(infer_path, data_path, em_type):
    yt_titles = yt.get_yt_titles()
    with open("data/whtitles", "r") as f:
        wh_titles = [line.rstrip('\n') for line in f]

    if em_type == "yt":  # Youtube
        save_f = os.path.join(data_path, "yt_embed")
        titles = yt_titles
    elif em_type == "wh":  # Wikihow
        save_f = os.path.join(data_path, "wh_embed")
        titles = wh_titles
    else:
        raise "Unknown embedding type: {}".format(em_type)

    nltk.download('punkt')
    V = 1
    MODEL_PATH = os.path.join(infer_path, 'encoder/infersent%s.pkl' % V)
    params_model = {
        'bsize': 256,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent = infersent.cuda()

    W2V_PATH = os.path.join(infer_path, 'GloVe/glove.840B.300d.txt')
    infersent.set_w2v_path(W2V_PATH)

    infersent.build_vocab(yt_titles + wh_titles, tokenize=True)
    embed = infersent.encode(titles, tokenize=True)
    np.save(save_f, embed)
示例#4
0
def embed_dataset(dataset_path, infersent_path, force_cpu=False):
    """
    To make this work, first run ./get_infersent.sh
    """
    MODEL_PATH = infersent_path / "encoder/infersent1.pkl"
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
    model = InferSent(params_model)
    if force_cpu:
        model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu'))
    else:
        model.load_state_dict(torch.load(MODEL_PATH))
        model.cuda()

    W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)
    model.build_vocab_k_words(K=100000)

    csv_data = read_csv(dataset_path / 'train.csv')
    csv_data = csv_data[1:]  # skip header
    data = defaultdict(list)

    for irow, row in enumerate(csv_data):
        if 'snips' in str(dataset_path):
            utterance, labels, delexicalised, intent = row
        else:
            raise TypeError(
                "Unknown dataset type. Implement your own first. See the "
                "README")
        data[intent].append(utterance)

    vectors = {}
    for i, (intent, sentences) in enumerate(data.items()):
        print('{}/{} done'.format(i, len(data.items())))
        embeddings = model.encode(sentences)
        avg_embedding = np.mean(embeddings, axis=0)
        vectors[intent] = avg_embedding

    return vectors
示例#5
0
文件: apply.py 项目: smit14/SentEval
def apply_logician(s1, s2 , is_list=False, sick_model = False):

	# is_list : If you are directly sending sentences then keep is_list = False
	#			If you are sending list of list of words then keep is_list = True

	# sick_model: if True, will use sick model for prediction
	#			: if False, will use snli model for prediction

	# Load InferSent model
	params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
					'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
	model = InferSent(params_model)
	model.load_state_dict(torch.load(MODEL_PATH))
	model.set_w2v_path(PATH_TO_W2V)

	params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
	params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
									 'tenacity': 3, 'epoch_size': 2}

	params_senteval['infersent'] = model.cuda()

	if not is_list:
		s1 = convert_str2lst(s1)
		s2 = convert_str2lst(s2)
	samples = s1+s2
	params_senteval['batch_size'] = min(128,len(s1))
	params_senteval = utils.dotdict(params_senteval)
	params_senteval.usepytorch  = True

	prepare(params_senteval, samples)

	emb_s1 = batcher(params_senteval, s1)
	emb_s2 = batcher(params_senteval, s2)
	if sick_model:
		testF = np.c_[ np.abs(emb_s1 - emb_s2),emb_s1 * emb_s2]
		cp = torch.load('./saved_sick.pth')
		print('[Contradiction  Neutral  Entailment]')
	else:
		testF = np.c_[emb_s1, emb_s2, emb_s1 * emb_s2, np.abs(emb_s1 - emb_s2)]
		cp = torch.load('./saved_snli_augment_ordered.pth')
		print('[ Entailment  Neutral Contradiction ]')
	inputdim = testF.shape[1]
	nclasses = 3
	clf = nn.Sequential(nn.Linear(inputdim, nclasses),).cuda()
	clf.load_state_dict(cp)

	testF = torch.FloatTensor(testF).cuda()
	out = clf(testF)
	sf = nn.Softmax(1)
	probs = sf(out)
	return probs
示例#6
0
def init_models(vocal_size: int = VOCAB_SIZE):
    model = InferSent({
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': VERSION
    })
    model.load_state_dict(torch.load(MODEL_PATH))
    model = model.cuda() if USE_CUDA else model

    model.set_w2v_path(VECTOR_PATH)
    model.build_vocab_k_words(K=VOCAB_SIZE)
    return model
def infersent_embeddings():
    train_data_list = []
    test_data_list = []
    sys.path.append(
        '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master')
    # Load model
    from models import InferSent
    model_version = 1
    MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model
    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)
    train_data_list = model.encode(final_train['text'].tolist(),
                                   bsize=128,
                                   tokenize=False,
                                   verbose=True)
    print('nb sentences encoded : {0}'.format(len(train_data_list)))
    test_data_list = model.encode(final_test['text'].tolist(),
                                  bsize=128,
                                  tokenize=False,
                                  verbose=True)
    print('nb sentences encoded : {0}'.format(len(test_data_list)))
    return train_data_list, test_data_list
def infersent_glove():
    #Set Model for InferSent+Glove
    V = 1
    MODEL_PATH = '/tmp/GloVe/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    modelg = InferSent(params_model)
    modelg.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = True
    modelg = modelg.cuda() if use_cuda else modelg

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = '/tmp/GloVe/glove.840B.300d.txt' if V == 1 else '/home/ganesh/Quora_dev/tmp/GloVe/glove.840B.300d.txt'
    modelg.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    modelg.build_vocab_k_words(K=100000)
    return modelg
示例#9
0
    def init_infersent_model(self):
        model_version = 1
        MODEL_PATH = "encoder/infersent%s.pkl" % model_version
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))

        # Keep it on CPU or put it on GPU
        use_cuda = False
        model = model.cuda() if use_cuda else model

        # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
        W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
        model.set_w2v_path(W2V_PATH)
        # Load embeddings of K most frequent words
        model.build_vocab_k_words(K=100000)
        self.model = model
示例#10
0
}
# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    # Load InferSent model
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.set_w2v_path(PATH_TO_W2V)

    params_senteval['infersent'] = model.cuda()

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion'
    ]
    results = se.eval(transfer_tasks)
    print(results)
示例#11
0
    args = parser.parse_args()
    print("download: ", args.download)
    print("Model: ", args.model_version)
    print("Makeing cosine vector : ", args.cosine)

    if args.download == True:
        nltk.download('punkt')
        model_version = args.model_version
        MODEL_PATH = "/home1/InferSent/encoder/infersent%s.pickle" % model_version
        params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                        'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))
        # Keep it on CPU or put it on GPU
        use_cuda = True
        model = model.cuda() if use_cuda else model
        W2V_PATH = '/home1/InferSent/oov_train_model.vec'
        model.set_w2v_path(W2V_PATH)
        # Load embeddings of K most frequent words
        # model.build_vocab_k_words(K=100000)
        model.build_vocab_k_words(K=2051129)  # Extract embedding word .

        # Load test sentences

        train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8')
        source_s = train_test[0][1:]
        target_s = train_test[1][1:]
        embeddings_source = model.encode(source_s, bsize=128, tokenize=False, verbose=True)
        print('nb source_s encoded : {0}'.format(len(embeddings_source)))
        embeddings_target = model.encode(target_s, bsize=128, tokenize=False, verbose=True)
        print('nb target_s encoded : {0}'.format(len(embeddings_target)))
示例#12
0
def main():

    # Dictionary for Final Rankings.
    ranking = dict()

    print("\n CSI 4107 - Microblog information retrieval system \n")

    print("\n Importing Query Files and Documents... \n")

    # Load the tweet list.
    # {'34952194402811904': 'Save BBC World Service from Savage Cuts http://www.petitionbuzz.com/petitions/savews', ...}
    tweets_dict = importTweets()

    # Load the list of queries.
    # {1: ['bbc', 'world', 'servic', 'staff', 'cut'], ...}
    queries_dict = importQuery()

    print("\n Importing Done! \n")

    print("\n Initializing InferSent Model... \n")

    # Initialize InferSent Model.
    infersent = InferSent(params_model)

    # Load Infersent v1 Model Encoder.
    infersent.load_state_dict(torch.load(MODEL_PATH))

    # Use GPU Mode
    infersent = infersent.cuda() if USE_CUDA else infersent

    # Load Pre-trained GloVe Model.
    infersent.set_w2v_path(W2V_PATH)

    print("\n InferSent Initialization Done! \n")

    print("\n Building Vocabulary from Tweets... \n")

    # Deconstruct the dictionary of Documents to Document ID, and Document Contents.
    tweets = list(tweets_dict.values())
    tweet_ids = list(tweets_dict.keys())

    # Deconstruct the dictionary of Queries to Query Contents, since we can replicate Query ID.
    queries = list(queries_dict.values())

    # Build the Infersent Vocabulary based on all the Documents' Contents.
    infersent.build_vocab(tweets, tokenize=False)

    print("\n Vocabulary Completed! \n")

    print("\n Building Document & Query Vectors... \n")

    doc_embeddings = infersent.encode(tweets,
                                      bsize=128,
                                      tokenize=False,
                                      verbose=True)
    query_embeddings = infersent.encode(queries,
                                        bsize=128,
                                        tokenize=False,
                                        verbose=True)

    print("\n Building Document & Query Vectors Done! \n")

    print("\n Retrieval and Ranking... \n")

    dranking = dict()

    for query_id in range(len(queries)):
        print(dranking)
        # Encoded array starts at 0 for first chronological document.
        current_document = 0

        # Calculate the Cosine Similarity between the current Query, and corpus of Documents.
        for tweet_id in tweet_ids:
            # Calculate the Cossine Sim
            dranking[tweet_id] = cosine(doc_embeddings[current_document],
                                        query_embeddings[query_id])
            current_document += 1

        # Put the ranking of Documents in Descending order into ranking.
        ranking[query_id + 1] = {
            k: v
            for k, v in sorted(dranking.items(),
                               key=lambda dranking: dranking[1],
                               reverse=True)[:1000]
        }

        # Create the resulting file.
        print("Query " + str(query_id) + " Done.")
        dranking.clear()

    resultFileCreation(ranking)

    print("\n Retrieval and Ranking Done! \n")
示例#13
0
import numpy as np
import torch
from numpy import save
from tqdm import tqdm
import pandas as pd
import json
from models import InferSent
import argparse

V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))
model = model.cuda()
W2V_PATH = "./fastText/crawl-300d-2M.vec"
model.set_w2v_path(W2V_PATH)
model.build_vocab_k_words(K=100000)


def infersent_embed_doc(rpath, wpath):
    df = pd.read_csv(rpath,chunksize=1000)
    text = []
    count=0
    for chunk in df:
        text = text + chunk['comment'].tolist()
    error_idx = []
    with open(wpath,'w+') as fw:
        for i in range(0,len(text)):
            try:
示例#14
0
nlp = spacy.load("en_core_web_sm")
MODEL_PATH = "/home/psrivastava/Intern_Summer/infersent/encoder/infersent2.pkl"
W2V_PATH = "/home/psrivastava/Intern_Summer/infersent/fastText/crawl-300d-2M.vec"
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 2
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
infersent.set_w2v_path(W2V_PATH)
use_cuda = True
infersent = infersent.cuda() if use_cuda else infersent


def get_batch_from_dataframe(currentidx):

    to_fetch = currentidx + 640
    abs_arr = dfs.ix[currentidx:to_fetch, 'clean_text'].tolist()
    catg_arr = dfs.ix[currentidx:to_fetch, 'category'].tolist()
    subj_arr = dfs.ix[currentidx:to_fetch, 'set'].tolist()

    currentidx = currentidx + 640
    return abs_arr, catg_arr, subj_arr, title_arr, currentidx


def with_stopwords():
    pds = pd.DataFrame(columns=['embds', 'set', 'catg'])
示例#15
0
# setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)


# load model
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                        'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
if torch.cuda.is_available():
    infersent.cuda()

# set word vector
if V == 1:
    W2V_PATH = 'Glove/glove.840B.300d.txt'
    logger.info('Use Glove Embedding')
elif V ==2 :
    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    logger.info('Use fastText Embedding')
else:
    raise NotImplementedError
infersent.set_w2v_path(W2V_PATH)

# read data
refs = []
with open(args.golden, 'r') as f:
    parser.add_argument('-c', '--cpu', action='store_true',
                        help='Use CPU instead of GPU.')
    parser.add_argument('-b', '--batch-size', type=int, default=64,
                        help='Batch size (default: 64)')
    parser.add_argument('files', nargs='+',
                        help='List of files to extract sentence embeddings')

    args = parser.parse_args()

    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': args.version}
    model = InferSent(params_model)
    model.load_state_dict(torch.load(args.model_path))

    if not args.cpu:
        model = model.cuda()

    model.set_w2v_path(args.w2v_path)

    # Ensure directory
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    # Read files and extract features
    for fpath in args.files:
        print('Reading file {}'.format(fpath))
        sents = []
        with open(fpath) as f:
            for line in f:
                line = line.strip()
                assert line, 'Empty line in {}'.format(fpath)
示例#17
0
def main(arguments):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # Logistics
    parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0)
    parser.add_argument("--seed", help="Random seed", type=int, default=19)
    parser.add_argument("--use_pytorch",
                        help="1 to use PyTorch",
                        type=int,
                        default=1)
    parser.add_argument("--out_dir",
                        help="Dir to write preds to",
                        type=str,
                        default='')
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--load_data",
                        help="0 to read data from scratch",
                        type=int,
                        default=1)

    # Task options
    parser.add_argument("--tasks",
                        help="Tasks to evaluate on, as a comma separated list",
                        type=str)
    parser.add_argument("--max_seq_len",
                        help="Max sequence length",
                        type=int,
                        default=40)

    # Model options
    parser.add_argument("--model_checkpoint",
                        help="Model checkpoint to use",
                        type=str,
                        default='')
    parser.add_argument("--word_vec_file",
                        help="Word vector file to use",
                        type=str)
    parser.add_argument("--batch_size",
                        help="Batch size to use",
                        type=int,
                        default=64)

    # Classifier options
    parser.add_argument("--cls_batch_size",
                        help="Batch size to use",
                        type=int,
                        default=64)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s',
                        level=logging.DEBUG)
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    log_file = os.path.join(args.out_dir, "results.log")
    fileHandler = logging.FileHandler(log_file)
    logging.getLogger().addHandler(fileHandler)
    logging.info(args)

    # define senteval params
    params_senteval = {
        'task_path': PATH_TO_DATA,
        'usepytorch': args.use_pytorch,
        'kfold': 10,
        'max_seq_len': args.max_seq_len,
        'batch_size': args.batch_size,
        'load_data': args.load_data,
        'seed': args.seed
    }
    params_senteval['classifier'] = {
        'nhid': 0,
        'optim': 'rmsprop',
        'batch_size': 128,
        'tenacity': 3,
        'epoch_size': 2
    }

    # Load InferSent model
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(args.model_checkpoint))
    model.set_w2v_path(args.word_vec_file)

    params_senteval['infersent'] = model.cuda()

    # Do SentEval stuff
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    tasks = get_tasks(args.tasks)
    results = se.eval(tasks)
    write_results(results, args.out_dir)
    logging.info(results)
示例#18
0
}

# model
encoder_types = [
    'InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder',
    'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder',
    'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder'
]
assert params.encoder_type in encoder_types, "encoder_type must be in " + \
                                             str(encoder_types)

infersent_net = InferSent(config_nli_model)
print(infersent_net)

infersent_net.load_state_dict(torch.load('./encoder/infersent1.pkl'))
infersent_net.cuda()

for parameters_infer in infersent_net.parameters():
    parameters_infer.requires_grad = False

ae_model = DisEnc.LinearAutoEncoder(params.dis_emb_dim).cuda()

print(ae_model)


def cos_distance(a, b):
    return (1. - torch.nn.functional.cosine_similarity(a, b))


def hamming_distance(a, b):
    #return (a-b).abs().sum()
示例#19
0
}
nli_net = InferSent(params_model)
nli_net.load_state_dict(torch.load(MODEL_PATH))
print(nli_net)

# loss
weight = torch.FloatTensor(params.n_classes).fill_(1)
loss_fn = nn.CrossEntropyLoss(weight=weight)
loss_fn.size_average = False

# optimizer
optim_fn, optim_params = get_optimizer(params.optimizer)
optimizer = optim_fn(nli_net.parameters(), **optim_params)

# cuda by default
nli_net.cuda()
loss_fn.cuda()
"""
TRAIN
"""
val_acc_best = -1e10
adam_stop = False
stop_training = False
lr = optim_params['lr'] if 'sgd' in params.optimizer else None


def evaluate(epoch, eval_type='valid', final_eval=False):
    nli_net.eval()
    correct = 0.
    global val_acc_best, lr, stop_training, adam_stop
示例#20
0
def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    print_args(args)
    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)
    use_cuda = False if args.no_cuda else True
    verbose = args.verbose

    # model config
    config = {
        'word_emb_dim': args.word_emb_dim,
        'enc_lstm_dim': args.enc_lstm_dim,
        'n_enc_layers': args.n_enc_layers,
        'dpout_model': args.dpout_model,
        'dpout_fc': args.dpout_fc,
        'fc_dim': args.fc_dim,
        'bsize': args.batch_size,
        'n_classes': args.n_classes,
        'pool_type': args.pool_type,
        'nonlinear_fc': args.nonlinear_fc,
        'use_cuda': use_cuda,
        'version': args.model_version,
        'dropout_prob': args.dropout_prob,
    }

    # load model
    if verbose:
        print('loading model...')
    model = InferSent(config)
    model.load_state_dict(torch.load(args.model_path))
    model = model.cuda() if not args.no_cuda else model
    model.set_w2v_path(args.word_emb_path)
    model.build_vocab_k_words(K=args.k_freq_words, verbose=verbose)

    # load classifier
    classifier = SimpleClassifier(config)
    classifier = classifier.cuda() if not args.no_cuda else classifier

    # get train examples
    train_examples = task.get_train_examples()
    # calculate t_total
    t_total = initialization.get_opt_train_steps(len(train_examples), args)

    # build optimizer.
    optimizer = optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9)

    # create running parameters
    r_params = RunnerParameters(
        local_rank=args.local_rank,
        n_gpu=n_gpu,
        learning_rate=5e-5,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        t_total=t_total,
        warmup_proportion=args.warmup_proportion,
        num_train_epochs=args.num_train_epochs,
        train_batch_size=args.train_batch_size,
        eval_batch_size=args.eval_batch_size,
        verbose=verbose)

    # create runner class for training and evaluation tasks.
    runner = GlueTaskClassifierRunner(encoder_model=model,
                                      classifier_model=classifier,
                                      optimizer=optimizer,
                                      label_list=task.get_labels(),
                                      device=device,
                                      rparams=r_params)

    if args.do_train:
        runner.run_train_classifier(train_examples)

    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples,
                                 task_name=task.name,
                                 verbose=verbose)

        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"),
                  header=False,
                  index=False)
        metrics_str = json.dumps(
            {
                "loss": results["loss"],
                "metrics": results["metrics"]
            }, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            mm_val_example = MnliMismatchedProcessor().get_dev_examples(
                task.data_dir)
            mm_results = runner.run_val(mm_val_example,
                                        task_name=task.name,
                                        verbose=verbose)

            df = pd.DataFrame(results["logits"])
            df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"),
                      header=False,
                      index=False)
            combined_metrics = {}
            for k, v in results["metrics"].items():
                combined_metrics[k] = v
            for k, v in mm_results["metrics"].items():
                combined_metrics["mm-" + k] = v
            combined_metrics_str = json.dumps(
                {
                    "loss": results["loss"],
                    "metrics": combined_metrics,
                },
                indent=2)
            print(combined_metrics_str)
            with open(os.path.join(args.output_dir, "val_metrics.json"),
                      "w") as f:
                f.write(combined_metrics_str)
# In[113]:


from random import randint
import numpy as np
import torch
from models import InferSent
model_version = 1
MODEL_PATH = "/home/anuja/Desktop/BE project/Models/InferSent/infersent1.pkl"
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
infermodel = InferSent(params_model)
infermodel.load_state_dict(torch.load(MODEL_PATH))
use_cuda = False
infermodel = infermodel.cuda() if use_cuda else infermodel
W2V_PATH = '/home/anuja/Desktop/BE project/glove.6B/glove.840B.300d.txt'
#replace with glove.840B.300d.txt
infermodel.set_w2v_path(W2V_PATH)
infermodel.build_vocab_k_words(K=100000)


# In[114]:


df = pd.DataFrame(columns=['body','replier', 'thread_no','embeddings'])
folder = glob.glob(folder_path)
th_no = 0
obj = preprocessing.preprocess()
cnt = 0
count_file = 0
示例#22
0
def main():
    init_output_dir(output_dir)
    # prepare dataset
    task = get_task(task_name, dataset_path)
    label_list = task.get_labels()
    label_map = {v: i for i, v in enumerate(label_list)}

    print("loading raw data ... ")
    train_examples = task.get_train_examples()
    val_examples = task.get_dev_examples()
    test_examples = task.get_test_examples()

    print("converting to data loader ... ")
    train_loader = get_dataloader(train_examples, label_map)
    val_loader = get_dataloader(val_examples, label_map)
    test_loader = get_dataloader(test_examples, label_map)

    # load model
    print("loading model ... ")
    model = InferSent(config)
    model.load_state_dict(torch.load(model_path))
    model = model.cuda() if config['use_cuda'] else model
    model.set_w2v_path(word_emb_path)
    print("building model vocabs ... ")
    model.build_vocab_k_words(K=100000, verbose=True)

    # run embedding for train set
    print("Run embedding for train set")
    for _ in trange(1, desc="Epoch"):
        run_encoding(loader=train_loader,
                     model=model,
                     mode='train')

    print("Run embedding for dev set")
    for _ in trange(1, desc="Epoch"):
        run_encoding(loader=val_loader,
                     model=model,
                     mode='dev')

    print("Run embedding for test set")
    for _ in trange(1, desc="Epoch"):
        run_encoding(loader=test_loader,
                     model=model,
                     mode='test')

    # HACK FOR MNLI mis-matched
    if task_name == 'mnli':
        print("Run Embedding for MNLI Mis-Matched Datasets")
        print("loading raw data ... ")
        mm_val_example = MnliMismatchedProcessor().get_dev_examples(dataset_path)
        mm_test_examples = MnliMismatchedProcessor().get_test_examples(dataset_path)
        print("converting to data loader ... ")
        mm_val_loader = get_dataloader(mm_val_example, label_map)
        mm_test_loader = get_dataloader(mm_test_examples, label_map)

        print("Run embedding for mm_dev set")
        for _ in trange(1, desc="Epoch"):
            run_encoding(loader=mm_val_loader,
                         model=model,
                         mode='mm_dev')

        print("Run embedding for test set")
        for _ in trange(1, desc="Epoch"):
            run_encoding(loader=mm_test_loader,
                         model=model,
                         mode='mm_test')
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)
infersent.build_vocab_k_words(K=100000)
infersent = infersent.cuda()


class PoetryDataset(Dataset):
    def __init__(self, data_dir, split, create_data, **kwargs):

        super().__init__()
        self.data_dir = data_dir
        self.split = split
        self.max_sequence_length = kwargs.get('max_sequence_length', 50)
        self.min_occ = kwargs.get('min_occ', 3)

        self.raw_data_path = os.path.join(data_dir, 'poems.csv')
        self.data_file = 'poems.{}.json'.format(self.split)
        self.vocab_file = 'poems.vocab.json'
        self.categories = [['love', 'relationships', 'marriage'],