예제 #1
0
def get_loaded_model(force_gpu=False, k_most_frequent_words=1000000):

    model_path = "infersent/encoder/infersent{}.pkl".format(model_version)
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }

    model = InferSent(params_model)
    model.load_state_dict(torch.load(model_path))

    if (not torch.cuda.is_available()) and force_gpu:
        raise GPUNotFoundException()

    if torch.cuda.is_available():
        model = model.cuda()

    # If infersent1 -> use GloVe embeddings.
    # If infersent2 -> use InferSent embeddings.
    W2V_PATH = 'infersent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'infersent/dataset/fastText/crawl-300d-2M.vec'  ## noqa
    model.set_w2v_path(W2V_PATH)

    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=k_most_frequent_words)
    return model
예제 #2
0
    def prepare(model_path: str,
                word_vecs: str,
                out_path: str,
                sentences: Union[str, List[str]] = None,
                max_vocab: int = 0):
        """
        this method is for adapting the vocabulary,
        :param model_path: unadapted model state
        :param word_vecs: word vectors
        :param out_path: where to store the state
        :param sentences: training sentences for scanning the vocabulary
        :param max_vocab: maximum vocabulary size (optional)
        :return:
        """
        assert bool(sentences) != bool(
            max_vocab), 'Either sentences or max_vocab should be given'

        model = InferSent(config=MODEL_CONF)
        log.info(f"Loading state from {out_path}")

        model.load_state_dict(torch.load(model_path))
        log.info(f"Loading word vecs from {out_path}")
        model.set_w2v_path(word_vecs)
        if sentences:
            if type(sentences) is not list:
                sentences = list(read_lines(sentences))
            log.info("Building vocabulary from sentences")
            model.build_vocab(sentences, tokenize=True)
        if max_vocab:
            log.info(f"Pruning vocabulary to top {max_vocab} types")
            model.build_vocab_k_words(K=max_vocab)
        log.info(f"Saving at {out_path}")

        state = SentenceEncoder._get_state(model)
        torch.save(state, out_path)
예제 #3
0
def load_infersent():
    V = 2
    MODEL_PATH = 'encoder/infersent%s.pkl' % V
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    W2V_PATH = 'fastText/crawl-300d-2M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab_k_words(K=100000)
    return infersent
예제 #4
0
def init_models(vocal_size: int = VOCAB_SIZE):
    model = InferSent({
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': VERSION
    })
    model.load_state_dict(torch.load(MODEL_PATH))
    model = model.cuda() if USE_CUDA else model

    model.set_w2v_path(VECTOR_PATH)
    model.build_vocab_k_words(K=VOCAB_SIZE)
    return model
예제 #5
0
def load_model():
    model_version = 1
    MODEL_PATH = "encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)

    print('building vocab')
    model.build_vocab_k_words(K=100000)
    print('done building vocab')
    return model
예제 #6
0
def embed_dataset(dataset_path, infersent_path, force_cpu=False):
    """
    To make this work, first run ./get_infersent.sh
    """
    MODEL_PATH = infersent_path / "encoder/infersent1.pkl"
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
    model = InferSent(params_model)
    if force_cpu:
        model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu'))
    else:
        model.load_state_dict(torch.load(MODEL_PATH))
        model.cuda()

    W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)
    model.build_vocab_k_words(K=100000)

    csv_data = read_csv(dataset_path / 'train.csv')
    csv_data = csv_data[1:]  # skip header
    data = defaultdict(list)

    for irow, row in enumerate(csv_data):
        if 'snips' in str(dataset_path):
            utterance, labels, delexicalised, intent = row
        else:
            raise TypeError(
                "Unknown dataset type. Implement your own first. See the "
                "README")
        data[intent].append(utterance)

    vectors = {}
    for i, (intent, sentences) in enumerate(data.items()):
        print('{}/{} done'.format(i, len(data.items())))
        embeddings = model.encode(sentences)
        avg_embedding = np.mean(embeddings, axis=0)
        vectors[intent] = avg_embedding

    return vectors
def infersent_embeddings():
    train_data_list = []
    test_data_list = []
    sys.path.append(
        '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master')
    # Load model
    from models import InferSent
    model_version = 1
    MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = False
    model = model.cuda() if use_cuda else model
    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec'
    model.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    model.build_vocab_k_words(K=100000)
    train_data_list = model.encode(final_train['text'].tolist(),
                                   bsize=128,
                                   tokenize=False,
                                   verbose=True)
    print('nb sentences encoded : {0}'.format(len(train_data_list)))
    test_data_list = model.encode(final_test['text'].tolist(),
                                  bsize=128,
                                  tokenize=False,
                                  verbose=True)
    print('nb sentences encoded : {0}'.format(len(test_data_list)))
    return train_data_list, test_data_list
예제 #8
0
    def init_infersent_model(self):
        model_version = 1
        MODEL_PATH = "encoder/infersent%s.pkl" % model_version
        params_model = {
            'bsize': 64,
            'word_emb_dim': 300,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': model_version
        }
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))

        # Keep it on CPU or put it on GPU
        use_cuda = False
        model = model.cuda() if use_cuda else model

        # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
        W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
        model.set_w2v_path(W2V_PATH)
        # Load embeddings of K most frequent words
        model.build_vocab_k_words(K=100000)
        self.model = model
def infersent_glove():
    #Set Model for InferSent+Glove
    V = 1
    MODEL_PATH = '/tmp/GloVe/encoder/infersent%s.pkl' % V
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': V
    }
    modelg = InferSent(params_model)
    modelg.load_state_dict(torch.load(MODEL_PATH))
    # Keep it on CPU or put it on GPU
    use_cuda = True
    modelg = modelg.cuda() if use_cuda else modelg

    # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
    W2V_PATH = '/tmp/GloVe/glove.840B.300d.txt' if V == 1 else '/home/ganesh/Quora_dev/tmp/GloVe/glove.840B.300d.txt'
    modelg.set_w2v_path(W2V_PATH)
    # Load embeddings of K most frequent words
    modelg.build_vocab_k_words(K=100000)
    return modelg
                        utils.extract_list_from_string(rows[1]), book_id_map)

with open(os.path.join(this_dir, "data/description.csv"), mode='r') as infile:
    reader = csv.reader(infile)
    for rows in reader:
        if len(rows[1]) > 20:
            book_id_descriptions[rows[0]] = rows[1]

with open(os.path.join(this_dir, "data/title.csv"), mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    book_id_titles = {rows[0]: rows[1] for rows in reader}

model_path = "encoder/infersent2.pkl"
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 2
}
model = InferSent(params_model)
model.load_state_dict(torch.load(model_path))

w2v_path = 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(w2v_path)

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)
예제 #11
0
# In[3]:

# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

# In[4]:

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# In[5]:

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

# ## Load sentences

# In[6]:

# Load some sentences
sentences = []
with open('samples.txt') as f:
    for line in f:
        sentences.append(line.strip())
print(len(sentences))

# In[7]:

sentences[:5]
with open(ORI_PATH) as f:
    ori = f.read()
    ori = ori.replace('[[[[Premise]]]]: ',
                      '').replace('>>>>[[[[Hypothesis]]]]:', '')
    ori = ori.replace('[[', '').replace(']]', '')
    ori = ori.splitlines()

params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
infersent.set_w2v_path(W2V_PATH)
infersent.build_vocab_k_words(K)

adv_emb = infersent.encode(adv, tokenize=True)
ori_emb = infersent.encode(ori, tokenize=True)

result = [cos_sim(i, j) for i, j in zip(adv_emb, ori_emb)]
with open('../results/InferSent.txt', 'w') as f:
    f.write('\n'.join([str(i) for i in result]))

result = [distance(i, j) for i, j in zip(adv_emb, ori_emb)]
with open('../results/InferSent_distance.txt', 'w') as f:
    f.write('\n'.join([str(i) for i in result]))
예제 #13
0
def extract_answer_IFST(story_data, question_and_ans_data, story_ids,
                        model_version, Vocab_Size):
    """ (1) get answer, then modify self.question_and_ans_data by add the answer to it. 
        (2) for each story id, extract its question, then look up in story_data, find the best sentence"""
    import re
    import pandas as pd

    import torch
    import numpy as np
    from models import InferSent

    #sentence_list=build_vocabulary(story_data)
    W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'dataset/fastText/crawl-300d-2M.vec'
    MODEL_PATH = 'encoder/infersent%s.pkl' % model_version
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.set_w2v_path(W2V_PATH)
    if model_version == 3:
        sentence_list = build_vocabulary(story_data)
        model.build_vocab(sentence_list)
    else:
        model.build_vocab_k_words(K=Vocab_Size)

    for story_id in story_ids:
        story = story_data.loc[lambda df: df.story_id == story_id,
                               'story'].values[0]
        question_ids = question_and_ans_data.loc[
            lambda df: df.story_id == story_id, 'question_id']

        for question_id in question_ids:
            # get the question and answer
            question = question_and_ans_data.loc[
                lambda df: df.question_id == question_id, 'question'].values[0]
            if 'answer' in question_and_ans_data:
                answer = question_and_ans_data.loc[
                    lambda df: df.question_id == question_id,
                    'answer'].values[0]

            question_encoded = model.encode(
                str(question_and_ans_data.loc[question_and_ans_data.index[
                    question_and_ans_data['question_id'] == question_id][0],
                                              'question']))[0]

            ans = []
            for sent in story.sents:
                #sim = sent.similarity(question)
                sim = cosine(question_encoded, model.encode(str(sent))[0])

                ans.append({
                    'question_id': question_id,
                    'answer_pred': sent,
                    'similarity': sim
                })

            ans = pd.DataFrame(ans).reindex(
                ['question_id', 'answer_pred', 'similarity'], axis=1)
            ans.sort_values(by=['similarity'], ascending=False, inplace=True)

            question_and_ans_data.loc[lambda df: df.question_id == question_id,
                                      'answer_pred'] = str(
                                          ans.iloc[0]['answer_pred']).replace(
                                              '\n', ' ')  #.text

    #question_and_ans_data['answer_pred'] = question_and_ans_data['answer_pred'].apply(TextBlob)

    return question_and_ans_data
예제 #14
0
assert params.encoder_type in encoder_types, "encoder_type must be in " + \
                                             str(encoder_types)


# For Load encoder
encoder = None
if params.encoder_path and params.encoder_type == 'InferSent':
    
    params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version}
    encoder = InferSent(params_model)
    encoder.load_state_dict(torch.load(params.encoder_path))
    encoder.set_w2v_path(params.vector_rep)
    
    if params.vocab_samples.isdigit() :
        print("Build vocab from K samples")
        encoder.build_vocab_k_words(K=int(params.vocab_samples))
    else:
        print("Build vocab from full file")
        encoder.build_vocab(K=params.vocab_samples)

    print("========TEST encoder=======")
    print(encoder.encode(['the cat eats.']))
    
    encoder.to(device)
    
    


# model config
config_nli_model = {
    'n_words'        :  len(word_vec)         ,
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--dataname',
                        default='t6',
                        help='dataset name',
                        choices=['t6', 't26', '2C'])
    parser.add_argument('-c',
                        '--classifiername',
                        default='RF',
                        help='which classifier to use',
                        choices=['GaussianNB', 'RF', 'SVM', 'KNN'])
    args = parser.parse_args()
    data_name = args.dataname  # t6 or t26, 2C, 4C
    clf_name = args.classifiername  # classfier

    GLOVE_PATH = 'GloVe/glove.840B.300d.txt'
    dataset = '../data/'
    disasters = []
    train_list = []
    test_list = []
    if data_name == "t6":
        file_path = dataset + 'CrisisLexT6_cleaned/'
        disasters = [
            "sandy", "queensland", "boston", "west_texas", "oklahoma",
            "alberta"
        ]
        test_list = [
            "{}_glove_token.csv.unique.csv".format(disaster)
            for disaster in disasters
        ]
        train_list = [
            "{}_training.csv".format(disaster) for disaster in disasters
        ]
    if data_name == "t26":
        file_path = dataset + 'CrisisLexT26_cleaned/'
        disasters = [
            "2012_Colorado_wildfires", "2013_Queensland_floods",
            "2013_Boston_bombings", "2013_West_Texas_explosion",
            "2013_Alberta_floods", "2013_Colorado_floods",
            "2013_NY_train_crash"
        ]
        test_list = [
            "{}-tweets_labeled.csv.unique.csv".format(disaster)
            for disaster in disasters
        ]
        train_list = [
            "{}_training.csv".format(disaster) for disaster in disasters
        ]
    if data_name == "2C":
        file_path = dataset + '2CTweets_cleaned/'
        disasters = [
            "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco", "Boston",
            "Brisbane", "Dublin", "London", "Sydney"
        ]
        test_list = [
            "{}2C.csv.token.csv.unique.csv".format(disaster)
            for disaster in disasters
        ]
        train_list = [
            "{}2C_training.csv".format(disaster) for disaster in disasters
        ]

    accu_list = []
    roc_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    output_dir = ''
    for disaster, train, test in zip(disasters, train_list, test_list):
        train_file = os.path.join(file_path, train)
        test_file = os.path.join(file_path, test)
        xtrain = []
        ytrain = []
        xtest = []
        ytest = []
        xtrain, ytrain = load_data(data_name, train_file)
        xtest, ytest = load_data(data_name, test_file)

        train_output = "{}{}.train.npy".format(output_dir, disaster)
        test_output = "{}{}.test.npy".format(output_dir, disaster)
        if not os.path.isfile(train_output):
            # Load our pre-trained model (in encoder/):
            V = 1
            MODEL_PATH = 'encoder/infersent%s.pkl' % V
            params_model = {
                'bsize': 64,
                'word_emb_dim': 300,
                'enc_lstm_dim': 2048,
                'pool_type': 'max',
                'dpout_model': 0.0,
                'version': V
            }
            infersent = InferSent(params_model)
            infersent.load_state_dict(torch.load(MODEL_PATH))
            # Set word vector path for the model:
            W2V_PATH = './GloVe/glove.840B.300d.txt'
            infersent.set_w2v_path(W2V_PATH)
            # # Build the vocabulary of word vectors (i.e keep only those needed):
            # infersent.build_vocab(sentences, tokenize=True)
            infersent.build_vocab_k_words(K=100000)
            # Encode your sentences (list of n sentences):
            train_embed = infersent.encode(xtrain,
                                           bsize=128,
                                           tokenize=True,
                                           verbose=True)
            np.save(train_output, train_embed)
            test_embed = infersent.encode(xtest,
                                          bsize=128,
                                          tokenize=True,
                                          verbose=True)
            np.save(test_output, test_embed)
            print('file saved')
        else:
            train_embed = np.load(train_output)
            test_embed = np.load(test_output)

        print(test)
        accu, roc, precision, recall, f1 = run_classifier(
            train_embed, ytrain, test_embed, ytest, clf_name, 100)
        # print accu, roc
        accu_list.append(accu)
        roc_list.append(roc)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    print("{}_InferSent_{}_LOO_accuracy {}".format(data_name, clf_name,
                                                   accu_list))
    print("{}_InferSent_{}_LOO_roc {}".format(data_name, clf_name, roc_list))
    print("{}_InferSent_{}_LOO_percision {}".format(data_name, clf_name,
                                                    precision_list))
    print("{}_InferSent_{}_LOO_recall {}".format(data_name, clf_name,
                                                 recall_list))
    print("{}_InferSent_{}_LOO_f1 {}".format(data_name, clf_name, f1_list))
    print(
        "{0}_InferSent_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f} "
        .format(data_name, clf_name, np.mean(accu_list), np.std(accu_list),
                np.mean(roc_list), np.std(roc_list), np.mean(f1_list),
                np.std(f1_list), np.mean(precision_list),
                np.std(precision_list), np.mean(recall_list),
                np.std(recall_list)))
예제 #16
0
V = 1
MODEL_PATH = '../encoder/infersent%s.pkl' % V
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': V
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = '../GloVe/glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)

infersent.build_vocab_k_words(
    K=100000)  #100k most common words loaded up in model vocab

PORT = "5000"
app = Flask(__name__)
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'


def adapt_array(arr):
    """
    http://stackoverflow.com/a/31312102/190597 (SoulNibbler)
    """
    out = io.BytesIO()
    np.save(out, arr)
    out.seek(0)
    return sqlite3.Binary(out.read())
예제 #17
0
    if args.download == True:
        nltk.download('punkt')
        model_version = args.model_version
        MODEL_PATH = "/home1/InferSent/encoder/infersent%s.pickle" % model_version
        params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                        'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
        model = InferSent(params_model)
        model.load_state_dict(torch.load(MODEL_PATH))
        # Keep it on CPU or put it on GPU
        use_cuda = True
        model = model.cuda() if use_cuda else model
        W2V_PATH = '/home1/InferSent/oov_train_model.vec'
        model.set_w2v_path(W2V_PATH)
        # Load embeddings of K most frequent words
        # model.build_vocab_k_words(K=100000)
        model.build_vocab_k_words(K=2051129)  # Extract embedding word .

        # Load test sentences

        train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8')
        source_s = train_test[0][1:]
        target_s = train_test[1][1:]
        embeddings_source = model.encode(source_s, bsize=128, tokenize=False, verbose=True)
        print('nb source_s encoded : {0}'.format(len(embeddings_source)))
        embeddings_target = model.encode(target_s, bsize=128, tokenize=False, verbose=True)
        print('nb target_s encoded : {0}'.format(len(embeddings_target)))
        np.save('embeddings_source.npy', embeddings_source)
        np.save('embeddings_target.npy', embeddings_target)

    if args.cosine == True:
        source_np = np.load('embeddings_source.npy')
from random import randint
import numpy as np
import torch
from models import InferSent
model_version = 1
MODEL_PATH = "/home/anuja/Desktop/BE project/Models/InferSent/infersent1.pkl"
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
infermodel = InferSent(params_model)
infermodel.load_state_dict(torch.load(MODEL_PATH))
use_cuda = False
infermodel = infermodel.cuda() if use_cuda else infermodel
W2V_PATH = '/home/anuja/Desktop/BE project/glove.6B/glove.840B.300d.txt'
#replace with glove.840B.300d.txt
infermodel.set_w2v_path(W2V_PATH)
infermodel.build_vocab_k_words(K=100000)


# In[114]:


df = pd.DataFrame(columns=['body','replier', 'thread_no','embeddings'])
folder = glob.glob(folder_path)
th_no = 0
obj = preprocessing.preprocess()
cnt = 0
count_file = 0
thread_list=[]
try:
    for fol in tqdm_notebook(folder):
        files = glob.glob(fol+'/*.txt')
예제 #19
0
def main():
    init_output_dir(output_dir)
    # prepare dataset
    task = get_task(task_name, dataset_path)
    label_list = task.get_labels()
    label_map = {v: i for i, v in enumerate(label_list)}

    print("loading raw data ... ")
    train_examples = task.get_train_examples()
    val_examples = task.get_dev_examples()
    test_examples = task.get_test_examples()

    print("converting to data loader ... ")
    train_loader = get_dataloader(train_examples, label_map)
    val_loader = get_dataloader(val_examples, label_map)
    test_loader = get_dataloader(test_examples, label_map)

    # load model
    print("loading model ... ")
    model = InferSent(config)
    model.load_state_dict(torch.load(model_path))
    model = model.cuda() if config['use_cuda'] else model
    model.set_w2v_path(word_emb_path)
    print("building model vocabs ... ")
    model.build_vocab_k_words(K=100000, verbose=True)

    # run embedding for train set
    print("Run embedding for train set")
    for _ in trange(1, desc="Epoch"):
        run_encoding(loader=train_loader,
                     model=model,
                     mode='train')

    print("Run embedding for dev set")
    for _ in trange(1, desc="Epoch"):
        run_encoding(loader=val_loader,
                     model=model,
                     mode='dev')

    print("Run embedding for test set")
    for _ in trange(1, desc="Epoch"):
        run_encoding(loader=test_loader,
                     model=model,
                     mode='test')

    # HACK FOR MNLI mis-matched
    if task_name == 'mnli':
        print("Run Embedding for MNLI Mis-Matched Datasets")
        print("loading raw data ... ")
        mm_val_example = MnliMismatchedProcessor().get_dev_examples(dataset_path)
        mm_test_examples = MnliMismatchedProcessor().get_test_examples(dataset_path)
        print("converting to data loader ... ")
        mm_val_loader = get_dataloader(mm_val_example, label_map)
        mm_test_loader = get_dataloader(mm_test_examples, label_map)

        print("Run embedding for mm_dev set")
        for _ in trange(1, desc="Epoch"):
            run_encoding(loader=mm_val_loader,
                         model=model,
                         mode='mm_dev')

        print("Run embedding for test set")
        for _ in trange(1, desc="Epoch"):
            run_encoding(loader=mm_test_loader,
                         model=model,
                         mode='mm_test')
예제 #20
0
def main():
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    args = get_args()
    print_args(args)
    device, n_gpu = initialization.init_cuda_from_args(args, logger=logger)
    initialization.init_seed(args, n_gpu=n_gpu, logger=logger)
    initialization.init_train_batch_size(args)
    initialization.init_output_dir(args)
    initialization.save_args(args)
    task = get_task(args.task_name, args.data_dir)
    use_cuda = False if args.no_cuda else True
    verbose = args.verbose

    # model config
    config = {
        'word_emb_dim': args.word_emb_dim,
        'enc_lstm_dim': args.enc_lstm_dim,
        'n_enc_layers': args.n_enc_layers,
        'dpout_model': args.dpout_model,
        'dpout_fc': args.dpout_fc,
        'fc_dim': args.fc_dim,
        'bsize': args.batch_size,
        'n_classes': args.n_classes,
        'pool_type': args.pool_type,
        'nonlinear_fc': args.nonlinear_fc,
        'use_cuda': use_cuda,
        'version': args.model_version,
        'dropout_prob': args.dropout_prob,
    }

    # load model
    if verbose:
        print('loading model...')
    model = InferSent(config)
    model.load_state_dict(torch.load(args.model_path))
    model = model.cuda() if not args.no_cuda else model
    model.set_w2v_path(args.word_emb_path)
    model.build_vocab_k_words(K=args.k_freq_words, verbose=verbose)

    # load classifier
    classifier = SimpleClassifier(config)
    classifier = classifier.cuda() if not args.no_cuda else classifier

    # get train examples
    train_examples = task.get_train_examples()
    # calculate t_total
    t_total = initialization.get_opt_train_steps(len(train_examples), args)

    # build optimizer.
    optimizer = optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9)

    # create running parameters
    r_params = RunnerParameters(
        local_rank=args.local_rank,
        n_gpu=n_gpu,
        learning_rate=5e-5,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        t_total=t_total,
        warmup_proportion=args.warmup_proportion,
        num_train_epochs=args.num_train_epochs,
        train_batch_size=args.train_batch_size,
        eval_batch_size=args.eval_batch_size,
        verbose=verbose)

    # create runner class for training and evaluation tasks.
    runner = GlueTaskClassifierRunner(encoder_model=model,
                                      classifier_model=classifier,
                                      optimizer=optimizer,
                                      label_list=task.get_labels(),
                                      device=device,
                                      rparams=r_params)

    if args.do_train:
        runner.run_train_classifier(train_examples)

    if args.do_val:
        val_examples = task.get_dev_examples()
        results = runner.run_val(val_examples,
                                 task_name=task.name,
                                 verbose=verbose)

        df = pd.DataFrame(results["logits"])
        df.to_csv(os.path.join(args.output_dir, "val_preds.csv"),
                  header=False,
                  index=False)
        metrics_str = json.dumps(
            {
                "loss": results["loss"],
                "metrics": results["metrics"]
            }, indent=2)
        print(metrics_str)
        with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f:
            f.write(metrics_str)

        # HACK for MNLI-mismatched
        if task.name == "mnli":
            mm_val_example = MnliMismatchedProcessor().get_dev_examples(
                task.data_dir)
            mm_results = runner.run_val(mm_val_example,
                                        task_name=task.name,
                                        verbose=verbose)

            df = pd.DataFrame(results["logits"])
            df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"),
                      header=False,
                      index=False)
            combined_metrics = {}
            for k, v in results["metrics"].items():
                combined_metrics[k] = v
            for k, v in mm_results["metrics"].items():
                combined_metrics["mm-" + k] = v
            combined_metrics_str = json.dumps(
                {
                    "loss": results["loss"],
                    "metrics": combined_metrics,
                },
                indent=2)
            print(combined_metrics_str)
            with open(os.path.join(args.output_dir, "val_metrics.json"),
                      "w") as f:
                f.write(combined_metrics_str)
예제 #21
0
파일: MAD.py 프로젝트: tallemeersch/MAD
    params_model = {
        'bsize': 64,
        'word_emb_dim': 300,
        'enc_lstm_dim': 2048,
        'pool_type': 'max',
        'dpout_model': 0.0,
        'version': model_version
    }
    model_infersent = InferSent(params_model)
    model_infersent.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = '/MAD/InferSent/dataset/crawl-300d-2M.vec'
    model_infersent.set_w2v_path(W2V_PATH)

    # Load embeddings of K most frequent words
    model_infersent.build_vocab_k_words(K=1000000)

    print("InferSent model loaded")

    #input: src, tgt, tgt.translated (to src, being English).

    src = open(args.path_src, "r").read().split("\n")
    src = src[:-1]
    tgt = open(args.path_tgt, "r").read().split("\n")
    tgt = tgt[:-1]

    Txt_target_2_cross = open(args.path_tgt_translated).read().split("\n")
    Txt_target_2_cross = Txt_target_2_cross[:-1]

    assert len(src) == len(tgt)
    assert len(src) == len(Txt_target_2_cross)
예제 #22
0
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': model_version
}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

# Keep it on CPU or put it on GPU
use_cuda = False
infersent = infersent.cuda() if use_cuda else infersent

infersent.set_w2v_path(W2V_PATH)
# Load embeddings of K most frequent words
infersent.build_vocab_k_words(K=500000)
# or you can build your own vocabulary based on sentences in the data
#infersent.build_vocab(yoursentences, tokenize=True)

# 1- create sentence embedding for all the sentences and questions using InferSent
# 2- calculates the distance between sentence & questions
#    based on Euclidean & Cosine similarity using sentence embeddings
embeddings_dic = {
    'Question': [],
    'Answer': [],
    'Question_Emb': [],
    'Answer_Emb': [],
    'Label': [],
    'Cosine_Dist': [],
    'Euclidean_Dist': [],
    'Predicted_label_Cos': [],