示例#1
0
if __name__ == '__main__':
    ds = None

    ds = None
    if FLAGS.dataset == 'STS':
        print('Using the STS dataset')
        ds = STS()
    elif FLAGS.dataset == 'STSLarge':
        print('Using the STSLarge dataset')
        ds = STSLarge()
    elif FLAGS.dataset == 'PPDB':
        print('Using the PPDB dataset')
        ds = PPDB()
    elif FLAGS.dataset == 'Quora':
        print('Using the Quora dataset')
        ds = Quora()
    elif FLAGS.dataset == 'Sick':
        print('Using the Sick dataset')
        ds = Sick()
    elif FLAGS.dataset == 'SemEval':
        print('Using the SemEval dataset')
        ds = SemEval()
    elif FLAGS.dataset == 'StackExchange':
        print('Using the StackExchange dataset')
        ds = StackExchange()
    else:
        raise NotImplementedError('Dataset {} has not been '
                                  'implemented yet'.format(FLAGS.dataset))

    if FLAGS.mode == 'train':
        train(ds, ds.metadata_path, ds.w2v)
    The idea of this simple code is to load and dump text to our trained
    models.
"""
import collections

import datasets
from datasets import Quora

from datasets import seq2id
from datasets import merge_sentences

import tflearn

from models import AttentionBlstmQuora

quora = Quora()
Batch = collections.namedtuple('Batch', ['s1', 's2', 'sim'])


def get_sents_encoded(sentence_1, sentence_2, dt=quora):
    data = [
        datasets.tokenize(sentence_1, lang='en'),
        datasets.tokenize(sentence_2, lang='en')
    ]
    vocab_is = dt.w2i
    lst_sent_ids = seq2id(data, vocab_is, seq_begin=False, seq_end=False)
    s1_ids = lst_sent_ids[0]
    s2_ids = lst_sent_ids[1]
    return s1_ids, s2_ids

示例#3
0
import random
import numpy as np

engine = get_engine()

seed = 4269666
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
# Constant definition
device = torch.device("cuda:2")

# Le probleme vient du count vectorizer qui vire certains mots
print("Load Dataset")
dataset = Quora.torch_dataset()
dataclasses = Quora.dataclasses()
dataclasses = {q._id: q for q in dataclasses}


def embedding_collate_decorator(collate_fn):
    def wrapper(batch):
        x, y, id_, qrels, seq_lens = collate_fn(batch)
        return x, y, id_, qrels, seq_lens

    return wrapper


collate_fn = embedding_collate_decorator(sequence_collate_fn)

train_len, val_len = int(0.7 * len(dataset)), int(0.15 * len(dataset))
import sys
import os
from os import path

libpath = path.normpath(
    path.join(path.dirname(path.realpath(__file__)), os.pardir, "src"))
sys.path.append(libpath)

import pickle as pkl
import torch

import data
from datasets import Quora, Robust2004

sys.modules["dataset"] = data

quora_dc = Quora.dataclasses()
quora_torch = Quora.torch_dataset()
rb_dc = Robust2004.dataclasses()
rb_torch = Robust2004.torch_dataset()

del sys.modules["dataset"]

with open(Quora.dataclasses_path, "wb") as f:
    pkl.dump(quora_dc, f)

with open(Robust2004.dataclasses_path, "wb") as f:
    pkl.dump(rb_dc, f)

torch.save(quora_torch, Quora.torch_path)
torch.save(rb_torch, Robust2004.torch_path)