示例#1
0
def get_imdb_data(embedding_size=50):
    """Renvoie l'ensemble des donnéees nécessaires pour l'apprentissage

    - dictionnaire word vers ID
    - embeddings (Glove)
    - DataSet (FolderText)

    """
    WORDS = re.compile(r"\S+")

    words, embeddings = prepare_dataset('edu.stanford.glove.6b.%d' %
                                        embedding_size).load()
    OOVID = len(words)
    words.append("__OOV__")

    word2id = {word: ix for ix, word in enumerate(words)}
    embeddings = np.vstack((embeddings, np.zeros(embedding_size)))

    def tokenizer(t):
        return [word2id.get(x, OOVID) for x in re.findall(WORDS, t.lower())]

    logging.info("Loading embeddings")

    logging.info("Get the IMDB dataset")
    ds = prepare_dataset("edu.stanford.aclimdb")

    return word2id, embeddings, FolderText(ds.train.classes,
                                           ds.train.path,
                                           tokenizer,
                                           load=False), FolderText(
                                               ds.test.classes,
                                               ds.test.path,
                                               tokenizer,
                                               load=False)
示例#2
0
def _msmarco(part: str):
    return Adhoc(
        documents=_msmarco_docs(),
        topics=prepare_dataset(
            f"com.microsoft.msmarco.passage.{part}.queries"),
        assessments=prepare_dataset(
            f"com.microsoft.msmarco.passage.{part}.qrels"),
    )
示例#3
0
    def __init__(self):
        self.index_stem = prepare_dataset(
            "ca.uwaterloo.jimmylin.anserini.robust04")
        self.topics = prepare_dataset("gov.nist.trec.adhoc.robust.2004.topics")
        self.qrels = prepare_dataset("gov.nist.trec.adhoc.robust.2004.qrels")

        # FIXME: parallelize (when experimaestro supports this)
        self.docstore = BuildDocStore(index=self.index_stem).submit()
        self.index = Reindex(index=self.index_stem).submit()
示例#4
0
 def get(subset: str):
     topics = prepare_dataset(
         f"com.microsoft.msmarco.passage.{subset}.queries")
     qrels = prepare_dataset(
         f"com.microsoft.msmarco.passage.{subset}.qrels")
     assessed_topics = TrecAssessedTopics(topics=topics,
                                          assessments=qrels)
     return MsmarcoDataset(docstore=docstore,
                           index=index,
                           index_stem=index_stem,
                           assessed_topics=assessed_topics)
示例#5
0
def fold(name: str):
    """Return topics and assessments for a given fold

    Folds are trf1 to trf5 (test), vaf1 to vaf5 (validation) and f1 to f5 (test)
    """
    topics = prepare_dataset("gov.nist.trec.adhoc.robust.2004.topics")
    qrels = prepare_dataset("gov.nist.trec.adhoc.robust.2004.qrels")

    train_topics = AdhocTopicFold(topics=topics, ids=sorted(list(FOLDS[name])))
    train_qrels = AdhocAssessmentFold(qrels=qrels,
                                      ids=sorted(list(FOLDS[name])))

    return train_topics, train_qrels
def get_dataloaders_and_vocabs(batch_size):

    ds = prepare_dataset('org.universaldependencies.french.gsd')

    words = VocabularyTagging(True)
    tags = VocabularyTagging(False)
    train_dataset = TaggingDataset(ds.files['train'], words, tags, True)
    val_dataset = TaggingDataset(ds.files['dev'], words, tags, False)
    test_dataset = TaggingDataset(ds.files['test'], words, tags, False)

    kwargs = dict(collate_fn=TaggingDataset.collate,
                  pin_memory=(torch.cuda.is_available()),
                  num_workers=torch.multiprocessing.cpu_count())
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              **kwargs)
    val_loader = DataLoader(val_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            **kwargs)
    test_loader = DataLoader(test_dataset,
                             batch_size=batch_size,
                             shuffle=True,
                             **kwargs)
    return train_loader, val_loader, test_loader, words, tags
示例#7
0
def cli(port, workdir, dataset, debug):
    """Runs an experiment"""
    logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO)

    bm25 = BM25()

    # Sets the working directory and the name of the xp
    with experiment(workdir, "bm25", port=port) as xp:
        # Index the collection
        xp.setenv("JAVA_HOME", os.environ["JAVA_HOME"])
        ds = prepare_dataset(dataset)

        documents = ds.documents
        index = IndexCollection(
            documents=documents,
            storePositions=True,
            storeDocvectors=True,
            storeContents=True,
            threads=CPU_COUNT,
        ).submit()

        # Search with BM25
        bm25_retriever = AnseriniRetriever(k=1500, index=index,
                                           model=BM25()).tag("model", "bm25")

        bm25_eval = Evaluate(dataset=ds, retriever=bm25_retriever).submit()

    print("BM25 results on TREC 1")
    print(bm25_eval.results.read_text())
def get_dataloaders(batch_size, word2id):

    # load the IMDB dataset
    ds = prepare_dataset("edu.standford.aclimdb")

    # ds.train.classes and ds.test.classes are dict ('class', 'path-to-files')
    dev_ds = FolderText(ds.train.classes, word2id, load=False)
    test_ds = FolderText(ds.test.classes, word2id, load=False)

    # partition development set as train and validation set
    train_len = int(len(dev_ds) * 0.9)
    val_len = len(dev_ds) - train_len
    train_ds, val_ds = torch.utils.data.random_split(dev_ds,
                                                     [train_len, val_len])

    kwargs = dict(collate_fn=FolderText.collate,
                  pin_memory=(torch.cuda.is_available()),
                  num_workers=torch.multiprocessing.cpu_count())

    train_loader = DataLoader(train_ds,
                              batch_size=batch_size,
                              shuffle=True,
                              **kwargs)
    val_loader = DataLoader(val_ds,
                            batch_size=batch_size,
                            shuffle=True,
                            **kwargs)
    test_loader = DataLoader(test_ds,
                             batch_size=batch_size,
                             shuffle=True,
                             **kwargs)

    return train_loader, val_loader, test_loader
示例#9
0
def msmarco_train_triplets(info: Information):
    """Use MS-Marco triplets"""
    info.train_sampler = TripletBasedSampler(
        source=prepare_dataset(
            "com.microsoft.msmarco.passage.train.idtriples"),
        index=info.index(_msmarco_docs()),
    )
示例#10
0
def get_glove_embeddings():
    word2id, embeddings = prepare_dataset('edu.standford.glove.6b.50').load()

    # add a null embedding for pad: set it to id zero, so it works when zero-padding
    # In order to set 0 to PAD_ID, we need to shift all the ids for the embeddings
    # by adding one (because the id 0 is already used)
    for word in word2id:
        word2id[word] += 1
    word2id['<pad>'] = 0
    embeddings = np.insert(embeddings, 0, values=0, axis=0)

    # add an OOV embedding: use the mean of all embeddings
    OOV_ID = len(word2id)
    word2id['<oov>'] = OOV_ID
    embeddings = np.insert(embeddings, OOV_ID, embeddings.mean(0), axis=0)

    return word2id, embeddings
示例#11
0
    def __init__(self, train=True) -> None:
        super().__init__()

        self.train = train

        ds = prepare_dataset("com.lecun.mnist")

        if self.train:
            train_x, train_y = ds.files["train/images"].data(
            ), ds.files["train/labels"].data()
        else:
            train_x, train_y = ds.files["test/images"].data(
            ), ds.files["test/labels"].data()

        train_x = np.reshape(
            train_x,
            (train_x.shape[0], train_x.shape[1] * train_x.shape[2])) / 255

        self.train_x = train_x
示例#12
0
def robust(info: Information, top_k: int):
    """Use the TREC Robust dataset"""
    from xpmir.datasets.robust import fold

    # Return pairs topic/qrels
    documents = prepare_dataset("gov.nist.trec.adhoc.robust.2004").documents

    def get(p: str):
        topics, qrels = fold(p)
        return Adhoc(topics=topics, assessments=qrels, documents=documents)

    info.train_sampler = ModelBasedSampler(
        retriever=AnseriniRetriever(k=top_k,
                                    index=info.index(documents),
                                    model=info.basemodel),
        dataset=get("trf1"),
    )

    info.dev = get("trf1")
    info.test = get("f1")
示例#13
0
def cli(vocab_size: int):
    # Création du jeu de données et du modèle
    ds = prepare_dataset("com.sentiment140")

    # Création du vocabulaire
    wpmodel = Path("wp{}.model".format(vocab_size))
    if not wpmodel.is_file():
        logging.info("Did not find the wordpiece model %s", wpmodel)
        TRAINPATH = Path("/tmp/sentiment140-train.txt")
        cleanup(ds.files["train"], TRAINPATH)
        program = """import sentencepiece as spm; spm.SentencePieceTrainer.Train('--model_prefix=wp{vocab_size} --vocab_size={vocab_size} --input={TRAINPATH}')"""
        subprocess.run([sys.executable, "-c", program])
        TRAINPATH.unlink()

    # Création des jeux de données
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load("wp{vocab_size}.model")

    CLASSMAP = {0: 0, 4: 1}
    test = generatedata("test", tokenizer, vocab_size, ds, CLASSMAP)
    train = generatedata("train", tokenizer, vocab_size, ds, CLASSMAP)
示例#14
0
    def prepare():
        """Index the MS-Marco collection"""
        # Get the collection and index it
        collection = prepare_dataset(
            "com.microsoft.msmarco.passage.collection")

        docstore = BuildDocStore(collection=collection).submit()
        index = Reindex(collection=collection, stemmer="none").submit()
        index_stem = Reindex(collection=collection, stemmer="porter").submit()

        def get(subset: str):
            topics = prepare_dataset(
                f"com.microsoft.msmarco.passage.{subset}.queries")
            qrels = prepare_dataset(
                f"com.microsoft.msmarco.passage.{subset}.qrels")
            assessed_topics = TrecAssessedTopics(topics=topics,
                                                 assessments=qrels)
            return MsmarcoDataset(docstore=docstore,
                                  index=index,
                                  index_stem=index_stem,
                                  assessed_topics=assessed_topics)

        return get
示例#15
0
def cli(vocab_size: int):
    # Création du jeu de données et du modèle
    ds = prepare_dataset("com.sentiment140.english")

    # Création du vocabulaire
    wpmodel = Path("wp{}.model".format(vocab_size))
    if not wpmodel.is_file():
        logging.info("Did not find the wordpiece model %s", wpmodel)
        TRAINPATH = Path("sentiment140-train.txt")
        cleanup(ds.train.path, TRAINPATH)
        logging.info("Création du vocabulaire avec sentencepiece")
        spm.SentencePieceTrainer.train(input=str(TRAINPATH),
                                       model_prefix=f"wp{vocab_size}",
                                       vocab_size=vocab_size)
        TRAINPATH.unlink()

    # Création des jeux de données
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(f"wp{vocab_size}.model")

    CLASSMAP = {0: 0, 4: 1}
    logging.info("Traitement du train/test (Sentiment 140)")
    generatedata("test", tokenizer, vocab_size, ds.test, CLASSMAP)
    generatedata("train", tokenizer, vocab_size, ds.train, CLASSMAP)
示例#16
0
文件: tp9_etu.py 项目: eusip/AMAL
import re
from pathlib import Path
from torch.utils.data import Dataset
from datamaestro import prepare_dataset

EMBEDDING_SIZE = 50

ds = prepare_dataset("edu.standford.aclimdb")
word2id, embeddings = prepare_dataset('edu.standford.glove.6b.%d' % EMBEDDING_SIZE).load()

class FolderText(Dataset):
    def __init__(self, classes, tokenizer, load=False):
        self.tokenizer = tokenizer
        self.files = []
        self.filelabels = []
        self.labels = list(classes.keys())
        for label, folder in classes.items():
            for file in folder.glob("*.txt"):
                self.files.append(file)
                self.filelabels.append(label)

    def __len__(self):
        return len(self.filelabels)
    
    def __getitem__(self, ix):
        return self.tokenizer(self.files[ix].read_text()), self.filelabels[ix]


WORDS = re.compile(r"\S+")
def tokenizer(t):
    return list([x for x in re.findall(WORDS, t.lower())])
示例#17
0
from datamaestro import prepare_dataset

ds = prepare_dataset("org.universaldependencies.french.gsd")
print('ds')
示例#18
0
文件: vae.py 项目: ykrmm/RLDAL
        eps = torch.randn(mu.size())
        z = mu + eps * sigma
        return z

    def forward(self, x):

        mu, sigma = self.encode(x)
        z = self.reparametrization(mu, sigma)
        y = self.decode(z)

        return y, mu, sigma


if __name__ == '__main__':

    ds = prepare_dataset("com.lecun.mnist")
    train_images, train_labels = ds.train.images.data(), ds.train.labels.data()
    test_images, test_labels = ds.test.images.data(), ds.test.labels.data()

    dataset_train = Mnist_dataset(train_images, train_labels)
    dataset_test = Mnist_dataset(test_images, test_labels)
    batch_size = 65
    train_loader = DataLoader(dataset_train,
                              shuffle=True,
                              batch_size=batch_size)
    test_loader = DataLoader(dataset_test, shuffle=True, batch_size=batch_size)

    writer = SummaryWriter()

    savepath = "save_net/auto_encoder.model"
示例#19
0
def _msmarco_docs():
    return prepare_dataset("com.microsoft.msmarco.passage.collection")
示例#20
0
def glove(info):
    from xpmir.vocab.wordvec_vocab import WordvecUnkVocab

    wordembs = prepare_dataset("edu.stanford.glove.6b.50")
    return WordvecUnkVocab(data=wordembs, random=info.random)
示例#21
0
    def forward(ctx, y, target):
        ctx.save_for_backward(y, target)
        return torch.sum(torch.pow(y - target, 2))

    @staticmethod
    def backward(ctx, grad_outputs):
        y, target = ctx.saved_tensors
        y_grad = (2 * y - 2 * target) * grad_outputs

        return y_grad, None


## Pour utiliser la fonction

## Pour telecharger le dataset Boston
ds = prepare_dataset("edu.uci.boston")
fields, data = ds.files.data()

n = data.shape[0]

regling = linear1()
mse = MSE()

learning_rate = 0.01

# Parameters
w = torch.rand(13, requires_grad=True, dtype=torch.double)
b = torch.rand(1, requires_grad=True, dtype=torch.double)

writer = SummaryWriter()
示例#22
0
                            **kwargs)
    test_loader = DataLoader(test_ds,
                             batch_size=batch_size,
                             shuffle=True,
                             **kwargs)

    return train_loader, val_loader, test_loader


if __name__ == '__main__':

    EMBEDDING_SIZE = 50  # size of GloVe vectors

    # load the IMDB dataset
    print('Loading IMDB dataset..')
    ds = prepare_dataset("edu.standford.aclimdb")
    # load pretrained GloVe word embeddings (400k trained vectors)
    print('Loading GloVe embeddings..')
    word2id, embeddings = get_glove_embeddings()

    print('Vocab from GloVe: size {}, head: {}'.format(len(word2id), [
        (i, w) for w, i in word2id.items()
    ][:40]))
    print('Embeddings matrix size:', type(embeddings), embeddings.shape)

    # ds.train.classes and ds.test.classes are dict ('class', 'path-to-files')
    train_dataset = FolderText(ds.train.classes, word2id, load=False)
    test_dataset = FolderText(ds.test.classes, word2id, load=False)

    print('IMDB dataset:')
    print('Number of training samples:', len(train_dataset))