Exemplo n.º 1
0
def train():
    """
    Train the tfidf model, requires downloaded data and saves to models/
    """
    dataset = QuizBowlDataset(guesser_train=True)
    tfidf_guesser = TfidfGuesser()
    tfidf_guesser.train(dataset.training_data())
    tfidf_guesser.save()
Exemplo n.º 2
0
def train():
    """
    While this Naive IR model does not require any training, we call it any way to maintain consistency with the pipeline,
    requires downloaded data and saves to models/
    """
    dataset = QuizBowlDataset(guesser_train=True)
    naive_ir_guesser = NaiveIRGuesser()
    naive_ir_guesser.train(dataset.training_data())
    naive_ir_guesser.save()
    def _prepare_question_list(
            self, data_type: str, retriever: AbsRetriever
    ) -> List[Union[str, List[str], List[bool]]]:
        '''
        Args:
            data_type: str, 'train' / 'val'.
            retriever: AbsRetriever
        Returns:
            the question list
        '''
        dump_path = "HAR" + data_type + ".dump"
        if os.path.isfile(dump_path):
            logger.info(dump_path + " already exits, loading from file")
            with open(dump_path, 'rb') as f:
                params = pickle.load(f)
                sample_list = params['data_dump']
            return sample_list
        dataset = QuizBowlDataset(guesser_train=True)

        if data_type == 'train':
            data = dataset.training_data_text()
        elif data_type == 'val':
            data = dataset.dev_data_text()
        sample_list = []
        questions = data[0]
        answers = data[1]

        tot_len = len(questions)
        retrieved_pages = []
        idx = 0
        batch_size = 1000
        for idx in tqdm(range(0, tot_len, batch_size)):
            end_idx = min(idx + batch_size, tot_len)
            one_batch = retriever.retrieve(questions[idx:end_idx])
            retrieved_pages.extend(one_batch)

        for q, pages, ans in tqdm(zip(questions, retrieved_pages, answers)):
            passage_list = []
            label_list = []
            for page, score in pages:
                try:
                    page_item = self.wiki_page_dict[page]
                    # Just use summary for now
                    passage_list.append(page_item.summary)
                except:
                    logger.warning("Skipping page " + page)
                    continue
                label_list.append(ans == page)
            sample_list.append([q, passage_list, label_list])

        with open(dump_path, 'wb') as f:
            pickle.dump({'data_dump': sample_list}, f)
        return sample_list
Exemplo n.º 4
0
    def train(self, path: str) -> None:
        '''
		train the model
		Args:
			path: str, pkl model path
		'''

        print("*** preprocessing ***")
        dataset = QuizBowlDataset(guesser_train=True)
        training_data = dataset.training_data()

        questions = training_data[0]
        answers = training_data[1]
        answer_docs = defaultdict(str)
        for q, ans in zip(questions, answers):
            text = ' '.join(q)
            answer_docs[ans] += ' ' + text.lower()

        x_array = []
        y_array = []
        for ans, doc in answer_docs.items():
            tokens = word_tokenize(doc)
            # remove stop words
            x_array.append([t for t in tokens if not t in self.stop_words])
            y_array.append(ans)

        self.i_to_ans = {i: ans for i, ans in enumerate(y_array)}

        print("*** building dictionary ***")
        self.dictionary = corpora.Dictionary(x_array)

        print("*** building bag of words representations ***")
        docs_bag_of_words = [self.dictionary.doc2bow(x) for x in x_array]

        print("*** building bm25 model ***")
        self.bm25_model = bm25.BM25(docs_bag_of_words)

        print("*** running two test questions ***")
        questions = [
            """ The oldest document written in this language is a letter written in 1521 in the town of Câmpulung, while more recent poets writing in this language include Carmen Sylva and Anton Pann. This language uses five cases, though the genitive and dative cases are identical, as are the nominative and accusative. Tripthongs occur frequently in this language, as in "rusaoică," while interjections in this language include "mamă-mamă. " It is more closely related to Dalmatian than to Italian or Spanish, and this language includes the pronouns "noi," "voi," and "eu" ["AY-oo"] and favors labial consonants such as "b" and "m" over velars such as "g" and "k." For 10 points, name this tongue spoken by the members of O-Zone and Nicolae Ceauşescu, an Eastern Romance language spoken in Bucharest. This tongue has direct and the oblique cases, and, unlike its related languages, maintains a long o/short u distinction. The future tense in this language is invoked by compounding the verb meaning "to wish," "a vrea. " This language's verb for "to preserve," "a p?stra," is the source of its sole loanword into English, "pastrami. " Its endangered dialects include the Megleno- and Istro- versions. The most popular regional varieties of this language are the Aro- and Daco- forms. It is identical to a language known for nationalist reasons as (*) "Moldovan" and, due to its geographic distribution, exhibits a high degree of borrowed vocabulary from Slavic tongues. For 10 points, identify this easternmost Romance language, spoken by such figures as Ion Antonescu and Constantin Brancusi. """,
            """  The narrator of this novel is alarmed when a ghost\'s hand reaches into his room and starts bleeding from shattered glass. Isabella and Edgar Linton, heirs to Thrushcross Grange, marry the two protagonists of this novel. Narrated in parts by Nelly Dean and Mr. Lockwood, this novel\'s tragic couple is Catherine Earnshaw and Heathcliff. For 10 points, name this only novel by Emily Bronte. The narrator of this novel is thought by the servant Joseph to have stolen a lantern, and that narrator is consequently attacked by dogs set loose by Joseph. One character is adopted from an orphanage in Liverpool and throws applesauce at another character. Hindley, whose wife Frances dies after giving birth to Hareton, hates that orphan. Nelly Dean tells the story of the house to the narrator of this novel, Mr. Lockwood, who rents a room at Thrushcross Grange. Identify this novel centering on the romance between Heathcliff and Catherine, a work by Emily Brontë. This book\'s final chapter chronicles how one of its protagonists starts to starve himself to death, during which he claims to have seen the edge of hell but is now near heaven. Another chapter of this novel describes how a premature baby named Catherine is buried near the corner of a church with the Lintons. One character in this novel is freed by the housekeeper Zillah after she is imprisoned for many days. The frame story of this novel is set up between Nelly\'s tale to Lockwood about Thrushcross Grange and the titular locale. For 10 points, name this novel about Heathcliff and his relationship to Catherineforrtl """
        ]
        print("answers: ", self.retrieve(questions))

        print("*** saving bm25 retriever ***")
        self._save(path)
        """
Exemplo n.º 5
0
    def eval(self):
        # evaluate accuracy
        self.k = 50
        print("Evaluating test accuracy...")
        dataset = QuizBowlDataset(guesser_train=True)
        questions = [[' '.join(q)] for q in dataset.test_data()[0]]
        answers = dataset.test_data()[1]
        guesses = []

        for i in range(len(questions)):
            if i % 100 == 0:
                print(i, answers[i], questions[i])
                print(self.retrieve(questions[i])[0])
            guesses.append((answers[i], self.retrieve(questions[i])[0]))

        with open("guesses_test", 'wb') as f:
            pickle.dump(guesses, f)
Exemplo n.º 6
0
    def train(
        self,
        path: str,
        retriever: AbsRetriever,
        wiki_dump: str,
    ):
        '''
        download all the wikipage info in the training set
        Args:
            path: str, pkl model path
            retriever: AbsRetriever, not used here
            wiki_dump: str, the wiki dump json file path at the time those page lables are created
        '''

        if not os.path.isfile(wiki_dump):
            logger.warning("Wiki dump doesn't exit, download a new one")
            urllib.request.urlretrieve(WIKI_DUMP_URL, wiki_dump)
        with open(wiki_dump, 'r') as f:
            old_wiki_dict = json.load(f)

        dataset = QuizBowlDataset(guesser_train=True)
        training_data = dataset.training_data()
        answers = training_data[1]
        logger.info("Start extracing wiki pages")

        # get it from the wikipedia API since it has anchor text information
        # only using page search will have disambuigation issues since page names are changed

        for ans in tqdm(answers):
            try:
                wiki_pageid = old_wiki_dict[ans]['id']
                self.wiki_page_dict[ans] = wikipedia.page(pageid=wiki_pageid)
            except:
                logger.warning(
                    "Fail to get wikipage %s using the id of the old wikidump "
                    % ans)
                try:
                    logger.warning("Using direct page search %s " % ans)
                    self.wiki_page_dict[ans] = wikipedia.page(
                        ans, auto_suggest=False)
                except:
                    logger.warning("Fail to get " + ans)

        with open(path, 'wb') as f:
            pickle.dump({'wiki_page_dict': self.wiki_page_dict}, f)
    def _prepare_wiki_dict(self, path: str, old_wiki_dict: dict):
        '''
        Load the page info into a dict
        Args:
            path: str, the path to the wiki dict
            old_wiki_dict: dict, the loaded old wiki dump
        '''
        if os.path.isfile(path):
            logger.info("Wiki dict already exits, loading from file")
            self._load_wiki_dict(path)
            return
        logger.warning(
            "Wiki dict not found, preparing the wiki dict, this might take a while"
        )
        dataset = QuizBowlDataset(guesser_train=True)
        training_data = dataset.training_data()
        answers = training_data[1]
        logger.info("Start extracing wiki pages")

        # get it from the wikipedia API since it has anchor text information
        # only using page search will have disambuigation issues since page names are changed

        for ans in tqdm(answers):
            try:
                wiki_pageid = old_wiki_dict[ans]['id']
                self.wiki_page_dict[ans] = wikipedia.page(pageid=wiki_pageid)
            except:
                logger.warning(
                    "Fail to get wikipage %s using the id of the old wikidump "
                    % ans)
                try:
                    logger.warning("Using direct page search %s " % ans)
                    self.wiki_page_dict[ans] = wikipedia.page(
                        ans, auto_suggest=False)
                except:
                    logger.warning("Fail to get " + ans)

        with open(path, 'wb') as f:
            pickle.dump({'wiki_page_dict': self.wiki_page_dict}, f)
Exemplo n.º 8
0
from typing import List, Optional, Tuple
from collections import defaultdict
import pickle
import json
from os import path

import click
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from flask import Flask, jsonify, request

from qanta import util
from qanta.dataset import QuizBowlDataset

if __name__ == '__main__':

    print("hi")
    dataset = QuizBowlDataset(guesser_train=True)
    print(dataset)
Exemplo n.º 9
0
def train():
    """
    Train the tfidf model, requires downloaded data and saves to models/
    """
    print("At train 1 : Getting Datasets")

    #See if file exist already
    exists = path.isfile('tfidf.pickle') and path.isfile(
        'train_exs.npy') and path.isfile('dev_exs.npy')
    if not exists:
        datasetTrainGuess = QuizBowlDataset(guesser_train=True).training_data()
        datasetTrainBuzz = QuizBowlDataset(buzzer_train=True).training_data()
        datasetDevBuzz = QuizBowlDataset(buzzer_dev=True).training_data()

    #Download Wikipedia data if not found
    exists = path.isfile('wikidata.json')
    if not exists:
        print('creating wiki data file')
        getWikiData.get_wikipedia_data()

    # datasetTrainGuess[0] -- questions
    #    #Each question has its own set of setences.
    # datasetTrainGuess[1] -- answers

    # Check if pickle exists
    print("At train 2 : Getting Guessers and train them")
    exists = path.isfile('tfidf.pickle')
    if exists:
        tfidf_guesser = TfidfGuesser.load()
    else:
        print('loading wiki')
        with open('wikidata.json') as f:
            data = json.load(f)

        print('Finish loading')
        questions_wiki = []
        answers_wiki = []
        i = 0
        for val in data:
            i += 1
            if i > 500:
                break
            answers_wiki.append(data[val]['title'])
            questions_wiki.append(data[val]['text'])

        print("training data ready")
        tfidf_guesser = TfidfGuesser()
        tfidf_guesser.train(datasetTrainGuess, questions_wiki, answers_wiki)
        tfidf_guesser.save()

    #Check if trained examples exists

    print("At train 3 : Train them")
    exists = path.isfile('train_exs.npy') and path.isfile('dev_exs.npy')
    if exists:
        train_exs = np.load('train_exs.npy')
        dev_exs = np.load('dev_exs.npy')
    else:
        #Phrase skip should go in the function under here
        print('Generating Guesses for Training Buzzer Data')
        train_qnums, train_answers, train_char_indices, train_ques_texts, train_ques_lens, train_guesses_and_scores =  \
                                                                    LSTM.generate_guesses_and_scores(tfidf_guesser, datasetTrainBuzz, BUZZ_NUM_GUESSES, char_skip=CHAR_SKIP)

        print('Generating Guesses for Dev Buzzer Data')
        dev_qnums, dev_answers, dev_char_indices, dev_ques_texts, dev_ques_lens, dev_guesses_and_scores =    \
                                                                LSTM.generate_guesses_and_scores(tfidf_guesser, datasetDevBuzz, BUZZ_NUM_GUESSES, char_skip=CHAR_SKIP)

        train_exs = LSTM.create_feature_vecs_and_labels(
            train_guesses_and_scores, train_answers, BUZZ_NUM_GUESSES)
        dev_exs = LSTM.create_feature_vecs_and_labels(dev_guesses_and_scores,
                                                      dev_answers,
                                                      BUZZ_NUM_GUESSES)
        np.save('train_exs.npy', train_exs)
        np.save('dev_exs.npy', dev_exs)

    # Organize data
    print("At train 4: Data")
    train_dataset = LSTM.QuestionDataset(train_exs)
    train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    train_loader = DataLoader(train_dataset,
                              batch_size=8,
                              sampler=train_sampler,
                              num_workers=0,
                              collate_fn=LSTM.batchify)

    dev_dataset = LSTM.QuestionDataset(dev_exs)
    dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_loader = DataLoader(dev_dataset,
                            batch_size=8,
                            sampler=dev_sampler,
                            num_workers=0,
                            collate_fn=LSTM.batchify)

    print("At train 5: Train LSTM")
    lstm = LSTM.RNNBuzzer()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    lstm.to(device)
    for epoch in range(25):
        print('start epoch %d' % (epoch + 1))
        train_acc, dev_acc = LSTM.train(50, lstm, train_loader, dev_loader,
                                        device)

    print("At train 6: Saving LSTM model")

    torch.save(lstm.state_dict(), 'lstm_model.pt')

    print("Done with training")
Exemplo n.º 10
0
def train():
    dataset = QuizBowlDataset(guesser_train=True)
    dan_guesser = DanGuesser()
    dan_guesser.train(dataset.training_data(), dataset.dev_data())