예제 #1
0
    def train(self, path: str) -> None:
        '''
        train the model
        Args:
            path: str, pkl model path
        '''
        dataset = QuizBowlDataset(guesser_train=True)
        training_data = dataset.training_data()

        questions = training_data[0]
        answers = training_data[1]
        answer_docs = defaultdict(str)
        for q, ans in zip(questions, answers):
            text = ' '.join(q)
            answer_docs[ans] += ' ' + text

        x_array = []
        y_array = []
        for ans, doc in answer_docs.items():
            x_array.append(doc)
            y_array.append(ans)

        self.i_to_ans = {i: ans for i, ans in enumerate(y_array)}
        self.tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                                min_df=2,
                                                max_df=.9).fit(x_array)
        self.tfidf_matrix = self.tfidf_vectorizer.transform(x_array)
        self._save(path)
예제 #2
0
def train_entities():
    """
    Train the tfidf model with context
    """
    dataset = QuizBowlDataset(guesser_train=True)
    tfidf_guesser = TfidfContextGuesser()
    tfidf_guesser.train(dataset.training_data())
    tfidf_guesser.save()
예제 #3
0
def train():
    """
    Train the tfidf model, requires downloaded data and saves to models/
    """
    dataset = QuizBowlDataset(guesser_train=True)
    tfidf_guesser = TfidfGuesser()
    tfidf_guesser.train(dataset.training_data())
    tfidf_guesser.save()
예제 #4
0
def train():
    """
    While this Naive IR model does not require any training, we call it any way to maintain consistency with the pipeline,
    requires downloaded data and saves to models/
    """
    dataset = QuizBowlDataset(guesser_train=True)
    naive_ir_guesser = NaiveIRGuesser()
    naive_ir_guesser.train(dataset.training_data())
    naive_ir_guesser.save()
예제 #5
0
    def train(self, path: str) -> None:
        '''
		train the model
		Args:
			path: str, pkl model path
		'''

        print("*** preprocessing ***")
        dataset = QuizBowlDataset(guesser_train=True)
        training_data = dataset.training_data()

        questions = training_data[0]
        answers = training_data[1]
        answer_docs = defaultdict(str)
        for q, ans in zip(questions, answers):
            text = ' '.join(q)
            answer_docs[ans] += ' ' + text.lower()

        x_array = []
        y_array = []
        for ans, doc in answer_docs.items():
            tokens = word_tokenize(doc)
            # remove stop words
            x_array.append([t for t in tokens if not t in self.stop_words])
            y_array.append(ans)

        self.i_to_ans = {i: ans for i, ans in enumerate(y_array)}

        print("*** building dictionary ***")
        self.dictionary = corpora.Dictionary(x_array)

        print("*** building bag of words representations ***")
        docs_bag_of_words = [self.dictionary.doc2bow(x) for x in x_array]

        print("*** building bm25 model ***")
        self.bm25_model = bm25.BM25(docs_bag_of_words)

        print("*** running two test questions ***")
        questions = [
            """ The oldest document written in this language is a letter written in 1521 in the town of Câmpulung, while more recent poets writing in this language include Carmen Sylva and Anton Pann. This language uses five cases, though the genitive and dative cases are identical, as are the nominative and accusative. Tripthongs occur frequently in this language, as in "rusaoică," while interjections in this language include "mamă-mamă. " It is more closely related to Dalmatian than to Italian or Spanish, and this language includes the pronouns "noi," "voi," and "eu" ["AY-oo"] and favors labial consonants such as "b" and "m" over velars such as "g" and "k." For 10 points, name this tongue spoken by the members of O-Zone and Nicolae Ceauşescu, an Eastern Romance language spoken in Bucharest. This tongue has direct and the oblique cases, and, unlike its related languages, maintains a long o/short u distinction. The future tense in this language is invoked by compounding the verb meaning "to wish," "a vrea. " This language's verb for "to preserve," "a p?stra," is the source of its sole loanword into English, "pastrami. " Its endangered dialects include the Megleno- and Istro- versions. The most popular regional varieties of this language are the Aro- and Daco- forms. It is identical to a language known for nationalist reasons as (*) "Moldovan" and, due to its geographic distribution, exhibits a high degree of borrowed vocabulary from Slavic tongues. For 10 points, identify this easternmost Romance language, spoken by such figures as Ion Antonescu and Constantin Brancusi. """,
            """  The narrator of this novel is alarmed when a ghost\'s hand reaches into his room and starts bleeding from shattered glass. Isabella and Edgar Linton, heirs to Thrushcross Grange, marry the two protagonists of this novel. Narrated in parts by Nelly Dean and Mr. Lockwood, this novel\'s tragic couple is Catherine Earnshaw and Heathcliff. For 10 points, name this only novel by Emily Bronte. The narrator of this novel is thought by the servant Joseph to have stolen a lantern, and that narrator is consequently attacked by dogs set loose by Joseph. One character is adopted from an orphanage in Liverpool and throws applesauce at another character. Hindley, whose wife Frances dies after giving birth to Hareton, hates that orphan. Nelly Dean tells the story of the house to the narrator of this novel, Mr. Lockwood, who rents a room at Thrushcross Grange. Identify this novel centering on the romance between Heathcliff and Catherine, a work by Emily Brontë. This book\'s final chapter chronicles how one of its protagonists starts to starve himself to death, during which he claims to have seen the edge of hell but is now near heaven. Another chapter of this novel describes how a premature baby named Catherine is buried near the corner of a church with the Lintons. One character in this novel is freed by the housekeeper Zillah after she is imprisoned for many days. The frame story of this novel is set up between Nelly\'s tale to Lockwood about Thrushcross Grange and the titular locale. For 10 points, name this novel about Heathcliff and his relationship to Catherineforrtl """
        ]
        print("answers: ", self.retrieve(questions))

        print("*** saving bm25 retriever ***")
        self._save(path)
        """
예제 #6
0
    def train(
        self,
        path: str,
        retriever: AbsRetriever,
        wiki_dump: str,
    ):
        '''
        download all the wikipage info in the training set
        Args:
            path: str, pkl model path
            retriever: AbsRetriever, not used here
            wiki_dump: str, the wiki dump json file path at the time those page lables are created
        '''

        if not os.path.isfile(wiki_dump):
            logger.warning("Wiki dump doesn't exit, download a new one")
            urllib.request.urlretrieve(WIKI_DUMP_URL, wiki_dump)
        with open(wiki_dump, 'r') as f:
            old_wiki_dict = json.load(f)

        dataset = QuizBowlDataset(guesser_train=True)
        training_data = dataset.training_data()
        answers = training_data[1]
        logger.info("Start extracing wiki pages")

        # get it from the wikipedia API since it has anchor text information
        # only using page search will have disambuigation issues since page names are changed

        for ans in tqdm(answers):
            try:
                wiki_pageid = old_wiki_dict[ans]['id']
                self.wiki_page_dict[ans] = wikipedia.page(pageid=wiki_pageid)
            except:
                logger.warning(
                    "Fail to get wikipage %s using the id of the old wikidump "
                    % ans)
                try:
                    logger.warning("Using direct page search %s " % ans)
                    self.wiki_page_dict[ans] = wikipedia.page(
                        ans, auto_suggest=False)
                except:
                    logger.warning("Fail to get " + ans)

        with open(path, 'wb') as f:
            pickle.dump({'wiki_page_dict': self.wiki_page_dict}, f)
    def _prepare_wiki_dict(self, path: str, old_wiki_dict: dict):
        '''
        Load the page info into a dict
        Args:
            path: str, the path to the wiki dict
            old_wiki_dict: dict, the loaded old wiki dump
        '''
        if os.path.isfile(path):
            logger.info("Wiki dict already exits, loading from file")
            self._load_wiki_dict(path)
            return
        logger.warning(
            "Wiki dict not found, preparing the wiki dict, this might take a while"
        )
        dataset = QuizBowlDataset(guesser_train=True)
        training_data = dataset.training_data()
        answers = training_data[1]
        logger.info("Start extracing wiki pages")

        # get it from the wikipedia API since it has anchor text information
        # only using page search will have disambuigation issues since page names are changed

        for ans in tqdm(answers):
            try:
                wiki_pageid = old_wiki_dict[ans]['id']
                self.wiki_page_dict[ans] = wikipedia.page(pageid=wiki_pageid)
            except:
                logger.warning(
                    "Fail to get wikipage %s using the id of the old wikidump "
                    % ans)
                try:
                    logger.warning("Using direct page search %s " % ans)
                    self.wiki_page_dict[ans] = wikipedia.page(
                        ans, auto_suggest=False)
                except:
                    logger.warning("Fail to get " + ans)

        with open(path, 'wb') as f:
            pickle.dump({'wiki_page_dict': self.wiki_page_dict}, f)
예제 #8
0
def train():
    dataset = QuizBowlDataset(guesser_train=True)
    dan_guesser = DanGuesser()
    dan_guesser.train(dataset.training_data(), dataset.dev_data())