示例#1
0
    def computeResult(self):
        #For evaluation of BOW
        eval_qns = faq_config.getEvaluationQns()
        for qns in eval_qns:
            TOPTEN_ANS.clear()
            user_qa = [base_objects.QAPair(qns.question, "")]

            if self.aType == 1:
                #BOW Type
                user_feat_extractor = nltk_objects.NLTKFeatureExtraction(
                    user_qa)
                bow_algo = BOWAlgorithm(user_qa, user_feat_extractor,
                                        self.fext)
                resultDict = bow_algo._compute()
            else:
                uq_nlp_feat = [b.TextFeatureExtraction(qns.question, qns)]
                tstate = model.State(uq_nlp_feat, self.fext,
                                     model.final_weights, None)
                resultDict = tstate.get_final_scores(model.final_weights)[0]
            self.get_topNResults(resultDict, 10)
            index_ = TOPTEN_ANS.index(
                qns.answer) if qns.answer in TOPTEN_ANS else -1
            print("Question is: ", qns.question)
            print("Correct answer at index: ", index_)
            print("--------------------------------------------")
            self.rdict.update({qns.question: index_ + 1})
示例#2
0
def main():

    print("****** Hummingbird FAQ engine powered by NLTK *********")

    faqs = faq_config.getFAQs()
    '''
    TRAINING Code
    '''
    if do_training:
        state = model.train_model(faqs)
        model.final_weights = state.weights

        if report_training:
            all_scores = state.get_scores(state.weights)
            for ix, q_score_set in enumerate(all_scores):
                dict_scores = sorted([(ascore, qnum)
                                      for qnum, ascore in q_score_set.items()],
                                     reverse=True)
                print(state.best_choices[ix])
                for pair in dict_scores:
                    print("%2d: %f" % (pair[1], pair[0]))
                print()

    if do_main:
        faq_bow_feat = nltk_objects.NLTKFeatureExtraction(faqs)
        faq_nlp_feat = model.get_faq_features(faqs)

        run_mrr(faq_bow_feat, CONFIG_ALGO_BOW)

        space_out()

        run_mrr(faq_nlp_feat, CONFIG_ALGO_NLP)

        print(
            "You can enter question multiple times. Enter quit or Ctrl+c to quit"
        )
        while 1:
            #'''

            space_out()
            user_q = input("Enter your question or 'quit' to Exit : ")
            #user_q = "when is hummingbird season"
            #user_q = "Do hummingbirds migrate in winter?"
            #user_q = "How fast do hummingbirds' wings beat per second?"

            if user_q == "" or user_q == None:
                raise ValueError("Invalid question given. Exiting")
                exit(1)
            elif user_q == "quit":
                print("Thank you for trying out our FAQ Engine..Exiting")
                exit(1)
            user_qa = [base_objects.QAPair(user_q, "")]
            space_out()
            run_userq(user_qa, faq_bow_feat, CONFIG_ALGO_BOW)
            space_out()
            run_userq(user_qa, faq_nlp_feat, CONFIG_ALGO_NLP)
示例#3
0
def run_userq(user_qa, faq_feat, algoType):

    #FIXME: It has to be added to the empty list because nltk_object operates on the list
    #Alt: Alternate approach. Only call __tokenize(). But move stops to a class variable.
    user_q = user_qa[0].question
    if (algoType == CONFIG_ALGO_BOW):
        #BOW specific implementation.
        uq_bow_feat = nltk_objects.NLTKFeatureExtraction(user_qa)
        bow_algo = BOWAlgorithm(user_q, uq_bow_feat, faq_feat)
        resultDict = bow_algo._compute()
    else:
        #NLP Pipeline specific
        uq_nlp_feat = [b.TextFeatureExtraction(user_q, user_qa)]
        '''
        Testing code
        '''

        tstate = model.State(uq_nlp_feat, faq_feat, model.final_weights, None)
        nlp_rdict = tstate.get_final_scores(model.final_weights)
        resultDict = nlp_rdict[0]

    print_results(user_q, resultDict, algoType)
#TODO: not sure if i need to remove stopwords before lemmatizing (ok, tokenizing does that)
#      but i'm not sure if tokenizing should do that........

#TODO: some words (like In) may need to be lowercased

#TODO: maybe we should leave stopwords. like "to" should be there for verbs i feel...

#TODO: words like "United States" are being tagged with synsets separately

#TODO: need to add in parts of speech. look at question 50. "build" should not be a noun

sub_folder = 'data/synsets'

faqs = faq_config.getFAQs()
feature_extractor = no.NLTKFeatureExtraction(faqs)

#flatten = lambda l: [item for sublist in l for item in sublist]


def save_synsets(lemmas, filename):
    with open(filename, "w+") as outfile:
        first = True
        for lemma in lemmas:
            lemma_synset = lesk.get_lemma_synset(lemma, lemmas)

            if lemma_synset is not None:
                if not first:
                    outfile.write('\n')
                outfile.write("%s %s" % (lemma, lemma_synset.name()))
                first = False
示例#5
0
import nltk_objects
import faq_config

faqs = faq_config.getFAQs()
feature_extractor = nltk_objects.NLTKFeatureExtraction(faqs)

for qatoken in feature_extractor.tokens:
    print(qatoken)

for qatoken in feature_extractor.sentence_tokens:
    print(qatoken)

for qabow in feature_extractor.bow:
    print(qabow)

for qalemma in feature_extractor.lemmas:
    print(qalemma)

for qastem in feature_extractor.stems:
    print(qastem)

for postag in feature_extractor.pos_tags:
    print(postag)

for graphs in feature_extractor.dependency_graphs:
    print(graphs)

for syns in feature_extractor.synsets:
    print(syns)
'''
Test cases: