示例#1
0
 def _compute_score(self, raw_tokens, ans_eng):
     term_count = 0
     related = []
     causal_match = False
     position = []
     for term, synsets in ans_eng.ir_query_tagged:
         match = False
         term_related = []
         for i, page_term in enumerate(indexer.regularize(raw_tokens)):
             page_term_related = ans_eng.related_values(synsets, page_term)
             if page_term_related:
                 term_related.append((max(page_term_related), i))
                 if term == page_term or max(
                         page_term_related) >= ans_eng.lch:
                     match = True
         if match:  # above LCH value
             term_count += 1
             if term == 'cause':
                 causal_match = True
         if term_related:
             term_related.sort(key=lambda tup: tup[0])
             term_related, i = term_related[-1]  # maximum value
             related.append(term_related)
             position.append(i)
     return term_count, related, causal_match, position
 def regularize_text(self):
     if not self.paragraphs:
         self.tokenize_sentences()
     for i, para in enumerate(self.paragraphs):
         for j, sent in enumerate(para.sentence_tokens):
             self.paragraphs[i].sentence_tokens[j] = regularize(sent)
         # Remove empty sentences
         self.paragraphs[i].sentence_tokens = [
             x for x in self.paragraphs[i].sentence_tokens if x
         ]
示例#3
0
 def regularize_text(self):
     """Regularizes all tokens for each sentence in each paragraph."""
     if not self.paragraphs:
         self.tokenize_sentences()
     for i, para in enumerate(self.paragraphs):
         for j, sent in enumerate(para.sentence_tokens):
             self.paragraphs[i].sentence_tokens[j] = regularize(sent)
         # Remove empty sentences
         self.paragraphs[i].sentence_tokens = [x for x in self.
                                               paragraphs[i].sentence_tokens
                                               if x]
    def _compute_score(self, raw_tokens, ans_eng):
        """Compute various score components from the answer text.

        Args:
            raw_tokens: List of token strings, such as from a sentence.
            ans_eng: The AnswerEngine object that was used to generate
                this Answer object.

        Returns:
            term_count: Integer number of how many query-terms had a
                matching answer-term with a semantic relatedness (LCH)
                value above the threshold specified in the ans_eng.
            related: List of LCH similarity values (semantic relatedness),
                representing the maximum LCH value for each query-term
                when evaluated with each answer-term.
            causal_match: Boolean value representing whether or not the
                (usually hidden) causal term had a match (LCH greater
                than the threshold in ans_eng).
            position: List of token indexes for each query term,
                representing the location in the raw_tokens that had the
                maximal semantic relatedness (LCH) for each query term.
        """
        term_count = 0
        related = []
        causal_match = False
        position = []
        for term, synsets in ans_eng.ir_query_tagged:
            match = False
            term_related = []
            for i, page_term in enumerate(indexer.regularize(raw_tokens)):
                page_term_related = ans_eng.related_values(synsets, page_term)
                if page_term_related:
                    term_related.append((max(page_term_related), i))
                    if term == page_term or max(
                            page_term_related) >= ans_eng.lch:
                        match = True
            if match:  # above LCH value
                term_count += 1
                if term == 'cause':
                    causal_match = True
            if term_related:
                term_related.sort(key=lambda tup: tup[0])
                term_related, i = term_related[-1]  # maximum value
                related.append(term_related)
                position.append(i)
        return term_count, related, causal_match, position
示例#5
0
    def _compute_score(self, raw_tokens, ans_eng):
        """Compute various score components from the answer text.

        Args:
            raw_tokens: List of token strings, such as from a sentence.
            ans_eng: The AnswerEngine object that was used to generate
                this Answer object.

        Returns:
            term_count: Integer number of how many query-terms had a
                matching answer-term with a semantic relatedness (LCH)
                value above the threshold specified in the ans_eng.
            related: List of LCH similarity values (semantic relatedness),
                representing the maximum LCH value for each query-term
                when evaluated with each answer-term.
            causal_match: Boolean value representing whether or not the
                (usually hidden) causal term had a match (LCH greater
                than the threshold in ans_eng).
            position: List of token indexes for each query term,
                representing the location in the raw_tokens that had the
                maximal semantic relatedness (LCH) for each query term.
        """
        term_count = 0
        related = []
        causal_match = False
        position = []
        for term, synsets in ans_eng.ir_query_tagged:
            match = False
            term_related = []
            for i, page_term in enumerate(indexer.regularize(raw_tokens)):
                page_term_related = ans_eng.related_values(synsets, page_term)
                if page_term_related:
                    term_related.append((max(page_term_related), i))
                    if term == page_term or max(page_term_related) >= ans_eng.lch:
                        match = True
            if match:  # above LCH value
                term_count += 1
                if term == 'cause':
                    causal_match = True
            if term_related:
                term_related.sort(key=lambda tup: tup[0])
                term_related, i = term_related[-1]  # maximum value
                related.append(term_related)
                position.append(i)
        return term_count, related, causal_match, position
示例#6
0
    def __init__(self, index, query, start=0, num_top=10, lch=2.16):
        """Inits AnswerEngine by querying the IR module to get Page objects.

        Args:
            index: An indexer.Index object, which represent the IR system.
            query: The direct query string from the user.
            start: The number of pages to offset from the beginning of the
                page list returned by the index.
            num_top: The number of pages (from the top of the ranked list
                of pages (sorted by similarity) returned by the index) to
                extract answers from.
                Combined withe the start-argument, this allows for paging
                through the results by only looking at a certain number of
                pages at a time.
            lch: The Leacock-Chodorow Similarity measurement. Used to
                determine if two WordNet senses (synsets) are related.
                The default value has been empirically determined to
                provide good results, though it may be fine-tuned. This
                argument should be a float.
        """
        self.query = query
        self.start = start
        self.num_top = num_top
        self.lch = lch
        self.answers = None
        # Candidate Document Selection
        self.ir_query = indexer.regularize(indexer.tokenizer.tokenize(query))
        self.ir_query_tagged = None
        page_sim = index.ranked(self.ir_query)
        self.num_pages = len(page_sim)
        # Reduce number of pages we need to get from disk
        page_sim = page_sim[start:num_top]
        page_ids, similarity = zip(*page_sim)
        # Retrieve the Page objects from the list of Page.IDs
        self.pages = index.get_page(page_ids)
        # Tell each page the value of its similarity score
        for page, sim in zip(self.pages, similarity):
            page.cosine_sim = sim
    def __init__(self, index, query, start=0, num_top=10, lch=2.16):
        """Inits AnswerEngine by querying the IR module to get Page objects.

        Args:
            index: An indexer.Index object, which represent the IR system.
            query: The direct query string from the user.
            start: The number of pages to offset from the beginning of the
                page list returned by the index.
            num_top: The number of pages (from the top of the ranked list
                of pages (sorted by similarity) returned by the index) to
                extract answers from.
                Combined withe the start-argument, this allows for paging
                through the results by only looking at a certain number of
                pages at a time.
            lch: The Leacock-Chodorow Similarity measurement. Used to
                determine if two WordNet senses (synsets) are related.
                The default value has been empirically determined to
                provide good results, though it may be fine-tuned. This
                argument should be a float.
        """
        self.query = query
        self.start = start
        self.num_top = num_top
        self.lch = lch
        self.answers = None
        # Candidate Document Selection
        self.ir_query = indexer.regularize(indexer.tokenizer.tokenize(query))
        self.ir_query_tagged = None
        page_sim = index.ranked(self.ir_query)
        self.num_pages = len(page_sim)
        # Reduce number of pages we need to get from disk
        page_sim = page_sim[start:num_top]
        page_ids, similarity = zip(*page_sim)
        # Retrieve the Page objects from the list of Page.IDs
        self.pages = index.get_page(page_ids)
        # Tell each page the value of its similarity score
        for page, sim in zip(self.pages, similarity):
            page.cosine_sim = sim
示例#8
0
    def __init__(self, index, query, start=0, num_top=10, lch=2.16):
        self.query = query
        self.start = start
        self.num_top = num_top
        self.lch = lch
        self.answers = None

        # Candidate Document Selection
        self.ir_query = indexer.regularize(indexer.tokenizer.tokenize(query))
        self.ir_query_tagged = None
        page_sim = index.ranked(self.ir_query)
        self.num_pages = len(page_sim)

        # Reduce number of pages we need to get from disk
        page_sim = page_sim[start:num_top]
        page_ids, similarity = zip(*page_sim)

        # Retrieve the Page objects from the list of Page.IDs
        self.pages = index.get_page(page_ids)

        # Tell each page the value of its similarity score
        for page, sim in zip(self.pages, similarity):
            page.cosine_sim = sim