Пример #1
0
    def append_subquery_matches(self, keywords, matches):
        """
        get matches to individual and nearby keywords (non phrase)
        """

        # check for full name matches to a attribute, e.g. dataset.nevents
        for kwd in keywords:
            add_full_fieldmatch(kwd, matches)

        fields_by_entity = get_schema().list_result_fields()
        str_len = len(keywords)
        max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN)
        for length in xrange(1, max_len + 1):
            for start in xrange(0, str_len - length + 1):
                chunk = keywords[start:start + length]
                # exclude phrases with "a b c" (as these were processed earlier)
                if any(c for c in chunk if ' ' in c):
                    continue
                # only the last term in the chunk is allowed to contain operator
                if any(test_operator_containment(kw) for kw in chunk[:-1]):
                    continue
                if DEBUG:
                    print('chunk:', chunk)
                    print('len=', length, '; start=', start, 'chunk:', chunk)

                s_chunk = ' '.join(get_keyword_without_operator(kw)
                                   for kw in chunk)
                results = self.fields_idx.search_index(
                    kwds=s_chunk,
                    limit=CHUNK_N_TOKEN_COMBINATION_RESULTS)
                max_score = results and results[0]['score']
                for result in results:
                    result['len'] = len(result['keywords_matched'])
                    entity = result['result_type']
                    if not check_validity(result, fields_by_entity):
                        continue
                    result['field'] = fields_by_entity[entity][result['field']]
                    result['tokens_required'] = chunk
                    if USE_IR_SCORE_NORMALIZATION_LOCAL:
                        result['score'] /= max_score
                    matches[entity].append(result)
Пример #2
0
    def append_subquery_matches(self, keywords, matches):
        """
        get matches to individual and nearby keywords (non phrase)
        """

        # check for full name matches to a attribute, e.g. dataset.nevents
        for kwd in keywords:
            add_full_fieldmatch(kwd, matches)

        fields_by_entity = get_schema().list_result_fields()
        str_len = len(keywords)
        max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN)
        for length in xrange(1, max_len + 1):
            for start in xrange(0, str_len - length + 1):
                chunk = keywords[start:start + length]
                # exclude phrases with "a b c" (as these were processed earlier)
                if any(c for c in chunk if ' ' in c):
                    continue
                # only the last term in the chunk is allowed to contain operator
                if any(test_operator_containment(kw) for kw in chunk[:-1]):
                    continue
                if DEBUG:
                    print('chunk:', chunk)
                    print('len=', length, '; start=', start, 'chunk:', chunk)

                s_chunk = ' '.join(
                    get_keyword_without_operator(kw) for kw in chunk)
                results = self.fields_idx.search_index(
                    kwds=s_chunk, limit=CHUNK_N_TOKEN_COMBINATION_RESULTS)
                max_score = results and results[0]['score']
                for result in results:
                    result['len'] = len(result['keywords_matched'])
                    entity = result['result_type']
                    if not check_validity(result, fields_by_entity):
                        continue
                    result['field'] = fields_by_entity[entity][result['field']]
                    result['tokens_required'] = chunk
                    if USE_IR_SCORE_NORMALIZATION_LOCAL:
                        result['score'] /= max_score
                    matches[entity].append(result)
Пример #3
0
    def get_phrase_matches(self, keywords):
        """
        get phrase matches from IR index
        """
        fields_by_entity = get_schema().list_result_fields()

        # first filter out the phrases (we wont combine them with anything)
        phrase_kwds = [kw for kw in keywords if ' ' in kw]

        matches = defaultdict(list)
        for kwd in phrase_kwds:
            # remove operators, e.g. "number of events">10 => number of events
            phrase = get_keyword_without_operator(kwd)
            # get ranked list of matches
            results = self.fields_idx.search_index(kwds=phrase,
                                                   limit=CHUNK_N_PHRASE_RESULTS)

            max_score = results and results[0]['score']
            for result in results:
                #r['len'] =  1
                result['len'] = len(result['keywords_matched'])
                entity = result['result_type']
                if not check_validity(result, fields_by_entity):
                    continue

                # TODO: this shall be done in presentation level
                result['field'] = fields_by_entity[entity][result['field']]
                result['tokens_required'] = [kwd]

                # penalize terms that have multiple matches
                result['score'] *= W_PHRASE
                if USE_IR_SCORE_NORMALIZATION_LOCAL:
                    result['score'] /= max_score

                matches[entity].append(result)

        return matches
Пример #4
0
    def get_phrase_matches(self, keywords):
        """
        get phrase matches from IR index
        """
        fields_by_entity = get_schema().list_result_fields()

        # first filter out the phrases (we wont combine them with anything)
        phrase_kwds = [kw for kw in keywords if ' ' in kw]

        matches = defaultdict(list)
        for kwd in phrase_kwds:
            # remove operators, e.g. "number of events">10 => number of events
            phrase = get_keyword_without_operator(kwd)
            # get ranked list of matches
            results = self.fields_idx.search_index(
                kwds=phrase, limit=CHUNK_N_PHRASE_RESULTS)

            max_score = results and results[0]['score']
            for result in results:
                #r['len'] =  1
                result['len'] = len(result['keywords_matched'])
                entity = result['result_type']
                if not check_validity(result, fields_by_entity):
                    continue

                # TODO: this shall be done in presentation level
                result['field'] = fields_by_entity[entity][result['field']]
                result['tokens_required'] = [kwd]

                # penalize terms that have multiple matches
                result['score'] *= W_PHRASE
                if USE_IR_SCORE_NORMALIZATION_LOCAL:
                    result['score'] /= max_score

                matches[entity].append(result)

        return matches