示例#1
0
文件: chunk_matcher.py 项目: dmwm/DAS
 def __init__(self, fields):
     """
     Upon each instantialization it recreates the Whoosh IR index based
     on the field data given.
     """
     self.fields_idx = SimpleIREntityAttributeMatcher(fields)
示例#2
0
 def __init__(self, fields):
     """
     Upon each instantialization it recreates the Whoosh IR index based
     on the field data given.
     """
     self.fields_idx = SimpleIREntityAttributeMatcher(fields)
示例#3
0
文件: chunk_matcher.py 项目: dmwm/DAS
class MultiKwdAttributeMatcher(object):
    """
    Matches chunks of keywords into fields in service outputs.
    """
    fields_idx = None

    def __init__(self, fields):
        """
        Upon each instantialization it recreates the Whoosh IR index based
        on the field data given.
        """
        self.fields_idx = SimpleIREntityAttributeMatcher(fields)

    def generate_chunks(self, keywords):
        """
        params: a tokenized list of keywords (e.g. ["a b c", 'a', 'b'])
        returns: a list of fields matching a combination of nearby keywords

        .. doctest::

            {
                '[result_type]':
                    [ matched_field, ...]
            }
        """

        if not get_setting('SERVICE_RESULT_FIELDS'):
            return {}

        matches = self.get_phrase_matches(keywords)
        self.append_subquery_matches(keywords, matches)
        # return the matches in sorted order (per result type)
        for entity, m_list in matches.items():
            for match in m_list:
                last_token = match['tokens_required'][-1]
                tokens_used = match['tokens_required']
                match['predicate'] = get_operator_and_param(last_token)
                match['field_name'] = match['field']['name']
                match['tokens_required_non_stopw'] = \
                    filter_stopwords(tokens_used)
                match['tokens_required_set'] = set(tokens_used)

            m_list.sort(key=lambda f: f['score'], reverse=True)

            # as IR based matching is fairly dumb now,
            # prune out the useless matches
            purge = []
            for m1 in m_list:
                for m2 in m_list:
                    tokens1 = m1['tokens_required_set']
                    tokens2 = m2['tokens_required_set']
                    if (m2 != m1 and m1['field_name'] == m2['field_name'] and
                            tokens1.issubset(tokens2) and
                            m1['score'] + 0.01 >= m2['score']):
                        # mark a useless match for deletion
                        purge.append(m2)
            matches[entity] = [match for match in m_list if match not in purge]

        normalize_scores(matches)

        # if enabled, prune low scoring chunks
        if get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS'):
            cutoff = get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS')
            for key in matches:
                matches[key] = [match for match in matches[key]
                                if match['score'] > cutoff]

        print_debug(matches)
        return matches

    def get_phrase_matches(self, keywords):
        """
        get phrase matches from IR index
        """
        fields_by_entity = get_schema().list_result_fields()

        # first filter out the phrases (we wont combine them with anything)
        phrase_kwds = [kw for kw in keywords if ' ' in kw]

        matches = defaultdict(list)
        for kwd in phrase_kwds:
            # remove operators, e.g. "number of events">10 => number of events
            phrase = get_keyword_without_operator(kwd)
            # get ranked list of matches
            results = self.fields_idx.search_index(kwds=phrase,
                                                   limit=CHUNK_N_PHRASE_RESULTS)

            max_score = results and results[0]['score']
            for result in results:
                #r['len'] =  1
                result['len'] = len(result['keywords_matched'])
                entity = result['result_type']
                if not check_validity(result, fields_by_entity):
                    continue

                # TODO: this shall be done in presentation level
                result['field'] = fields_by_entity[entity][result['field']]
                result['tokens_required'] = [kwd]

                # penalize terms that have multiple matches
                result['score'] *= W_PHRASE
                if USE_IR_SCORE_NORMALIZATION_LOCAL:
                    result['score'] /= max_score

                matches[entity].append(result)

        return matches

    def append_subquery_matches(self, keywords, matches):
        """
        get matches to individual and nearby keywords (non phrase)
        """

        # check for full name matches to a attribute, e.g. dataset.nevents
        for kwd in keywords:
            add_full_fieldmatch(kwd, matches)

        fields_by_entity = get_schema().list_result_fields()
        str_len = len(keywords)
        max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN)
        for length in xrange(1, max_len + 1):
            for start in xrange(0, str_len - length + 1):
                chunk = keywords[start:start + length]
                # exclude phrases with "a b c" (as these were processed earlier)
                if any(c for c in chunk if ' ' in c):
                    continue
                # only the last term in the chunk is allowed to contain operator
                if any(test_operator_containment(kw) for kw in chunk[:-1]):
                    continue
                if DEBUG:
                    print('chunk:', chunk)
                    print('len=', length, '; start=', start, 'chunk:', chunk)

                s_chunk = ' '.join(get_keyword_without_operator(kw)
                                   for kw in chunk)
                results = self.fields_idx.search_index(
                    kwds=s_chunk,
                    limit=CHUNK_N_TOKEN_COMBINATION_RESULTS)
                max_score = results and results[0]['score']
                for result in results:
                    result['len'] = len(result['keywords_matched'])
                    entity = result['result_type']
                    if not check_validity(result, fields_by_entity):
                        continue
                    result['field'] = fields_by_entity[entity][result['field']]
                    result['tokens_required'] = chunk
                    if USE_IR_SCORE_NORMALIZATION_LOCAL:
                        result['score'] /= max_score
                    matches[entity].append(result)
示例#4
0
class MultiKwdAttributeMatcher(object):
    """
    Matches chunks of keywords into fields in service outputs.
    """
    fields_idx = None

    def __init__(self, fields):
        """
        Upon each instantialization it recreates the Whoosh IR index based
        on the field data given.
        """
        self.fields_idx = SimpleIREntityAttributeMatcher(fields)

    def generate_chunks(self, keywords):
        """
        params: a tokenized list of keywords (e.g. ["a b c", 'a', 'b'])
        returns: a list of fields matching a combination of nearby keywords

        .. doctest::

            {
                '[result_type]':
                    [ matched_field, ...]
            }
        """

        if not get_setting('SERVICE_RESULT_FIELDS'):
            return {}

        matches = self.get_phrase_matches(keywords)
        self.append_subquery_matches(keywords, matches)
        # return the matches in sorted order (per result type)
        for entity, m_list in matches.items():
            for match in m_list:
                last_token = match['tokens_required'][-1]
                tokens_used = match['tokens_required']
                match['predicate'] = get_operator_and_param(last_token)
                match['field_name'] = match['field']['name']
                match['tokens_required_non_stopw'] = \
                    filter_stopwords(tokens_used)
                match['tokens_required_set'] = set(tokens_used)

            m_list.sort(key=lambda f: f['score'], reverse=True)

            # as IR based matching is fairly dumb now,
            # prune out the useless matches
            purge = []
            for m1 in m_list:
                for m2 in m_list:
                    tokens1 = m1['tokens_required_set']
                    tokens2 = m2['tokens_required_set']
                    if (m2 != m1 and m1['field_name'] == m2['field_name']
                            and tokens1.issubset(tokens2)
                            and m1['score'] + 0.01 >= m2['score']):
                        # mark a useless match for deletion
                        purge.append(m2)
            matches[entity] = [match for match in m_list if match not in purge]

        normalize_scores(matches)

        # if enabled, prune low scoring chunks
        if get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS'):
            cutoff = get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS')
            for key in matches:
                matches[key] = [
                    match for match in matches[key] if match['score'] > cutoff
                ]

        print_debug(matches)
        return matches

    def get_phrase_matches(self, keywords):
        """
        get phrase matches from IR index
        """
        fields_by_entity = get_schema().list_result_fields()

        # first filter out the phrases (we wont combine them with anything)
        phrase_kwds = [kw for kw in keywords if ' ' in kw]

        matches = defaultdict(list)
        for kwd in phrase_kwds:
            # remove operators, e.g. "number of events">10 => number of events
            phrase = get_keyword_without_operator(kwd)
            # get ranked list of matches
            results = self.fields_idx.search_index(
                kwds=phrase, limit=CHUNK_N_PHRASE_RESULTS)

            max_score = results and results[0]['score']
            for result in results:
                #r['len'] =  1
                result['len'] = len(result['keywords_matched'])
                entity = result['result_type']
                if not check_validity(result, fields_by_entity):
                    continue

                # TODO: this shall be done in presentation level
                result['field'] = fields_by_entity[entity][result['field']]
                result['tokens_required'] = [kwd]

                # penalize terms that have multiple matches
                result['score'] *= W_PHRASE
                if USE_IR_SCORE_NORMALIZATION_LOCAL:
                    result['score'] /= max_score

                matches[entity].append(result)

        return matches

    def append_subquery_matches(self, keywords, matches):
        """
        get matches to individual and nearby keywords (non phrase)
        """

        # check for full name matches to a attribute, e.g. dataset.nevents
        for kwd in keywords:
            add_full_fieldmatch(kwd, matches)

        fields_by_entity = get_schema().list_result_fields()
        str_len = len(keywords)
        max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN)
        for length in xrange(1, max_len + 1):
            for start in xrange(0, str_len - length + 1):
                chunk = keywords[start:start + length]
                # exclude phrases with "a b c" (as these were processed earlier)
                if any(c for c in chunk if ' ' in c):
                    continue
                # only the last term in the chunk is allowed to contain operator
                if any(test_operator_containment(kw) for kw in chunk[:-1]):
                    continue
                if DEBUG:
                    print('chunk:', chunk)
                    print('len=', length, '; start=', start, 'chunk:', chunk)

                s_chunk = ' '.join(
                    get_keyword_without_operator(kw) for kw in chunk)
                results = self.fields_idx.search_index(
                    kwds=s_chunk, limit=CHUNK_N_TOKEN_COMBINATION_RESULTS)
                max_score = results and results[0]['score']
                for result in results:
                    result['len'] = len(result['keywords_matched'])
                    entity = result['result_type']
                    if not check_validity(result, fields_by_entity):
                        continue
                    result['field'] = fields_by_entity[entity][result['field']]
                    result['tokens_required'] = chunk
                    if USE_IR_SCORE_NORMALIZATION_LOCAL:
                        result['score'] /= max_score
                    matches[entity].append(result)