Exemplo n.º 1
0
def match_value_dataset(kwd, dbs_inst=None):
    """ return keyword matches to dataset values in dbsmanager """
    # if no specific dbs_inst passed, get the current one from request
    if not dbs_inst:
        if not hasattr(request, 'dbs_inst'):
            return None, None
        dbs_inst = request.dbs_inst

    dataset_score = None

    # make sure the kwd is unicode
    if not isinstance(kwd, unicode) and isinstance(kwd, str):
        kwd = unicode(kwd)

    upd_kwd = kwd

    # dbsmgr.find returns a generator, check if it's non empty
    match = find_datasets(kwd, dbs_inst, limit=1)
    if next(match, False):
        if DEBUG:
            print('Dataset matched by keyword %s' % kwd)
        # if kw contains wildcards the score shall be a bit lower
        if '*' in kwd and not '/' in kwd:
            dataset_score = 0.8
        elif '*' in kwd and '/' in kwd:
            dataset_score = 0.9
        elif not '*' in kwd and not '/' in kwd:
            if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False):
                dataset_score = 0.7
                upd_kwd = '*%s*' % kwd
        else:
            dataset_score = 1.0

        # prevent number-only-keywords to be matched into datasets
        if kwd.isnumeric():
            dataset_score -= 0.3

    # add extra wildcard to make sure the query will work...
    if not RE_3SLASHES.match(upd_kwd):
        upd_kwd0 = upd_kwd
        if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'):
            upd_kwd = '*' + upd_kwd
        if not upd_kwd0.endswith('*') or '*' not in upd_kwd0:
            upd_kwd += '*'

    return dataset_score, {
        'map_to': 'dataset.name',
        'adjusted_keyword': upd_kwd
    }
Exemplo n.º 2
0
def hint_dataset_in_other_insts(query, cur_inst):
    """ find datasets in other DBS instances
     (shown only if no matches in current instance)"""
    dataset_pat = get_dataset_token(query)
    if not dataset_pat:
        return {}
    matches = match_dataset_all_inst(dataset_pat, cur_inst)

    # for now, display hints ONLY on no matches in the current instance
    if any(m['inst'] == cur_inst for m in matches):
        return

    results = [{
        'inst':
        m['inst'],
        'match':
        m['inst'],
        'query':
        repl_dataset_val(query, m['match']) + ' instance=' + m['inst'],
        'examples':
        list(find_datasets(m['match'], m['inst']))
    } for m in matches if m['inst'] != cur_inst]
    #print results
    return {
        'title': 'Matching datasets in other DBS instances',
        'results': results
    }
Exemplo n.º 3
0
def match_value_dataset(kwd, dbs_inst=None):
    """ return keyword matches to dataset values in dbsmanager """
    # if no specific dbs_inst passed, get the current one from request
    if not dbs_inst:
        if not hasattr(request, 'dbs_inst'):
                return None, None
        dbs_inst = request.dbs_inst

    dataset_score = None

    # make sure the kwd is unicode
    if not isinstance(kwd, unicode) and isinstance(kwd, str):
        kwd = unicode(kwd)

    upd_kwd = kwd

    # dbsmgr.find returns a generator, check if it's non empty
    match = find_datasets(kwd, dbs_inst, limit=1)
    if next(match, False):
        if DEBUG:
            print 'Dataset matched by keyword %s' % kwd
        # if kw contains wildcards the score shall be a bit lower
        if '*' in kwd and not '/' in kwd:
            dataset_score = 0.8
        elif '*' in kwd and '/' in kwd:
            dataset_score = 0.9
        elif not '*' in kwd and not '/' in kwd:
            if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False):
                dataset_score = 0.7
                upd_kwd = '*%s*' % kwd
        else:
            dataset_score = 1.0

        # prevent number-only-keywords to be matched into datasets
        if kwd.isnumeric():
            dataset_score -= 0.3

    # add extra wildcard to make sure the query will work...
    if not RE_3SLAHES.match(upd_kwd):
        upd_kwd0 = upd_kwd
        if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'):
            upd_kwd = '*' + upd_kwd
        if not upd_kwd0.endswith('*') or '*' not in upd_kwd0:
            upd_kwd += '*'

    return dataset_score, {'map_to': 'dataset.name',
                           'adjusted_keyword': upd_kwd}
Exemplo n.º 4
0
def extract_wildcard_patterns(dbs_inst, pattern):
    """
    Given a wildcard query and a list of datasets, we interested in
    how many slashes are matched by each of wildcard (because the slashes has to
    be included in the result).

    it returns counts per each combination of different patterns e.g.
      *Zmm* used regexp (.*)Zmm(.*) where one of the results is the following
       match /RelValZmm/CMSSW.../tier that yield such a combination:
        query   match                   transformed into pattern
        *       '/RelVal'     ->        */*
        Zmm     (query)
        *       '/CMSSW.../tier' ->     */*/*
    """
    # get matching datasets from out cache (through dbs manager instance)
    dbs_mngr_query = pattern
    dataset_matches = find_datasets(dbs_mngr_query, dbs_inst, limit=-1)

    # we will use these regexps  to extract different dataset patterns
    pat_re = "^" + pattern.replace("*", "(.*)") + "$"
    pat_re = re.compile(pat_re, re.IGNORECASE)

    # now match the positions of slash
    counts = {}
    interpretations = {}
    for item in dataset_matches:
        match = pat_re.match(item)

        # just in case the pat_re regexp was more restrictive than db filtering
        if not match:
            continue

        groups = match.groups()
        if DEBUG:
            print "matched groups", groups

        # a group may contain more than one slash
        f_replace_group = (
            lambda group: (group.count("/") == 3 and "*/*/*/*")
            or (group.count("/") == 2 and "*/*/*")
            or (group.count("/") == 1 and "*/*")
            or "*"
        )

        replacements = tuple([f_replace_group(group) for group in groups])
        counts[replacements] = counts.get(replacements, 0) + 1

        # add this into list of possible options
        updated = interpretations.get(replacements, [])
        updated.append(groups)
        interpretations[replacements] = updated

    return counts, interpretations
Exemplo n.º 5
0
def hint_dataset_case_insensitive(query, cur_inst):
    """ case insensitive dataset suggestions
     shown only if current query return no results """
    dataset_pat = get_dataset_token(query)
    if  not dataset_pat:
        return {}
    good_result = lambda m: m != dataset_pat
    if '*' in dataset_pat:
        # the mongo query is quite slow
        # we shall care only if case sensitive search return no results
        exact_matches = find_datasets(dataset_pat, cur_inst, ignorecase=False)
        if next(exact_matches, False):
            return

    matches = [{'match': m,
                'query': repl_dataset_val(query, m)}
               for m in find_datasets(dataset_pat, cur_inst)
               if good_result(m)]
    return {'title': 'Case-insensitive dataset matches (NEW)',
            'descr': '(dataset selection in DBS3 is now case-sensitive)',
            'results': matches}
Exemplo n.º 6
0
def hint_dataset_case_insensitive(query, cur_inst):
    """ case insensitive dataset suggestions
     shown only if current query return no results """
    dataset_pat = get_dataset_token(query)
    if not dataset_pat:
        return {}
    good_result = lambda m: m != dataset_pat
    if '*' in dataset_pat:
        # the mongo query is quite slow
        # we shall care only if case sensitive search return no results
        exact_matches = find_datasets(dataset_pat, cur_inst, ignorecase=False)
        if next(exact_matches, False):
            return

    matches = [{
        'match': m,
        'query': repl_dataset_val(query, m)
    } for m in find_datasets(dataset_pat, cur_inst) if good_result(m)]
    return {
        'title': 'Case-insensitive dataset matches (NEW)',
        'descr': '(dataset selection in DBS3 is now case-sensitive)',
        'results': matches
    }
Exemplo n.º 7
0
def extract_wildcard_patterns(dbs_inst, pattern, ignorecase=False):
    """
    Given a wildcard query and a list of datasets, we interested in
    how many slashes are matched by each of wildcard (because the slashes has to
    be included in the result).

    it returns counts per each combination of different patterns e.g.
      *Zmm* used regexp (.*)Zmm(.*) where one of the results is the following
       match /RelValZmm/CMSSW.../tier that yield such a combination:
        query   match                   transformed into pattern
        *       '/RelVal'     ->        */*
        Zmm     (query)
        *       '/CMSSW.../tier' ->     */*/*
    """
    # get matching datasets from out cache (through dbs manager instance)
    dbs_mngr_query = pattern
    dataset_matches = find_datasets(dbs_mngr_query, dbs_inst, limit=-1,
                                    ignorecase=ignorecase)

    # we will use these regexps  to extract different dataset patterns
    pat_re = '^' + pattern.replace('*', '(.*)') + '$'
    pat_re = re.compile(pat_re, re.IGNORECASE if ignorecase else 0)

    # now match the positions of slash
    counts = defaultdict(int)
    interpretations = defaultdict(list)
    for item in dataset_matches:
        match = pat_re.match(item)

        # just in case the pat_re regexp was more restrictive than db filtering
        if not match:
            continue

        groups = match.groups()
        if DEBUG:
            print("matched groups", groups)

        # a group may contain more than one slash
        f_replace_group = lambda group: (group.count('/') == 3 and '*/*/*/*')\
                                        or (group.count('/') == 2 and '*/*/*')\
                                        or (group.count('/') == 1 and '*/*')\
                                        or '*'
        replacements = tuple(f_replace_group(group) for group in groups)
        counts[replacements] += 1
        # add this into list of possible options
        interpretations[replacements].append(groups)

    return counts, interpretations
Exemplo n.º 8
0
def hint_dataset_in_other_insts(query, cur_inst):
    """ find datasets in other DBS instances
     (shown only if no matches in current instance)"""
    dataset_pat = get_dataset_token(query)
    if  not dataset_pat:
        return {}
    matches = match_dataset_all_inst(dataset_pat, cur_inst)

    # for now, display hints ONLY on no matches in the current instance
    if any(m['inst'] == cur_inst for m in matches):
        return

    results = [{'inst': m['inst'],
                'match': m['inst'],
                'query': repl_dataset_val(query, m['match']) +
                         ' instance=' + m['inst'],
                'examples': list(find_datasets(m['match'], m['inst']))}
               for m in matches
               if m['inst'] != cur_inst]
    #print results
    return {'title': 'Matching datasets in other DBS instances',
            'results': results}