Пример #1
0
def datasets(keywords=None, licenses=None):
    """Search all datasets by keywords and licenses."""
    script_list = SCRIPT_LIST()
    if not keywords and not licenses:
        # The scripts present locally
        offline_scripts = sorted(script_list, key=lambda s: s.name.lower())
        # The scripts present in upstream retriever repository
        retriever_scripts = get_dataset_names_upstream(
            repo=RETRIEVER_REPOSITORY)
        # The scripts present in upstream recipes repository
        recipes_scripts = get_dataset_names_upstream()
        # Sorted list of all the online scripts
        native_scripts = sorted(list(set(retriever_scripts + recipes_scripts)))
        return {'online': native_scripts, 'offline': offline_scripts}

    offline_scripts = set()
    if licenses:
        licenses = [i.lower() for i in licenses]
    for script in script_list:
        if script.name:
            if licenses:
                script_license = [
                    licence_map['name'].lower()
                    for licence_map in script.licenses if licence_map['name']
                ]
                if script_license and set(script_license).intersection(
                        set(licenses)):
                    offline_scripts.add(script)
                    continue
            if keywords:
                script_keywords = script.title + ' ' + script.name
                if script.keywords:
                    script_keywords = script_keywords + ' ' + '-'.join(
                        script.keywords)
                script_keywords = script_keywords.lower()
                for k in keywords:
                    if script_keywords.find(k.lower()) != -1:
                        offline_scripts.add(script)
                        break
    # The offline scripts filtered by params
    offline_scripts = sorted(list(offline_scripts),
                             key=lambda s: s.name.lower())
    # The scripts present in upstream retriever repository filtered by params
    retriever_scripts = get_dataset_names_upstream(keywords,
                                                   licenses,
                                                   repo=RETRIEVER_REPOSITORY)
    # The scripts present in upstream recipes repository filtered by params
    recipes_scripts = get_dataset_names_upstream(keywords, licenses)
    native_scripts = sorted(list(set(retriever_scripts + recipes_scripts)))
    datasets_dict = {'online': native_scripts, 'offline': offline_scripts}
    return datasets_dict
Пример #2
0
for module in module_list:
    script_list.append(module.name)

    if hasattr(module, "keywords"):
        # Add list of keywords to keywords_list
        if module.keywords:
            keywords_list += module.keywords

    if hasattr(module, "licenses"):
        # Append string to list of licenses_list
        if module.licenses:
            for dict_items in module.licenses:
                if dict_items['name']:
                    licenses_list.append(dict_items['name'])

script_list.extend(get_dataset_names_upstream(repo=RETRIEVER_REPOSITORY))
script_list.extend(get_dataset_names_upstream())
script_list = sorted(set(script_list))

# set of all possible licenses and keywords
licenses_options = set(licenses_list)
keywords_options = set(keywords_list)

parser = argparse.ArgumentParser(prog="retriever")
parser.add_argument('-v', '--version', action='version', version=VERSION)
parser.add_argument('-q',
                    '--quiet',
                    help='suppress command-line output',
                    action='store_true')

# ..............................................................