Exemplo n.º 1
0
def index_location(args, config):

    path = config['location']
    add_site_libs(path)

    # This whole routine is probably being too careful
    # Explicit is better than implicit, though!
    index_processor_helper = IndexHelper()
    index_processor_helper.configure(config)
    # the IndexHelper singleton can be used in processors that
    # need to talk to elasticsearch

    settings_path = os.path.join(path, '_settings/settings.json')
    default_mapping_path = os.path.join(path, '_defaults/mappings.json')
    processors_path = os.path.join(path, '_settings/processors.json')

    es = Elasticsearch(config["elasticsearch"])
    index_name = config["index"]

    if es.indices.exists(index_name):
        es.indices.delete(index_name)
    if os.path.exists(settings_path):
        es.indices.create(index=index_name, body=file(settings_path).read())
    else:
        es.indices.create(index=index_name)

    processors = []
    processor_settings = read_json_file(processors_path)

    if processor_settings:
        configured_processors = [ContentProcessor(name, **details)
                                 for name, details
                                 in processor_settings.iteritems()]

        processors += configured_processors

    glob_pattern = "".join([os.path.normpath(path), "/_*/"])
    underscored = glob.glob(glob_pattern)
    ignore_dirs = [os.path.join(path, d) for d in DO_NOT_INDEX]
    filesystem_candidates = [u for u in underscored if u not in ignore_dirs]

    for f in filesystem_candidates:
        # TODO: don't create processors for directories that
        # have a configured processor
        processor_name_starts = f[0:-1].rfind('/') + 2
        processor_name = f[processor_name_starts:-1]
        processor_args = dict(directory=f,
                              site_root=path,
                              processor="sheer.processors.filesystem")
        processors.append(ContentProcessor(processor_name, **processor_args))
    # Load default mapping (or not)
    if os.path.exists(default_mapping_path):
        try:
            default_mapping = read_json_file(default_mapping_path)
        except ValueError:
            sys.exit("default mapping present, but is not valid JSON")

    else:
        default_mapping = {}

    for processor in processors:
        print "creating mapping for %s (%s)" % (processor.name, processor.processor_name)
        es.indices.put_mapping(index=index_name,
                               doc_type=processor.name,
                               body={processor.name: processor.mapping(default_mapping)})

        i = -1
        for i, document in enumerate(processor.documents()):
            es.create(index=index_name,
                      doc_type=processor.name,
                      id=document['_id'],
                      body=document)
            sys.stdout.write("indexed %s %s \r" % (i + 1, processor.name))
            sys.stdout.flush()

        sys.stdout.write("indexed %s %s \n" % (i + 1, processor.name))
Exemplo n.º 2
0
def index_location(args, config):

    path = config['location']
    add_site_libs(path)

    # This whole routine is probably being too careful
    # Explicit is better than implicit, though!
    index_processor_helper = IndexHelper()
    index_processor_helper.configure(config)
    # the IndexHelper singleton can be used in processors that
    # need to talk to elasticsearch

    settings_path = os.path.join(path, '_settings/settings.json')
    default_mapping_path = os.path.join(path, '_defaults/mappings.json')
    processors_path = os.path.join(path, '_settings/processors.json')

    es = Elasticsearch(config["elasticsearch"])
    index_name = config["index"]

    if es.indices.exists(index_name):
        es.indices.delete(index_name)
    if os.path.exists(settings_path):
        es.indices.create(index=index_name, body=file(settings_path).read())
    else:
        es.indices.create(index=index_name)

    processors = []
    processor_settings = read_json_file(processors_path)

    if processor_settings:
        configured_processors = [
            ContentProcessor(name, **details)
            for name, details in processor_settings.iteritems()
        ]

        processors += configured_processors

    glob_pattern = "".join([os.path.normpath(path), "/_*/"])
    underscored = glob.glob(glob_pattern)
    ignore_dirs = [os.path.join(path, d) for d in DO_NOT_INDEX]
    filesystem_candidates = [u for u in underscored if u not in ignore_dirs]

    for f in filesystem_candidates:
        # TODO: don't create processors for directories that
        # have a configured processor
        processor_name_starts = f[0:-1].rfind('/') + 2
        processor_name = f[processor_name_starts:-1]
        processor_args = dict(directory=f,
                              site_root=path,
                              processor="sheer.processors.filesystem")
        processors.append(ContentProcessor(processor_name, **processor_args))
    # Load default mapping (or not)
    if os.path.exists(default_mapping_path):
        try:
            default_mapping = read_json_file(default_mapping_path)
        except ValueError:
            sys.exit("default mapping present, but is not valid JSON")

    else:
        default_mapping = {}

    for processor in processors:
        print "creating mapping for %s (%s)" % (processor.name,
                                                processor.processor_name)
        es.indices.put_mapping(
            index=index_name,
            doc_type=processor.name,
            body={processor.name: processor.mapping(default_mapping)})

        i = -1
        for i, document in enumerate(processor.documents()):
            es.create(index=index_name,
                      doc_type=processor.name,
                      id=document['_id'],
                      body=document)
            sys.stdout.write("indexed %s %s \r" % (i + 1, processor.name))
            sys.stdout.flush()

        sys.stdout.write("indexed %s %s \n" % (i + 1, processor.name))
Exemplo n.º 3
0
def index_location(args, config):

    path = config['location']
    add_site_libs(path)

    # This whole routine is probably being too careful
    # Explicit is better than implicit, though!
    index_processor_helper = IndexHelper()
    index_processor_helper.configure(config)
    # the IndexHelper singleton can be used in processors that
    # need to talk to elasticsearch

    settings_path = os.path.join(path, '_settings/settings.json')
    processors_path = os.path.join(path, '_settings/processors.json')

    es = Elasticsearch(config["elasticsearch"])
    index_name = config["index"]

    # If we're given args.reindex and NOT given a list of processors to reindex,
    # we're expected to reindex everything. Delete the existing index.
    if not args.processors and args.reindex and es.indices.exists(index_name):
        print "reindexing %s" % index_name
        es.indices.delete(index_name)

    # If the index doesn't exist, create it.
    if not es.indices.exists(index_name):
        if os.path.exists(settings_path):
            es.indices.create(index=index_name, body=file(settings_path).read())
        else:
            es.indices.create(index=index_name)

    processors = []
    processor_settings = read_json_file(processors_path)

    if processor_settings:
        configured_processors = [ContentProcessor(name, **details)
                                 for name, details
                                 in processor_settings.iteritems()]

        processors += configured_processors

    glob_pattern = "".join([os.path.normpath(path), "/_*/"])
    underscored = glob.glob(glob_pattern)
    ignore_dirs = [os.path.join(path, d) for d in DO_NOT_INDEX]
    filesystem_candidates = [u for u in underscored if u not in ignore_dirs]

    for f in filesystem_candidates:
        # TODO: don't create processors for directories that
        # have a configured processor
        processor_name_starts = f[0:-1].rfind('/') + 2
        processor_name = f[processor_name_starts:-1]
        processor_args = dict(directory=f,
                              site_root=path,
                              processor="sheer.processors.filesystem")
        processors.append(ContentProcessor(processor_name, **processor_args))

    # If any specific content processors were selected, we run them. Otherwise
    # we run all of them.
    selected_processors = processors
    if args.processors and len(args.processors) > 0:
        selected_processors = [p for p in processors if p.name in args.processors]

    failed_processors = []
    for processor in selected_processors:
        index_sucess = index_processor(es,
                                       index_name,
                                       processor,
                                       reindex=args.reindex)
        if not index_sucess:
            failed_processors.append(processor.name)
    # Exit with an error code != 0 if there were any issues with indexing
    if failed_processors:
        sys.exit("Indexing the following processor(s) failed: {}".format(
            ", ".join(failed_processors)))