def index_location(args, config): path = config['location'] add_site_libs(path) # This whole routine is probably being too careful # Explicit is better than implicit, though! index_processor_helper = IndexHelper() index_processor_helper.configure(config) # the IndexHelper singleton can be used in processors that # need to talk to elasticsearch settings_path = os.path.join(path, '_settings/settings.json') default_mapping_path = os.path.join(path, '_defaults/mappings.json') processors_path = os.path.join(path, '_settings/processors.json') es = Elasticsearch(config["elasticsearch"]) index_name = config["index"] if es.indices.exists(index_name): es.indices.delete(index_name) if os.path.exists(settings_path): es.indices.create(index=index_name, body=file(settings_path).read()) else: es.indices.create(index=index_name) processors = [] processor_settings = read_json_file(processors_path) if processor_settings: configured_processors = [ContentProcessor(name, **details) for name, details in processor_settings.iteritems()] processors += configured_processors glob_pattern = "".join([os.path.normpath(path), "/_*/"]) underscored = glob.glob(glob_pattern) ignore_dirs = [os.path.join(path, d) for d in DO_NOT_INDEX] filesystem_candidates = [u for u in underscored if u not in ignore_dirs] for f in filesystem_candidates: # TODO: don't create processors for directories that # have a configured processor processor_name_starts = f[0:-1].rfind('/') + 2 processor_name = f[processor_name_starts:-1] processor_args = dict(directory=f, site_root=path, processor="sheer.processors.filesystem") processors.append(ContentProcessor(processor_name, **processor_args)) # Load default mapping (or not) if os.path.exists(default_mapping_path): try: default_mapping = read_json_file(default_mapping_path) except ValueError: sys.exit("default mapping present, but is not valid JSON") else: default_mapping = {} for processor in processors: print "creating mapping for %s (%s)" % (processor.name, processor.processor_name) es.indices.put_mapping(index=index_name, doc_type=processor.name, body={processor.name: processor.mapping(default_mapping)}) i = -1 for i, document in enumerate(processor.documents()): es.create(index=index_name, doc_type=processor.name, id=document['_id'], body=document) sys.stdout.write("indexed %s %s \r" % (i + 1, processor.name)) sys.stdout.flush() sys.stdout.write("indexed %s %s \n" % (i + 1, processor.name))
def index_location(args, config): path = config['location'] add_site_libs(path) # This whole routine is probably being too careful # Explicit is better than implicit, though! index_processor_helper = IndexHelper() index_processor_helper.configure(config) # the IndexHelper singleton can be used in processors that # need to talk to elasticsearch settings_path = os.path.join(path, '_settings/settings.json') default_mapping_path = os.path.join(path, '_defaults/mappings.json') processors_path = os.path.join(path, '_settings/processors.json') es = Elasticsearch(config["elasticsearch"]) index_name = config["index"] if es.indices.exists(index_name): es.indices.delete(index_name) if os.path.exists(settings_path): es.indices.create(index=index_name, body=file(settings_path).read()) else: es.indices.create(index=index_name) processors = [] processor_settings = read_json_file(processors_path) if processor_settings: configured_processors = [ ContentProcessor(name, **details) for name, details in processor_settings.iteritems() ] processors += configured_processors glob_pattern = "".join([os.path.normpath(path), "/_*/"]) underscored = glob.glob(glob_pattern) ignore_dirs = [os.path.join(path, d) for d in DO_NOT_INDEX] filesystem_candidates = [u for u in underscored if u not in ignore_dirs] for f in filesystem_candidates: # TODO: don't create processors for directories that # have a configured processor processor_name_starts = f[0:-1].rfind('/') + 2 processor_name = f[processor_name_starts:-1] processor_args = dict(directory=f, site_root=path, processor="sheer.processors.filesystem") processors.append(ContentProcessor(processor_name, **processor_args)) # Load default mapping (or not) if os.path.exists(default_mapping_path): try: default_mapping = read_json_file(default_mapping_path) except ValueError: sys.exit("default mapping present, but is not valid JSON") else: default_mapping = {} for processor in processors: print "creating mapping for %s (%s)" % (processor.name, processor.processor_name) es.indices.put_mapping( index=index_name, doc_type=processor.name, body={processor.name: processor.mapping(default_mapping)}) i = -1 for i, document in enumerate(processor.documents()): es.create(index=index_name, doc_type=processor.name, id=document['_id'], body=document) sys.stdout.write("indexed %s %s \r" % (i + 1, processor.name)) sys.stdout.flush() sys.stdout.write("indexed %s %s \n" % (i + 1, processor.name))
def index_location(args, config): path = config['location'] add_site_libs(path) # This whole routine is probably being too careful # Explicit is better than implicit, though! index_processor_helper = IndexHelper() index_processor_helper.configure(config) # the IndexHelper singleton can be used in processors that # need to talk to elasticsearch settings_path = os.path.join(path, '_settings/settings.json') processors_path = os.path.join(path, '_settings/processors.json') es = Elasticsearch(config["elasticsearch"]) index_name = config["index"] # If we're given args.reindex and NOT given a list of processors to reindex, # we're expected to reindex everything. Delete the existing index. if not args.processors and args.reindex and es.indices.exists(index_name): print "reindexing %s" % index_name es.indices.delete(index_name) # If the index doesn't exist, create it. if not es.indices.exists(index_name): if os.path.exists(settings_path): es.indices.create(index=index_name, body=file(settings_path).read()) else: es.indices.create(index=index_name) processors = [] processor_settings = read_json_file(processors_path) if processor_settings: configured_processors = [ContentProcessor(name, **details) for name, details in processor_settings.iteritems()] processors += configured_processors glob_pattern = "".join([os.path.normpath(path), "/_*/"]) underscored = glob.glob(glob_pattern) ignore_dirs = [os.path.join(path, d) for d in DO_NOT_INDEX] filesystem_candidates = [u for u in underscored if u not in ignore_dirs] for f in filesystem_candidates: # TODO: don't create processors for directories that # have a configured processor processor_name_starts = f[0:-1].rfind('/') + 2 processor_name = f[processor_name_starts:-1] processor_args = dict(directory=f, site_root=path, processor="sheer.processors.filesystem") processors.append(ContentProcessor(processor_name, **processor_args)) # If any specific content processors were selected, we run them. Otherwise # we run all of them. selected_processors = processors if args.processors and len(args.processors) > 0: selected_processors = [p for p in processors if p.name in args.processors] failed_processors = [] for processor in selected_processors: index_sucess = index_processor(es, index_name, processor, reindex=args.reindex) if not index_sucess: failed_processors.append(processor.name) # Exit with an error code != 0 if there were any issues with indexing if failed_processors: sys.exit("Indexing the following processor(s) failed: {}".format( ", ".join(failed_processors)))