def test_append_update(): """Make sure existent key are merged and nonexistent ones are created as lists.""" d = {'o': ['hai']} eq_(append_update(d, [('o', 'hello'), ('o', 'howdy'), ('p', 'pod')]), { 'o': ['hai', 'hello', 'howdy'], 'p': ['pod'] })
def filters_by_name(plugins): """Return a mapping of filter names to all filters with that name, regardless of whether they have descriptions :arg plugins: An iterable of plugins from which to get filters """ return append_update({}, ((f.name, f) for f in chain.from_iterable(p.filters for p in plugins)))
def filters_by_name(plugins): """Return a mapping of filter names to all filters with that name, regardless of whether they have descriptions :arg plugins: An iterable of plugins from which to get filters """ return append_update( {}, ((f.name, f) for f in chain.from_iterable(p.filters for p in plugins)))
def test_append_update(): """Make sure existent key are merged and nonexistent ones are created as lists.""" d = {'o': ['hai']} eq_(append_update(d, [('o', 'hello'), ('o', 'howdy'), ('p', 'pod')]), {'o': ['hai', 'hello', 'howdy'], 'p': ['pod']})
def index_file(tree, tree_indexers, path, es, index): """Index a single file into ES, and build a static HTML representation of it. For the moment, we execute plugins in series, figuring that we have plenty of files to keep our processors busy in most trees that take very long. I'm a little afraid of the cost of passing potentially large TreesToIndex to worker processes. That goes at 52MB/s on my OS X laptop, measuring by the size of the pickled object and including the pickling and unpickling time. :arg path: Absolute path to the file to index :arg index: The ES index name """ try: contents = unicode_contents(path, tree.source_encoding) except IOError as exc: if exc.errno == ENOENT and islink(path): # It's just a bad symlink (or a symlink that was swiped out # from under us--whatever) return else: raise # Just like index_folders, if the path is not in UTF-8, then elasticsearch # will not accept the path, so just move on. rel_path = relpath(path, tree.source_folder) is_text = isinstance(contents, unicode) is_link = islink(path) # Index by line if the contents are text and the path is not a symlink. index_by_line = is_text and not is_link if index_by_line: lines = split_content_lines(contents) num_lines = len(lines) needles_by_line = [{} for _ in xrange(num_lines)] annotations_by_line = [[] for _ in xrange(num_lines)] refses, regionses = [], [] needles = {} linkses = [] for tree_indexer in tree_indexers: file_to_index = tree_indexer.file_to_index(rel_path, contents) if file_to_index.is_interesting(): # Per-file stuff: append_update(needles, file_to_index.needles()) if not is_link: linkses.append(file_to_index.links()) # Per-line stuff: if index_by_line: refses.append(file_to_index.refs()) regionses.append(file_to_index.regions()) append_update_by_line(needles_by_line, file_to_index.needles_by_line()) append_by_line(annotations_by_line, file_to_index.annotations_by_line()) def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear() # Indexing a 277K-line file all in one request makes ES time out (>60s), # so we chunk it up. 300 docs is optimal according to the benchmarks in # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like # images don't make our chunk sizes ridiculous, there's a size ceiling as # well: 10000 is based on the 300 and an average of 31 chars per line. for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000): es.bulk(chunk, index=index, doc_type=LINE)
def index_file(tree, tree_indexers, path, es, index): """Index a single file into ES, and build a static HTML representation of it. For the moment, we execute plugins in series, figuring that we have plenty of files to keep our processors busy in most trees that take very long. I'm a little afraid of the cost of passing potentially large TreesToIndex to worker processes. That goes at 52MB/s on my OS X laptop, measuring by the size of the pickled object and including the pickling and unpickling time. :arg path: Bytestring absolute path to the file to index :arg index: The ES index name """ try: contents = unicode_contents(path, tree.source_encoding) except IOError as exc: if exc.errno == ENOENT and islink(path): # It's just a bad symlink (or a symlink that was swiped out # from under us--whatever) return else: raise # Just like index_folders, if the path is not in UTF-8, then elasticsearch # will not accept the path, so just move on. rel_path = relpath(path, tree.source_folder) is_text = isinstance(contents, unicode) is_link = islink(path) # Index by line if the contents are text and the path is not a symlink. index_by_line = is_text and not is_link if index_by_line: lines = split_content_lines(contents) num_lines = len(lines) needles_by_line = [{} for _ in xrange(num_lines)] annotations_by_line = [[] for _ in xrange(num_lines)] refses, regionses = [], [] needles = {} linkses = [] for tree_indexer in tree_indexers: file_to_index = tree_indexer.file_to_index(rel_path, contents) if file_to_index.is_interesting(): # Per-file stuff: append_update(needles, file_to_index.needles()) if not is_link: linkses.append(file_to_index.links()) # Per-line stuff: if index_by_line: refses.append(file_to_index.refs()) regionses.append(file_to_index.regions()) append_update_by_line(needles_by_line, file_to_index.needles_by_line()) append_by_line(annotations_by_line, file_to_index.annotations_by_line()) def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict( # Some non-array fields: folder=unicode_for_display(folder_name), name=unicode_for_display(file_name), size=file_info.st_size, is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines( finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket( tags, lambda index_obj: "regions" if isinstance( index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear() # Indexing a 277K-line file all in one request makes ES time out (>60s), # so we chunk it up. 300 docs is optimal according to the benchmarks in # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like # images don't make our chunk sizes ridiculous, there's a size ceiling as # well: 10000 is based on the 300 and an average of 31 chars per line. for chunk in bulk_chunks(docs(), docs_per_chunk=300, bytes_per_chunk=10000): es.bulk(chunk, index=index, doc_type=LINE)