def docs(): """Yield documents for bulk indexing.""" # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict( # Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, modified=datetime.fromtimestamp(file_info.st_mtime), is_folder=False, # And these, which all get mashed into arrays: **needles) links = [{ 'order': order, 'heading': heading, 'items': [{ 'icon': icon, 'title': title, 'href': href } for icon, title, href in items] } for order, heading, items in chain.from_iterable(linkses)] if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines( finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket( tags, lambda index_obj: "regions" if isinstance( index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total)
def docs(): """Yield documents for bulk indexing. Big Warning: docs also clears the contents of all elements of needles_by_line because they will no longer be used. """ # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, modified=datetime.fromtimestamp(file_info.st_mtime), is_folder=False, # And these, which all get mashed into arrays: **needles) links = dictify_links(chain.from_iterable(linkses)) if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. if index_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total) # Because needles_by_line holds a reference, total is not # garbage collected. Since we won't use it again, we can clear # the contents, saving substantial memory on long files. total.clear()
def docs(): """Yield documents for bulk indexing.""" # Index a doc of type 'file' so we can build folder listings. # At the moment, we send to ES in the same worker that does the # indexing. We could interpose an external queueing system, but I'm # willing to potentially sacrifice a little speed here for the easy # management of self-throttling. file_info = stat(path) folder_name, file_name = split(rel_path) # Hard-code the keys that are hard-coded in the browse() # controller. Merge with the pluggable ones from needles: doc = dict(# Some non-array fields: folder=folder_name, name=file_name, size=file_info.st_size, modified=datetime.fromtimestamp(file_info.st_mtime), is_folder=False, # And these, which all get mashed into arrays: **needles) links = [{'order': order, 'heading': heading, 'items': [{'icon': icon, 'title': title, 'href': href} for icon, title, href in items]} for order, heading, items in chain.from_iterable(linkses)] if links: doc['links'] = links yield es.index_op(doc, doc_type=FILE) # Index all the lines. If it's an empty file (no lines), don't bother # ES. It hates empty dicts. if is_text and needles_by_line: for total, annotations_for_this_line, tags in izip( needles_by_line, annotations_by_line, es_lines(finished_tags(lines, chain.from_iterable(refses), chain.from_iterable(regionses)))): # Duplicate the file-wide needles into this line: total.update(needles) # We bucket tags into refs and regions for ES because later at # request time we want to be able to merge them individually # with those from skimmers. refs_and_regions = bucket(tags, lambda index_obj: "regions" if isinstance(index_obj['payload'], basestring) else "refs") if 'refs' in refs_and_regions: total['refs'] = refs_and_regions['refs'] if 'regions' in refs_and_regions: total['regions'] = refs_and_regions['regions'] if annotations_for_this_line: total['annotations'] = annotations_for_this_line yield es.index_op(total)