def build_lexicon( docs_to_index: List[DocumentToIndex], lex_path: str, ) -> Lexicon: print('Building lexicon: {}'.format(lex_path)) word_counts = get_word_counts(docs_to_index) lexicon = Lexicon([ Lexicon.Word(i, w, word_counts[w]) for i, w in enumerate(sorted(word_counts.keys())) ]) print('Storing lexicon: {}'.format(lex_path)) lexicon.store(lex_path) return lexicon
def main( index_dir: str, new_doc_dir: Optional[str], chunk_size: Optional[int] = None, skip_existing_names: bool = False ): assert chunk_size is None or chunk_size > 0 doc_path = os.path.join(index_dir, 'documents.txt') lex_path = os.path.join(index_dir, 'lexicon.txt') index_path = os.path.join(index_dir, 'index.bin') old_lexicon = Lexicon.load(lex_path) documents = Documents.load(doc_path) if new_doc_dir: new_docs_to_index = list_docs(new_doc_dir) else: new_docs_to_index = read_docs_from_stdin() assert len(new_docs_to_index) > 0 tmp_new_docs_to_index = [] for new_doc in new_docs_to_index: if new_doc.name in documents: if skip_existing_names: print('Skipping: {} is already indexed!'.format(new_doc.name)) else: raise Exception( '{} is already indexed! Aborting.'.format(new_doc.name)) else: tmp_new_docs_to_index.append(new_doc) new_docs_to_index = tmp_new_docs_to_index if len(new_docs_to_index) == 0: print('No new documents to index.') return # Update lexicon new_word_counts = get_word_counts(new_docs_to_index) lexicon_words = [ Lexicon.Word(w.id, w.token, w.count + new_word_counts[w.token] if w.token in new_word_counts else w.count) for w in old_lexicon ] for w in new_word_counts: if w not in old_lexicon: lexicon_words.append( Lexicon.Word(len(lexicon_words), w, new_word_counts[w])) lexicon = Lexicon(lexicon_words) base_doc_id = len(documents) new_documents = [Documents.Document(id=i + base_doc_id, name=d.name) for i, d in enumerate(new_docs_to_index)] # Convert existing index.bin to a dirctory if needed if os.path.isfile(index_path): tmp_index_path = index_path + '.tmp' shutil.move(index_path, tmp_index_path) os.makedirs(index_path) shutil.move( tmp_index_path, os.path.join(index_path, '{:07d}-{:07d}.bin'.format( 0, base_doc_id))) assert os.path.isdir(index_path) # Index the new documents index_new_docs(new_docs_to_index, new_documents, lexicon, index_path, os.path.join(index_dir, 'data'), chunk_size) # Write out the new documents file shutil.move(doc_path, doc_path + '.old') all_documents = list(documents) all_documents.extend(new_documents) Documents(all_documents).store(doc_path) # Update to the new lexicon lexicon.store(lex_path) print('Done!')