def batch_insert(docs, doc_types, authors, recipients): ''' Inserts the documents in batches :param docs: :param doc_types: :param authors: :param recipients: :return: ''' db2 = Database("TOB_FULL") db2.batch_insert('docs', ['id', 'tid', 'timestamp', 'year', 'date_orig', 'title', 'collection_id', 'pages', 'no_docs', 'availability'], docs) db2.batch_insert('doc_types', ['doc_id', 'doc_type', 'weight'], doc_types) db2.batch_insert('authors', ['doc_id', 'author'], authors) db2.batch_insert('recipients', ['doc_id', 'recipient'], recipients)
def store_vocabulary_slice(data, indices, indptr, vocabulary_slice, ngram, vocabulary_offset, add_new_terms, use_sections=False): ''' Iterates through vocabulary processed so far and stores every token a) in the tokens table of tob_full (token, token_reversed, id, ngram, total) b) as a compressed sparse matrix :param data: :param indices: :param indptr: :param vocabulary: :param ngram: :return: ''' print("finished tokenizing. storing vocabulary slice.") # parse to int (may not be necessary) data = np.frombuffer(data, dtype=np.int64) indices = np.frombuffer(indices, dtype = np.int64) indptr = np.frombuffer(indptr, dtype=np.int64) # if adding new terms, the temp matrix has to have as many columns as the vocabulary as a whole, not just the # current vocabulary slice if add_new_terms: shape = (len(indptr) - 1, len(load_vocabulary_trie(ngram))) else: shape = (len(indptr) - 1, len(vocabulary_slice)) temp_matrix = csr_matrix((data, indices, indptr), shape=shape, dtype= np.int64) # get global tfidf weights here from IPython import embed embed() temp_matrix = temp_matrix.tocsc() print("temp matrix") print("shape", temp_matrix.shape) print("indptr, voc slice", len(indptr), len(vocabulary_slice)) print("nnz", temp_matrix.getnnz()) print("len, sum of data", len(data), np.sum(data)) db = Database("TOB_FULL") tokens = [] for token in vocabulary_slice: if len(tokens) >= 20000: print("Quality control on first token vector") test_vector = get_ngram_vector(tokens[0]['token']) print("token: ", tokens[0]['token'], " total db: ", tokens[0]['total'], "total vector ", test_vector.sum(), "Shape: ", test_vector.shape, " nnz: ", test_vector.getnnz(), "indptr: ", test_vector.indptr, " data len ", len(test_vector.data), " indices len ", len(test_vector.indices)) if not use_sections: db.batch_insert('tokens', ['token', 'token_reversed', 'id', 'ngram', 'total'], tokens) tokens = [] id = vocabulary_slice[token] # extract, indptr, data, and indices directly instead of forming a column slice first # the column slice takes about 3secs per term # subtract vocabulary offset to get the correct ids indptr_token_start = temp_matrix.indptr[id - vocabulary_offset] indptr_token_end = temp_matrix.indptr[id+1 - vocabulary_offset] indices_token = temp_matrix.indices[indptr_token_start:indptr_token_end] data_token = temp_matrix.data[indptr_token_start:indptr_token_end] indptr_token = np.array([0, len(indices_token)], dtype=np.int64) # if add_new_terms: # shape = (len(load_vocabulary_trie(ngram)), 1) # else: shape = (temp_matrix.shape[0], 1) token_vector = csc_matrix((data_token, indices_token, indptr_token), shape=shape) # to compress directory: tar -c tokens | pv --size `du -csh tokens | grep total | cut -f1` | pigz -9 > tokens.tar.gz hash_path = hashlib.sha256(token.encode()).hexdigest() if use_sections: hash_path += '_sections' token_path = PATH_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3]) if not os.path.exists(token_path): os.makedirs(token_path) store_csr_matrix_to_file(token_vector, token_path + hash_path, compressed=True) if not use_sections: tokens.append({ 'token': token, 'token_reversed': token[::-1], 'id': id, 'ngram': ngram, 'total': np.sum(data_token) }) if not use_sections: db.batch_insert('tokens', ['token', 'token_reversed', 'id', 'ngram', 'total'], tokens)