Python Database.batch_insert 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tobacco.utilities.databases

클래스/타입: Database

메소드/함수: batch_insert

hotexamples.com에서의 예제들: 2

Python Database.batch_insert - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tobacco.utilities.databases.Database.batch_insert에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Database(30)

connect(30)

batch_insert(2)

자주 사용되는 메소드들

Database (30)

connect (30)

batch_insert (2)

예제 #1

파일 보기

파일: s0_full_db_initialization.py 프로젝트: srisi/tobacco_analytics

def batch_insert(docs, doc_types, authors, recipients):

    '''
    Inserts the documents in batches

    :param docs:
    :param doc_types:
    :param authors:
    :param recipients:
    :return:
    '''


    db2 = Database("TOB_FULL")
    db2.batch_insert('docs',
                    ['id', 'tid', 'timestamp', 'year', 'date_orig', 'title', 'collection_id', 'pages', 'no_docs', 'availability'],
                    docs)
    db2.batch_insert('doc_types', ['doc_id', 'doc_type', 'weight'], doc_types)
    db2.batch_insert('authors', ['doc_id', 'author'], authors)
    db2.batch_insert('recipients', ['doc_id', 'recipient'], recipients)

예제 #2

파일 보기

파일: s2_full_db_to_doc_term_matrix_and_token_db.py 프로젝트: srisi/tobacco_analytics

def store_vocabulary_slice(data, indices, indptr, vocabulary_slice, ngram, vocabulary_offset, add_new_terms, use_sections=False):
    '''
    Iterates through vocabulary processed so far and stores every token
    a) in the tokens table of tob_full (token, token_reversed, id, ngram, total)
    b) as a compressed sparse matrix

    :param data:
    :param indices:
    :param indptr:
    :param vocabulary:
    :param ngram:
    :return:
    '''

    print("finished tokenizing. storing vocabulary slice.")

    # parse to int (may not be necessary)
    data = np.frombuffer(data, dtype=np.int64)
    indices = np.frombuffer(indices, dtype = np.int64)
    indptr = np.frombuffer(indptr, dtype=np.int64)

    # if adding new terms, the temp matrix has to have as many columns as the vocabulary as a whole, not just the
    # current vocabulary slice
    if add_new_terms:
        shape = (len(indptr) - 1, len(load_vocabulary_trie(ngram)))
    else:
        shape = (len(indptr) - 1, len(vocabulary_slice))

    temp_matrix = csr_matrix((data, indices, indptr), shape=shape, dtype= np.int64)

    # get global tfidf weights here
    from IPython import embed
    embed()

    temp_matrix = temp_matrix.tocsc()

    print("temp matrix")
    print("shape", temp_matrix.shape)
    print("indptr, voc slice", len(indptr), len(vocabulary_slice))
    print("nnz", temp_matrix.getnnz())
    print("len, sum of data", len(data), np.sum(data))


    db = Database("TOB_FULL")

    tokens = []

    for token in vocabulary_slice:

        if len(tokens) >= 20000:
            print("Quality control on first token vector")
            test_vector = get_ngram_vector(tokens[0]['token'])
            print("token: ", tokens[0]['token'], " total db: ", tokens[0]['total'], "total vector ", test_vector.sum(), "Shape: ", test_vector.shape, " nnz: ",
                  test_vector.getnnz(), "indptr: ", test_vector.indptr, " data len ",  len(test_vector.data),
                  " indices len ", len(test_vector.indices))

            if not use_sections:
                db.batch_insert('tokens',
                                ['token', 'token_reversed', 'id', 'ngram', 'total'],
                                tokens)
                tokens = []

        id = vocabulary_slice[token]


        # extract, indptr, data, and indices directly instead of forming a column slice first
        # the column slice takes about 3secs per term
        # subtract vocabulary offset to get the correct ids
        indptr_token_start = temp_matrix.indptr[id - vocabulary_offset]
        indptr_token_end = temp_matrix.indptr[id+1 - vocabulary_offset]

        indices_token = temp_matrix.indices[indptr_token_start:indptr_token_end]
        data_token = temp_matrix.data[indptr_token_start:indptr_token_end]
        indptr_token = np.array([0, len(indices_token)], dtype=np.int64)


        # if add_new_terms:
        #     shape = (len(load_vocabulary_trie(ngram)), 1)
        # else:
        shape = (temp_matrix.shape[0], 1)
        token_vector = csc_matrix((data_token, indices_token, indptr_token), shape=shape)

        # to compress directory: tar -c tokens | pv --size `du -csh tokens | grep total | cut -f1` | pigz -9 > tokens.tar.gz
        hash_path = hashlib.sha256(token.encode()).hexdigest()
        if use_sections:
            hash_path += '_sections'
        token_path = PATH_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3])
        if not os.path.exists(token_path): os.makedirs(token_path)

        store_csr_matrix_to_file(token_vector, token_path + hash_path, compressed=True)

        if not use_sections:
            tokens.append({
                'token': token,
                'token_reversed': token[::-1],
                'id': id,
                'ngram': ngram,
                'total': np.sum(data_token)
            })

    if not use_sections:
        db.batch_insert('tokens',
                        ['token', 'token_reversed', 'id', 'ngram', 'total'],
                        tokens)