示例#1
0
def get_all_tokens_worker(entry_queue, results_queue, ngram):

    token_counter = Counter()

    if ngram >=2:
        vocabulary = load_vocabulary_trie(ngram-1)

    while True:
        entry = entry_queue.get()
        if entry == FLAG_ALL_DONE:
            results_queue.put(token_counter)
            results_queue.put(FLAG_WORKER_FINISHED_PROCESSING)
            break
        else:
            if results_queue.qsize() > 30 or len(token_counter) > 10000000:
                print("Sleeping. Results qsize: ", results_queue.qsize())
                time.sleep(10)


            if len(token_counter) > 1000000 and results_queue.qsize() < 5:
                results_queue.put(token_counter)
                token_counter = Counter()

            ocr = open(entry, encoding='cp1252', errors='ignore').read()
            for token in ngram_generator(ocr, ngram):
                if ngram == 1 or check_ngram_validity(token, vocabulary, ngram):
                    token_counter[" ".join(token)] += 1
def distinctive_terms_overall(main_name):

    global_totals = get_vocabulary_totals(1)
    vocabulary_trie = load_vocabulary_trie(1)

    local_vocabulary, local_totals = get_vocabulary_and_totals(main_name)

    global_totals_localized = np.zeros(len(local_vocabulary['id_to_token']))
    for token in local_vocabulary['token_to_id']:
        local_token_id = local_vocabulary['token_to_id'][token]
        global_token_id = vocabulary_trie[token]
        global_totals_localized[local_token_id] = global_totals[
            global_token_id]

    print(len(global_totals), len(local_totals), len(global_totals_localized))

    distinctive_terms = get_distinctive_terms(local_totals,
                                              global_totals_localized,
                                              local_vocabulary)
    print(distinctive_terms)

    db = Database("TOB_NETWORKS")
    con, cur = db.connect()
    cur.execute('SELECT DISTINCT(tid) as tid from {}_docs'.format(main_name))
    tids = [row['tid'] for row in cur.fetchall()]
    totals2 = get_totals(tids, local_vocabulary)
    dist = get_distinctive_terms(totals2, global_totals_localized,
                                 local_vocabulary)
    print("\n", dist)

    totals3 = get_totals(tids, local_vocabulary, tf=True)
    dist = get_distinctive_terms(totals3, global_totals_localized,
                                 local_vocabulary)
    print("\n", dist)
示例#3
0
    def __get_distinctive_terms(self):

        try:
            distinctive_terms = pickle.load(
                open(
                    PATH_TOKENIZED +
                    'networks/{}_distinctive_terms.pickle'.format(
                        self.main_name), 'rb'))
        except IOError:

            distinctive_terms = {'overall': None, 'nodes': {}, 'edges': {}}

            global_totals = get_vocabulary_totals(1)
            vocabulary_trie = load_vocabulary_trie(1)

            totals_legacy = np.zeros(len(self.vocabulary['ordered']))
            for token_id, token in enumerate(self.vocabulary['ordered']):
                token_id_global = vocabulary_trie[token]
                token_total_global = global_totals[token_id_global]
                totals_legacy[token_id] = token_total_global

            distinctive_terms['overall'] = self.__calculate_distinctive_terms(
                self.totals, totals_legacy)
            print("overall", distinctive_terms['overall'])
            names_set = set(
                [n.lower() for name in self.nodes for n in name.split(',')])
            for term in distinctive_terms['overall']:
                if term[0] not in names_set:
                    print(term)

            for node in self.nodes:
                node_dtm = self.apply_filter(nodes=[node])
                node_counts = np.array(node_dtm.sum(axis=0)).flatten()
                distinctive_terms['nodes'][
                    node] = self.__calculate_distinctive_terms(
                        node_counts, self.totals)
                print("\n", node, distinctive_terms['nodes'][node])

            for edge in self.edges:
                edge_dtm = self.apply_filter(edges=[edge])
                # skip edges without content
                if edge_dtm.sum() == 0: continue
                edge_counts = np.array(edge_dtm.sum(axis=0)).flatten()
                distinctive_terms['edges'][
                    edge] = self.__calculate_distinctive_terms(
                        edge_counts, self.totals)
                print("\n", edge, distinctive_terms['edges'][edge])

            pickle.dump(
                distinctive_terms,
                open(
                    PATH_TOKENIZED +
                    'networks/{}_distinctive_terms.pickle'.format(
                        self.main_name), 'wb'))

        return distinctive_terms
def add_terms():


    for ngram in range(1,3):
        # update vocabulary trie
        # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore
        start = time.time()
        vocabulary = load_vocabulary_trie(ngram)
        keys = vocabulary.keys() + ADDED_TOKENS[ngram]
        vocabulary_new = Trie(keys)
        vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram))

        full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram]))
        print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
示例#5
0
def test(ngram=1):


    voc = load_vocabulary_trie(ngram)

    file = open('{}grams.txt'.format(ngram), 'w')

    count = 0
    for key in sorted(voc.keys()):
        validity = ""
        if ngram == 1:
            validity = str(check_1gram_validity(key))
        elif ngram == 2:
            if re.match('[0-9]+ [0-9]+[a-z]*', key):
                validity = "False"
            else:
                validity = "True"
        file.write(validity + "\t" + key+"\n")
        if validity == "True":
            count += 1

    print(len(voc), count )
示例#6
0
    def __get_vocabulary_and_totals(self):

        try:
            vocabulary_dict = pickle.load(
                open(
                    PATH_TOKENIZED +
                    'networks/{}_vocabulary.pickle'.format(self.main_name),
                    'rb'))
            totals = np.load(
                open(
                    PATH_TOKENIZED +
                    'networks/{}_totals.npy'.format(self.main_name), 'rb'))
        except IOError:

            from tobacco.frequencies_preprocessing.preprocessing_docs import get_ocr_by_tid

            vocabulary_trie = load_vocabulary_trie(1)

            db = Database("TOB_NETWORKS")
            con, cur = db.connect()

            totals = np.zeros(len(vocabulary_trie), dtype=np.int64)
            vocabulary_dict = {
                'token_to_id': {},
                'id_to_token': {},
                'ordered': []
            }

            cur.execute('SELECT DISTINCT(tid) as tid FROM {}_sections'.format(
                main_name))
            while True:
                row = cur.fetchone()
                if not row: break
                else:
                    text = get_ocr_by_tid(row['tid'], return_bytearray=False)
                    for token in text.split():
                        if token in vocabulary_trie:
                            totals[vocabulary_trie[token]] += 1

            token_id = 0
            for i in totals.argsort()[-5000:][::-1]:
                token = vocabulary_trie.restore_key(i)
                if token in STOP_WORDS or totals[i] < 10:
                    continue
                else:

                    vocabulary_dict['id_to_token'][token_id] = token
                    vocabulary_dict['token_to_id'][token] = token_id
                    vocabulary_dict['ordered'].append(token)
                    assert vocabulary_dict['ordered'][
                        token_id] == vocabulary_dict['id_to_token'][token_id]
                    token_id += 1

            assert len(vocabulary_dict['ordered']) == len(
                vocabulary_dict['id_to_token']) == len(
                    vocabulary_dict['token_to_id'])

            # store totals of main person
            totals_name = np.zeros(len(vocabulary_dict['id_to_token']))

            for token in vocabulary_dict['token_to_id']:
                token_id_global = vocabulary_trie[token]
                token_id_local = vocabulary_dict['token_to_id'][token]
                totals_name[token_id_local] = totals[token_id_global]
            np.save(
                open(
                    PATH_TOKENIZED +
                    'networks/{}_totals.npy'.format(self.main_name), 'wb'),
                totals_name)

            pickle.dump(
                vocabulary_dict,
                open(
                    PATH_TOKENIZED +
                    'networks/{}_vocabulary.pickle'.format(self.main_name),
                    'wb'))

        return vocabulary_dict, totals
示例#7
0
from tobacco.utilities.ocr import load_vocabulary_trie

GLOBAL_IDF_WEIGHTS = np.load(PATH_TOKENIZED + 'idf_weights.npy')
'''
8/15/17
So, recap of basic linear algebra w/r/t sparsity:
sklearn's nmf (and presumably others) offer l1 and l2 regularization that can be weighed against each other
(e.g. 50% l1, 50% l2)

l1 regularization: reduce number of non-zero weights
l2 regularization: reduce squared sum of weights

i.e. to enforce sparsity in the terms, use l1 regularization
'''

VOCABULARY = load_vocabulary_trie(1)


def tokenize_sections(output_sections,
                      vocabulary,
                      log_likelihoods,
                      tokenizer_type='count',
                      use_global_idf=True):

    start = time.time()

    sections = [i[7] for i in output_sections]
    # map term to token_id
    vocabulary_dict = {token: idx for idx, token in enumerate(vocabulary)}

    indices = array.array(str("i"))
def full_db_to_tokens(ngram=1, use_sections=False, add_new_terms=None):
    """ Stores the full database as a csc doc-term matrix (one for each ngram level)

    :param ngram: 1-5
    :param use_sections: use 200 word sections if true, else use full documents
    :return:

    Step 1: Take a slice of the vocabulary and tokenize all documents to a csr matrix. Then transfrom
            that csr to a csc matrix
    Step 2: Take all the csc slices and stack them next to each other.

    Rationale: Creating the full csr matrix at once and then turning it to a csc matrix uses absurd amounts of memory
    Note to future: yes, you have tried and no, it didn't work.

    """


    # Load vocabulary according to ngram level
    vocabulary = load_vocabulary_trie(ngram)

    # Slice the vocabulary into slices with length n, depending on the ngram level.
#    ngram_to_interval = {1: 130000, 2: 2600000, 3: 3100000, 4:3600000, 5: 2500000}
    ngram_to_interval = {1: 140000, 2: 1000000, 3: 3100000, 4:3600000, 5: 2500000}
    voc_interval = ngram_to_interval[ngram]
    # if just adding new terms, then we don't need to slice the vocabulary
    if add_new_terms: voc_interval = 100000000
    print("{} vocabulary slices to process.".format(len(range(0, len(vocabulary)-1, voc_interval))))

    for voc_idx in range(0, len(vocabulary)-1, voc_interval):
        print("Working on voc_idx {} out of {}".format(voc_idx, len(vocabulary)))

        # 2/1/17: just to make it clear: voc_idx is the vocabulary offset
        vocabulary_offset = voc_idx

        vocabulary_slice = {}
        if add_new_terms:
            vocabulary_slice = {new_term: vocabulary[new_term] for new_term in add_new_terms}
            print(vocabulary_slice)
        else:
            for i in range(voc_idx, voc_idx + voc_interval):
                try: vocabulary_slice[vocabulary.restore_key(i)] = i
                except KeyError: pass


        # Initialize arrays for indices and indptr, add first element to indptr
        data = array.array(str("l"))
        indices = array.array(str("l"))
        indptr = array.array(str("l"))
        indptr.append(0)


        entry_queue = get_entry_queue()
        for i in range(NUMBER_OF_PROCESSES): entry_queue.put(FLAG_ALL_DONE)
        print("entry queue size", entry_queue.qsize())

        results_queue = mp.Queue()

        # Initialize and start processes
        for process_n in range(NUMBER_OF_PROCESSES):
            p = mp.Process(target = tokenize_document_worker, args=(entry_queue, results_queue, ngram, vocabulary_slice, vocabulary_offset, use_sections))
            p.start()

        processors_finished = 0
        # next id to be added to the results
        current_id = 0
        # storage dict for returned but not yet added results
        pending_results = {}

        while True:
            new_result = results_queue.get()
            if new_result == FLAG_WORKER_FINISHED_PROCESSING:
                processors_finished += 1
                if processors_finished == NUMBER_OF_PROCESSES:
                    assert entry_queue.qsize() == 0
                    store_vocabulary_slice(data, indices, indptr, vocabulary_slice, ngram, vocabulary_offset, add_new_terms, use_sections)
                    break

            else:
                # all results first get added to the pending_results dict
                pending_results[new_result['id']] = {'indices': new_result['indices'],
                                                     'data': new_result['data']}

                while True:
                    # then, if the next id to be added is in the result,
                    # the result gets moved from the dict to the indices array
                    if current_id in pending_results:
                        if current_id % 10000 == 0:
                            print("Current id: {}. qsize: {}. Data length: {}.".format(current_id, results_queue.qsize(), len(data)))
                            print(len(indptr))

                        if use_sections:
                            for section_id in range(len(pending_results[current_id]['indices'])):
                                indices += pending_results[current_id]['indices'][section_id]
                                data += pending_results[current_id]['data'][section_id]
                                indptr.append(len(indices))
                        else:
                            indices += pending_results[current_id]['indices']
                            data += pending_results[current_id]['data']
                            indptr.append(len(indices))
                        pending_results.pop(current_id, None)
                        current_id += 1
                    else:
                        break
def store_vocabulary_slice(data, indices, indptr, vocabulary_slice, ngram, vocabulary_offset, add_new_terms, use_sections=False):
    '''
    Iterates through vocabulary processed so far and stores every token
    a) in the tokens table of tob_full (token, token_reversed, id, ngram, total)
    b) as a compressed sparse matrix

    :param data:
    :param indices:
    :param indptr:
    :param vocabulary:
    :param ngram:
    :return:
    '''

    print("finished tokenizing. storing vocabulary slice.")

    # parse to int (may not be necessary)
    data = np.frombuffer(data, dtype=np.int64)
    indices = np.frombuffer(indices, dtype = np.int64)
    indptr = np.frombuffer(indptr, dtype=np.int64)

    # if adding new terms, the temp matrix has to have as many columns as the vocabulary as a whole, not just the
    # current vocabulary slice
    if add_new_terms:
        shape = (len(indptr) - 1, len(load_vocabulary_trie(ngram)))
    else:
        shape = (len(indptr) - 1, len(vocabulary_slice))

    temp_matrix = csr_matrix((data, indices, indptr), shape=shape, dtype= np.int64)

    # get global tfidf weights here
    from IPython import embed
    embed()

    temp_matrix = temp_matrix.tocsc()

    print("temp matrix")
    print("shape", temp_matrix.shape)
    print("indptr, voc slice", len(indptr), len(vocabulary_slice))
    print("nnz", temp_matrix.getnnz())
    print("len, sum of data", len(data), np.sum(data))


    db = Database("TOB_FULL")

    tokens = []

    for token in vocabulary_slice:

        if len(tokens) >= 20000:
            print("Quality control on first token vector")
            test_vector = get_ngram_vector(tokens[0]['token'])
            print("token: ", tokens[0]['token'], " total db: ", tokens[0]['total'], "total vector ", test_vector.sum(), "Shape: ", test_vector.shape, " nnz: ",
                  test_vector.getnnz(), "indptr: ", test_vector.indptr, " data len ",  len(test_vector.data),
                  " indices len ", len(test_vector.indices))

            if not use_sections:
                db.batch_insert('tokens',
                                ['token', 'token_reversed', 'id', 'ngram', 'total'],
                                tokens)
                tokens = []

        id = vocabulary_slice[token]


        # extract, indptr, data, and indices directly instead of forming a column slice first
        # the column slice takes about 3secs per term
        # subtract vocabulary offset to get the correct ids
        indptr_token_start = temp_matrix.indptr[id - vocabulary_offset]
        indptr_token_end = temp_matrix.indptr[id+1 - vocabulary_offset]

        indices_token = temp_matrix.indices[indptr_token_start:indptr_token_end]
        data_token = temp_matrix.data[indptr_token_start:indptr_token_end]
        indptr_token = np.array([0, len(indices_token)], dtype=np.int64)


        # if add_new_terms:
        #     shape = (len(load_vocabulary_trie(ngram)), 1)
        # else:
        shape = (temp_matrix.shape[0], 1)
        token_vector = csc_matrix((data_token, indices_token, indptr_token), shape=shape)

        # to compress directory: tar -c tokens | pv --size `du -csh tokens | grep total | cut -f1` | pigz -9 > tokens.tar.gz
        hash_path = hashlib.sha256(token.encode()).hexdigest()
        if use_sections:
            hash_path += '_sections'
        token_path = PATH_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3])
        if not os.path.exists(token_path): os.makedirs(token_path)

        store_csr_matrix_to_file(token_vector, token_path + hash_path, compressed=True)

        if not use_sections:
            tokens.append({
                'token': token,
                'token_reversed': token[::-1],
                'id': id,
                'ngram': ngram,
                'total': np.sum(data_token)
            })

    if not use_sections:
        db.batch_insert('tokens',
                        ['token', 'token_reversed', 'id', 'ngram', 'total'],
                        tokens)
def get_globals(globals_type='frequencies', load_only_docs=False):
    """
    Returns all the globals necessary to process a frequencies or text_passages global
    use load_only_docs to load only the docs (but not section) filters and totals in frequency mode


    Mode: frequencies, load: docs and sections. 14s
    Mode: frequencies, load: docs                2s

    :param globals_type:
    :return:
    """

    s = time.time()
    if globals_type == 'frequencies':
        globals_dict = {
            'filters': {
                'docs': {
                    'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='docs'),       #  66 MB
                    'collection': get_collection_filters(return_type='csc', docs_or_sections='docs'),   # 121 MB
                    'availability': get_availability_filters(return_type='csc', docs_or_sections='docs')#  40 MB
                },
            },

            'totals': {
                'totals': {
                    'docs': {
                        'np': get_totals_vector(docs_or_sections='docs', return_type='np_int32'),        # 43 MB
                    },
                },
                'collection':{
                    'docs': get_collection_totals_vectors(docs_or_sections='docs')
                },
                'doc_type': {
                    'docs': get_doc_type_totals_vectors(docs_or_sections='docs')
                }
            },

            'vocabulary_totals': get_vocabulary_totals(1),
            'vocabulary_trie': load_vocabulary_trie(1),  # 1 MB
            'vocabulary_set': load_vocabulary_trie(1, return_type='set'),  # 15 MB
            'collections_and_idx_dict': get_col_name_and_idx_dict(),
            'doc_type_and_idx_dict': get_doc_types_to_idx_dict(),

            'year_parts_id_list': {
                'docs': get_year_doc_id_list('docs'),  # 45 MB
            }
        }

        if not load_only_docs:
            globals_dict['filters']['sections'] = {
                'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'),        # 1009 MB
                'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'),    #  828 MB
                'availability': get_availability_filters(return_type='csc', docs_or_sections='sections') #  427 MB
            }
            globals_dict['totals']['totals']['sections'] = {}
            globals_dict['totals']['totals']['sections']['np'] = get_totals_vector(docs_or_sections='sections', return_type='csc')  # 341 MB
            globals_dict['year_parts_id_list']['sections'] = get_year_doc_id_list('sections')           # 1 MB ??this seems wrong ??
            globals_dict['totals']['collection']['sections'] = get_collection_totals_vectors(docs_or_sections='sections')
            globals_dict['totals']['doc_type']['sections'] = get_doc_type_totals_vectors(docs_or_sections='sections')

    #     if globals_type == 'frequencies':
#         globals_dict = {
#             'filters':{
#                 'docs':{
#                     'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='docs'),            # 66 MB
#                     'collection': get_collection_filters(return_type='csc', docs_or_sections='docs'),        # 121 MB
#                     'availability': get_availability_filters(return_type='csc', docs_or_sections='docs')     #  40 MB
#                    },
#                 'sections':{
#                     'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'),       # 1009 MB
#                     'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'),   # 828 MB
#                     'availability': get_availability_filters(return_type='csc', docs_or_sections='sections')# 427 MB
#                 }
#             },
#
#             'totals':{
#                 'totals':{
#                     'docs':{
#                         'np': csc_to_np_int32(get_totals_vector(docs_or_sections='docs')),                 # 43 MB
#                     },
#                     'sections':{
#                         'np': csc_to_np_int32(get_totals_vector(docs_or_sections='sections')),             # 341 MB
#                     },
#                 },
#             },
#
#             # 8/31/18 I don't think these are used -> commented out for the time being.
# #            'year_doc_matrix':{
# #                'docs': get_year_doc_transformation_matrix(docs_or_sections='docs'),                        # 170 MB
# #                'sections': get_year_doc_transformation_matrix(docs_or_sections='sections')                 # 1360 MB
# #            },
#
#             'vocabulary_totals': get_vocabulary_totals(1),
#             'vocabulary_trie': load_vocabulary_trie(1),                                                     # 1 MB
#             'vocabulary_set': load_vocabulary_trie(1, return_type='set'),                                   # 15 MB
#
#             'year_parts_id_list':{
#                 'docs': get_year_doc_id_list('docs'),                                                       # 45 MB
#                 'sections': get_year_doc_id_list('sections')                                                # 1 MB ?? why so much less than docs??
#             }
#         }
    elif globals_type == 'passages':
        globals_dict = {
            'filters':{
                'sections':{
                    'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'),       # 958 MB
                    'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'),   # 828 MB
                    'availability': get_availability_filters(return_type='csc', docs_or_sections='sections')# 427 MB
                }
            },

            'doc_types_and_idx_dict': get_doc_types_to_idx_dict(),
            'collections_and_idx_dict': get_col_name_and_idx_dict(),
            'section_to_doc_and_offset_arr': get_section_to_doc_and_offset_arr(),                           # 1024 MB

            'vocabulary_totals': get_vocabulary_totals(1),
            'vocabulary_trie': load_vocabulary_trie(1),                                                     # 1 MB
            'vocabulary_set': load_vocabulary_trie(1, return_type='set'),                                   # 15 MB

            'year_parts_id_list':{
                'docs': get_year_doc_id_list('docs'),                                                       # 45 MB
                'sections': get_year_doc_id_list('sections')                                                # 1 MB ?? why so much less than docs??
            }
        }
    else:
        raise ValueError("only 'frequencies' and 'passages' are valid values for globals_type but not {}".format(globals_type))

#    print("Loading globals in {} mode took: {}".format(globals_type, time.time() - s))

    return globals_dict
import time
from collections import namedtuple

import numpy as np
from tobacco.frequencies_preprocessing.preprocessing_globals_loader import get_globals
from tobacco.text_passages.text_passages_helper_process_year_of_sections import process_year_of_sections_cython
from tobacco.text_passages.text_passages_helper_search import parse_text_passages_tokens
from tobacco.frequencies_preprocessing.preprocessing_filters import get_active_filters_np
from tobacco.utilities.ocr import load_vocabulary_trie

VOCABULARY = load_vocabulary_trie(1, return_type='set')

from tobacco.frequencies.calculate_ngrams_class import NgramResult

# only use end_year, not start_year

Document = namedtuple('Document',
                      ['tid', 'title', 'date', 'year', 'collection'])
Passage = namedtuple('Passage', ['Document', 'text'])


def find_text_passages(tokens,
                       active_filters,
                       years_to_process,
                       passage_length,
                       globals,
                       logging=False,
                       insert_result_to_db=True):
    """ This is the main task to find text passages matching one or more search terms.

    The main processing itself is done year by year in the cython function process_year_of_sections_cython