def parse_text_passages_tokens(tokens): search_regexes = [] intersection_vector = None first_token = tokens[0].strip('*') for token in tokens: # if wildcard, process wildcard if token.find('*') > -1: token_vector, token_regex = process_wildcard_token(token) # else: handle normal token else: try: token_vector = Vector().load_token_vector( token, return_type='np_uint8', docs_or_sections='sections') token_regex = re.compile(r'\b{}\b'.format(token)) # will throw a fof error when the token does not exist. e.g. 'compound w' except FileNotFoundError: token_vector, token_regex = process_nonexistant_token(token) if token_vector is not None: if intersection_vector is None: intersection_vector = token_vector else: intersection_vector *= token_vector search_regexes.append(token_regex) return first_token, search_regexes, intersection_vector
def _compute_add_tokens_data(self): """ Load counts, frequencies, and totals for each token. 12/18/18 Moved from preprocessing_tokens and implemented for use with the NgramResult class, i.e. it won't return a df but individual vars. :return: None """ self.tokens_data = [] self.aggregate = None for token in self.parsed_search_tokens: # Load token and totals try: loaded_vector = Vector().load_token_vector( token, return_type='np_int32', docs_or_sections=self.docs_or_sections) except FileNotFoundError: print('Could not load token {}.'.format(token)) continue # initialize aggregate if self.aggregate is None: self.aggregate = loaded_vector.copy() else: self.aggregate += loaded_vector absolute_counts = loaded_vector.convert_to_year_array( filter_vec=self.combined_filters_np) self.tokens_data.append({ 'token': token, 'counts': absolute_counts, 'frequencies': absolute_counts / self.totals_years, 'total': absolute_counts.sum }) self.tokens_data = sorted(self.tokens_data, key=lambda k: k['total'], reverse=True) self.aggregate.filter_with(self.combined_filters_np) self.aggregate_csc = self.aggregate.copy().convert_to_datatype('csc')
def get_doc_type_filters(return_type='csc', docs_or_sections = 'docs'): doc_types = get_dtype_dict() doc_type_filters = {} for doc_type in doc_types['valid']: for weight in [False]: if doc_type in [ 'letter', 'report', 'memo', 'email', 'note', 'publication', 'report, scientific', 'advertisement', 'promotional material', 'budget', 'specification', 'budget_review', 'telex', 'news_article', 'agenda', 'report, market research', 'speech', 'presentation', 'minutes' ]: doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type', weight=weight, return_type=return_type, docs_or_sections=docs_or_sections) else: doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type', weight=weight, return_type='csc', docs_or_sections=docs_or_sections) for group in doc_types['groups']: for weight in [False]: file_name = '{}_{}_{}_{}'.format(group, 'doc_type', docs_or_sections, weight) file_path = Path(PATH_TOKENIZED, 'filters', file_name) try: group_filter = Vector().load_from_disk(file_path, return_type=return_type) except FileNotFoundError: print("creating group filter for: ", group) group_filter = None for doc_type in doc_types['groups'][group]: if group_filter is None: group_filter = doc_type_filters[(doc_type, weight)] else: group_filter += doc_type_filters[(doc_type, weight)] group_filter.save_to_disk(file_path) doc_type_filters[(group, weight)] = group_filter # if return_type == 'np': # doc_type_filters[(group, weight)] = csc_bool_to_np_cython(group_filter) return doc_type_filters
def get_doc_type_totals_vector(doc_type_name, docs_or_sections='docs', return_type='csc'): """ Loads one doc_type totals vector >>> totals = get_doc_type_totals_vector('report', 'docs', 'csc') >>> totals <Document Vector of type csc with 1652572 elements and length 11303161.> :param doc_type_name: :param docs_or_sections: :param return_type: :return: """ try: return Vector().load_totals_vector(doc_type_name.replace('/', '_'), 'doc_type', docs_or_sections, return_type) except IOError: # added 6/9/17. This is all very awkward (i.e. loading filters every time). # It replaces an older solution that used filter_vector(), which is no longer available. # Filters get loaded every time because otherwise, the filters would be held in memory twice. print("Creating doc type totals vector for: ", doc_type_name, docs_or_sections) filters = get_filters(return_type='np') totals_vector = csc_to_np_int32(get_totals_vector(docs_or_sections)) active_doc_type_filters_np, _, _ = get_active_filters_np( active_filters={ 'doc_type': {doc_type_name}, 'collection': {}, 'availability': {} }, FILTERS=filters, docs_or_sections=docs_or_sections, return_type=np.uint8) filtered_dt = totals_vector * active_doc_type_filters_np # um... yeah... ugly but it creates the required mx1 sparse vector vec = csc_matrix(csc_matrix(filtered_dt).T, dtype=np.int64) # totals[dt] = filter_vector(get_totals_vector(), {'doc_type': {dt}, 'collection': {}}) store_csr_matrix_to_file( vec, PATH_TOKENIZED + 'totals/{}_{}'.format( doc_type_name.replace('/', '_'), docs_or_sections)) return get_doc_type_totals_vector(doc_type_name, docs_or_sections, return_type)
def process_all(): create_sqlite_table() terms = [] db = Database('TOB_FULL') con, cur = db.connect() cur.execute('SELECT token from tokens where total > 10000;') for row in cur.fetchall(): term = row['token'] valid = True for word in term.split(): if len(word) == 1: valid = False try: int(word) valid = False except ValueError: pass if valid: terms.append(term) print("Number of terms: {}".format(len(terms))) for collection in COL_NAME_TO_ID: col_id = COL_NAME_TO_ID[collection] filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8') filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8')) max_5p_filter = Vector().load_page_number_vector('max_5') print("pre", filtered_collection_vector) filtered_collection_vector.filter_with(max_5p_filter) print('post', filtered_collection_vector) if collection == 'msa_bat': totals = TOTALS_COL[5] for id in [6, 7, 8, 9, 10, 11, 15]: totals += TOTALS_COL[id] print(totals) else: totals = TOTALS_COL[col_id] filtered_totals_year_vector = totals.convert_to_year_array(filter_vec=filtered_collection_vector) for term in terms: find_and_store_policy(term, filtered_collection_vector, filtered_totals_year_vector, collection)
def get_ngram_vector(token, return_type='csc', return_sum=False, docs_or_sections='docs'): """ Loads the ngram vector of the token E.g. v = get_ngram_vector('nicotine', return_type='csc', docs_or_sections='docs') :param token: search token, string :param return_type: 'csc', 'np', 'uint8' :param return_sum: Whether or not to return the sum of the vector. :param docs_or_sections: 'docs' or 'sections' :return: """ # to distribute the millions of stored ngram vectors, they were hashed. hash = hashlib.sha256(token.encode()).hexdigest() h = hash if docs_or_sections == 'sections': h += '_sections' token_path = Path(PATH_TOKENS, hash[0], hash[1], hash[2], hash[3], h) # token_path = PATH_TOKENS + '{}/{}/{}/{}/{}'.format(hash[0], hash[1], hash[2], hash[3], hash) # if docs_or_sections == 'sections': # token_path += '_sections' ngram_vector = Vector() ngram_vector.load_from_disk(token_path) ngram_vector.convert_to_datatype(return_type) return ngram_vector from IPython import embed embed() # csc = load_csc_matrix_from_file(token_path) # if csc is None: # print("Could not find token vector for token {} at {}".format(token, token_path)) # if return_type == 'csc': # pass # elif return_type == 'np': # out = csc_to_np_int64(csc) # elif return_type == 'uint8': # out = csc_to_np_uint8(csc) # # else: # raise ValueError("{} is not a valid return type for get_ngram_vector. 'csc' and 'np' are valid.") if return_sum: token_sum = csc.data.sum() return out, token_sum else: return out
def get_collection_totals_vector(collection_id, docs_or_sections, return_type='csc'): """ Load the totals vector for one collection :param collection_id: id of the collection :param docs_or_sections: "docs" or "sections" :param return_type: "csc" or "np" or csc sparse matrix or np array :return: """ try: return Vector().load_totals_vector(collection_id, 'collection', docs_or_sections, 'csc') # csc = load_csc_matrix_from_file(PATH_TOKENIZED + 'totals/{}_{}'.format(collection_id, docs_or_sections)) except IOError: print("Creating totals vector for collection, type: ", collection_id, docs_or_sections) filters = get_filters(return_type='np') totals_vector = csc_to_np_int32(get_totals_vector(docs_or_sections)) _, active_collection_filters_np, _ = get_active_filters_np( active_filters={ 'doc_type': {}, 'collection': {collection_id}, 'availability': {} }, FILTERS=filters, docs_or_sections=docs_or_sections, return_type=np.uint8) filtered_dt = totals_vector * active_collection_filters_np # um... yeah... ugly but it creates the required mx1 sparse vector csc = csc_matrix(csc_matrix(filtered_dt).T, dtype=np.int64) store_csr_matrix_to_file( csc, PATH_TOKENIZED + 'totals/{}_{}'.format(collection_id, docs_or_sections)) if return_type == 'csc': return csc_matrix(csc, dtype=np.int32) else: return csc_to_np_int32(csc)
def get_totals_vector(docs_or_sections='docs', return_type='np_int32'): ''' Only implemented for 1 gram because there's no reason why we would need totals for 2-5 grams :return: ''' ngram = 1 try: file_name = 'totals_{}_{}'.format(ngram, docs_or_sections) file_path = Path(PATH_TOKENIZED, file_name) totals_vector = Vector().load_from_disk(file_path, return_type=return_type) return totals_vector except IOError: totals_vector = create_totals_vector(ngram, docs_or_sections) totals_vector = csc_matrix(totals_vector, dtype=np.int32) store_csr_matrix_to_file( totals_vector, PATH_TOKENIZED + 'totals_{}_{}.npz'.format(ngram, docs_or_sections)) return get_totals_vector(docs_or_sections, return_type)
def _compute_set_active_filters_np(self, globals): """ Applies all filters to both the term and a copy of the totals vector and sets them All filters are np uint8 Vectors. The following filters are set in this function: doc_type_filters_np collection_filters_np availability_filters_np term_filters_np combined_filters_np 6/10/17 Added availability filter. The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it. That is to say: the availability filter is not used on its own. 7/25/17 Added term filter 12/19/18: Moved to calculate_ngrams from preprocessing filters as this is a calculation step, not a pre-processing step >>> globals = get_globals(load_only_docs=True) >>> dt_filters = ['internal communication'] >>> col_filters = [2,3] >>> avail_filters = ['no restrictions'] >>> term_filters = [] >>> search_tokens = ['addiction'] >>> ngram = NgramResult(dt_filters, col_filters, avail_filters, term_filters, search_tokens) >>> ngram.docs_or_sections = 'docs' >>> ngram._compute_set_active_filters_np(globals=globals) >>> print(ngram.combined_filters_np) <Document Vector of type np_uint8 with 4964 elements.> :return: None """ filters = globals['filters'][self.docs_or_sections] if not 'term' in self.active_filters: self.active_filters['term'] = {} # all filters used here are unweighted # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each. weighted = False filter_len = DOC_COUNT if self.docs_or_sections == 'sections': filter_len = SECTION_COUNT # process availability filters if len(self.active_filters['availability']) == 0: self.availability_filters_np = None else: self.availability_filters_np = None for filter_name in self.active_filters['availability']: if self.availability_filters_np is None: self.availability_filters_np = filters['availability'][( filter_name, weighted)].copy() else: self.availability_filters_np += filters['availability'][( filter_name, weighted)] self.availability_filters_np.convert_to_datatype('np_uint8') # process term filters if len(self.active_filters['term']) == 0: self.term_filters_np = None else: self.term_filters_np = None for filter_name in self.active_filters['term']: if self.term_filters_np is None: self.term_filters_np = Vector().load_token_vector( filter_name, return_type='np_uint8', docs_or_sections=self.docs_or_sections) else: self.term_filters_np += Vector().load_token_vector( filter_name, return_type='np_uint8', docs_or_sections=self.docs_or_sections) # process doc_type filters if len(self.active_filters['doc_type']) == 0: self.doc_type_filters_np = Vector( np.ones(filter_len, dtype=np.uint8)) else: self.doc_type_filters_np = None for filter_name in self.active_filters['doc_type']: if self.doc_type_filters_np is None: self.doc_type_filters_np = filters['doc_type'][( filter_name, weighted)].copy() else: self.doc_type_filters_np += filters['doc_type'][( filter_name, weighted)] self.doc_type_filters_np.convert_to_datatype('np_uint8') # process collection filters if len(self.active_filters['collection']) == 0: self.collection_filters_np = Vector( np.ones(filter_len, dtype=np.uint8)) else: self.collection_filters_np = None for filter_name in self.active_filters['collection']: if self.collection_filters_np is None: self.collection_filters_np = filters['collection'][( filter_name, weighted)].copy() else: self.collection_filters_np += filters['collection'][( filter_name, weighted)] self.collection_filters_np.convert_to_datatype('np_uint8') # Apply term filter to doc type and collection filters if self.term_filters_np is not None: self.doc_type_filters_np.filter_with(self.term_filters_np) self.collection_filters_np.filter_with(self.term_filters_np) # Apply availability filter to doc type and collection filters if self.availability_filters_np is not None: self.doc_type_filters_np.filter_with(self.availability_filters_np) self.doc_type_filters_np.filter_with(self.availability_filters_np) # Create final filter if len(self.active_filters['doc_type']) == 0: self.combined_filters_np = self.collection_filters_np elif len(self.active_filters['collection']) == 0: self.combined_filters_np = self.doc_type_filters_np else: self.combined_filters_np = self.collection_filters_np.filter_with( self.doc_type_filters_np, return_copy=True)
class NgramResult(): def __init__(self, doc_type_filters: list, collection_filters: list, availability_filters: list, term_filters: list, unparsed_search_tokens: list = None, parsed_search_tokens: list = None): self.unparsed_search_tokens = unparsed_search_tokens self.parsed_search_tokens = parsed_search_tokens self.errors = None self.docs_or_sections = None self.aggregate = None self.aggregate_years = None self.muliprocessing_queue = None # Results self.tokens_data = None self.collections = None self.doc_types = None self.doc_type_groups = None # Filters (These are inputs, lists of strings) self.doc_type_filters = doc_type_filters self.collection_filters = collection_filters self.availability_filters = availability_filters self.term_filters = term_filters self.active_filters = self._get_active_filters() # Filters (uint8 np arrays of the actual filters) self.doc_type_filters_np = None self.collection_filters_np = None self.availability_filters_np = None self.term_filters_np = None self.combined_filters_np = None def store_result_in_db(self, database): hash = generate_hash((self.parsed_search_tokens, self.doc_type_filters, self.collection_filters, self.availability_filters, self.term_filters)) store_cmd = '''REPLACE INTO results_frequencies (tokens, doc_type_filters, collection_filters, availability_filters, term_filters, query_hash, results, last_accessed, count_accessed ) VALUES(%s, %s, %s, %s, %s, %s, %s, DATE(NOW()), 0);''' con, cur = database.connect() cur.execute(store_cmd, (str(self.tokens), str( self.doc_type_filters), str(self.collection_filters), str(self.availability_filters), str(self.term_filters), hash, json.dumps(self.generate_results_dict()))) con.commit() con.close() def generate_results_dict(self): return { 'error': self.errors, 'data': { 'tokens': self.tokens_data, # using 'tokens' for backwards compatibility 'collections': self.collections, 'doc_types': self.doc_types, 'doc_type_groups': self.doc_type_groups } } def _get_active_filters(self): return { 'doc_type': self.doc_type_filters, 'collection': self.collection_filters, 'availability': self.availability_filters, 'term': self.term_filters } def compute_result(self, globals): """ Computes the result for ngram >>> unparsed_search_tokens = ['addiction'] >>> doc_type_filters = [] >>> collection_filters = [] >>> availability_filters = [] >>> term_filters = [] >>> globals = get_globals() >>> ngram = NgramResult(doc_type_filters, collection_filters, availability_filters, ... term_filters, unparsed_search_tokens=unparsed_search_tokens) >>> ngram.compute_result(globals) """ check_param_type(self.unparsed_search_tokens, list, 'unparsed_search_tokens', 'NgramResult.__init__()') check_param_type(self.collection_filters, list, 'doc_type_filters', 'NgramResult.__init__()') check_param_type(self.availability_filters, list, 'availability_filters', 'NgramResult.__init__()') check_param_type(self.term_filters, list, 'term_filters', 'NgramResult.__init__()') check_param_type(self.doc_type_filters, list, 'doc_type_filters', 'NgramResult.__init__()') self.active_filters = self._get_active_filters() if len(self.term_filters) == 0: self.docs_or_sections = 'docs' else: self.docs_or_sections = 'sections' # Initialize multiprocessing queue to handle the results for the collections and document types mp_results_queue = multiprocessing.Queue() # parse the search tokens as a separate process... multiprocessing.Process(target=parse_search_tokens, args=(self.unparsed_search_tokens, mp_results_queue)).start() # ... in the meantime, load and set the active doc type, collection, availability, # term, and combined filters. They are stored as self.doc_type_filters_np, # self.combined_filters_np... self._compute_set_active_filters_np(globals) # create a total count per year array. Add 1 to totals to avoid division by 0 errors. totals_vector = globals['totals']['totals'][ self.docs_or_sections]['np'] self.totals_years = totals_vector.convert_to_year_array( filter_vec=self.combined_filters_np) + 1 # get the parsed search tokens. If there were errors, return them. self.parsed_search_tokens, self.errors = mp_results_queue.get() if len(self.parsed_search_tokens) == 0: print({'error': self.errors}) return {'error': self.errors} # get the count data for all tokens. # adds tokens_data (list of count/freq data for each token), aggregate (vector sum of all # tokens), aggregate_years (aggregate as years vector) self._compute_add_tokens_data() # Second round of multiprocessing: calculate z-scores while adding collections and doc types multiprocessing.Process(target=get_z_scores, args=(self.tokens_data, self.totals_years, mp_results_queue)).start() # add collections data self._compute_add_collection_data(globals) print("Collections") for i in self.collections: print(i) # add document type and document type group data self._compute_add_doc_type_data(globals) # release memory of variables that are no longer used self.aggregate = None self.aggregate_csc = None self.totals_years = None self.combined_filters_np = None self.collection_filters_np = None self.doc_type_filters_np = None mp_result = mp_results_queue.get() z_scores = mp_result[1] for token_id in range(len(z_scores)): self.tokens_data[token_id]['z_scores'] = z_scores[token_id].tolist( ) def _compute_set_active_filters_np(self, globals): """ Applies all filters to both the term and a copy of the totals vector and sets them All filters are np uint8 Vectors. The following filters are set in this function: doc_type_filters_np collection_filters_np availability_filters_np term_filters_np combined_filters_np 6/10/17 Added availability filter. The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it. That is to say: the availability filter is not used on its own. 7/25/17 Added term filter 12/19/18: Moved to calculate_ngrams from preprocessing filters as this is a calculation step, not a pre-processing step >>> globals = get_globals(load_only_docs=True) >>> dt_filters = ['internal communication'] >>> col_filters = [2,3] >>> avail_filters = ['no restrictions'] >>> term_filters = [] >>> search_tokens = ['addiction'] >>> ngram = NgramResult(dt_filters, col_filters, avail_filters, term_filters, search_tokens) >>> ngram.docs_or_sections = 'docs' >>> ngram._compute_set_active_filters_np(globals=globals) >>> print(ngram.combined_filters_np) <Document Vector of type np_uint8 with 4964 elements.> :return: None """ filters = globals['filters'][self.docs_or_sections] if not 'term' in self.active_filters: self.active_filters['term'] = {} # all filters used here are unweighted # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each. weighted = False filter_len = DOC_COUNT if self.docs_or_sections == 'sections': filter_len = SECTION_COUNT # process availability filters if len(self.active_filters['availability']) == 0: self.availability_filters_np = None else: self.availability_filters_np = None for filter_name in self.active_filters['availability']: if self.availability_filters_np is None: self.availability_filters_np = filters['availability'][( filter_name, weighted)].copy() else: self.availability_filters_np += filters['availability'][( filter_name, weighted)] self.availability_filters_np.convert_to_datatype('np_uint8') # process term filters if len(self.active_filters['term']) == 0: self.term_filters_np = None else: self.term_filters_np = None for filter_name in self.active_filters['term']: if self.term_filters_np is None: self.term_filters_np = Vector().load_token_vector( filter_name, return_type='np_uint8', docs_or_sections=self.docs_or_sections) else: self.term_filters_np += Vector().load_token_vector( filter_name, return_type='np_uint8', docs_or_sections=self.docs_or_sections) # process doc_type filters if len(self.active_filters['doc_type']) == 0: self.doc_type_filters_np = Vector( np.ones(filter_len, dtype=np.uint8)) else: self.doc_type_filters_np = None for filter_name in self.active_filters['doc_type']: if self.doc_type_filters_np is None: self.doc_type_filters_np = filters['doc_type'][( filter_name, weighted)].copy() else: self.doc_type_filters_np += filters['doc_type'][( filter_name, weighted)] self.doc_type_filters_np.convert_to_datatype('np_uint8') # process collection filters if len(self.active_filters['collection']) == 0: self.collection_filters_np = Vector( np.ones(filter_len, dtype=np.uint8)) else: self.collection_filters_np = None for filter_name in self.active_filters['collection']: if self.collection_filters_np is None: self.collection_filters_np = filters['collection'][( filter_name, weighted)].copy() else: self.collection_filters_np += filters['collection'][( filter_name, weighted)] self.collection_filters_np.convert_to_datatype('np_uint8') # Apply term filter to doc type and collection filters if self.term_filters_np is not None: self.doc_type_filters_np.filter_with(self.term_filters_np) self.collection_filters_np.filter_with(self.term_filters_np) # Apply availability filter to doc type and collection filters if self.availability_filters_np is not None: self.doc_type_filters_np.filter_with(self.availability_filters_np) self.doc_type_filters_np.filter_with(self.availability_filters_np) # Create final filter if len(self.active_filters['doc_type']) == 0: self.combined_filters_np = self.collection_filters_np elif len(self.active_filters['collection']) == 0: self.combined_filters_np = self.doc_type_filters_np else: self.combined_filters_np = self.collection_filters_np.filter_with( self.doc_type_filters_np, return_copy=True) def _compute_add_tokens_data(self): """ Load counts, frequencies, and totals for each token. 12/18/18 Moved from preprocessing_tokens and implemented for use with the NgramResult class, i.e. it won't return a df but individual vars. :return: None """ self.tokens_data = [] self.aggregate = None for token in self.parsed_search_tokens: # Load token and totals try: loaded_vector = Vector().load_token_vector( token, return_type='np_int32', docs_or_sections=self.docs_or_sections) except FileNotFoundError: print('Could not load token {}.'.format(token)) continue # initialize aggregate if self.aggregate is None: self.aggregate = loaded_vector.copy() else: self.aggregate += loaded_vector absolute_counts = loaded_vector.convert_to_year_array( filter_vec=self.combined_filters_np) self.tokens_data.append({ 'token': token, 'counts': absolute_counts, 'frequencies': absolute_counts / self.totals_years, 'total': absolute_counts.sum }) self.tokens_data = sorted(self.tokens_data, key=lambda k: k['total'], reverse=True) self.aggregate.filter_with(self.combined_filters_np) self.aggregate_csc = self.aggregate.copy().convert_to_datatype('csc') def _compute_add_collection_data(self, globals): # Sort filters by number of documents they represent filter_sums = [] for filter_name in globals['filters'][ self.docs_or_sections]['collection']: if filter_name == ('msa_bat', False): continue filter = globals['filters'][ self.docs_or_sections]['collection'][filter_name] if filter.sum > 0: filter_sums.append((filter_name, filter.sum)) filter_sums_sorted = sorted(filter_sums, key=lambda x: x[1], reverse=True) # Select 9 collections with the most documents cols_filtered = [] for filter_name, filter_sum in filter_sums_sorted: # if a filter's total is lower than the highest included filtered collection -> skip becaus # it has no chance of getting included. filter = globals['filters'][ self.docs_or_sections]['collection'][filter_name] if len(cols_filtered ) > 9 and cols_filtered[8]['total'] > filter_sum: continue cols_filtered = cols_filtered[:9] col_filtered = self.aggregate_csc.convert_to_year_array( filter_vec=filter) cols_filtered.append({ 'name': filter_name[0], 'absolute_counts': col_filtered, 'total': col_filtered.sum }) # embed() if len(cols_filtered) >= 9: cols_filtered = sorted(cols_filtered, key=lambda x: x['total'], reverse=True) cols_filtered = cols_filtered[:9] results = [] for col in cols_filtered: name = col['name'] collection_totals = globals['totals']['collection'][ self.docs_or_sections][name] collection_totals_filtered = collection_totals.convert_to_year_array( filter_vec=self.doc_type_filters_np) relative_frequencies = col[ 'absolute_counts'] / collection_totals_filtered results.append({ 'token': globals['collections_and_idx_dict'][name]['name_short'], 'counts': col['absolute_counts'], 'frequencies': relative_frequencies, 'total': col['total'] }) self.collections = results def _compute_add_doc_type_data(self, globals): # Second, add all of the doc_type_groups dts = [] for dt_group_name in [ 'internal communication', 'marketing documents', 'internal scientific reports', 'news reports', 'scientific publications', 'court documents' ]: dt = {'token': dt_group_name} dt_group_filter = globals['filters'][ self.docs_or_sections]['doc_type'][(dt_group_name, False)] agg_filtered_with_dt = self.aggregate_csc.convert_to_year_array( filter_vec=dt_group_filter) dt['absolute_counts'] = agg_filtered_with_dt dt['total'] = agg_filtered_with_dt.sum dt_group_totals = globals['totals']['doc_type'][ self.docs_or_sections][dt_group_name] dt_group_totals_filtered = dt_group_totals.convert_to_year_array( filter_vec=self.collection_filters_np) freqs = dt['absolute_counts'] / dt_group_totals_filtered freqs.vector = np.nan_to_num(freqs.vector) dt['frequencies'] = freqs dts.append(dt) # Second, find the 9 most frequent document types to process dts_filtered = [] for i in range(275): dt_name = globals['doc_type_and_idx_dict'][i] # 1/2019: the following dts are missing: 99 journal, 208 magazine, # 230 report - clinical study, 243 paper, 248 non printable/unable # 264 conference proceedings. Unclear why but these are small collections so it # shouldn't matter. try: dt_filter = globals['filters'][ self.docs_or_sections]['doc_type'][(dt_name, False)] except: print(i, dt_name) continue dt_filter_sum = dt_filter.sum if len(dts_filtered ) > 9 and dts_filtered[8]['total'] > dt_filter_sum: continue dts_filtered = dts_filtered[:9] agg_filtered_with_dt = self.aggregate_csc.convert_to_year_array( filter_vec=dt_filter) dts_filtered.append({ 'name': dt_name, 'absolute_counts': agg_filtered_with_dt, 'total': agg_filtered_with_dt.sum }) if len(dts_filtered) >= 9: dts_filtered = sorted(dts_filtered, key=lambda x: x['total'], reverse=True) for dt in dts_filtered: dt_totals = globals['totals']['doc_type'][self.docs_or_sections][ dt['name']] dt_totals_filtered = dt_totals.convert_to_year_array( filter_vec=self.collection_filters_np) freqs = dt['absolute_counts'] / dt_totals_filtered freqs.vector = np.nan_to_num(freqs.vector) dt['frequencies'] = freqs dts.append(dt) self.doc_types = dts
def find_and_store_policy(term='and', filtered_collection_vector=None, filtered_totals_year_vector=None, collection=None): db = sqlite3.connect('policies.db') cur = db.cursor() col_id = COL_NAME_TO_ID[collection] if not filtered_collection_vector: filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8') filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8')) filtered_totals_year_vector = TOTALS_COL[col_id].convert_to_year_array(filter_vec=filtered_collection_vector) term_v = Vector().load_token_vector(token=term) term_year_vector = term_v.convert_to_year_array(filter_vec=filtered_collection_vector) dunnings = {} for start_first_period in range(50, 90): end_first_period = start_first_period + 3 policy_year = start_first_period + 4 start_second_period = start_first_period + 5 end_second_period = start_first_period + 8 term_count_first = term_year_vector[start_first_period : end_first_period+1].sum() term_count_second = term_year_vector[start_second_period : end_second_period+1].sum() totals_first = filtered_totals_year_vector[start_first_period : end_first_period+1].sum() totals_second = filtered_totals_year_vector[start_second_period : end_second_period+1].sum() dunning = dunning_log_likelihood(term_count_first, term_count_second, totals_first, totals_second) dunnings[policy_year] = { 'year': f'19{policy_year}', 'dunning': dunning, 'first': term_count_first, 'first_freq': term_count_first / totals_first*100, 'second': term_count_second, 'second_freq': term_count_second / totals_second*100 } # print(f'19{start_first_period}-19{end_first_period} vs. 19{start_second_period}-' # f'19{end_second_period}: {dunning}. 1: {term_count_first}. 2: {term_count_second}') dunnings_sorted = sorted(dunnings.items(), key=lambda x:x[1]['dunning']) policy_adoption = dunnings_sorted[-1][1] policy_ending = dunnings_sorted[0][1] policy = '{}. {:15s}. Adoption: {}. D: {:7.0f}. C1: {:9d}. C2: {:9d}. F: {:5.3f}. ' \ 'Ending: {}. D:{:7.0f}. C1: {:9d}. C2: {:9d}. F: {:5.3f}.'.format( collection, term, policy_adoption['year'], policy_adoption['dunning'], policy_adoption['first'], policy_adoption['second'], policy_adoption['first_freq']/ policy_adoption['second_freq'], policy_ending['year'], policy_ending['dunning'], policy_ending['first'], policy_ending['second'], policy_ending['first_freq']/ policy_ending['second_freq'] ) print(policy) cur.execute('''INSERT INTO policies_5p VALUES("{}", "{}", {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})'''.format( collection, term, policy_adoption['year'], policy_adoption['dunning'], policy_adoption['first'], policy_adoption['second'], policy_adoption['first_freq'], policy_adoption['second_freq'], policy_ending['year'], policy_ending['dunning'], policy_ending['first'], policy_ending['second'], policy_ending['first_freq'], policy_ending['second_freq'] )) db.commit()
def get_absolute_google_counts(token_name: str) -> np.ndarray: """ This function retrieves the absolute counts for a given token from the Google Ngram Viewer. It first loads the relative frequencies from the ngram viewer and the absolute counts for the corpus from Google's source data. Then, it multiplies the absolute number of terms in the corpus for any given year with the relative frequency of the search token. >>> google_counts = get_absolute_google_counts('addiction') >>> print(f'Google counts for addiction in 1950: {google_counts[50]}') Google counts for addiction in 1950: 2482.0 >>> type(google_counts) <class 'tobacco.utilities.vector.Vector'> """ hash = hashlib.sha256(token_name.encode()).hexdigest() file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3], f'{hash}.npy') try: # this really doesn't need a hash absolute_counts = Vector().load_from_disk(file_path, return_type='np_int32') # absolute_counts = np.load(token_path+hash_path+'.npy') except FileNotFoundError: corpus_id = 15 # construct the url, i.e. place the token and other parameters where they belong url = 'https://books.google.com/ngrams/interactive_chart?content={}&year_start={}&year_end={}' \ '&corpus={}&smoothing=0'.format(token_name.replace(' ', '+'), YEAR_START, YEAR_END, corpus_id) try: with urllib.request.urlopen(url, timeout=1) as response: page = response.read().decode('utf-8') if page.find('var data = [];') > -1: relative_frequencies = 116 * [0] else: start = page.find('var data = [') end = page.find('}];', start) data_dict = json.loads(page[start + 11:end + 2])[0] relative_frequencies = data_dict['timeseries'] relative_frequencies += 8 * [relative_frequencies[-1]] except urllib.error.HTTPError: relative_frequencies = 116 * [0] # if general error, return 0 but don't store except: temp = 116 * [0] return np.array( [round(temp[i] * GOOGLE_TOTALS[i]) for i in range(len(temp))], dtype=np.float) # Step 3: calculate the absolute number of appearances by multiplying the frequencies with the total number of tokens absolute_counts = [ round(relative_frequencies[i] * GOOGLE_TOTALS[i]) for i in range(len(relative_frequencies)) ] absolute_counts = Vector(np.array(absolute_counts, dtype=np.float64)) absolute_counts.save_to_disk(file_path) # hash = hashlib.sha256(token_name.encode()).hexdigest() # file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3]) # token_path = PATH_GOOGLE_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3]) # if not Path.exists(file_path): # print(file_path) # Path.mkdir(file_path, parents=True) # np.save(token_path + hash_path, absolute_counts) return absolute_counts
from tobacco.utilities.vector import Vector GOOGLE_TOTALS = Vector( np.array([ 1285712637.0, 1311315033.0, 1266236889.0, 1405505328.0, 1351302005.0, 1397090480.0, 1409945274.0, 1417130893.0, 1283265090.0, 1354824248.0, 1350964981.0, 1431385638.0, 1356693322.0, 1324894757.0, 1211361619.0, 1175413415.0, 1183132092.0, 1039343103.0, 1136614538.0, 1388696469.0, 1216676110.0, 1413237707.0, 1151386048.0, 1069007206.0, 1113107246.0, 1053565430.0, 1216023821.0, 1212716430.0, 1153722574.0, 1244889331.0, 1183806248.0, 1057602772.0, 915956659.0, 1053600093.0, 1157109310.0, 1199843463.0, 1232280287.0, 1261812592.0, 1249209591.0, 1179404138.0, 1084154164.0, 1045379066.0, 890214397.0, 812192380.0, 926378706.0, 1203221497.0, 1385834769.0, 1486005621.0, 1641024100.0, 1644401950.0, 1603394676.0, 1621780754.0, 1590464886.0, 1662160145.0, 1751719755.0, 1817491821.0, 1952474329.0, 1976098333.0, 2064236476.0, 2341981521.0, 2567977722.0, 2818694749.0, 2955051696.0, 2931038992.0, 3300623502.0, 3466842517.0, 3658119990.0, 3968752101.0, 3942222509.0, 4086393350.0, 4058576649.0, 4174172415.0, 4058707895.0, 4045487401.0, 4104379941.0, 4242326406.0, 4314577619.0, 4365839878.0, 4528331460.0, 4611609946.0, 4627406112.0, 4839530894.0, 4982167985.0, 5309222580.0, 5475269397.0, 5793946882.0, 5936558026.0, 6191886939.0, 6549339038.0, 7075013106.0, 6895715366.0, 7596808027.0, 7492130348.0, 8027353540.0, 8276258599.0, 8745049453.0, 8979708108.0, 9406708249.0, 9997156197.0, 11190986329.0, 11349375656.0, 12519922882.0, 13632028136.0, 14705541576.0, 14425183957.0, 15310495914.0, 16206118071.0, 19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0 ], dtype=np.float64))
def get_filter(search_term, filter_type, weight=False, return_type='csc', docs_or_sections='docs'): ''' Creates a binary filter (True if document has the specificed doc_type or collection. Falso otherwise :param search_term: :param filter_type: 'collection' or 'doc_type' :param weight: :return: Vector >>> filter = get_filter('letter', 'doc_type', weight=False, return_type='csc') >>> filter <Document Vector of type csc with 2490726 elements.> ''' try: # can't store terms with a forward slash -> replace with underscore if filter_type == 'doc_type': search_term = search_term.replace('/', '_') file_name = '{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight) file_path = Path(PATH_TOKENIZED, 'filters', file_name) filter = Vector().load_from_disk(file_path, return_type=return_type) # filter = load_csc_matrix_from_file(PATH_TOKENIZED + 'filters/{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight)) except IOError: db = Database("TOB_FULL") con, cur = db.connect() if docs_or_sections == 'docs': filter_len = DOC_COUNT elif docs_or_sections == 'sections': filter_len = SECTION_COUNT else: raise ValueError("param docs_or_sections has to be either 'docs' or sections' but not ''{}".format(docs_or_sections)) if weight: filter = np.zeros((filter_len, 1), dtype=np.float) else: filter = np.zeros((filter_len, 1), dtype=np.bool) if filter_type == 'collection': cur.execute("SELECT id, no_tokens from docs where collection_id = '{}' ORDER BY id ASC".format(search_term)) elif filter_type == 'doc_type': if weight: cur.execute('''SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens, doc_types.weight as weight FROM doc_types, docs WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'''.format(search_term)) else: cur.execute('SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens FROM doc_types, docs ' ' WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'.format(search_term)) elif filter_type == 'availability': # dict maps from search term to the requisite where clause (documents can be both formerly privileged and formerly confidential) term_to_mysql_where_clauses_dict = { 'no restrictions': 'WHERE availability = "public;no restrictions"', 'formerly confidential': 'WHERE availability = "public;formerly confidential" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"', 'formerly privileged': 'WHERE availability = "public;formerly privileged" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"' } cur.execute('SELECT id, no_tokens from docs {} ORDER BY id ASC'.format(term_to_mysql_where_clauses_dict[search_term])) else: raise KeyError("{} is not a valid filter_type. Valid filter types are 'collection', 'doc_type', and 'availability'".format(filter_type)) if docs_or_sections == 'sections': doc_id_to_section_dict = get_doc_id_to_section_id_dict() rows = cur.fetchall() for row in rows: if docs_or_sections == 'docs': if weight: filter[row['id']] = row['weight'] else: filter[row['id'], 0] = True elif docs_or_sections == 'sections': first_id, last_id = doc_id_to_section_dict[row['id']] for section_id in range(first_id, last_id+1): filter[section_id] = True filter = csc_matrix(filter) if filter_type == 'doc_type': search_term = search_term.replace('/', '_') filter_path = PATH_TOKENIZED + 'filters/{}_{}_{}_{}.npz'.format(search_term, filter_type, docs_or_sections, weight) store_csr_matrix_to_file(filter, filter_path) print("Created filter for {} with {} elements.".format(search_term, filter.getnnz())) return get_filter(filter_type, weight, return_type, docs_or_sections) # if return_type == 'np': # filter = np.array(filter.todense()).flatten() return filter
def get_active_filters_np(active_filters, FILTERS, docs_or_sections='docs'): """ Applies all filters to both the term and a copy of the totals vector and returns them 6/10/17 Added availability filter. The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it. That is to say: the availability filter is not used on its own. 7/25/17 Added term filter >>> FILTERS = { ... 'docs':{ ... 'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='docs'), ... 'collection': get_collection_filters(return_type='csc', docs_or_sections='docs'), ... 'availability': get_availability_filters(return_type='csc', docs_or_sections='docs') ... }, ... 'sections':{ ... 'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'), ... 'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'), ... 'availability': get_availability_filters(return_type='csc', docs_or_sections='sections') ... } ... } >>> active_filters = { ... 'doc_type': {'internal communication'}, ... 'collection': {2,3}, ... 'availability': {'no restrictions'}, ... 'term': {} ... } >>> doc_type_filter, collection_filter, final_filter = get_active_filters_np( ... active_filters=active_filters, FILTERS=FILTERS, docs_or_sections='docs') :param active_filters: dict of lists, e.g. {'doc_type': ["internal communication"], 'collection': [1,2], 'availability': [], 'term': []} :param FILTERS: Filters from global ??I think they are csc filters ?? :param return_type: e.g. np.uint8 or np.int32. By default, the same document type as the input, usually np.uint8 :param docs_or_sections: 'docs' or 'sections' :return: doc_type_filter, collection_filter, final_filter """ if not 'term' in active_filters: active_filters['term'] = {} # all filters used here are unweighted # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each. weighted = False filter_len = DOC_COUNT if docs_or_sections == 'sections': filter_len = SECTION_COUNT # process availability filters if len(active_filters['availability']) == 0: availability_filter = None else: availability_filter = None for filter_name in active_filters['availability']: if availability_filter is None: availability_filter = FILTERS[docs_or_sections]['availability'][(filter_name, weighted)].copy() else: availability_filter += FILTERS[docs_or_sections]['availability'][(filter_name, weighted)] availability_filter.convert_to_datatype('np_uint8') # process term filters if len(active_filters['term']) == 0: term_filter = None else: term_filter = None for filter_name in active_filters['term']: if term_filter is None: term_filter = Vector().load_token_vector(filter_name, return_type='np_uint8', docs_or_sections=docs_or_sections) else: term_filter += Vector().load_token_vector(filter_name, return_type='np_uint8', docs_or_sections=docs_or_sections) # process doc_type filters if len(active_filters['doc_type']) == 0: doc_type_filter = np.ones(filter_len, dtype='bool') else: doc_type_filter = None for filter_name in active_filters['doc_type']: if doc_type_filter is None: doc_type_filter = FILTERS[docs_or_sections]['doc_type'][(filter_name, weighted)].copy() else: doc_type_filter += FILTERS[docs_or_sections]['doc_type'][(filter_name, weighted)] # if docs_or_sections == 'sections': doc_type_filter.convert_to_datatype('np_uint8') # process collection filters if len(active_filters['collection']) == 0: collection_filter = np.ones(filter_len, dtype=np.uint8) else: collection_filter = None for filter_name in active_filters['collection']: if collection_filter is None: collection_filter = FILTERS[docs_or_sections]['collection'][(filter_name, weighted)].copy() else: collection_filter += FILTERS[docs_or_sections]['collection'][(filter_name, weighted)] collection_filter.convert_to_datatype('np_uint8') # Apply term filter to doc type and collection filters if term_filter is not None: doc_type_filter.filter_with(term_filter) collection_filter.filter_with(term_filter) # Apply availability filter to doc type and collection filters if availability_filter is not None: doc_type_filter.filter_with(availability_filter) doc_type_filter.filter_with(availability_filter) # Create final filter if len(active_filters['doc_type']) == 0: final_filter = collection_filter elif len(active_filters['collection']) == 0: final_filter = doc_type_filter else: final_filter = collection_filter.filter_with(doc_type_filter, return_copy=True) return doc_type_filter, collection_filter, final_filter
def create_filter(search_term: Union[str, int], filter_type: str, weight: bool, return_type: str, docs_or_sections: str): ''' Creates a filter vector for collectinos, doc types, availability 12/20/18: Separated from get_filter (moved to Vector class) :param search_term: :param filter_type: :param weight: :param return_type: :param docs_or_sections: :return: ''' db = Database("TOB_FULL") con, cur = db.connect() if docs_or_sections == 'docs': filter_len = DOC_COUNT elif docs_or_sections == 'sections': filter_len = SECTION_COUNT else: raise ValueError("param docs_or_sections has to be either 'docs' or sections' but not ''{}".format(docs_or_sections)) if weight: filter = np.zeros((filter_len, 1), dtype=np.float) else: filter = np.zeros((filter_len, 1), dtype=np.bool) if filter_type == 'collection': cur.execute("SELECT id, no_tokens from docs where collection_id = '{}' ORDER BY id ASC".format(search_term)) elif filter_type == 'doc_type': if weight: cur.execute('''SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens, doc_types.weight as weight FROM doc_types, docs WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'''.format(search_term)) else: cur.execute('SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens FROM doc_types, docs ' ' WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'.format(search_term)) elif filter_type == 'availability': # dict maps from search term to the requisite where clause (documents can be both formerly privileged and formerly confidential) term_to_mysql_where_clauses_dict = { 'no restrictions': 'WHERE availability = "public;no restrictions"', 'formerly confidential': 'WHERE availability = "public;formerly confidential" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"', 'formerly privileged': 'WHERE availability = "public;formerly privileged" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"' } cur.execute('SELECT id, no_tokens from docs {} ORDER BY id ASC'.format(term_to_mysql_where_clauses_dict[search_term])) else: raise KeyError("{} is not a valid filter_type. Valid filter types are 'collection', 'doc_type', and 'availability'".format(filter_type)) if docs_or_sections == 'sections': doc_id_to_section_dict = get_doc_id_to_section_id_dict() rows = cur.fetchall() for row in rows: if docs_or_sections == 'docs': if weight: filter[row['id']] = row['weight'] else: filter[row['id'], 0] = True elif docs_or_sections == 'sections': first_id, last_id = doc_id_to_section_dict[row['id']] for section_id in range(first_id, last_id + 1): filter[section_id] = True filter_vec = Vector(csc_matrix(filter)) if filter_type == 'doc_type': search_term = search_term.replace('/', '_') file_name = '{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight) file_path = Path(PATH_TOKENIZED, 'filters', file_name) filter_vec.save_to_disk(file_path) # filter_path = PATH_TOKENIZED + 'filters/{}_{}_{}_{}.npz'.format(search_term, filter_type, docs_or_sections, weight) # store_csr_matrix_to_file(filter, filter_path) print("Created filter for {} with {} elements.".format(search_term, filter_vec.vector.getnnz()))