예제 #1
0
def parse_text_passages_tokens(tokens):

    search_regexes = []

    intersection_vector = None

    first_token = tokens[0].strip('*')

    for token in tokens:
        # if wildcard, process wildcard
        if token.find('*') > -1:
            token_vector, token_regex = process_wildcard_token(token)

        # else: handle normal token
        else:
            try:

                token_vector = Vector().load_token_vector(
                    token, return_type='np_uint8', docs_or_sections='sections')
                token_regex = re.compile(r'\b{}\b'.format(token))

            # will throw a fof error when the token does not exist. e.g. 'compound w'
            except FileNotFoundError:
                token_vector, token_regex = process_nonexistant_token(token)

        if token_vector is not None:
            if intersection_vector is None:
                intersection_vector = token_vector
            else:
                intersection_vector *= token_vector

        search_regexes.append(token_regex)

    return first_token, search_regexes, intersection_vector
예제 #2
0
    def _compute_add_tokens_data(self):
        """
        Load counts, frequencies, and totals for each token.

        12/18/18 Moved from preprocessing_tokens and implemented for use with the NgramResult class,
        i.e. it won't return a df but individual vars.

        :return: None
        """

        self.tokens_data = []
        self.aggregate = None

        for token in self.parsed_search_tokens:
            # Load token and totals
            try:
                loaded_vector = Vector().load_token_vector(
                    token,
                    return_type='np_int32',
                    docs_or_sections=self.docs_or_sections)
            except FileNotFoundError:
                print('Could not load token {}.'.format(token))
                continue

            # initialize aggregate
            if self.aggregate is None:
                self.aggregate = loaded_vector.copy()
            else:
                self.aggregate += loaded_vector

            absolute_counts = loaded_vector.convert_to_year_array(
                filter_vec=self.combined_filters_np)

            self.tokens_data.append({
                'token': token,
                'counts': absolute_counts,
                'frequencies': absolute_counts / self.totals_years,
                'total': absolute_counts.sum
            })

        self.tokens_data = sorted(self.tokens_data,
                                  key=lambda k: k['total'],
                                  reverse=True)
        self.aggregate.filter_with(self.combined_filters_np)
        self.aggregate_csc = self.aggregate.copy().convert_to_datatype('csc')
예제 #3
0
def get_doc_type_filters(return_type='csc', docs_or_sections = 'docs'):


    doc_types = get_dtype_dict()


    doc_type_filters = {}
    for doc_type in doc_types['valid']:
        for weight in [False]:
            if doc_type in [
                'letter', 'report', 'memo', 'email', 'note', 'publication', 'report, scientific', 'advertisement',
                'promotional material', 'budget', 'specification', 'budget_review', 'telex', 'news_article', 'agenda',
                'report, market research', 'speech', 'presentation', 'minutes'
            ]:
                doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type',
                          weight=weight, return_type=return_type, docs_or_sections=docs_or_sections)
            else:
                doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type',
                          weight=weight, return_type='csc', docs_or_sections=docs_or_sections)


    for group in doc_types['groups']:
        for weight in [False]:

            file_name = '{}_{}_{}_{}'.format(group, 'doc_type', docs_or_sections, weight)
            file_path = Path(PATH_TOKENIZED, 'filters', file_name)
            try:
                group_filter = Vector().load_from_disk(file_path, return_type=return_type)

            except FileNotFoundError:
                print("creating group filter for: ", group)
                group_filter = None
                for doc_type in doc_types['groups'][group]:
                    if group_filter is None:
                        group_filter = doc_type_filters[(doc_type, weight)]
                    else:
                        group_filter += doc_type_filters[(doc_type, weight)]

                group_filter.save_to_disk(file_path)

            doc_type_filters[(group, weight)] = group_filter
#            if return_type == 'np':
#                doc_type_filters[(group, weight)] = csc_bool_to_np_cython(group_filter)

    return doc_type_filters
def get_doc_type_totals_vector(doc_type_name,
                               docs_or_sections='docs',
                               return_type='csc'):
    """
    Loads one doc_type totals vector

    >>> totals = get_doc_type_totals_vector('report', 'docs', 'csc')
    >>> totals
    <Document Vector of type csc with 1652572 elements and length 11303161.>


    :param doc_type_name:
    :param docs_or_sections:
    :param return_type:
    :return:
    """

    try:
        return Vector().load_totals_vector(doc_type_name.replace('/', '_'),
                                           'doc_type', docs_or_sections,
                                           return_type)
    except IOError:
        # added 6/9/17. This is all very awkward (i.e. loading filters every time).
        # It replaces an older solution that used filter_vector(), which is no longer available.
        # Filters get loaded every time because otherwise, the filters would be held in memory twice.

        print("Creating doc type totals vector for: ", doc_type_name,
              docs_or_sections)
        filters = get_filters(return_type='np')
        totals_vector = csc_to_np_int32(get_totals_vector(docs_or_sections))
        active_doc_type_filters_np, _, _ = get_active_filters_np(
            active_filters={
                'doc_type': {doc_type_name},
                'collection': {},
                'availability': {}
            },
            FILTERS=filters,
            docs_or_sections=docs_or_sections,
            return_type=np.uint8)

        filtered_dt = totals_vector * active_doc_type_filters_np
        # um... yeah... ugly but it creates the required mx1 sparse vector
        vec = csc_matrix(csc_matrix(filtered_dt).T, dtype=np.int64)

        #            totals[dt] = filter_vector(get_totals_vector(), {'doc_type': {dt}, 'collection': {}})
        store_csr_matrix_to_file(
            vec, PATH_TOKENIZED + 'totals/{}_{}'.format(
                doc_type_name.replace('/', '_'), docs_or_sections))

        return get_doc_type_totals_vector(doc_type_name, docs_or_sections,
                                          return_type)
예제 #5
0
def process_all():

    create_sqlite_table()

    terms = []

    db = Database('TOB_FULL')
    con, cur = db.connect()

    cur.execute('SELECT token from tokens where total > 10000;')
    for row in cur.fetchall():
        term = row['token']
        valid = True

        for word in term.split():
            if len(word) == 1:
                valid = False
            try:
                int(word)
                valid = False
            except ValueError:
                pass
        if valid:
            terms.append(term)

    print("Number of terms: {}".format(len(terms)))


    for collection in COL_NAME_TO_ID:
        col_id = COL_NAME_TO_ID[collection]
        filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8')
        filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8'))
        max_5p_filter = Vector().load_page_number_vector('max_5')
        print("pre", filtered_collection_vector)
        filtered_collection_vector.filter_with(max_5p_filter)
        print('post', filtered_collection_vector)

        if collection == 'msa_bat':
            totals = TOTALS_COL[5]
            for id in [6, 7, 8, 9, 10, 11, 15]:
                totals += TOTALS_COL[id]
            print(totals)
        else:
            totals = TOTALS_COL[col_id]
        filtered_totals_year_vector = totals.convert_to_year_array(filter_vec=filtered_collection_vector)


        for term in terms:
            find_and_store_policy(term, filtered_collection_vector, filtered_totals_year_vector, collection)
def get_ngram_vector(token,
                     return_type='csc',
                     return_sum=False,
                     docs_or_sections='docs'):
    """ Loads the ngram vector of the token

    E.g. v = get_ngram_vector('nicotine', return_type='csc', docs_or_sections='docs')

    :param token: search token, string
    :param return_type: 'csc', 'np', 'uint8'
    :param return_sum: Whether or not to return the sum of the vector.
    :param docs_or_sections: 'docs' or 'sections'
    :return:
    """

    # to distribute the millions of stored ngram vectors, they were hashed.
    hash = hashlib.sha256(token.encode()).hexdigest()

    h = hash
    if docs_or_sections == 'sections':
        h += '_sections'
    token_path = Path(PATH_TOKENS, hash[0], hash[1], hash[2], hash[3], h)

    #    token_path = PATH_TOKENS + '{}/{}/{}/{}/{}'.format(hash[0], hash[1], hash[2], hash[3], hash)

    #    if docs_or_sections == 'sections':
    #        token_path += '_sections'

    ngram_vector = Vector()
    ngram_vector.load_from_disk(token_path)
    ngram_vector.convert_to_datatype(return_type)

    return ngram_vector

    from IPython import embed
    embed()

    #    csc = load_csc_matrix_from_file(token_path)
    #    if csc is None:
    #        print("Could not find token vector for token {} at {}".format(token, token_path))

    # if return_type == 'csc':
    #     pass
    # elif return_type == 'np':
    #     out = csc_to_np_int64(csc)
    # elif return_type == 'uint8':
    #     out = csc_to_np_uint8(csc)
    #
    # else:
    #     raise ValueError("{} is not a valid return type for get_ngram_vector. 'csc' and 'np' are valid.")

    if return_sum:
        token_sum = csc.data.sum()
        return out, token_sum
    else:
        return out
def get_collection_totals_vector(collection_id,
                                 docs_or_sections,
                                 return_type='csc'):
    """ Load the totals vector for one collection

    :param collection_id: id of the collection
    :param docs_or_sections: "docs" or "sections"
    :param return_type: "csc" or "np" or csc sparse matrix or np array
    :return:
    """

    try:
        return Vector().load_totals_vector(collection_id, 'collection',
                                           docs_or_sections, 'csc')


#        csc = load_csc_matrix_from_file(PATH_TOKENIZED + 'totals/{}_{}'.format(collection_id, docs_or_sections))
    except IOError:

        print("Creating totals vector for collection, type: ", collection_id,
              docs_or_sections)

        filters = get_filters(return_type='np')
        totals_vector = csc_to_np_int32(get_totals_vector(docs_or_sections))
        _, active_collection_filters_np, _ = get_active_filters_np(
            active_filters={
                'doc_type': {},
                'collection': {collection_id},
                'availability': {}
            },
            FILTERS=filters,
            docs_or_sections=docs_or_sections,
            return_type=np.uint8)

        filtered_dt = totals_vector * active_collection_filters_np
        # um... yeah... ugly but it creates the required mx1 sparse vector
        csc = csc_matrix(csc_matrix(filtered_dt).T, dtype=np.int64)
        store_csr_matrix_to_file(
            csc, PATH_TOKENIZED +
            'totals/{}_{}'.format(collection_id, docs_or_sections))

    if return_type == 'csc':
        return csc_matrix(csc, dtype=np.int32)
    else:
        return csc_to_np_int32(csc)
def get_totals_vector(docs_or_sections='docs', return_type='np_int32'):
    '''
    Only implemented for 1 gram because there's no reason why we would need totals for 2-5 grams
    :return:
    '''

    ngram = 1

    try:
        file_name = 'totals_{}_{}'.format(ngram, docs_or_sections)
        file_path = Path(PATH_TOKENIZED, file_name)
        totals_vector = Vector().load_from_disk(file_path,
                                                return_type=return_type)
        return totals_vector

    except IOError:

        totals_vector = create_totals_vector(ngram, docs_or_sections)
        totals_vector = csc_matrix(totals_vector, dtype=np.int32)
        store_csr_matrix_to_file(
            totals_vector, PATH_TOKENIZED +
            'totals_{}_{}.npz'.format(ngram, docs_or_sections))
        return get_totals_vector(docs_or_sections, return_type)
예제 #9
0
    def _compute_set_active_filters_np(self, globals):
        """ Applies all filters to both the term and a copy of the totals vector and sets them

        All filters are np uint8 Vectors. The following filters are set in this function:
        doc_type_filters_np
        collection_filters_np
        availability_filters_np
        term_filters_np
        combined_filters_np

        6/10/17 Added availability filter.
        The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it.
        That is to say: the availability filter is not used on its own.

        7/25/17 Added term filter

        12/19/18: Moved to calculate_ngrams from preprocessing filters as this is a calculation
        step, not a pre-processing step

        >>> globals = get_globals(load_only_docs=True)
        >>> dt_filters = ['internal communication']
        >>> col_filters = [2,3]
        >>> avail_filters = ['no restrictions']
        >>> term_filters = []
        >>> search_tokens = ['addiction']
        >>> ngram = NgramResult(dt_filters, col_filters, avail_filters, term_filters, search_tokens)
        >>> ngram.docs_or_sections = 'docs'
        >>> ngram._compute_set_active_filters_np(globals=globals)
        >>> print(ngram.combined_filters_np)
        <Document Vector of type np_uint8 with 4964 elements.>

        :return: None
        """

        filters = globals['filters'][self.docs_or_sections]

        if not 'term' in self.active_filters:
            self.active_filters['term'] = {}

        # all filters used here are unweighted
        # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each.
        weighted = False
        filter_len = DOC_COUNT
        if self.docs_or_sections == 'sections':
            filter_len = SECTION_COUNT

        # process availability filters
        if len(self.active_filters['availability']) == 0:
            self.availability_filters_np = None
        else:
            self.availability_filters_np = None
            for filter_name in self.active_filters['availability']:
                if self.availability_filters_np is None:
                    self.availability_filters_np = filters['availability'][(
                        filter_name, weighted)].copy()
                else:
                    self.availability_filters_np += filters['availability'][(
                        filter_name, weighted)]
            self.availability_filters_np.convert_to_datatype('np_uint8')

        # process term filters
        if len(self.active_filters['term']) == 0:
            self.term_filters_np = None
        else:
            self.term_filters_np = None
            for filter_name in self.active_filters['term']:
                if self.term_filters_np is None:
                    self.term_filters_np = Vector().load_token_vector(
                        filter_name,
                        return_type='np_uint8',
                        docs_or_sections=self.docs_or_sections)
                else:
                    self.term_filters_np += Vector().load_token_vector(
                        filter_name,
                        return_type='np_uint8',
                        docs_or_sections=self.docs_or_sections)

        # process doc_type filters
        if len(self.active_filters['doc_type']) == 0:
            self.doc_type_filters_np = Vector(
                np.ones(filter_len, dtype=np.uint8))
        else:
            self.doc_type_filters_np = None
            for filter_name in self.active_filters['doc_type']:
                if self.doc_type_filters_np is None:
                    self.doc_type_filters_np = filters['doc_type'][(
                        filter_name, weighted)].copy()
                else:
                    self.doc_type_filters_np += filters['doc_type'][(
                        filter_name, weighted)]
            self.doc_type_filters_np.convert_to_datatype('np_uint8')

        # process collection filters
        if len(self.active_filters['collection']) == 0:
            self.collection_filters_np = Vector(
                np.ones(filter_len, dtype=np.uint8))
        else:
            self.collection_filters_np = None
            for filter_name in self.active_filters['collection']:
                if self.collection_filters_np is None:
                    self.collection_filters_np = filters['collection'][(
                        filter_name, weighted)].copy()
                else:
                    self.collection_filters_np += filters['collection'][(
                        filter_name, weighted)]
            self.collection_filters_np.convert_to_datatype('np_uint8')

        # Apply term filter to doc type and collection filters
        if self.term_filters_np is not None:
            self.doc_type_filters_np.filter_with(self.term_filters_np)
            self.collection_filters_np.filter_with(self.term_filters_np)

        # Apply availability filter to doc type and collection filters
        if self.availability_filters_np is not None:
            self.doc_type_filters_np.filter_with(self.availability_filters_np)
            self.doc_type_filters_np.filter_with(self.availability_filters_np)

        # Create final filter
        if len(self.active_filters['doc_type']) == 0:
            self.combined_filters_np = self.collection_filters_np
        elif len(self.active_filters['collection']) == 0:
            self.combined_filters_np = self.doc_type_filters_np
        else:
            self.combined_filters_np = self.collection_filters_np.filter_with(
                self.doc_type_filters_np, return_copy=True)
예제 #10
0
class NgramResult():
    def __init__(self,
                 doc_type_filters: list,
                 collection_filters: list,
                 availability_filters: list,
                 term_filters: list,
                 unparsed_search_tokens: list = None,
                 parsed_search_tokens: list = None):

        self.unparsed_search_tokens = unparsed_search_tokens
        self.parsed_search_tokens = parsed_search_tokens
        self.errors = None
        self.docs_or_sections = None
        self.aggregate = None
        self.aggregate_years = None

        self.muliprocessing_queue = None

        # Results
        self.tokens_data = None
        self.collections = None
        self.doc_types = None
        self.doc_type_groups = None

        # Filters (These are inputs, lists of strings)
        self.doc_type_filters = doc_type_filters
        self.collection_filters = collection_filters
        self.availability_filters = availability_filters
        self.term_filters = term_filters
        self.active_filters = self._get_active_filters()

        # Filters (uint8 np arrays of the actual filters)
        self.doc_type_filters_np = None
        self.collection_filters_np = None
        self.availability_filters_np = None
        self.term_filters_np = None
        self.combined_filters_np = None

    def store_result_in_db(self, database):

        hash = generate_hash((self.parsed_search_tokens, self.doc_type_filters,
                              self.collection_filters,
                              self.availability_filters, self.term_filters))
        store_cmd = '''REPLACE INTO results_frequencies (tokens,
                                                        doc_type_filters,
                                                        collection_filters,
                                                        availability_filters,
                                                        term_filters,
                                                        query_hash,
                                                        results,
                                                        last_accessed,
                                                        count_accessed
                                                        )
                                    VALUES(%s, %s, %s, %s, %s, %s, %s, DATE(NOW()), 0);'''
        con, cur = database.connect()
        cur.execute(store_cmd,
                    (str(self.tokens), str(
                        self.doc_type_filters), str(self.collection_filters),
                     str(self.availability_filters), str(self.term_filters),
                     hash, json.dumps(self.generate_results_dict())))
        con.commit()
        con.close()

    def generate_results_dict(self):

        return {
            'error': self.errors,
            'data': {
                'tokens':
                self.tokens_data,  # using 'tokens' for backwards compatibility
                'collections': self.collections,
                'doc_types': self.doc_types,
                'doc_type_groups': self.doc_type_groups
            }
        }

    def _get_active_filters(self):
        return {
            'doc_type': self.doc_type_filters,
            'collection': self.collection_filters,
            'availability': self.availability_filters,
            'term': self.term_filters
        }

    def compute_result(self, globals):
        """
        Computes the result for ngram

        >>> unparsed_search_tokens = ['addiction']
        >>> doc_type_filters = []
        >>> collection_filters = []
        >>> availability_filters = []
        >>> term_filters = []
        >>> globals = get_globals()
        >>> ngram = NgramResult(doc_type_filters, collection_filters, availability_filters,
        ...                     term_filters, unparsed_search_tokens=unparsed_search_tokens)
        >>> ngram.compute_result(globals)


        """

        check_param_type(self.unparsed_search_tokens, list,
                         'unparsed_search_tokens', 'NgramResult.__init__()')
        check_param_type(self.collection_filters, list, 'doc_type_filters',
                         'NgramResult.__init__()')
        check_param_type(self.availability_filters, list,
                         'availability_filters', 'NgramResult.__init__()')
        check_param_type(self.term_filters, list, 'term_filters',
                         'NgramResult.__init__()')
        check_param_type(self.doc_type_filters, list, 'doc_type_filters',
                         'NgramResult.__init__()')

        self.active_filters = self._get_active_filters()

        if len(self.term_filters) == 0:
            self.docs_or_sections = 'docs'
        else:
            self.docs_or_sections = 'sections'

        # Initialize multiprocessing queue to handle the results for the collections and document types
        mp_results_queue = multiprocessing.Queue()

        # parse the search tokens as a separate process...
        multiprocessing.Process(target=parse_search_tokens,
                                args=(self.unparsed_search_tokens,
                                      mp_results_queue)).start()

        # ... in the meantime, load and set the active doc type, collection, availability,
        # term, and combined filters. They are stored as self.doc_type_filters_np,
        # self.combined_filters_np...
        self._compute_set_active_filters_np(globals)

        # create a total count per year array. Add 1 to totals to avoid division by 0 errors.
        totals_vector = globals['totals']['totals'][
            self.docs_or_sections]['np']
        self.totals_years = totals_vector.convert_to_year_array(
            filter_vec=self.combined_filters_np) + 1

        # get the parsed search tokens. If there were errors, return them.
        self.parsed_search_tokens, self.errors = mp_results_queue.get()
        if len(self.parsed_search_tokens) == 0:
            print({'error': self.errors})
            return {'error': self.errors}

        # get the count data for all tokens.
        # adds tokens_data (list of count/freq data for each token), aggregate (vector sum of all
        #  tokens), aggregate_years (aggregate as years vector)
        self._compute_add_tokens_data()

        # Second round of multiprocessing: calculate z-scores while adding collections and doc types
        multiprocessing.Process(target=get_z_scores,
                                args=(self.tokens_data, self.totals_years,
                                      mp_results_queue)).start()

        # add collections data
        self._compute_add_collection_data(globals)

        print("Collections")
        for i in self.collections:
            print(i)

        # add document type and document type group data
        self._compute_add_doc_type_data(globals)

        # release memory of variables that are no longer used
        self.aggregate = None
        self.aggregate_csc = None
        self.totals_years = None
        self.combined_filters_np = None
        self.collection_filters_np = None
        self.doc_type_filters_np = None

        mp_result = mp_results_queue.get()
        z_scores = mp_result[1]
        for token_id in range(len(z_scores)):
            self.tokens_data[token_id]['z_scores'] = z_scores[token_id].tolist(
            )

    def _compute_set_active_filters_np(self, globals):
        """ Applies all filters to both the term and a copy of the totals vector and sets them

        All filters are np uint8 Vectors. The following filters are set in this function:
        doc_type_filters_np
        collection_filters_np
        availability_filters_np
        term_filters_np
        combined_filters_np

        6/10/17 Added availability filter.
        The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it.
        That is to say: the availability filter is not used on its own.

        7/25/17 Added term filter

        12/19/18: Moved to calculate_ngrams from preprocessing filters as this is a calculation
        step, not a pre-processing step

        >>> globals = get_globals(load_only_docs=True)
        >>> dt_filters = ['internal communication']
        >>> col_filters = [2,3]
        >>> avail_filters = ['no restrictions']
        >>> term_filters = []
        >>> search_tokens = ['addiction']
        >>> ngram = NgramResult(dt_filters, col_filters, avail_filters, term_filters, search_tokens)
        >>> ngram.docs_or_sections = 'docs'
        >>> ngram._compute_set_active_filters_np(globals=globals)
        >>> print(ngram.combined_filters_np)
        <Document Vector of type np_uint8 with 4964 elements.>

        :return: None
        """

        filters = globals['filters'][self.docs_or_sections]

        if not 'term' in self.active_filters:
            self.active_filters['term'] = {}

        # all filters used here are unweighted
        # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each.
        weighted = False
        filter_len = DOC_COUNT
        if self.docs_or_sections == 'sections':
            filter_len = SECTION_COUNT

        # process availability filters
        if len(self.active_filters['availability']) == 0:
            self.availability_filters_np = None
        else:
            self.availability_filters_np = None
            for filter_name in self.active_filters['availability']:
                if self.availability_filters_np is None:
                    self.availability_filters_np = filters['availability'][(
                        filter_name, weighted)].copy()
                else:
                    self.availability_filters_np += filters['availability'][(
                        filter_name, weighted)]
            self.availability_filters_np.convert_to_datatype('np_uint8')

        # process term filters
        if len(self.active_filters['term']) == 0:
            self.term_filters_np = None
        else:
            self.term_filters_np = None
            for filter_name in self.active_filters['term']:
                if self.term_filters_np is None:
                    self.term_filters_np = Vector().load_token_vector(
                        filter_name,
                        return_type='np_uint8',
                        docs_or_sections=self.docs_or_sections)
                else:
                    self.term_filters_np += Vector().load_token_vector(
                        filter_name,
                        return_type='np_uint8',
                        docs_or_sections=self.docs_or_sections)

        # process doc_type filters
        if len(self.active_filters['doc_type']) == 0:
            self.doc_type_filters_np = Vector(
                np.ones(filter_len, dtype=np.uint8))
        else:
            self.doc_type_filters_np = None
            for filter_name in self.active_filters['doc_type']:
                if self.doc_type_filters_np is None:
                    self.doc_type_filters_np = filters['doc_type'][(
                        filter_name, weighted)].copy()
                else:
                    self.doc_type_filters_np += filters['doc_type'][(
                        filter_name, weighted)]
            self.doc_type_filters_np.convert_to_datatype('np_uint8')

        # process collection filters
        if len(self.active_filters['collection']) == 0:
            self.collection_filters_np = Vector(
                np.ones(filter_len, dtype=np.uint8))
        else:
            self.collection_filters_np = None
            for filter_name in self.active_filters['collection']:
                if self.collection_filters_np is None:
                    self.collection_filters_np = filters['collection'][(
                        filter_name, weighted)].copy()
                else:
                    self.collection_filters_np += filters['collection'][(
                        filter_name, weighted)]
            self.collection_filters_np.convert_to_datatype('np_uint8')

        # Apply term filter to doc type and collection filters
        if self.term_filters_np is not None:
            self.doc_type_filters_np.filter_with(self.term_filters_np)
            self.collection_filters_np.filter_with(self.term_filters_np)

        # Apply availability filter to doc type and collection filters
        if self.availability_filters_np is not None:
            self.doc_type_filters_np.filter_with(self.availability_filters_np)
            self.doc_type_filters_np.filter_with(self.availability_filters_np)

        # Create final filter
        if len(self.active_filters['doc_type']) == 0:
            self.combined_filters_np = self.collection_filters_np
        elif len(self.active_filters['collection']) == 0:
            self.combined_filters_np = self.doc_type_filters_np
        else:
            self.combined_filters_np = self.collection_filters_np.filter_with(
                self.doc_type_filters_np, return_copy=True)

    def _compute_add_tokens_data(self):
        """
        Load counts, frequencies, and totals for each token.

        12/18/18 Moved from preprocessing_tokens and implemented for use with the NgramResult class,
        i.e. it won't return a df but individual vars.

        :return: None
        """

        self.tokens_data = []
        self.aggregate = None

        for token in self.parsed_search_tokens:
            # Load token and totals
            try:
                loaded_vector = Vector().load_token_vector(
                    token,
                    return_type='np_int32',
                    docs_or_sections=self.docs_or_sections)
            except FileNotFoundError:
                print('Could not load token {}.'.format(token))
                continue

            # initialize aggregate
            if self.aggregate is None:
                self.aggregate = loaded_vector.copy()
            else:
                self.aggregate += loaded_vector

            absolute_counts = loaded_vector.convert_to_year_array(
                filter_vec=self.combined_filters_np)

            self.tokens_data.append({
                'token': token,
                'counts': absolute_counts,
                'frequencies': absolute_counts / self.totals_years,
                'total': absolute_counts.sum
            })

        self.tokens_data = sorted(self.tokens_data,
                                  key=lambda k: k['total'],
                                  reverse=True)
        self.aggregate.filter_with(self.combined_filters_np)
        self.aggregate_csc = self.aggregate.copy().convert_to_datatype('csc')

    def _compute_add_collection_data(self, globals):

        # Sort filters by number of documents they represent
        filter_sums = []
        for filter_name in globals['filters'][
                self.docs_or_sections]['collection']:
            if filter_name == ('msa_bat', False):
                continue

            filter = globals['filters'][
                self.docs_or_sections]['collection'][filter_name]
            if filter.sum > 0:
                filter_sums.append((filter_name, filter.sum))
        filter_sums_sorted = sorted(filter_sums,
                                    key=lambda x: x[1],
                                    reverse=True)

        # Select 9 collections with the most documents
        cols_filtered = []
        for filter_name, filter_sum in filter_sums_sorted:

            # if a filter's total is lower than the highest included filtered collection -> skip becaus
            # it has no chance of getting included.
            filter = globals['filters'][
                self.docs_or_sections]['collection'][filter_name]
            if len(cols_filtered
                   ) > 9 and cols_filtered[8]['total'] > filter_sum:
                continue
            cols_filtered = cols_filtered[:9]

            col_filtered = self.aggregate_csc.convert_to_year_array(
                filter_vec=filter)

            cols_filtered.append({
                'name': filter_name[0],
                'absolute_counts': col_filtered,
                'total': col_filtered.sum
            })
            #            embed()
            if len(cols_filtered) >= 9:
                cols_filtered = sorted(cols_filtered,
                                       key=lambda x: x['total'],
                                       reverse=True)

        cols_filtered = cols_filtered[:9]

        results = []

        for col in cols_filtered:
            name = col['name']
            collection_totals = globals['totals']['collection'][
                self.docs_or_sections][name]
            collection_totals_filtered = collection_totals.convert_to_year_array(
                filter_vec=self.doc_type_filters_np)
            relative_frequencies = col[
                'absolute_counts'] / collection_totals_filtered

            results.append({
                'token':
                globals['collections_and_idx_dict'][name]['name_short'],
                'counts':
                col['absolute_counts'],
                'frequencies':
                relative_frequencies,
                'total':
                col['total']
            })

        self.collections = results

    def _compute_add_doc_type_data(self, globals):

        # Second, add all of the doc_type_groups
        dts = []
        for dt_group_name in [
                'internal communication', 'marketing documents',
                'internal scientific reports', 'news reports',
                'scientific publications', 'court documents'
        ]:
            dt = {'token': dt_group_name}
            dt_group_filter = globals['filters'][
                self.docs_or_sections]['doc_type'][(dt_group_name, False)]
            agg_filtered_with_dt = self.aggregate_csc.convert_to_year_array(
                filter_vec=dt_group_filter)
            dt['absolute_counts'] = agg_filtered_with_dt
            dt['total'] = agg_filtered_with_dt.sum
            dt_group_totals = globals['totals']['doc_type'][
                self.docs_or_sections][dt_group_name]
            dt_group_totals_filtered = dt_group_totals.convert_to_year_array(
                filter_vec=self.collection_filters_np)
            freqs = dt['absolute_counts'] / dt_group_totals_filtered
            freqs.vector = np.nan_to_num(freqs.vector)
            dt['frequencies'] = freqs
            dts.append(dt)

        # Second, find the 9 most frequent document types to process
        dts_filtered = []
        for i in range(275):
            dt_name = globals['doc_type_and_idx_dict'][i]

            # 1/2019: the following dts are missing: 99 journal, 208 magazine,
            # 230 report - clinical study, 243 paper, 248 non printable/unable
            # 264 conference proceedings. Unclear why but these are small collections so it
            # shouldn't matter.
            try:
                dt_filter = globals['filters'][
                    self.docs_or_sections]['doc_type'][(dt_name, False)]
            except:
                print(i, dt_name)
                continue
            dt_filter_sum = dt_filter.sum
            if len(dts_filtered
                   ) > 9 and dts_filtered[8]['total'] > dt_filter_sum:
                continue
            dts_filtered = dts_filtered[:9]

            agg_filtered_with_dt = self.aggregate_csc.convert_to_year_array(
                filter_vec=dt_filter)
            dts_filtered.append({
                'name': dt_name,
                'absolute_counts': agg_filtered_with_dt,
                'total': agg_filtered_with_dt.sum
            })
            if len(dts_filtered) >= 9:
                dts_filtered = sorted(dts_filtered,
                                      key=lambda x: x['total'],
                                      reverse=True)

        for dt in dts_filtered:
            dt_totals = globals['totals']['doc_type'][self.docs_or_sections][
                dt['name']]
            dt_totals_filtered = dt_totals.convert_to_year_array(
                filter_vec=self.collection_filters_np)
            freqs = dt['absolute_counts'] / dt_totals_filtered
            freqs.vector = np.nan_to_num(freqs.vector)
            dt['frequencies'] = freqs
            dts.append(dt)

        self.doc_types = dts
예제 #11
0
def find_and_store_policy(term='and', filtered_collection_vector=None, filtered_totals_year_vector=None,
                          collection=None):


    db = sqlite3.connect('policies.db')
    cur = db.cursor()

    col_id = COL_NAME_TO_ID[collection]

    if not filtered_collection_vector:
        filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8')
        filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8'))
        filtered_totals_year_vector = TOTALS_COL[col_id].convert_to_year_array(filter_vec=filtered_collection_vector)

    term_v = Vector().load_token_vector(token=term)
    term_year_vector = term_v.convert_to_year_array(filter_vec=filtered_collection_vector)


    dunnings = {}

    for start_first_period in range(50, 90):
        end_first_period = start_first_period + 3
        policy_year = start_first_period + 4
        start_second_period = start_first_period + 5
        end_second_period = start_first_period + 8

        term_count_first = term_year_vector[start_first_period : end_first_period+1].sum()
        term_count_second = term_year_vector[start_second_period : end_second_period+1].sum()

        totals_first = filtered_totals_year_vector[start_first_period : end_first_period+1].sum()
        totals_second = filtered_totals_year_vector[start_second_period : end_second_period+1].sum()

        dunning = dunning_log_likelihood(term_count_first, term_count_second,
                                         totals_first, totals_second)

        dunnings[policy_year] = {
            'year': f'19{policy_year}',
            'dunning': dunning,
            'first': term_count_first,
            'first_freq': term_count_first / totals_first*100,
            'second': term_count_second,
            'second_freq': term_count_second / totals_second*100
        }
#        print(f'19{start_first_period}-19{end_first_period} vs. 19{start_second_period}-'
#        f'19{end_second_period}: {dunning}. 1: {term_count_first}. 2: {term_count_second}')

    dunnings_sorted = sorted(dunnings.items(), key=lambda x:x[1]['dunning'])

    policy_adoption = dunnings_sorted[-1][1]
    policy_ending = dunnings_sorted[0][1]

    policy = '{}. {:15s}. Adoption: {}. D: {:7.0f}. C1: {:9d}. C2: {:9d}. F: {:5.3f}. ' \
             'Ending: {}. D:{:7.0f}. C1: {:9d}. C2: {:9d}. F: {:5.3f}.'.format( collection, term,
        policy_adoption['year'], policy_adoption['dunning'], policy_adoption['first'],
        policy_adoption['second'], policy_adoption['first_freq']/ policy_adoption['second_freq'],
        policy_ending['year'], policy_ending['dunning'], policy_ending['first'],
        policy_ending['second'], policy_ending['first_freq']/ policy_ending['second_freq']
    )
    print(policy)

    cur.execute('''INSERT INTO policies_5p VALUES("{}", "{}", 
                                               {}, {}, {}, {}, {}, {},
                                               {}, {}, {}, {}, {}, {})'''.format(
        collection, term,
        policy_adoption['year'], policy_adoption['dunning'], policy_adoption['first'],
        policy_adoption['second'], policy_adoption['first_freq'], policy_adoption['second_freq'],
        policy_ending['year'], policy_ending['dunning'], policy_ending['first'],
        policy_ending['second'], policy_ending['first_freq'], policy_ending['second_freq']
    ))
    db.commit()
예제 #12
0
def get_absolute_google_counts(token_name: str) -> np.ndarray:
    """    This function retrieves the absolute counts for a given token from the Google Ngram Viewer.

    It first loads the relative frequencies from the ngram viewer and the absolute counts
    for the corpus from Google's source data.
    Then, it multiplies the absolute number of terms in the corpus for any given year with the
    relative frequency of the search token.

    >>> google_counts = get_absolute_google_counts('addiction')
    >>> print(f'Google counts for addiction in 1950: {google_counts[50]}')
    Google counts for addiction in 1950: 2482.0

    >>> type(google_counts)
    <class 'tobacco.utilities.vector.Vector'>

    """

    hash = hashlib.sha256(token_name.encode()).hexdigest()
    file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3],
                     f'{hash}.npy')

    try:
        # this really doesn't need a hash
        absolute_counts = Vector().load_from_disk(file_path,
                                                  return_type='np_int32')
#        absolute_counts = np.load(token_path+hash_path+'.npy')

    except FileNotFoundError:

        corpus_id = 15
        # construct the url, i.e. place the token and other parameters where they belong
        url = 'https://books.google.com/ngrams/interactive_chart?content={}&year_start={}&year_end={}' \
              '&corpus={}&smoothing=0'.format(token_name.replace(' ', '+'), YEAR_START, YEAR_END, corpus_id)

        try:
            with urllib.request.urlopen(url, timeout=1) as response:
                page = response.read().decode('utf-8')

                if page.find('var data = [];') > -1:
                    relative_frequencies = 116 * [0]
                else:

                    start = page.find('var data = [')
                    end = page.find('}];', start)
                    data_dict = json.loads(page[start + 11:end + 2])[0]
                    relative_frequencies = data_dict['timeseries']
                    relative_frequencies += 8 * [relative_frequencies[-1]]

        except urllib.error.HTTPError:
            relative_frequencies = 116 * [0]

        # if general error, return 0 but don't store
        except:
            temp = 116 * [0]
            return np.array(
                [round(temp[i] * GOOGLE_TOTALS[i]) for i in range(len(temp))],
                dtype=np.float)

        # Step 3: calculate the absolute number of appearances by multiplying the frequencies with the total number of tokens
        absolute_counts = [
            round(relative_frequencies[i] * GOOGLE_TOTALS[i])
            for i in range(len(relative_frequencies))
        ]
        absolute_counts = Vector(np.array(absolute_counts, dtype=np.float64))
        absolute_counts.save_to_disk(file_path)


#        hash = hashlib.sha256(token_name.encode()).hexdigest()
#        file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3])

#        token_path = PATH_GOOGLE_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3])

#       if not Path.exists(file_path):
#           print(file_path)
#           Path.mkdir(file_path, parents=True)
#        np.save(token_path + hash_path, absolute_counts)

    return absolute_counts
예제 #13
0
from tobacco.utilities.vector import Vector

GOOGLE_TOTALS = Vector(
    np.array([
        1285712637.0, 1311315033.0, 1266236889.0, 1405505328.0, 1351302005.0,
        1397090480.0, 1409945274.0, 1417130893.0, 1283265090.0, 1354824248.0,
        1350964981.0, 1431385638.0, 1356693322.0, 1324894757.0, 1211361619.0,
        1175413415.0, 1183132092.0, 1039343103.0, 1136614538.0, 1388696469.0,
        1216676110.0, 1413237707.0, 1151386048.0, 1069007206.0, 1113107246.0,
        1053565430.0, 1216023821.0, 1212716430.0, 1153722574.0, 1244889331.0,
        1183806248.0, 1057602772.0, 915956659.0, 1053600093.0, 1157109310.0,
        1199843463.0, 1232280287.0, 1261812592.0, 1249209591.0, 1179404138.0,
        1084154164.0, 1045379066.0, 890214397.0, 812192380.0, 926378706.0,
        1203221497.0, 1385834769.0, 1486005621.0, 1641024100.0, 1644401950.0,
        1603394676.0, 1621780754.0, 1590464886.0, 1662160145.0, 1751719755.0,
        1817491821.0, 1952474329.0, 1976098333.0, 2064236476.0, 2341981521.0,
        2567977722.0, 2818694749.0, 2955051696.0, 2931038992.0, 3300623502.0,
        3466842517.0, 3658119990.0, 3968752101.0, 3942222509.0, 4086393350.0,
        4058576649.0, 4174172415.0, 4058707895.0, 4045487401.0, 4104379941.0,
        4242326406.0, 4314577619.0, 4365839878.0, 4528331460.0, 4611609946.0,
        4627406112.0, 4839530894.0, 4982167985.0, 5309222580.0, 5475269397.0,
        5793946882.0, 5936558026.0, 6191886939.0, 6549339038.0, 7075013106.0,
        6895715366.0, 7596808027.0, 7492130348.0, 8027353540.0, 8276258599.0,
        8745049453.0, 8979708108.0, 9406708249.0, 9997156197.0, 11190986329.0,
        11349375656.0, 12519922882.0, 13632028136.0, 14705541576.0,
        14425183957.0, 15310495914.0, 16206118071.0, 19482936409.0,
        19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0,
        19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0
    ],
             dtype=np.float64))

예제 #14
0
def get_filter(search_term, filter_type, weight=False, return_type='csc', docs_or_sections='docs'):
    '''
    Creates a binary filter (True if document has the specificed doc_type or collection. Falso otherwise
    :param search_term:
    :param filter_type: 'collection' or 'doc_type'
    :param weight:
    :return: Vector

    >>> filter = get_filter('letter', 'doc_type', weight=False, return_type='csc')
    >>> filter
    <Document Vector of type csc with 2490726 elements.>

    '''


    try:
        # can't store terms with a forward slash -> replace with underscore
        if filter_type == 'doc_type': search_term = search_term.replace('/', '_')
        file_name = '{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight)
        file_path = Path(PATH_TOKENIZED, 'filters', file_name)
        filter = Vector().load_from_disk(file_path, return_type=return_type)

#        filter =  load_csc_matrix_from_file(PATH_TOKENIZED + 'filters/{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight))


    except IOError:


        db = Database("TOB_FULL")
        con, cur = db.connect()

        if docs_or_sections == 'docs':
            filter_len = DOC_COUNT
        elif docs_or_sections == 'sections':
            filter_len = SECTION_COUNT
        else:
            raise ValueError("param docs_or_sections has to be either 'docs' or sections' but not ''{}".format(docs_or_sections))

        if weight:
            filter = np.zeros((filter_len, 1), dtype=np.float)
        else:
            filter = np.zeros((filter_len, 1), dtype=np.bool)


        if filter_type == 'collection':
            cur.execute("SELECT id, no_tokens from docs where collection_id = '{}' ORDER BY id ASC".format(search_term))
        elif filter_type == 'doc_type':
            if weight:
                cur.execute('''SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens, doc_types.weight as weight
                                  FROM doc_types, docs WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'''.format(search_term))
            else:
                cur.execute('SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens FROM doc_types, docs '
                            '     WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'.format(search_term))
        elif filter_type == 'availability':
            # dict maps from search term to the requisite where clause (documents can be both formerly privileged and formerly confidential)
            term_to_mysql_where_clauses_dict = {
                'no restrictions': 'WHERE availability = "public;no restrictions"',
                'formerly confidential': 'WHERE availability = "public;formerly confidential" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"',
                'formerly privileged': 'WHERE availability = "public;formerly privileged" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"'
            }
            cur.execute('SELECT id, no_tokens from docs {} ORDER BY id ASC'.format(term_to_mysql_where_clauses_dict[search_term]))
        else:
            raise KeyError("{} is not a valid filter_type. Valid filter types are 'collection', 'doc_type', and 'availability'".format(filter_type))

        if docs_or_sections == 'sections':
            doc_id_to_section_dict = get_doc_id_to_section_id_dict()

        rows = cur.fetchall()
        for row in rows:
            if docs_or_sections == 'docs':
                if weight:
                    filter[row['id']] = row['weight']
                else:
                    filter[row['id'], 0] = True

            elif docs_or_sections == 'sections':
                first_id, last_id = doc_id_to_section_dict[row['id']]
                for section_id in range(first_id, last_id+1):
                    filter[section_id] = True

        filter = csc_matrix(filter)
        if filter_type == 'doc_type': search_term = search_term.replace('/', '_')

        filter_path = PATH_TOKENIZED + 'filters/{}_{}_{}_{}.npz'.format(search_term, filter_type, docs_or_sections, weight)
        store_csr_matrix_to_file(filter, filter_path)
        print("Created filter for {} with {} elements.".format(search_term, filter.getnnz()))

        return get_filter(filter_type, weight, return_type, docs_or_sections)

#    if return_type == 'np':
#        filter = np.array(filter.todense()).flatten()

    return filter
예제 #15
0
def get_active_filters_np(active_filters, FILTERS, docs_or_sections='docs'):

    """ Applies all filters to both the term and a copy of the totals vector and returns them

    6/10/17 Added availability filter.
    The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it.
    That is to say: the availability filter is not used on its own.

    7/25/17 Added term filter

    >>> FILTERS = {
    ...     'docs':{
    ...         'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='docs'),
    ...         'collection': get_collection_filters(return_type='csc', docs_or_sections='docs'),
    ...         'availability': get_availability_filters(return_type='csc', docs_or_sections='docs')
    ...     },
    ...     'sections':{
    ...         'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'),
    ...         'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'),
    ...         'availability': get_availability_filters(return_type='csc', docs_or_sections='sections')
    ...     }
    ... }
    >>> active_filters = {
    ...     'doc_type': {'internal communication'},
    ...     'collection': {2,3},
    ...     'availability': {'no restrictions'},
    ...     'term': {}
    ... }
    >>> doc_type_filter, collection_filter, final_filter = get_active_filters_np(
    ...         active_filters=active_filters, FILTERS=FILTERS, docs_or_sections='docs')


    :param active_filters: dict of lists, e.g. {'doc_type': ["internal communication"], 'collection': [1,2],
                                                'availability': [], 'term': []}
    :param FILTERS: Filters from global ??I think they are csc filters ??
    :param return_type: e.g. np.uint8 or np.int32. By default, the same document type as the input, usually np.uint8
    :param docs_or_sections: 'docs' or 'sections'
    :return: doc_type_filter, collection_filter, final_filter
    """

    if not 'term' in active_filters:
        active_filters['term'] = {}

    # all filters used here are unweighted
    # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each.
    weighted = False
    filter_len = DOC_COUNT
    if docs_or_sections == 'sections':
        filter_len = SECTION_COUNT

    # process availability filters
    if len(active_filters['availability']) == 0:
        availability_filter = None
    else:
        availability_filter = None
        for filter_name in active_filters['availability']:
            if availability_filter is None:
                availability_filter = FILTERS[docs_or_sections]['availability'][(filter_name, weighted)].copy()
            else:
                availability_filter += FILTERS[docs_or_sections]['availability'][(filter_name, weighted)]
        availability_filter.convert_to_datatype('np_uint8')

    # process term filters
    if len(active_filters['term']) == 0:
        term_filter = None
    else:
        term_filter = None
        for filter_name in active_filters['term']:
            if term_filter is None:
                term_filter = Vector().load_token_vector(filter_name, return_type='np_uint8',
                                                         docs_or_sections=docs_or_sections)
            else:
                term_filter += Vector().load_token_vector(filter_name, return_type='np_uint8',
                                                          docs_or_sections=docs_or_sections)

    # process doc_type filters
    if len(active_filters['doc_type']) == 0:
        doc_type_filter = np.ones(filter_len, dtype='bool')
    else:
        doc_type_filter = None
        for filter_name in active_filters['doc_type']:
            if doc_type_filter is None:

                doc_type_filter = FILTERS[docs_or_sections]['doc_type'][(filter_name, weighted)].copy()
            else:
                doc_type_filter += FILTERS[docs_or_sections]['doc_type'][(filter_name, weighted)]

        # if docs_or_sections == 'sections':
        doc_type_filter.convert_to_datatype('np_uint8')

    # process collection filters
    if len(active_filters['collection']) == 0:
        collection_filter = np.ones(filter_len, dtype=np.uint8)
    else:
        collection_filter = None
        for filter_name in active_filters['collection']:
            if collection_filter is None:
                collection_filter = FILTERS[docs_or_sections]['collection'][(filter_name, weighted)].copy()
            else:
                collection_filter += FILTERS[docs_or_sections]['collection'][(filter_name, weighted)]
        collection_filter.convert_to_datatype('np_uint8')

    # Apply term filter to doc type and collection filters
    if term_filter is not None:
        doc_type_filter.filter_with(term_filter)
        collection_filter.filter_with(term_filter)

    # Apply availability filter to doc type and collection filters
    if availability_filter is not None:
        doc_type_filter.filter_with(availability_filter)
        doc_type_filter.filter_with(availability_filter)

    # Create final filter
    if len(active_filters['doc_type']) == 0:
        final_filter = collection_filter
    elif len(active_filters['collection']) == 0:
        final_filter = doc_type_filter
    else:
        final_filter = collection_filter.filter_with(doc_type_filter, return_copy=True)

    return doc_type_filter, collection_filter, final_filter
예제 #16
0
def create_filter(search_term: Union[str, int], filter_type: str, weight: bool,
                  return_type: str, docs_or_sections: str):

    '''
    Creates a filter vector for collectinos, doc types, availability

    12/20/18: Separated from get_filter (moved to Vector class)

    :param search_term:
    :param filter_type:
    :param weight:
    :param return_type:
    :param docs_or_sections:
    :return:
    '''

    db = Database("TOB_FULL")
    con, cur = db.connect()

    if docs_or_sections == 'docs':
        filter_len = DOC_COUNT
    elif docs_or_sections == 'sections':
        filter_len = SECTION_COUNT
    else:
        raise ValueError("param docs_or_sections has to be either 'docs' or sections' but not ''{}".format(docs_or_sections))

    if weight:
        filter = np.zeros((filter_len, 1), dtype=np.float)
    else:
        filter = np.zeros((filter_len, 1), dtype=np.bool)

    if filter_type == 'collection':
        cur.execute("SELECT id, no_tokens from docs where collection_id = '{}' ORDER BY id ASC".format(search_term))
    elif filter_type == 'doc_type':
        if weight:
            cur.execute('''SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens, doc_types.weight as weight
                                      FROM doc_types, docs WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'''.format(search_term))
        else:
            cur.execute('SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens FROM doc_types, docs '
                        '     WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'.format(search_term))
    elif filter_type == 'availability':
        # dict maps from search term to the requisite where clause (documents can be both formerly privileged and formerly confidential)
        term_to_mysql_where_clauses_dict = {
            'no restrictions': 'WHERE availability = "public;no restrictions"',
            'formerly confidential': 'WHERE availability = "public;formerly confidential" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"',
            'formerly privileged': 'WHERE availability = "public;formerly privileged" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"'
        }
        cur.execute('SELECT id, no_tokens from docs {} ORDER BY id ASC'.format(term_to_mysql_where_clauses_dict[search_term]))
    else:
        raise KeyError("{} is not a valid filter_type. Valid filter types are 'collection', 'doc_type', and 'availability'".format(filter_type))

    if docs_or_sections == 'sections':
        doc_id_to_section_dict = get_doc_id_to_section_id_dict()

    rows = cur.fetchall()
    for row in rows:
        if docs_or_sections == 'docs':
            if weight:
                filter[row['id']] = row['weight']
            else:
                filter[row['id'], 0] = True

        elif docs_or_sections == 'sections':
            first_id, last_id = doc_id_to_section_dict[row['id']]
            for section_id in range(first_id, last_id + 1):
                filter[section_id] = True


    filter_vec = Vector(csc_matrix(filter))
    if filter_type == 'doc_type':
        search_term = search_term.replace('/', '_')
    file_name = '{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight)
    file_path = Path(PATH_TOKENIZED, 'filters', file_name)
    filter_vec.save_to_disk(file_path)



#    filter_path = PATH_TOKENIZED + 'filters/{}_{}_{}_{}.npz'.format(search_term, filter_type, docs_or_sections, weight)
#    store_csr_matrix_to_file(filter, filter_path)
    print("Created filter for {} with {} elements.".format(search_term, filter_vec.vector.getnnz()))