Exemplo n.º 1
0
def parse_text_passages_tokens(tokens):

    search_regexes = []

    intersection_vector = None

    first_token = tokens[0].strip('*')

    for token in tokens:
        # if wildcard, process wildcard
        if token.find('*') > -1:
            token_vector, token_regex = process_wildcard_token(token)

        # else: handle normal token
        else:
            try:

                token_vector = Vector().load_token_vector(
                    token, return_type='np_uint8', docs_or_sections='sections')
                token_regex = re.compile(r'\b{}\b'.format(token))

            # will throw a fof error when the token does not exist. e.g. 'compound w'
            except FileNotFoundError:
                token_vector, token_regex = process_nonexistant_token(token)

        if token_vector is not None:
            if intersection_vector is None:
                intersection_vector = token_vector
            else:
                intersection_vector *= token_vector

        search_regexes.append(token_regex)

    return first_token, search_regexes, intersection_vector
def get_ngram_vector(token,
                     return_type='csc',
                     return_sum=False,
                     docs_or_sections='docs'):
    """ Loads the ngram vector of the token

    E.g. v = get_ngram_vector('nicotine', return_type='csc', docs_or_sections='docs')

    :param token: search token, string
    :param return_type: 'csc', 'np', 'uint8'
    :param return_sum: Whether or not to return the sum of the vector.
    :param docs_or_sections: 'docs' or 'sections'
    :return:
    """

    # to distribute the millions of stored ngram vectors, they were hashed.
    hash = hashlib.sha256(token.encode()).hexdigest()

    h = hash
    if docs_or_sections == 'sections':
        h += '_sections'
    token_path = Path(PATH_TOKENS, hash[0], hash[1], hash[2], hash[3], h)

    #    token_path = PATH_TOKENS + '{}/{}/{}/{}/{}'.format(hash[0], hash[1], hash[2], hash[3], hash)

    #    if docs_or_sections == 'sections':
    #        token_path += '_sections'

    ngram_vector = Vector()
    ngram_vector.load_from_disk(token_path)
    ngram_vector.convert_to_datatype(return_type)

    return ngram_vector

    from IPython import embed
    embed()

    #    csc = load_csc_matrix_from_file(token_path)
    #    if csc is None:
    #        print("Could not find token vector for token {} at {}".format(token, token_path))

    # if return_type == 'csc':
    #     pass
    # elif return_type == 'np':
    #     out = csc_to_np_int64(csc)
    # elif return_type == 'uint8':
    #     out = csc_to_np_uint8(csc)
    #
    # else:
    #     raise ValueError("{} is not a valid return type for get_ngram_vector. 'csc' and 'np' are valid.")

    if return_sum:
        token_sum = csc.data.sum()
        return out, token_sum
    else:
        return out
def get_doc_type_totals_vector(doc_type_name,
                               docs_or_sections='docs',
                               return_type='csc'):
    """
    Loads one doc_type totals vector

    >>> totals = get_doc_type_totals_vector('report', 'docs', 'csc')
    >>> totals
    <Document Vector of type csc with 1652572 elements and length 11303161.>


    :param doc_type_name:
    :param docs_or_sections:
    :param return_type:
    :return:
    """

    try:
        return Vector().load_totals_vector(doc_type_name.replace('/', '_'),
                                           'doc_type', docs_or_sections,
                                           return_type)
    except IOError:
        # added 6/9/17. This is all very awkward (i.e. loading filters every time).
        # It replaces an older solution that used filter_vector(), which is no longer available.
        # Filters get loaded every time because otherwise, the filters would be held in memory twice.

        print("Creating doc type totals vector for: ", doc_type_name,
              docs_or_sections)
        filters = get_filters(return_type='np')
        totals_vector = csc_to_np_int32(get_totals_vector(docs_or_sections))
        active_doc_type_filters_np, _, _ = get_active_filters_np(
            active_filters={
                'doc_type': {doc_type_name},
                'collection': {},
                'availability': {}
            },
            FILTERS=filters,
            docs_or_sections=docs_or_sections,
            return_type=np.uint8)

        filtered_dt = totals_vector * active_doc_type_filters_np
        # um... yeah... ugly but it creates the required mx1 sparse vector
        vec = csc_matrix(csc_matrix(filtered_dt).T, dtype=np.int64)

        #            totals[dt] = filter_vector(get_totals_vector(), {'doc_type': {dt}, 'collection': {}})
        store_csr_matrix_to_file(
            vec, PATH_TOKENIZED + 'totals/{}_{}'.format(
                doc_type_name.replace('/', '_'), docs_or_sections))

        return get_doc_type_totals_vector(doc_type_name, docs_or_sections,
                                          return_type)
Exemplo n.º 4
0
def process_all():

    create_sqlite_table()

    terms = []

    db = Database('TOB_FULL')
    con, cur = db.connect()

    cur.execute('SELECT token from tokens where total > 10000;')
    for row in cur.fetchall():
        term = row['token']
        valid = True

        for word in term.split():
            if len(word) == 1:
                valid = False
            try:
                int(word)
                valid = False
            except ValueError:
                pass
        if valid:
            terms.append(term)

    print("Number of terms: {}".format(len(terms)))


    for collection in COL_NAME_TO_ID:
        col_id = COL_NAME_TO_ID[collection]
        filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8')
        filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8'))
        max_5p_filter = Vector().load_page_number_vector('max_5')
        print("pre", filtered_collection_vector)
        filtered_collection_vector.filter_with(max_5p_filter)
        print('post', filtered_collection_vector)

        if collection == 'msa_bat':
            totals = TOTALS_COL[5]
            for id in [6, 7, 8, 9, 10, 11, 15]:
                totals += TOTALS_COL[id]
            print(totals)
        else:
            totals = TOTALS_COL[col_id]
        filtered_totals_year_vector = totals.convert_to_year_array(filter_vec=filtered_collection_vector)


        for term in terms:
            find_and_store_policy(term, filtered_collection_vector, filtered_totals_year_vector, collection)
Exemplo n.º 5
0
    def _compute_add_tokens_data(self):
        """
        Load counts, frequencies, and totals for each token.

        12/18/18 Moved from preprocessing_tokens and implemented for use with the NgramResult class,
        i.e. it won't return a df but individual vars.

        :return: None
        """

        self.tokens_data = []
        self.aggregate = None

        for token in self.parsed_search_tokens:
            # Load token and totals
            try:
                loaded_vector = Vector().load_token_vector(
                    token,
                    return_type='np_int32',
                    docs_or_sections=self.docs_or_sections)
            except FileNotFoundError:
                print('Could not load token {}.'.format(token))
                continue

            # initialize aggregate
            if self.aggregate is None:
                self.aggregate = loaded_vector.copy()
            else:
                self.aggregate += loaded_vector

            absolute_counts = loaded_vector.convert_to_year_array(
                filter_vec=self.combined_filters_np)

            self.tokens_data.append({
                'token': token,
                'counts': absolute_counts,
                'frequencies': absolute_counts / self.totals_years,
                'total': absolute_counts.sum
            })

        self.tokens_data = sorted(self.tokens_data,
                                  key=lambda k: k['total'],
                                  reverse=True)
        self.aggregate.filter_with(self.combined_filters_np)
        self.aggregate_csc = self.aggregate.copy().convert_to_datatype('csc')
def get_collection_totals_vector(collection_id,
                                 docs_or_sections,
                                 return_type='csc'):
    """ Load the totals vector for one collection

    :param collection_id: id of the collection
    :param docs_or_sections: "docs" or "sections"
    :param return_type: "csc" or "np" or csc sparse matrix or np array
    :return:
    """

    try:
        return Vector().load_totals_vector(collection_id, 'collection',
                                           docs_or_sections, 'csc')


#        csc = load_csc_matrix_from_file(PATH_TOKENIZED + 'totals/{}_{}'.format(collection_id, docs_or_sections))
    except IOError:

        print("Creating totals vector for collection, type: ", collection_id,
              docs_or_sections)

        filters = get_filters(return_type='np')
        totals_vector = csc_to_np_int32(get_totals_vector(docs_or_sections))
        _, active_collection_filters_np, _ = get_active_filters_np(
            active_filters={
                'doc_type': {},
                'collection': {collection_id},
                'availability': {}
            },
            FILTERS=filters,
            docs_or_sections=docs_or_sections,
            return_type=np.uint8)

        filtered_dt = totals_vector * active_collection_filters_np
        # um... yeah... ugly but it creates the required mx1 sparse vector
        csc = csc_matrix(csc_matrix(filtered_dt).T, dtype=np.int64)
        store_csr_matrix_to_file(
            csc, PATH_TOKENIZED +
            'totals/{}_{}'.format(collection_id, docs_or_sections))

    if return_type == 'csc':
        return csc_matrix(csc, dtype=np.int32)
    else:
        return csc_to_np_int32(csc)
Exemplo n.º 7
0
def get_doc_type_filters(return_type='csc', docs_or_sections = 'docs'):


    doc_types = get_dtype_dict()


    doc_type_filters = {}
    for doc_type in doc_types['valid']:
        for weight in [False]:
            if doc_type in [
                'letter', 'report', 'memo', 'email', 'note', 'publication', 'report, scientific', 'advertisement',
                'promotional material', 'budget', 'specification', 'budget_review', 'telex', 'news_article', 'agenda',
                'report, market research', 'speech', 'presentation', 'minutes'
            ]:
                doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type',
                          weight=weight, return_type=return_type, docs_or_sections=docs_or_sections)
            else:
                doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type',
                          weight=weight, return_type='csc', docs_or_sections=docs_or_sections)


    for group in doc_types['groups']:
        for weight in [False]:

            file_name = '{}_{}_{}_{}'.format(group, 'doc_type', docs_or_sections, weight)
            file_path = Path(PATH_TOKENIZED, 'filters', file_name)
            try:
                group_filter = Vector().load_from_disk(file_path, return_type=return_type)

            except FileNotFoundError:
                print("creating group filter for: ", group)
                group_filter = None
                for doc_type in doc_types['groups'][group]:
                    if group_filter is None:
                        group_filter = doc_type_filters[(doc_type, weight)]
                    else:
                        group_filter += doc_type_filters[(doc_type, weight)]

                group_filter.save_to_disk(file_path)

            doc_type_filters[(group, weight)] = group_filter
#            if return_type == 'np':
#                doc_type_filters[(group, weight)] = csc_bool_to_np_cython(group_filter)

    return doc_type_filters
def get_totals_vector(docs_or_sections='docs', return_type='np_int32'):
    '''
    Only implemented for 1 gram because there's no reason why we would need totals for 2-5 grams
    :return:
    '''

    ngram = 1

    try:
        file_name = 'totals_{}_{}'.format(ngram, docs_or_sections)
        file_path = Path(PATH_TOKENIZED, file_name)
        totals_vector = Vector().load_from_disk(file_path,
                                                return_type=return_type)
        return totals_vector

    except IOError:

        totals_vector = create_totals_vector(ngram, docs_or_sections)
        totals_vector = csc_matrix(totals_vector, dtype=np.int32)
        store_csr_matrix_to_file(
            totals_vector, PATH_TOKENIZED +
            'totals_{}_{}.npz'.format(ngram, docs_or_sections))
        return get_totals_vector(docs_or_sections, return_type)
Exemplo n.º 9
0
    def _compute_set_active_filters_np(self, globals):
        """ Applies all filters to both the term and a copy of the totals vector and sets them

        All filters are np uint8 Vectors. The following filters are set in this function:
        doc_type_filters_np
        collection_filters_np
        availability_filters_np
        term_filters_np
        combined_filters_np

        6/10/17 Added availability filter.
        The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it.
        That is to say: the availability filter is not used on its own.

        7/25/17 Added term filter

        12/19/18: Moved to calculate_ngrams from preprocessing filters as this is a calculation
        step, not a pre-processing step

        >>> globals = get_globals(load_only_docs=True)
        >>> dt_filters = ['internal communication']
        >>> col_filters = [2,3]
        >>> avail_filters = ['no restrictions']
        >>> term_filters = []
        >>> search_tokens = ['addiction']
        >>> ngram = NgramResult(dt_filters, col_filters, avail_filters, term_filters, search_tokens)
        >>> ngram.docs_or_sections = 'docs'
        >>> ngram._compute_set_active_filters_np(globals=globals)
        >>> print(ngram.combined_filters_np)
        <Document Vector of type np_uint8 with 4964 elements.>

        :return: None
        """

        filters = globals['filters'][self.docs_or_sections]

        if not 'term' in self.active_filters:
            self.active_filters['term'] = {}

        # all filters used here are unweighted
        # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each.
        weighted = False
        filter_len = DOC_COUNT
        if self.docs_or_sections == 'sections':
            filter_len = SECTION_COUNT

        # process availability filters
        if len(self.active_filters['availability']) == 0:
            self.availability_filters_np = None
        else:
            self.availability_filters_np = None
            for filter_name in self.active_filters['availability']:
                if self.availability_filters_np is None:
                    self.availability_filters_np = filters['availability'][(
                        filter_name, weighted)].copy()
                else:
                    self.availability_filters_np += filters['availability'][(
                        filter_name, weighted)]
            self.availability_filters_np.convert_to_datatype('np_uint8')

        # process term filters
        if len(self.active_filters['term']) == 0:
            self.term_filters_np = None
        else:
            self.term_filters_np = None
            for filter_name in self.active_filters['term']:
                if self.term_filters_np is None:
                    self.term_filters_np = Vector().load_token_vector(
                        filter_name,
                        return_type='np_uint8',
                        docs_or_sections=self.docs_or_sections)
                else:
                    self.term_filters_np += Vector().load_token_vector(
                        filter_name,
                        return_type='np_uint8',
                        docs_or_sections=self.docs_or_sections)

        # process doc_type filters
        if len(self.active_filters['doc_type']) == 0:
            self.doc_type_filters_np = Vector(
                np.ones(filter_len, dtype=np.uint8))
        else:
            self.doc_type_filters_np = None
            for filter_name in self.active_filters['doc_type']:
                if self.doc_type_filters_np is None:
                    self.doc_type_filters_np = filters['doc_type'][(
                        filter_name, weighted)].copy()
                else:
                    self.doc_type_filters_np += filters['doc_type'][(
                        filter_name, weighted)]
            self.doc_type_filters_np.convert_to_datatype('np_uint8')

        # process collection filters
        if len(self.active_filters['collection']) == 0:
            self.collection_filters_np = Vector(
                np.ones(filter_len, dtype=np.uint8))
        else:
            self.collection_filters_np = None
            for filter_name in self.active_filters['collection']:
                if self.collection_filters_np is None:
                    self.collection_filters_np = filters['collection'][(
                        filter_name, weighted)].copy()
                else:
                    self.collection_filters_np += filters['collection'][(
                        filter_name, weighted)]
            self.collection_filters_np.convert_to_datatype('np_uint8')

        # Apply term filter to doc type and collection filters
        if self.term_filters_np is not None:
            self.doc_type_filters_np.filter_with(self.term_filters_np)
            self.collection_filters_np.filter_with(self.term_filters_np)

        # Apply availability filter to doc type and collection filters
        if self.availability_filters_np is not None:
            self.doc_type_filters_np.filter_with(self.availability_filters_np)
            self.doc_type_filters_np.filter_with(self.availability_filters_np)

        # Create final filter
        if len(self.active_filters['doc_type']) == 0:
            self.combined_filters_np = self.collection_filters_np
        elif len(self.active_filters['collection']) == 0:
            self.combined_filters_np = self.doc_type_filters_np
        else:
            self.combined_filters_np = self.collection_filters_np.filter_with(
                self.doc_type_filters_np, return_copy=True)
Exemplo n.º 10
0
def find_and_store_policy(term='and', filtered_collection_vector=None, filtered_totals_year_vector=None,
                          collection=None):


    db = sqlite3.connect('policies.db')
    cur = db.cursor()

    col_id = COL_NAME_TO_ID[collection]

    if not filtered_collection_vector:
        filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8')
        filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8'))
        filtered_totals_year_vector = TOTALS_COL[col_id].convert_to_year_array(filter_vec=filtered_collection_vector)

    term_v = Vector().load_token_vector(token=term)
    term_year_vector = term_v.convert_to_year_array(filter_vec=filtered_collection_vector)


    dunnings = {}

    for start_first_period in range(50, 90):
        end_first_period = start_first_period + 3
        policy_year = start_first_period + 4
        start_second_period = start_first_period + 5
        end_second_period = start_first_period + 8

        term_count_first = term_year_vector[start_first_period : end_first_period+1].sum()
        term_count_second = term_year_vector[start_second_period : end_second_period+1].sum()

        totals_first = filtered_totals_year_vector[start_first_period : end_first_period+1].sum()
        totals_second = filtered_totals_year_vector[start_second_period : end_second_period+1].sum()

        dunning = dunning_log_likelihood(term_count_first, term_count_second,
                                         totals_first, totals_second)

        dunnings[policy_year] = {
            'year': f'19{policy_year}',
            'dunning': dunning,
            'first': term_count_first,
            'first_freq': term_count_first / totals_first*100,
            'second': term_count_second,
            'second_freq': term_count_second / totals_second*100
        }
#        print(f'19{start_first_period}-19{end_first_period} vs. 19{start_second_period}-'
#        f'19{end_second_period}: {dunning}. 1: {term_count_first}. 2: {term_count_second}')

    dunnings_sorted = sorted(dunnings.items(), key=lambda x:x[1]['dunning'])

    policy_adoption = dunnings_sorted[-1][1]
    policy_ending = dunnings_sorted[0][1]

    policy = '{}. {:15s}. Adoption: {}. D: {:7.0f}. C1: {:9d}. C2: {:9d}. F: {:5.3f}. ' \
             'Ending: {}. D:{:7.0f}. C1: {:9d}. C2: {:9d}. F: {:5.3f}.'.format( collection, term,
        policy_adoption['year'], policy_adoption['dunning'], policy_adoption['first'],
        policy_adoption['second'], policy_adoption['first_freq']/ policy_adoption['second_freq'],
        policy_ending['year'], policy_ending['dunning'], policy_ending['first'],
        policy_ending['second'], policy_ending['first_freq']/ policy_ending['second_freq']
    )
    print(policy)

    cur.execute('''INSERT INTO policies_5p VALUES("{}", "{}", 
                                               {}, {}, {}, {}, {}, {},
                                               {}, {}, {}, {}, {}, {})'''.format(
        collection, term,
        policy_adoption['year'], policy_adoption['dunning'], policy_adoption['first'],
        policy_adoption['second'], policy_adoption['first_freq'], policy_adoption['second_freq'],
        policy_ending['year'], policy_ending['dunning'], policy_ending['first'],
        policy_ending['second'], policy_ending['first_freq'], policy_ending['second_freq']
    ))
    db.commit()
Exemplo n.º 11
0
def get_absolute_google_counts(token_name: str) -> np.ndarray:
    """    This function retrieves the absolute counts for a given token from the Google Ngram Viewer.

    It first loads the relative frequencies from the ngram viewer and the absolute counts
    for the corpus from Google's source data.
    Then, it multiplies the absolute number of terms in the corpus for any given year with the
    relative frequency of the search token.

    >>> google_counts = get_absolute_google_counts('addiction')
    >>> print(f'Google counts for addiction in 1950: {google_counts[50]}')
    Google counts for addiction in 1950: 2482.0

    >>> type(google_counts)
    <class 'tobacco.utilities.vector.Vector'>

    """

    hash = hashlib.sha256(token_name.encode()).hexdigest()
    file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3],
                     f'{hash}.npy')

    try:
        # this really doesn't need a hash
        absolute_counts = Vector().load_from_disk(file_path,
                                                  return_type='np_int32')
#        absolute_counts = np.load(token_path+hash_path+'.npy')

    except FileNotFoundError:

        corpus_id = 15
        # construct the url, i.e. place the token and other parameters where they belong
        url = 'https://books.google.com/ngrams/interactive_chart?content={}&year_start={}&year_end={}' \
              '&corpus={}&smoothing=0'.format(token_name.replace(' ', '+'), YEAR_START, YEAR_END, corpus_id)

        try:
            with urllib.request.urlopen(url, timeout=1) as response:
                page = response.read().decode('utf-8')

                if page.find('var data = [];') > -1:
                    relative_frequencies = 116 * [0]
                else:

                    start = page.find('var data = [')
                    end = page.find('}];', start)
                    data_dict = json.loads(page[start + 11:end + 2])[0]
                    relative_frequencies = data_dict['timeseries']
                    relative_frequencies += 8 * [relative_frequencies[-1]]

        except urllib.error.HTTPError:
            relative_frequencies = 116 * [0]

        # if general error, return 0 but don't store
        except:
            temp = 116 * [0]
            return np.array(
                [round(temp[i] * GOOGLE_TOTALS[i]) for i in range(len(temp))],
                dtype=np.float)

        # Step 3: calculate the absolute number of appearances by multiplying the frequencies with the total number of tokens
        absolute_counts = [
            round(relative_frequencies[i] * GOOGLE_TOTALS[i])
            for i in range(len(relative_frequencies))
        ]
        absolute_counts = Vector(np.array(absolute_counts, dtype=np.float64))
        absolute_counts.save_to_disk(file_path)


#        hash = hashlib.sha256(token_name.encode()).hexdigest()
#        file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3])

#        token_path = PATH_GOOGLE_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3])

#       if not Path.exists(file_path):
#           print(file_path)
#           Path.mkdir(file_path, parents=True)
#        np.save(token_path + hash_path, absolute_counts)

    return absolute_counts
Exemplo n.º 12
0
from tobacco.utilities.vector import Vector

GOOGLE_TOTALS = Vector(
    np.array([
        1285712637.0, 1311315033.0, 1266236889.0, 1405505328.0, 1351302005.0,
        1397090480.0, 1409945274.0, 1417130893.0, 1283265090.0, 1354824248.0,
        1350964981.0, 1431385638.0, 1356693322.0, 1324894757.0, 1211361619.0,
        1175413415.0, 1183132092.0, 1039343103.0, 1136614538.0, 1388696469.0,
        1216676110.0, 1413237707.0, 1151386048.0, 1069007206.0, 1113107246.0,
        1053565430.0, 1216023821.0, 1212716430.0, 1153722574.0, 1244889331.0,
        1183806248.0, 1057602772.0, 915956659.0, 1053600093.0, 1157109310.0,
        1199843463.0, 1232280287.0, 1261812592.0, 1249209591.0, 1179404138.0,
        1084154164.0, 1045379066.0, 890214397.0, 812192380.0, 926378706.0,
        1203221497.0, 1385834769.0, 1486005621.0, 1641024100.0, 1644401950.0,
        1603394676.0, 1621780754.0, 1590464886.0, 1662160145.0, 1751719755.0,
        1817491821.0, 1952474329.0, 1976098333.0, 2064236476.0, 2341981521.0,
        2567977722.0, 2818694749.0, 2955051696.0, 2931038992.0, 3300623502.0,
        3466842517.0, 3658119990.0, 3968752101.0, 3942222509.0, 4086393350.0,
        4058576649.0, 4174172415.0, 4058707895.0, 4045487401.0, 4104379941.0,
        4242326406.0, 4314577619.0, 4365839878.0, 4528331460.0, 4611609946.0,
        4627406112.0, 4839530894.0, 4982167985.0, 5309222580.0, 5475269397.0,
        5793946882.0, 5936558026.0, 6191886939.0, 6549339038.0, 7075013106.0,
        6895715366.0, 7596808027.0, 7492130348.0, 8027353540.0, 8276258599.0,
        8745049453.0, 8979708108.0, 9406708249.0, 9997156197.0, 11190986329.0,
        11349375656.0, 12519922882.0, 13632028136.0, 14705541576.0,
        14425183957.0, 15310495914.0, 16206118071.0, 19482936409.0,
        19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0,
        19482936409.0, 19482936409.0, 19482936409.0, 19482936409.0
    ],
             dtype=np.float64))

Exemplo n.º 13
0
def get_filter(search_term, filter_type, weight=False, return_type='csc', docs_or_sections='docs'):
    '''
    Creates a binary filter (True if document has the specificed doc_type or collection. Falso otherwise
    :param search_term:
    :param filter_type: 'collection' or 'doc_type'
    :param weight:
    :return: Vector

    >>> filter = get_filter('letter', 'doc_type', weight=False, return_type='csc')
    >>> filter
    <Document Vector of type csc with 2490726 elements.>

    '''


    try:
        # can't store terms with a forward slash -> replace with underscore
        if filter_type == 'doc_type': search_term = search_term.replace('/', '_')
        file_name = '{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight)
        file_path = Path(PATH_TOKENIZED, 'filters', file_name)
        filter = Vector().load_from_disk(file_path, return_type=return_type)

#        filter =  load_csc_matrix_from_file(PATH_TOKENIZED + 'filters/{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight))


    except IOError:


        db = Database("TOB_FULL")
        con, cur = db.connect()

        if docs_or_sections == 'docs':
            filter_len = DOC_COUNT
        elif docs_or_sections == 'sections':
            filter_len = SECTION_COUNT
        else:
            raise ValueError("param docs_or_sections has to be either 'docs' or sections' but not ''{}".format(docs_or_sections))

        if weight:
            filter = np.zeros((filter_len, 1), dtype=np.float)
        else:
            filter = np.zeros((filter_len, 1), dtype=np.bool)


        if filter_type == 'collection':
            cur.execute("SELECT id, no_tokens from docs where collection_id = '{}' ORDER BY id ASC".format(search_term))
        elif filter_type == 'doc_type':
            if weight:
                cur.execute('''SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens, doc_types.weight as weight
                                  FROM doc_types, docs WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'''.format(search_term))
            else:
                cur.execute('SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens FROM doc_types, docs '
                            '     WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'.format(search_term))
        elif filter_type == 'availability':
            # dict maps from search term to the requisite where clause (documents can be both formerly privileged and formerly confidential)
            term_to_mysql_where_clauses_dict = {
                'no restrictions': 'WHERE availability = "public;no restrictions"',
                'formerly confidential': 'WHERE availability = "public;formerly confidential" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"',
                'formerly privileged': 'WHERE availability = "public;formerly privileged" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"'
            }
            cur.execute('SELECT id, no_tokens from docs {} ORDER BY id ASC'.format(term_to_mysql_where_clauses_dict[search_term]))
        else:
            raise KeyError("{} is not a valid filter_type. Valid filter types are 'collection', 'doc_type', and 'availability'".format(filter_type))

        if docs_or_sections == 'sections':
            doc_id_to_section_dict = get_doc_id_to_section_id_dict()

        rows = cur.fetchall()
        for row in rows:
            if docs_or_sections == 'docs':
                if weight:
                    filter[row['id']] = row['weight']
                else:
                    filter[row['id'], 0] = True

            elif docs_or_sections == 'sections':
                first_id, last_id = doc_id_to_section_dict[row['id']]
                for section_id in range(first_id, last_id+1):
                    filter[section_id] = True

        filter = csc_matrix(filter)
        if filter_type == 'doc_type': search_term = search_term.replace('/', '_')

        filter_path = PATH_TOKENIZED + 'filters/{}_{}_{}_{}.npz'.format(search_term, filter_type, docs_or_sections, weight)
        store_csr_matrix_to_file(filter, filter_path)
        print("Created filter for {} with {} elements.".format(search_term, filter.getnnz()))

        return get_filter(filter_type, weight, return_type, docs_or_sections)

#    if return_type == 'np':
#        filter = np.array(filter.todense()).flatten()

    return filter
Exemplo n.º 14
0
def get_active_filters_np(active_filters, FILTERS, docs_or_sections='docs'):

    """ Applies all filters to both the term and a copy of the totals vector and returns them

    6/10/17 Added availability filter.
    The idea is that every time the availability filter is used, the collection and doc type filters get multiplied with it.
    That is to say: the availability filter is not used on its own.

    7/25/17 Added term filter

    >>> FILTERS = {
    ...     'docs':{
    ...         'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='docs'),
    ...         'collection': get_collection_filters(return_type='csc', docs_or_sections='docs'),
    ...         'availability': get_availability_filters(return_type='csc', docs_or_sections='docs')
    ...     },
    ...     'sections':{
    ...         'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'),
    ...         'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'),
    ...         'availability': get_availability_filters(return_type='csc', docs_or_sections='sections')
    ...     }
    ... }
    >>> active_filters = {
    ...     'doc_type': {'internal communication'},
    ...     'collection': {2,3},
    ...     'availability': {'no restrictions'},
    ...     'term': {}
    ... }
    >>> doc_type_filter, collection_filter, final_filter = get_active_filters_np(
    ...         active_filters=active_filters, FILTERS=FILTERS, docs_or_sections='docs')


    :param active_filters: dict of lists, e.g. {'doc_type': ["internal communication"], 'collection': [1,2],
                                                'availability': [], 'term': []}
    :param FILTERS: Filters from global ??I think they are csc filters ??
    :param return_type: e.g. np.uint8 or np.int32. By default, the same document type as the input, usually np.uint8
    :param docs_or_sections: 'docs' or 'sections'
    :return: doc_type_filter, collection_filter, final_filter
    """

    if not 'term' in active_filters:
        active_filters['term'] = {}

    # all filters used here are unweighted
    # 8/31/18: At some point, I had the idea that a document with 10 document types would give weight 1/10 to each.
    weighted = False
    filter_len = DOC_COUNT
    if docs_or_sections == 'sections':
        filter_len = SECTION_COUNT

    # process availability filters
    if len(active_filters['availability']) == 0:
        availability_filter = None
    else:
        availability_filter = None
        for filter_name in active_filters['availability']:
            if availability_filter is None:
                availability_filter = FILTERS[docs_or_sections]['availability'][(filter_name, weighted)].copy()
            else:
                availability_filter += FILTERS[docs_or_sections]['availability'][(filter_name, weighted)]
        availability_filter.convert_to_datatype('np_uint8')

    # process term filters
    if len(active_filters['term']) == 0:
        term_filter = None
    else:
        term_filter = None
        for filter_name in active_filters['term']:
            if term_filter is None:
                term_filter = Vector().load_token_vector(filter_name, return_type='np_uint8',
                                                         docs_or_sections=docs_or_sections)
            else:
                term_filter += Vector().load_token_vector(filter_name, return_type='np_uint8',
                                                          docs_or_sections=docs_or_sections)

    # process doc_type filters
    if len(active_filters['doc_type']) == 0:
        doc_type_filter = np.ones(filter_len, dtype='bool')
    else:
        doc_type_filter = None
        for filter_name in active_filters['doc_type']:
            if doc_type_filter is None:

                doc_type_filter = FILTERS[docs_or_sections]['doc_type'][(filter_name, weighted)].copy()
            else:
                doc_type_filter += FILTERS[docs_or_sections]['doc_type'][(filter_name, weighted)]

        # if docs_or_sections == 'sections':
        doc_type_filter.convert_to_datatype('np_uint8')

    # process collection filters
    if len(active_filters['collection']) == 0:
        collection_filter = np.ones(filter_len, dtype=np.uint8)
    else:
        collection_filter = None
        for filter_name in active_filters['collection']:
            if collection_filter is None:
                collection_filter = FILTERS[docs_or_sections]['collection'][(filter_name, weighted)].copy()
            else:
                collection_filter += FILTERS[docs_or_sections]['collection'][(filter_name, weighted)]
        collection_filter.convert_to_datatype('np_uint8')

    # Apply term filter to doc type and collection filters
    if term_filter is not None:
        doc_type_filter.filter_with(term_filter)
        collection_filter.filter_with(term_filter)

    # Apply availability filter to doc type and collection filters
    if availability_filter is not None:
        doc_type_filter.filter_with(availability_filter)
        doc_type_filter.filter_with(availability_filter)

    # Create final filter
    if len(active_filters['doc_type']) == 0:
        final_filter = collection_filter
    elif len(active_filters['collection']) == 0:
        final_filter = doc_type_filter
    else:
        final_filter = collection_filter.filter_with(doc_type_filter, return_copy=True)

    return doc_type_filter, collection_filter, final_filter
Exemplo n.º 15
0
def create_filter(search_term: Union[str, int], filter_type: str, weight: bool,
                  return_type: str, docs_or_sections: str):

    '''
    Creates a filter vector for collectinos, doc types, availability

    12/20/18: Separated from get_filter (moved to Vector class)

    :param search_term:
    :param filter_type:
    :param weight:
    :param return_type:
    :param docs_or_sections:
    :return:
    '''

    db = Database("TOB_FULL")
    con, cur = db.connect()

    if docs_or_sections == 'docs':
        filter_len = DOC_COUNT
    elif docs_or_sections == 'sections':
        filter_len = SECTION_COUNT
    else:
        raise ValueError("param docs_or_sections has to be either 'docs' or sections' but not ''{}".format(docs_or_sections))

    if weight:
        filter = np.zeros((filter_len, 1), dtype=np.float)
    else:
        filter = np.zeros((filter_len, 1), dtype=np.bool)

    if filter_type == 'collection':
        cur.execute("SELECT id, no_tokens from docs where collection_id = '{}' ORDER BY id ASC".format(search_term))
    elif filter_type == 'doc_type':
        if weight:
            cur.execute('''SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens, doc_types.weight as weight
                                      FROM doc_types, docs WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'''.format(search_term))
        else:
            cur.execute('SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens FROM doc_types, docs '
                        '     WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'.format(search_term))
    elif filter_type == 'availability':
        # dict maps from search term to the requisite where clause (documents can be both formerly privileged and formerly confidential)
        term_to_mysql_where_clauses_dict = {
            'no restrictions': 'WHERE availability = "public;no restrictions"',
            'formerly confidential': 'WHERE availability = "public;formerly confidential" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"',
            'formerly privileged': 'WHERE availability = "public;formerly privileged" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"'
        }
        cur.execute('SELECT id, no_tokens from docs {} ORDER BY id ASC'.format(term_to_mysql_where_clauses_dict[search_term]))
    else:
        raise KeyError("{} is not a valid filter_type. Valid filter types are 'collection', 'doc_type', and 'availability'".format(filter_type))

    if docs_or_sections == 'sections':
        doc_id_to_section_dict = get_doc_id_to_section_id_dict()

    rows = cur.fetchall()
    for row in rows:
        if docs_or_sections == 'docs':
            if weight:
                filter[row['id']] = row['weight']
            else:
                filter[row['id'], 0] = True

        elif docs_or_sections == 'sections':
            first_id, last_id = doc_id_to_section_dict[row['id']]
            for section_id in range(first_id, last_id + 1):
                filter[section_id] = True


    filter_vec = Vector(csc_matrix(filter))
    if filter_type == 'doc_type':
        search_term = search_term.replace('/', '_')
    file_name = '{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight)
    file_path = Path(PATH_TOKENIZED, 'filters', file_name)
    filter_vec.save_to_disk(file_path)



#    filter_path = PATH_TOKENIZED + 'filters/{}_{}_{}_{}.npz'.format(search_term, filter_type, docs_or_sections, weight)
#    store_csr_matrix_to_file(filter, filter_path)
    print("Created filter for {} with {} elements.".format(search_term, filter_vec.vector.getnnz()))