Exemplo n.º 1
0
def get_doc_type_filters(return_type='csc', docs_or_sections = 'docs'):


    doc_types = get_dtype_dict()


    doc_type_filters = {}
    for doc_type in doc_types['valid']:
        for weight in [False]:
            if doc_type in [
                'letter', 'report', 'memo', 'email', 'note', 'publication', 'report, scientific', 'advertisement',
                'promotional material', 'budget', 'specification', 'budget_review', 'telex', 'news_article', 'agenda',
                'report, market research', 'speech', 'presentation', 'minutes'
            ]:
                doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type',
                          weight=weight, return_type=return_type, docs_or_sections=docs_or_sections)
            else:
                doc_type_filters[(doc_type, weight)] = get_filter(doc_type, filter_type='doc_type',
                          weight=weight, return_type='csc', docs_or_sections=docs_or_sections)


    for group in doc_types['groups']:
        for weight in [False]:

            file_name = '{}_{}_{}_{}'.format(group, 'doc_type', docs_or_sections, weight)
            file_path = Path(PATH_TOKENIZED, 'filters', file_name)
            try:
                group_filter = Vector().load_from_disk(file_path, return_type=return_type)

            except FileNotFoundError:
                print("creating group filter for: ", group)
                group_filter = None
                for doc_type in doc_types['groups'][group]:
                    if group_filter is None:
                        group_filter = doc_type_filters[(doc_type, weight)]
                    else:
                        group_filter += doc_type_filters[(doc_type, weight)]

                group_filter.save_to_disk(file_path)

            doc_type_filters[(group, weight)] = group_filter
#            if return_type == 'np':
#                doc_type_filters[(group, weight)] = csc_bool_to_np_cython(group_filter)

    return doc_type_filters
Exemplo n.º 2
0
def get_absolute_google_counts(token_name: str) -> np.ndarray:
    """    This function retrieves the absolute counts for a given token from the Google Ngram Viewer.

    It first loads the relative frequencies from the ngram viewer and the absolute counts
    for the corpus from Google's source data.
    Then, it multiplies the absolute number of terms in the corpus for any given year with the
    relative frequency of the search token.

    >>> google_counts = get_absolute_google_counts('addiction')
    >>> print(f'Google counts for addiction in 1950: {google_counts[50]}')
    Google counts for addiction in 1950: 2482.0

    >>> type(google_counts)
    <class 'tobacco.utilities.vector.Vector'>

    """

    hash = hashlib.sha256(token_name.encode()).hexdigest()
    file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3],
                     f'{hash}.npy')

    try:
        # this really doesn't need a hash
        absolute_counts = Vector().load_from_disk(file_path,
                                                  return_type='np_int32')
#        absolute_counts = np.load(token_path+hash_path+'.npy')

    except FileNotFoundError:

        corpus_id = 15
        # construct the url, i.e. place the token and other parameters where they belong
        url = 'https://books.google.com/ngrams/interactive_chart?content={}&year_start={}&year_end={}' \
              '&corpus={}&smoothing=0'.format(token_name.replace(' ', '+'), YEAR_START, YEAR_END, corpus_id)

        try:
            with urllib.request.urlopen(url, timeout=1) as response:
                page = response.read().decode('utf-8')

                if page.find('var data = [];') > -1:
                    relative_frequencies = 116 * [0]
                else:

                    start = page.find('var data = [')
                    end = page.find('}];', start)
                    data_dict = json.loads(page[start + 11:end + 2])[0]
                    relative_frequencies = data_dict['timeseries']
                    relative_frequencies += 8 * [relative_frequencies[-1]]

        except urllib.error.HTTPError:
            relative_frequencies = 116 * [0]

        # if general error, return 0 but don't store
        except:
            temp = 116 * [0]
            return np.array(
                [round(temp[i] * GOOGLE_TOTALS[i]) for i in range(len(temp))],
                dtype=np.float)

        # Step 3: calculate the absolute number of appearances by multiplying the frequencies with the total number of tokens
        absolute_counts = [
            round(relative_frequencies[i] * GOOGLE_TOTALS[i])
            for i in range(len(relative_frequencies))
        ]
        absolute_counts = Vector(np.array(absolute_counts, dtype=np.float64))
        absolute_counts.save_to_disk(file_path)


#        hash = hashlib.sha256(token_name.encode()).hexdigest()
#        file_path = Path(PATH_GOOGLE_TOKENS, hash[0], hash[1], hash[2], hash[3])

#        token_path = PATH_GOOGLE_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3])

#       if not Path.exists(file_path):
#           print(file_path)
#           Path.mkdir(file_path, parents=True)
#        np.save(token_path + hash_path, absolute_counts)

    return absolute_counts
Exemplo n.º 3
0
def create_filter(search_term: Union[str, int], filter_type: str, weight: bool,
                  return_type: str, docs_or_sections: str):

    '''
    Creates a filter vector for collectinos, doc types, availability

    12/20/18: Separated from get_filter (moved to Vector class)

    :param search_term:
    :param filter_type:
    :param weight:
    :param return_type:
    :param docs_or_sections:
    :return:
    '''

    db = Database("TOB_FULL")
    con, cur = db.connect()

    if docs_or_sections == 'docs':
        filter_len = DOC_COUNT
    elif docs_or_sections == 'sections':
        filter_len = SECTION_COUNT
    else:
        raise ValueError("param docs_or_sections has to be either 'docs' or sections' but not ''{}".format(docs_or_sections))

    if weight:
        filter = np.zeros((filter_len, 1), dtype=np.float)
    else:
        filter = np.zeros((filter_len, 1), dtype=np.bool)

    if filter_type == 'collection':
        cur.execute("SELECT id, no_tokens from docs where collection_id = '{}' ORDER BY id ASC".format(search_term))
    elif filter_type == 'doc_type':
        if weight:
            cur.execute('''SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens, doc_types.weight as weight
                                      FROM doc_types, docs WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'''.format(search_term))
        else:
            cur.execute('SELECT doc_types.doc_id as id, docs.no_tokens as no_tokens FROM doc_types, docs '
                        '     WHERE doc_type = "{}" and doc_types.doc_id = docs.id ORDER BY id ASC'.format(search_term))
    elif filter_type == 'availability':
        # dict maps from search term to the requisite where clause (documents can be both formerly privileged and formerly confidential)
        term_to_mysql_where_clauses_dict = {
            'no restrictions': 'WHERE availability = "public;no restrictions"',
            'formerly confidential': 'WHERE availability = "public;formerly confidential" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"',
            'formerly privileged': 'WHERE availability = "public;formerly privileged" OR availability = "public;formerly confidential; formerly privileged" OR availability = "public;formerly privileged; formerly confidential"'
        }
        cur.execute('SELECT id, no_tokens from docs {} ORDER BY id ASC'.format(term_to_mysql_where_clauses_dict[search_term]))
    else:
        raise KeyError("{} is not a valid filter_type. Valid filter types are 'collection', 'doc_type', and 'availability'".format(filter_type))

    if docs_or_sections == 'sections':
        doc_id_to_section_dict = get_doc_id_to_section_id_dict()

    rows = cur.fetchall()
    for row in rows:
        if docs_or_sections == 'docs':
            if weight:
                filter[row['id']] = row['weight']
            else:
                filter[row['id'], 0] = True

        elif docs_or_sections == 'sections':
            first_id, last_id = doc_id_to_section_dict[row['id']]
            for section_id in range(first_id, last_id + 1):
                filter[section_id] = True


    filter_vec = Vector(csc_matrix(filter))
    if filter_type == 'doc_type':
        search_term = search_term.replace('/', '_')
    file_name = '{}_{}_{}_{}'.format(search_term, filter_type, docs_or_sections, weight)
    file_path = Path(PATH_TOKENIZED, 'filters', file_name)
    filter_vec.save_to_disk(file_path)



#    filter_path = PATH_TOKENIZED + 'filters/{}_{}_{}_{}.npz'.format(search_term, filter_type, docs_or_sections, weight)
#    store_csr_matrix_to_file(filter, filter_path)
    print("Created filter for {} with {} elements.".format(search_term, filter_vec.vector.getnnz()))