示例#1
0
def document_iterator(doc_ids):
    """ Select metadata (authors, tid, date, title, collection) for selected text passages

    6/9/17: Now using bytearrays instead of mysql
    :param doc_ids:
    :return:
    """

    db = Database("TOB_FULL")
    con, cur = db.connect()

    cur.execute(
        '''SELECT docs.id, docs.tid, docs.timestamp, docs.year, docs.title, docs.collection_id,
                          GROUP_CONCAT(authors.author SEPARATOR ', ') as author
                      FROM docs, authors
                      WHERE docs.id = authors.doc_id AND docs.id in ({}) GROUP BY docs.id;'''
        .format(",".join([str(i) for i in doc_ids])))

    while True:
        row = cur.fetchone()
        if row:
            yield row
        else:
            break
    con.close()
示例#2
0
    def __get_dtms(self):
        '''
        returns dict of dtms
        {
            'count': count_dtm,
            'tf': tf_dtm,
            'tfidf': tfidf_dtm
        }


        :return:
        '''

        try:
            dtm_count = load_csc_matrix_from_file(
                PATH_TOKENIZED +
                'networks/{}_dtm_count'.format(self.main_name))
            dtm_tf = load_csc_matrix_from_file(
                PATH_TOKENIZED + 'networks/{}_dtm_tf'.format(self.main_name))
            dtm_tfidf = load_csc_matrix_from_file(
                PATH_TOKENIZED +
                'networks/{}_dtm_tfidf'.format(self.main_name))

        except IOError:
            print("DTM for {} not found. Creating now...".format(
                self.main_name))
            db = Database('TOB_NETWORKS')
            con, cur = db.connect()

            # vocabulary, _ = get_vocabulary_and_totals(self.main_name)
            # vocabulary = vocabulary['ordered']

            count_vectorizer = CountVectorizer(
                vocabulary=self.vocabulary['ordered'])

            cur.execute(
                'SELECT text FROM {}_sections ORDER BY section_id ASC;'.format(
                    self.main_name))
            docs = [row['text'] for row in cur.fetchall()]
            dtm = count_vectorizer.fit_transform(docs)
            print(dtm.shape, dtm.sum())

            dtm_count = dtm.tocsc()
            store_csr_matrix_to_file(
                dtm_count, PATH_TOKENIZED +
                'networks/{}_dtm_count.npz'.format(self.main_name))

            tf_transformer = TfidfTransformer(use_idf=False)
            dtm_tf = tf_transformer.fit_transform(dtm_count.copy())
            store_csr_matrix_to_file(
                dtm_tf.tocsc(), PATH_TOKENIZED +
                'networks/{}_dtm_tf.npz'.format(self.main_name))
            tfidf_transformer = TfidfTransformer(use_idf=True)
            dtm_tfidf = tfidf_transformer.fit_transform(dtm_count.copy())
            store_csr_matrix_to_file(
                dtm_tfidf.tocsc(), PATH_TOKENIZED +
                'networks/{}_dtm_tfidf.npz'.format(self.main_name))

        dtm = {'count': dtm_count, 'tf': dtm_tf, 'tfidf': dtm_tfidf}
        return dtm
def full_initialization_process():
    """ Run through the full database initialization process.

    This means:
    - It creates the TOBACCO_RAW DB ?unclear for what?
    - adds a unix timestamp to each document
    - Creates a utf8 text file for each document
    - Initializes doc, doc_types, authors, recipients tables
    - Fills those table

    1/19/2017
    Runs through the whole initialization after importing the raw sql
    Includes creating indexes
    :return:
    """

    db1 = Database('TOBACCO_RAW;')
    con1, cur1 = db1.connect()
    cur1.execute('create index idl_doc_field_id_idx on idl_doc_field(id);')
    cur1.execute('create index idl_doc_id_idx on idl_doc(id);')
    add_timestamp_to_idl_doc()

    create_utf_text_files()

    initialize_tables()
    fill_tables()
def get_year_doc_id_list(docs_or_sections: str) -> list:
    """ Returns a list, wherein every value marks the first doc_id belonging to that year.
    e.g. year_doc_id_list[1910] -> first id belonging to year 1910
    year_doc_id_list[2015] -> highest doc_id + 1

    >>> year_doc_id_list = get_year_doc_id_list('docs')
    >>> ids_1901 = year_doc_id_list[0]
    >>> print(f'First 1901 doc id: {ids_1901[0]}. Last 1901 doc id: {ids_1901[1]}.')
    First 1901 doc id: 0. Last 1901 doc id: 183.
    >>> ids_1902 = year_doc_id_list[1]
    >>> print(f'First 1902 doc id: {ids_1902[0]}. Last 1902 doc id: {ids_1902[1]}.')
    First 1902 doc id: 184. Last 1902 doc id: 238.

    :param docs_or_sections: 'docs' or 'sections'
    :return: list
    """

    try:
        year_doc_id_list = pickle.load(
            open(
                PATH_TOKENIZED +
                'year_{}_id_list.pickle'.format(docs_or_sections), 'rb'))

    except IOError:
        from tobacco.frequencies_preprocessing.preprocessing_sections import \
            get_doc_id_to_section_id_dict
        doc_id_to_section_id_dict = get_doc_id_to_section_id_dict()
        from tobacco.utilities.databases import Database

        print("Creating new year_{}_id_list".format(docs_or_sections))

        db = Database("TOB_FULL")
        con, cur = db.connect()

        year_doc_id_list = []

        for year in range(1901, 2017):
            cur.execute(
                "SELECT MIN(id), MAX(id) FROM docs WHERE year = {}".format(
                    year))
            row = cur.fetchall()[0]
            min_doc_id = row['MIN(id)']
            max_doc_id = row['MAX(id)']

            if docs_or_sections == 'docs':
                year_doc_id_list.append((min_doc_id, max_doc_id))
                print(year, min_doc_id)
            elif docs_or_sections == 'sections':
                min_section_id = doc_id_to_section_id_dict[min_doc_id][0]
                max_section_id = doc_id_to_section_id_dict[max_doc_id][1]
                year_doc_id_list.append((min_section_id, max_section_id))
                print(year, min_section_id, max_section_id)

        pickle.dump(
            year_doc_id_list,
            open(
                PATH_TOKENIZED +
                'year_{}_id_list.pickle'.format(docs_or_sections), 'wb'))

    return year_doc_id_list
示例#5
0
def initialize_db(main_name):

    db = Database('TOB_NETWORKS')
    con, cur = db.connect()

    cur.execute('''CREATE TABLE IF NOT EXISTS {}_docs(
                    id              int NOT NULL AUTO_INCREMENT,
                    node            varchar(255)  NOT NULL,
                    main_author_is  varchar(10)   NOT NULL,
                    tid             varchar(10)   NOT NULL,
                    weight          float                 ,
                    year            INT           NOT NULL,

                    PRIMARY KEY(id));'''.format(main_name))

    cur.execute('''CREATE TABLE IF NOT EXISTS {}_sections(
                    section_id      int           NOT NULL,
                    node            varchar(255)  NOT NULL,
                    main_author_is  varchar(10)   NOT NULL,
                    tid             varchar(10)   NOT NULL,
                    tid_section     INT           NOT NULL,
                    weight          float         NOT NULL,
                    year            int           NOT NULL,
                    text            TEXT          NOT NULL,

                    PRIMARY KEY(section_id));'''.format(main_name))
    con.commit()
def distinctive_terms_overall(main_name):

    global_totals = get_vocabulary_totals(1)
    vocabulary_trie = load_vocabulary_trie(1)

    local_vocabulary, local_totals = get_vocabulary_and_totals(main_name)

    global_totals_localized = np.zeros(len(local_vocabulary['id_to_token']))
    for token in local_vocabulary['token_to_id']:
        local_token_id = local_vocabulary['token_to_id'][token]
        global_token_id = vocabulary_trie[token]
        global_totals_localized[local_token_id] = global_totals[
            global_token_id]

    print(len(global_totals), len(local_totals), len(global_totals_localized))

    distinctive_terms = get_distinctive_terms(local_totals,
                                              global_totals_localized,
                                              local_vocabulary)
    print(distinctive_terms)

    db = Database("TOB_NETWORKS")
    con, cur = db.connect()
    cur.execute('SELECT DISTINCT(tid) as tid from {}_docs'.format(main_name))
    tids = [row['tid'] for row in cur.fetchall()]
    totals2 = get_totals(tids, local_vocabulary)
    dist = get_distinctive_terms(totals2, global_totals_localized,
                                 local_vocabulary)
    print("\n", dist)

    totals3 = get_totals(tids, local_vocabulary, tf=True)
    dist = get_distinctive_terms(totals3, global_totals_localized,
                                 local_vocabulary)
    print("\n", dist)
示例#7
0
def markov(seed, no_token_to_generate=100, tokens_to_select=1):

    db = Database('TOB_FULL')
    con, cur = db.connect()

    seed_length = len(seed.split())

    output = seed

    for i in range(no_token_to_generate):
        cur.execute('''SELECT token, total 
                       from tokens 
                       where token like "{} %" and ngram={}
                       order by total desc;'''.format(
            seed, seed_length + tokens_to_select))

        continuations = cur.fetchall()
        possible_selections = []
        for continuation in continuations:
            token = " ".join(continuation['token'].split()[seed_length:]
                             [-tokens_to_select:])
            for i in range(continuation['total']):
                possible_selections.append(token)

        selection = random.choice(possible_selections)
        output += " " + selection
        seed = " ".join(output.split()[-2:])

        print(output)
示例#8
0
def generate_node_db():

    initialize_green_db()

    # Step 1: Identify top 50 nodes
    node_counter = Counter()

    db = Database("TOB_FULL")
    con, cur = db.connect()
    con2, cur2 = db.connect()
    cur.execute('DELETE FROM green;')
    con.commit()

    author_and_recipient_commands = [
        '''SELECT recipients.recipient as node, docs.tid, "author" as main_person_is
                          FROM authors, recipients, docs
                          WHERE authors.author="Green, SJ" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id;''',
        '''SELECT authors.author as node, authors.doc_id, docs.tid, "recipient" as main_person_is
                          FROM authors, recipients, docs
                          WHERE recipients.recipient="Green, SJ" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id;''',
    ]

    for command in author_and_recipient_commands:
        cur.execute(command)
        while True:
            row = cur.fetchone()
            if not row: break
            else:
                for person in english_name_regex.findall(row['node']):
                    node_counter[person] += 1


#                    cur2.execute('INSERT INTO green(node, main_author_is, tid) VALUES("{}", "{}", "{}")'.format(
#                        person, row['main_person_is'], row['tid']
#                    ))

#    con2.commit()
    print(node_counter.most_common(50))
    top_50_nodes = [i[0] for i in node_counter.most_common(50)]

    # Step 2: insert
    for command in author_and_recipient_commands:
        cur.execute(command)
        while True:
            row = cur.fetchone()
            if not row: break
            else:
                for person in english_name_regex.findall(row['node']):
                    if person in top_50_nodes:
                        cur2.execute(
                            'INSERT INTO green(node, main_author_is, tid) VALUES("{}", "{}", "{}")'
                            .format(person, row['main_person_is'], row['tid']))

    con2.commit()
def get_doc_id_to_section_id_dict():
    """Returns a dict that contains maps from doc_id (0-indexed) to section ids.
    Every doc_id maps to a tuple of (first_section_id_of_doc, last_section_id_of_doc)

    :return:
    """

    try:
        doc_id_to_section_id_dict = pickle.load(
            open(PATH_TOKENIZED + 'doc_id_to_section_id_dict.pickle', 'rb'))
    except IOError:

        print(
            "doc_id_to_section_id_dict not found. Creating a new one with section length = {}."
            .format(SECTION_LENGTH))

        db = Database("TOB_FULL")
        con1, cur1 = db.connect()
        cur1.execute("SELECT id, tid, no_tokens FROM docs ORDER BY id ASC;")

        doc_id_to_section_id_dict = {}

        first_section_id_of_doc = 0
        while True:
            row = cur1.fetchone()
            if not row: break

            last_section_id_of_doc = first_section_id_of_doc + row[
                'no_tokens'] // 200

            # prevent off by 1 error
            if row['no_tokens'] % 200 == 0:
                last_section_id_of_doc -= 1

            doc_id_to_section_id_dict[row['id']] = (first_section_id_of_doc,
                                                    last_section_id_of_doc)

            if row['id'] < 20 or row['id'] % 1000000 == 0:
                print(row['id'], first_section_id_of_doc,
                      last_section_id_of_doc)

            first_section_id_of_doc = last_section_id_of_doc + 1

        print(
            "Final section id: {}. Number of sections: {}. These numbers have to be equal."
            .format(first_section_id_of_doc, SECTION_COUNT))
        assert first_section_id_of_doc == SECTION_COUNT

        pickle.dump(
            doc_id_to_section_id_dict,
            open(PATH_TOKENIZED + 'doc_id_to_section_id_dict.pickle', 'wb'))

    return doc_id_to_section_id_dict
示例#10
0
    def load_page_number_vector(self, name, return_type='np_uint8'):
        """
        Loads a vector with either a prescribed minimum or maximum number of pages

        >>> x = Vector().load_page_number_vector('max_5')
        >>> x
        <Document Vector of type np_uint8 with 9404609 elements and length 11303161.>

        :param name:
        :return:
        """

        if not re.match('(min|max)_[1-9]\d*', name):
            raise ValueError(
                "Page number vectors need to start with min/max followed by an"
                " underscore and a number, e.g. min_10 (minimum 10 pages) or max_5 "
                "(max 5 pages).")

        file_path = Path(PATH_TOKENIZED, 'filters', f'{name}.npy')
        print(file_path)
        if not file_path.exists():

            from tobacco.utilities.databases import Database
            import operator
            if name.startswith('min'):
                op = operator.ge
            else:
                op = operator.le
            num = int(name[4:])

            db = Database('TOB_FULL')
            con, cur = db.connect()
            print('con')
            cur.execute('SELECT id, pages from docs order by id asc')

            arr = np.zeros(DOC_COUNT, dtype=np.uint8)
            rowid = 0
            while True:
                row = cur.fetchone()
                if not row:
                    break
                if not row['pages']: row['pages'] = 0
                assert rowid == row['id']
                if op(row['pages'],
                      num):  # if pagenumber >= or <= specified number
                    arr[rowid] = 1
                rowid += 1

            v = Vector(arr)
            v.save_to_disk(file_path)

        return self.load_from_disk(file_path, return_type)
示例#11
0
    def __get_nodes_init(self):

        try:
            nodes = pickle.load(
                open(
                    PATH_TOKENIZED +
                    'networks/{}_nodes.pickle'.format(self.main_name), 'rb'))

        except IOError:
            print("Top 50 nodes not available. Creating now...")
            # Step 1: Identify top 50 nodes
            node_counter = Counter()

            db = Database("TOB_FULL")
            con, cur = db.connect()

            author_and_recipient_commands = [
                '''SELECT recipients.recipient as node, docs.tid, docs.year, "author" as main_person_is
                                  FROM authors, recipients, docs
                                  WHERE authors.author="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id
                                  AND docs.year >= {} AND docs.year <= {};'''.
                format(NETWORK_CONFIGS[self.main_name]['name'],
                       NETWORK_CONFIGS[self.main_name]['start_year'],
                       NETWORK_CONFIGS[self.main_name]['end_year']),
                '''SELECT authors.author as node, authors.doc_id, docs.tid, docs.year, "recipient" as main_person_is
                                  FROM authors, recipients, docs
                                  WHERE recipients.recipient="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id
                                  AND docs.year >= {} AND docs.year <= {};'''.
                format(NETWORK_CONFIGS[self.main_name]['name'],
                       NETWORK_CONFIGS[self.main_name]['start_year'],
                       NETWORK_CONFIGS[self.main_name]['end_year'])
            ]

            for command in author_and_recipient_commands:
                cur.execute(command)
                while True:
                    row = cur.fetchone()
                    if not row: break
                    else:
                        for person in english_name_regex.findall(row['node']):
                            node_counter[person] += 1

            nodes = sorted([i[0] for i in node_counter.most_common(50)])
            print(nodes)
            pickle.dump(
                nodes,
                open(
                    PATH_TOKENIZED +
                    'networks/{}_nodes.pickle'.format(self.main_name), 'wb'))

        return nodes
示例#12
0
def get_tid_to_filelength_dict():

    try:
        tid_to_filelength_dict = pickle.load(gzip.open(PATH_TOKENIZED +
                    'tid_to_filelength_dict.pickle', 'rb'))

    except IOError:
        print("Preprocessed tid_to_filelength_dict not available. Creating a new one.")

        # tid_to_filelength_dict = {}
        tid_to_filelength_arr = np.zeros(DOC_COUNT, dtype=np.int64)

        db = Database('TOB_FULL')
        con, cur = db.connect()

        cur.execute('SELECT tid from docs')
        count = 0
        while True:
            count += 1
            if count % 10000 == 0:
                print(count)
            row = cur.fetchone()
            if not row: break
            tid = row['tid']

            filepath = '{}{}/{}/{}/{}/{}'.format(PATH_OCR_FILES, tid[0], tid[1], tid[2], tid[3], tid + ".txt")

            array_len = 10000
            end = None
            while True:
                b = bytearray(array_len)
                f = io.open(filepath, 'rb')
                f.readinto(b)
                str = b.decode('cp1252', errors='ignore')
                end = str.find('\x00')
                if end > -1:
                    break
                else:
                    array_len *= 10

#            tid_to_filelength_dict[tid] = end
            tid_to_filelength_arr[tid] = end
#        pickle.dump(tid_to_filelength_dict, gzip.open(PATH_TOKENIZED + 'tid_to_filelength_dict.pickle', 'wb'))
        pickle.save(PATH_TOKENIZED + 'tid_to_filelength_arr.npy', tid_to_filelength_arr)
        print("Longest file is {} bytes long.".format(max(tid_to_filelength_dict.values())))


    # if the number of tids in the dict != DOC_COUNT, something is wrong
    assert len(tid_to_filelength_dict) == DOC_COUNT, "Length of tid_to_filelength_dict ({}) does not equal DOC_COUNT ({})".format(len(tid_to_filelength_dict), DOC_COUNT)
    return tid_to_filelength_dict
def create_dtype_strings():
    '''
    Prints out all document types that appear at least 100 times by number of appearances

    '''

    db = Database("TOB_FULL")
    con, cur = db.connect()
    cur.execute("select doc_type, count(*) as hits from doc_types group by doc_type order by count(*) desc;")
    rows = cur.fetchall()

    for row in rows:
       if row['hits'] >= 100:
           print(u'({:30s}, 0, {:8d}, []),'.format('"{}"'.format(row['doc_type']), row['hits']))
示例#14
0
def initialize_green_db():

    db = Database('TOB_FULL')
    con, cur = db.connect()

    cur.execute('''CREATE TABLE IF NOT EXISTS green(
                    id              int NOT NULL AUTO_INCREMENT,
                    node            varchar(255)  NOT NULL,
                    main_author_is  varchar(10)   NOT NULL,
                    tid             varchar(10)   NOT NULL,
                    weight          float                 ,

                    PRIMARY KEY(id));''')
    con.commit()
示例#15
0
def process_all():

    create_sqlite_table()

    terms = []

    db = Database('TOB_FULL')
    con, cur = db.connect()

    cur.execute('SELECT token from tokens where total > 10000;')
    for row in cur.fetchall():
        term = row['token']
        valid = True

        for word in term.split():
            if len(word) == 1:
                valid = False
            try:
                int(word)
                valid = False
            except ValueError:
                pass
        if valid:
            terms.append(term)

    print("Number of terms: {}".format(len(terms)))


    for collection in COL_NAME_TO_ID:
        col_id = COL_NAME_TO_ID[collection]
        filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8')
        filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8'))
        max_5p_filter = Vector().load_page_number_vector('max_5')
        print("pre", filtered_collection_vector)
        filtered_collection_vector.filter_with(max_5p_filter)
        print('post', filtered_collection_vector)

        if collection == 'msa_bat':
            totals = TOTALS_COL[5]
            for id in [6, 7, 8, 9, 10, 11, 15]:
                totals += TOTALS_COL[id]
            print(totals)
        else:
            totals = TOTALS_COL[col_id]
        filtered_totals_year_vector = totals.convert_to_year_array(filter_vec=filtered_collection_vector)


        for term in terms:
            find_and_store_policy(term, filtered_collection_vector, filtered_totals_year_vector, collection)
示例#16
0
def get_col_name_and_idx_dict():
    """ Creates a dict of collection name to id as well as id to collection name

    For every id, code ('pm'), name:
    dict[id] = (code, name)
    dict[code] = id
    dict[name] = id

    :return:
    """

    try:
        col_name_and_idx_dict = pickle.load(
            gzip.open(PATH_TOKENIZED + 'col_name_and_idx_dict.pickle', 'rb'))

    except IOError:

        print("col_name_and_idx_dict not available. Creating now...")

        db = Database("TOB_RAW")
        con, cur = db.connect()

        cur.execute("SELECT id, code, name FROM idl_collection;")
        rows = cur.fetchall()

        col_name_and_idx_dict = {}

        for row in rows:
            col_name_and_idx_dict[row['id']] = {
                'code':
                row['code'],
                'name':
                row['name'],
                'name_short':
                row['name'].replace('Collection',
                                    '').replace('Records', '').replace(
                                        '(DATTA)', '').replace('&',
                                                               'and').strip()
            }
            col_name_and_idx_dict[row['code']] = row['id']
            col_name_and_idx_dict[row['name']] = row['id']

        print(col_name_and_idx_dict)

        pickle.dump(
            col_name_and_idx_dict,
            gzip.open(PATH_TOKENIZED + 'col_name_and_idx_dict.pickle', 'wb'))

    return col_name_and_idx_dict
def add_section_offset():
    '''

    This really adds the number of tokens per document.

    :return:
    '''

    db = Database("TOB_FULL")
    con1, cur1 = db.connect()
    cur1.execute("SELECT id, tid, no_tokens FROM docs ORDER BY id ASC;")

    count = 0
    first_section_id_of_doc = 0
    no_t = None
    doc_id = None
    while True:
        row = cur1.fetchone()
        if not row: break
        count += 1



        if count % 100000 == 0:
            print(count, first_section_id_of_doc)

        if count < 100:

            doc_text = get_ocr_by_tid(row['tid'], return_bytearray=False).lower()
            doc_text = expand_contractions(doc_text)
            document_split = re.findall(WORD_SPLIT_REGEX, doc_text)
            text_sections = [document_split[i:i+200] for i in range(0, len(document_split), 200)]

            print(count, first_section_id_of_doc, row['no_tokens'], row['no_tokens']//200+1, len(text_sections))


        first_section_id_of_doc = first_section_id_of_doc + row['no_tokens'] // 200 + 1

        # prevent off by 1 error
        if row['no_tokens'] % 200 == 0:
            first_section_id_of_doc -= 1

        no_t = row['no_tokens']
        doc_id = row['id']

    print("final", doc_id, first_section_id_of_doc, no_t)
    print(first_section_id_of_doc - SECTION_COUNT)
示例#18
0
    def __get_tid_weight(self, tid):

        db = Database('TOB_FULL')
        _, cur = db.connect()
        cur.execute('''select docs.tid, authors.author, recipients.recipient
                          FROM docs, authors, recipients
                          WHERE docs.tid="{}" AND docs.id=authors.doc_id AND authors.doc_id =recipients.doc_id;'''
                    .format(tid))
        authors = set()
        recipients = set()
        for row in cur.fetchall():
            for author in english_name_regex.findall(row['author']):
                authors.add(author)
            for recipient in english_name_regex.findall(row['recipient']):
                recipients.add(recipient)
        weight = 1 / (len(authors) * len(recipients))
        # print(weight, authors, recipients)
        return weight
def get_doc_types_to_idx_dict():
    '''
    Creates a dict of doc_type to id as well as id to doc_type
    :return:
    '''


    try:
        doc_types_and_idx_dict = pickle.load(gzip.open(PATH_TOKENIZED + 'doc_types_and_idx_dict.pickle', 'rb'))

    except IOError:

        print("No doc_types_and_idx_dict available. Creating now...")

        db = Database("TOB_FULL")
        con, cur = db.connect()

        cur.execute("select doc_type, count(*) as hits from doc_types group by doc_type order by count(*) desc;")

        rows = cur.fetchall()

        doc_types_and_idx_dict = {}

        ignored_doc_types = get_dtype_dict()['ignored']


        idx = 0
        for row in rows:

            doc_type = row['doc_type']
            total = row['hits']

            if doc_type in ignored_doc_types or total < 100:
                continue
            else:
                doc_types_and_idx_dict[doc_type] = idx
                idx += 1

        doc_types_and_idx_dict.update({d[1]:d[0] for d in doc_types_and_idx_dict.items()})

        pickle.dump(doc_types_and_idx_dict, gzip.open(PATH_TOKENIZED + 'doc_types_and_idx_dict.pickle', 'wb'))


    return doc_types_and_idx_dict
def fill_in_unknown_authors():

    '''
    Add an empty string as author for all documents that have no authors
    Otherwise, the join in text_passages search does not work (joining docs and authors)

    :return:
    '''

    db = Database("TOB_FULL")
    con, cur = db.connect()

    for i in range(DOC_COUNT):
        cur.execute('select doc_id FROM authors WHERE doc_id = {}'.format(i))
        if len(cur.fetchall()) == 0:
            print(i, "inserting empty string")
            cur.execute('INSERT INTO authors VALUES ({}, "");'.format(i))

    con.commit()
示例#21
0
def get_nodes_init(main_name):
    '''
    Get initial set of 50 nodes to process

    :param main_name:
    :return:
    '''

    # Step 1: Identify top 50 nodes
    node_counter = Counter()

    db = Database("TOB_FULL")
    con, cur = db.connect()

    author_and_recipient_commands = [
        '''SELECT recipients.recipient as node, docs.tid, docs.year, "author" as main_person_is
                          FROM authors, recipients, docs
                          WHERE authors.author="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id
                          AND docs.year >= {} AND docs.year <= {};'''.format(
            NETWORK_CONFIGS[main_name]['name'],
            NETWORK_CONFIGS[main_name]['start_year'],
            NETWORK_CONFIGS[main_name]['end_year']),
        '''SELECT authors.author as node, authors.doc_id, docs.tid, docs.year, "recipient" as main_person_is
                          FROM authors, recipients, docs
                          WHERE recipients.recipient="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id
                          AND docs.year >= {} AND docs.year <= {};'''.format(
            NETWORK_CONFIGS[main_name]['name'],
            NETWORK_CONFIGS[main_name]['start_year'],
            NETWORK_CONFIGS[main_name]['end_year'])
    ]

    for command in author_and_recipient_commands:
        cur.execute(command)
        while True:
            row = cur.fetchone()
            if not row: break
            else:
                for person in english_name_regex.findall(row['node']):
                    node_counter[person] += 1

    print(node_counter.most_common(50))
    top_50_nodes = sorted([i[0] for i in node_counter.most_common(50)])
def batch_insert(docs, doc_types, authors, recipients):

    '''
    Inserts the documents in batches

    :param docs:
    :param doc_types:
    :param authors:
    :param recipients:
    :return:
    '''


    db2 = Database("TOB_FULL")
    db2.batch_insert('docs',
                    ['id', 'tid', 'timestamp', 'year', 'date_orig', 'title', 'collection_id', 'pages', 'no_docs', 'availability'],
                    docs)
    db2.batch_insert('doc_types', ['doc_id', 'doc_type', 'weight'], doc_types)
    db2.batch_insert('authors', ['doc_id', 'author'], authors)
    db2.batch_insert('recipients', ['doc_id', 'recipient'], recipients)
def distinctive_term_nodes(main_name):

    local_vocabulary, totals_main = get_vocabulary_and_totals(main_name)
    db = Database("TOB_NETWORKS")
    con, cur = db.connect()
    cur.execute('SELECT DISTINCT(node) as node FROM {}_docs'.format(main_name))
    nodes = [row['node'] for row in cur.fetchall()]

    print(nodes)
    for node in nodes:

        cur.execute('SELECT tid from {}_docs WHERE node = "{}"'.format(
            main_name, node))
        tids = [row['tid'] for row in cur.fetchall()]
        totals_node = get_totals(tids, local_vocabulary)

        distinctive_terms = get_distinctive_terms(totals_node, totals_main,
                                                  local_vocabulary)

        print("\n", node)
        print(distinctive_terms)
def add_timestamp_to_idl_doc():
    '''
    1/19/2017
    Before filling the tables, add a timestamp column to idl_doc, so we can order by timestamp when filling
    tob_full db.

    :return:
    '''

    db = Database("TOBACCO_RAW")
    con1, cur1 = db.connect()
    con2, cur2 = db.connect()

    #cur1.execute('ALTER TABLE idl_doc ADD timestamp BIGINT;')

    cur1.execute('SELECT id, value from idl_doc_field where itag = 4 order by id asc;')
    count = 0
    while True:
        count += 1
        if count % 10000 == 0:
            print(count)

        row = cur1.fetchone()
        if not row:
            break
        id = row['id']
        date = row['value']
        timestamp = calculate_unix_timestamp(date)

        if not type(timestamp) == int:
            print("Conversion error with: ".format(id, date, timestamp))

        if timestamp is None:
            cur2.execute('UPDATE idl_doc SET timestamp = NULL WHERE id={};'.format(id))
            print("No timestamp for", id, date, timestamp)
        else:
            cur2.execute('UPDATE idl_doc SET timestamp = {} WHERE id={};'.format(timestamp, id))
    con2.commit()

    cur1.execute('CREATE INDEX idl_doc_timestamp_idx on idl_doc(timestamp);')
def delete_token(token):
    db = Database('RESULTS_RDS')
    con, cur = db.connect()

    cur.execute(
        'delete from results_passages_yearly where tokens = "{}";'.format(
            token))
    cur.execute(
        'delete from results_passages_yearly_default where tokens = "{}";'.
        format(token))
    cur.execute(
        'delete from results_passages where tokens = "{}";'.format(token))
    con.commit()
    db = Database('RESULTS_LOCAL')
    con, cur = db.connect()

    cur.execute(
        'delete from results_passages_yearly where tokens = "{}";'.format(
            token))
    cur.execute(
        'delete from results_passages_yearly_default where tokens = "{}";'.
        format(token))
    cur.execute(
        'delete from results_passages where tokens = "{}";'.format(token))
    con.commit()
def add_no_tokens():

    db = Database('TOB_FULL')
    con1, cur1 = db.connect()
    con2, cur2 = db.connect()

    cur1.execute("SELECT tid FROM docs;")

    while True:
        row = cur1.fetchone()
        if not row: break

        tid = row['tid']
        doc_text = get_ocr_by_tid(tid, return_bytearray=False).lower()
        doc_text = expand_contractions(doc_text)
        no_tokens = len(re.findall(WORD_SPLIT_REGEX, doc_text))

        print(tid, no_tokens)
        cur2.execute('UPDATE docs SET no_tokens = {} WHERE tid = "{}";'.format(no_tokens, tid))


    con2.commit()
示例#27
0
def generate_node_db(main_name):

    initialize_db(main_name)

    # Step 1: Identify top 50 nodes
    node_counter = Counter()

    db = Database("TOB_FULL")
    con, cur = db.connect()

    db_net = Database("TOB_NETWORKS")
    con_net, cur_net = db_net.connect()
    con.commit()

    author_and_recipient_commands = [
        '''SELECT recipients.recipient as node, docs.tid, docs.year, "author" as main_person_is
                          FROM authors, recipients, docs
                          WHERE authors.author="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id
                          AND docs.year >= {} AND docs.year <= {};'''.format(
            NETWORK_CONFIGS[main_name]['name'],
            NETWORK_CONFIGS[main_name]['start_year'],
            NETWORK_CONFIGS[main_name]['end_year']),
        '''SELECT authors.author as node, authors.doc_id, docs.tid, docs.year, "recipient" as main_person_is
                          FROM authors, recipients, docs
                          WHERE recipients.recipient="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id
                          AND docs.year >= {} AND docs.year <= {};'''.format(
            NETWORK_CONFIGS[main_name]['name'],
            NETWORK_CONFIGS[main_name]['start_year'],
            NETWORK_CONFIGS[main_name]['end_year'])
    ]

    for command in author_and_recipient_commands:
        cur.execute(command)
        while True:
            row = cur.fetchone()
            if not row: break
            else:
                for person in english_name_regex.findall(row['node']):
                    node_counter[person] += 1

    print(node_counter.most_common(50))
    top_50_nodes = [i[0] for i in node_counter.most_common(50)]

    # Step 2: insert
    for command in author_and_recipient_commands:
        cur.execute(command)
        while True:
            row = cur.fetchone()
            if not row: break
            else:
                for person in english_name_regex.findall(row['node']):
                    if person in top_50_nodes:
                        cur_net.execute(
                            'INSERT INTO {}_docs(node, main_author_is, tid, year) VALUES("{}", "{}", "{}", {})'
                            .format(main_name, person, row['main_person_is'],
                                    row['tid'], row['year']))

    con_net.commit()
示例#28
0
def generate_section_table(main_name):

    db_net = Database("TOB_NETWORKS")
    con, cur = db_net.connect()
    con2, cur2 = db_net.connect()

    section_id = 0

    cur.execute(
        'SELECT node, main_author_is, tid, weight, year FROM {}_docs ORDER BY id DESC;'
        .format(main_name))
    for row in cur.fetchall():
        document = get_ocr_by_tid(row['tid'], return_bytearray=False)
        document_split = re.findall(WORD_SPLIT_REGEX, document)
        text_sections = [
            document_split[i:i + SECTION_LENGTH]
            for i in range(0, len(document_split), SECTION_LENGTH)
        ]
        text_sections = [
            " ".join(text_section) for text_section in text_sections
        ]

        cur2.execute(
            'SELECT COUNT(*) as count FROM {}_docs WHERE tid = "{}"'.format(
                main_name, row['tid']))
        weight = 1 / cur2.fetchall()[0]['count']

        for tid_section, section in enumerate(text_sections):
            cur2.execute(
                '''INSERT INTO {}_sections (section_id, node, main_author_is, tid, tid_section, weight, year, text)
                                VALUES ({}, "{}", "{}", "{}", {}, {}, {}, "{}")'''
                .format(main_name, section_id, row['node'],
                        row['main_author_is'], row['tid'], tid_section, weight,
                        row['year'], section))

            section_id += 1

    con2.commit()
def get_entry_queue(n=None):

    doc_ids = []
    db = Database("TOB_FULL")
    con, cur = db.connect()
    if n:
        print("\n\nWARNING: Only using {} documents! Use this for testing purposes only!\n\n".format(n))
        cur.execute("SELECT id, tid from docs order by id asc limit {};".format(n))
    else:
        cur.execute("SELECT id, tid from docs order by id asc;")

    while True:
        doc_id = cur.fetchone()
        if not doc_id:
            break
        doc_ids.append(doc_id)

    entry_queue = mp.Queue()

    for id in doc_ids:
        entry_queue.put(id)

    return entry_queue
def wildcard_search(
    token,
    ngram=None,
    number_of_tokens=10,
):
    '''
    Parses wildcard tokens when parsing search tokens

    :return:
    '''
    wildcard_tokens = []

    print("wildcard", token[:-1])

    _, cur = Database("TOB_FULL").connect()

    if ngram:
        ngram_search = ' AND ngram={} '.format(ngram)
    else:
        ngram_search = ''

    if token[-1] == '*':
        cur.execute(
            'SELECT token FROM tokens where token LIKE "{}%" {} order by total DESC LIMIT {}'
            .format(token[:-1], ngram_search, number_of_tokens + 1))
    else:
        cur.execute(
            'SELECT token FROM tokens WHERE token_reversed LIKE "{}%" {} ORDER BY total DESC LIMIT {};'
            .format(token[1:][::-1], ngram_search, number_of_tokens + 1))

    for row in cur.fetchall():

        if len(wildcard_tokens) == number_of_tokens: break
        cur_token = row['token']
        if cur_token == token:
            continue
        else:
            wildcard_tokens.append(row['token'])

    return wildcard_tokens