def document_iterator(doc_ids): """ Select metadata (authors, tid, date, title, collection) for selected text passages 6/9/17: Now using bytearrays instead of mysql :param doc_ids: :return: """ db = Database("TOB_FULL") con, cur = db.connect() cur.execute( '''SELECT docs.id, docs.tid, docs.timestamp, docs.year, docs.title, docs.collection_id, GROUP_CONCAT(authors.author SEPARATOR ', ') as author FROM docs, authors WHERE docs.id = authors.doc_id AND docs.id in ({}) GROUP BY docs.id;''' .format(",".join([str(i) for i in doc_ids]))) while True: row = cur.fetchone() if row: yield row else: break con.close()
def __get_dtms(self): ''' returns dict of dtms { 'count': count_dtm, 'tf': tf_dtm, 'tfidf': tfidf_dtm } :return: ''' try: dtm_count = load_csc_matrix_from_file( PATH_TOKENIZED + 'networks/{}_dtm_count'.format(self.main_name)) dtm_tf = load_csc_matrix_from_file( PATH_TOKENIZED + 'networks/{}_dtm_tf'.format(self.main_name)) dtm_tfidf = load_csc_matrix_from_file( PATH_TOKENIZED + 'networks/{}_dtm_tfidf'.format(self.main_name)) except IOError: print("DTM for {} not found. Creating now...".format( self.main_name)) db = Database('TOB_NETWORKS') con, cur = db.connect() # vocabulary, _ = get_vocabulary_and_totals(self.main_name) # vocabulary = vocabulary['ordered'] count_vectorizer = CountVectorizer( vocabulary=self.vocabulary['ordered']) cur.execute( 'SELECT text FROM {}_sections ORDER BY section_id ASC;'.format( self.main_name)) docs = [row['text'] for row in cur.fetchall()] dtm = count_vectorizer.fit_transform(docs) print(dtm.shape, dtm.sum()) dtm_count = dtm.tocsc() store_csr_matrix_to_file( dtm_count, PATH_TOKENIZED + 'networks/{}_dtm_count.npz'.format(self.main_name)) tf_transformer = TfidfTransformer(use_idf=False) dtm_tf = tf_transformer.fit_transform(dtm_count.copy()) store_csr_matrix_to_file( dtm_tf.tocsc(), PATH_TOKENIZED + 'networks/{}_dtm_tf.npz'.format(self.main_name)) tfidf_transformer = TfidfTransformer(use_idf=True) dtm_tfidf = tfidf_transformer.fit_transform(dtm_count.copy()) store_csr_matrix_to_file( dtm_tfidf.tocsc(), PATH_TOKENIZED + 'networks/{}_dtm_tfidf.npz'.format(self.main_name)) dtm = {'count': dtm_count, 'tf': dtm_tf, 'tfidf': dtm_tfidf} return dtm
def full_initialization_process(): """ Run through the full database initialization process. This means: - It creates the TOBACCO_RAW DB ?unclear for what? - adds a unix timestamp to each document - Creates a utf8 text file for each document - Initializes doc, doc_types, authors, recipients tables - Fills those table 1/19/2017 Runs through the whole initialization after importing the raw sql Includes creating indexes :return: """ db1 = Database('TOBACCO_RAW;') con1, cur1 = db1.connect() cur1.execute('create index idl_doc_field_id_idx on idl_doc_field(id);') cur1.execute('create index idl_doc_id_idx on idl_doc(id);') add_timestamp_to_idl_doc() create_utf_text_files() initialize_tables() fill_tables()
def get_year_doc_id_list(docs_or_sections: str) -> list: """ Returns a list, wherein every value marks the first doc_id belonging to that year. e.g. year_doc_id_list[1910] -> first id belonging to year 1910 year_doc_id_list[2015] -> highest doc_id + 1 >>> year_doc_id_list = get_year_doc_id_list('docs') >>> ids_1901 = year_doc_id_list[0] >>> print(f'First 1901 doc id: {ids_1901[0]}. Last 1901 doc id: {ids_1901[1]}.') First 1901 doc id: 0. Last 1901 doc id: 183. >>> ids_1902 = year_doc_id_list[1] >>> print(f'First 1902 doc id: {ids_1902[0]}. Last 1902 doc id: {ids_1902[1]}.') First 1902 doc id: 184. Last 1902 doc id: 238. :param docs_or_sections: 'docs' or 'sections' :return: list """ try: year_doc_id_list = pickle.load( open( PATH_TOKENIZED + 'year_{}_id_list.pickle'.format(docs_or_sections), 'rb')) except IOError: from tobacco.frequencies_preprocessing.preprocessing_sections import \ get_doc_id_to_section_id_dict doc_id_to_section_id_dict = get_doc_id_to_section_id_dict() from tobacco.utilities.databases import Database print("Creating new year_{}_id_list".format(docs_or_sections)) db = Database("TOB_FULL") con, cur = db.connect() year_doc_id_list = [] for year in range(1901, 2017): cur.execute( "SELECT MIN(id), MAX(id) FROM docs WHERE year = {}".format( year)) row = cur.fetchall()[0] min_doc_id = row['MIN(id)'] max_doc_id = row['MAX(id)'] if docs_or_sections == 'docs': year_doc_id_list.append((min_doc_id, max_doc_id)) print(year, min_doc_id) elif docs_or_sections == 'sections': min_section_id = doc_id_to_section_id_dict[min_doc_id][0] max_section_id = doc_id_to_section_id_dict[max_doc_id][1] year_doc_id_list.append((min_section_id, max_section_id)) print(year, min_section_id, max_section_id) pickle.dump( year_doc_id_list, open( PATH_TOKENIZED + 'year_{}_id_list.pickle'.format(docs_or_sections), 'wb')) return year_doc_id_list
def initialize_db(main_name): db = Database('TOB_NETWORKS') con, cur = db.connect() cur.execute('''CREATE TABLE IF NOT EXISTS {}_docs( id int NOT NULL AUTO_INCREMENT, node varchar(255) NOT NULL, main_author_is varchar(10) NOT NULL, tid varchar(10) NOT NULL, weight float , year INT NOT NULL, PRIMARY KEY(id));'''.format(main_name)) cur.execute('''CREATE TABLE IF NOT EXISTS {}_sections( section_id int NOT NULL, node varchar(255) NOT NULL, main_author_is varchar(10) NOT NULL, tid varchar(10) NOT NULL, tid_section INT NOT NULL, weight float NOT NULL, year int NOT NULL, text TEXT NOT NULL, PRIMARY KEY(section_id));'''.format(main_name)) con.commit()
def distinctive_terms_overall(main_name): global_totals = get_vocabulary_totals(1) vocabulary_trie = load_vocabulary_trie(1) local_vocabulary, local_totals = get_vocabulary_and_totals(main_name) global_totals_localized = np.zeros(len(local_vocabulary['id_to_token'])) for token in local_vocabulary['token_to_id']: local_token_id = local_vocabulary['token_to_id'][token] global_token_id = vocabulary_trie[token] global_totals_localized[local_token_id] = global_totals[ global_token_id] print(len(global_totals), len(local_totals), len(global_totals_localized)) distinctive_terms = get_distinctive_terms(local_totals, global_totals_localized, local_vocabulary) print(distinctive_terms) db = Database("TOB_NETWORKS") con, cur = db.connect() cur.execute('SELECT DISTINCT(tid) as tid from {}_docs'.format(main_name)) tids = [row['tid'] for row in cur.fetchall()] totals2 = get_totals(tids, local_vocabulary) dist = get_distinctive_terms(totals2, global_totals_localized, local_vocabulary) print("\n", dist) totals3 = get_totals(tids, local_vocabulary, tf=True) dist = get_distinctive_terms(totals3, global_totals_localized, local_vocabulary) print("\n", dist)
def markov(seed, no_token_to_generate=100, tokens_to_select=1): db = Database('TOB_FULL') con, cur = db.connect() seed_length = len(seed.split()) output = seed for i in range(no_token_to_generate): cur.execute('''SELECT token, total from tokens where token like "{} %" and ngram={} order by total desc;'''.format( seed, seed_length + tokens_to_select)) continuations = cur.fetchall() possible_selections = [] for continuation in continuations: token = " ".join(continuation['token'].split()[seed_length:] [-tokens_to_select:]) for i in range(continuation['total']): possible_selections.append(token) selection = random.choice(possible_selections) output += " " + selection seed = " ".join(output.split()[-2:]) print(output)
def generate_node_db(): initialize_green_db() # Step 1: Identify top 50 nodes node_counter = Counter() db = Database("TOB_FULL") con, cur = db.connect() con2, cur2 = db.connect() cur.execute('DELETE FROM green;') con.commit() author_and_recipient_commands = [ '''SELECT recipients.recipient as node, docs.tid, "author" as main_person_is FROM authors, recipients, docs WHERE authors.author="Green, SJ" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id;''', '''SELECT authors.author as node, authors.doc_id, docs.tid, "recipient" as main_person_is FROM authors, recipients, docs WHERE recipients.recipient="Green, SJ" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id;''', ] for command in author_and_recipient_commands: cur.execute(command) while True: row = cur.fetchone() if not row: break else: for person in english_name_regex.findall(row['node']): node_counter[person] += 1 # cur2.execute('INSERT INTO green(node, main_author_is, tid) VALUES("{}", "{}", "{}")'.format( # person, row['main_person_is'], row['tid'] # )) # con2.commit() print(node_counter.most_common(50)) top_50_nodes = [i[0] for i in node_counter.most_common(50)] # Step 2: insert for command in author_and_recipient_commands: cur.execute(command) while True: row = cur.fetchone() if not row: break else: for person in english_name_regex.findall(row['node']): if person in top_50_nodes: cur2.execute( 'INSERT INTO green(node, main_author_is, tid) VALUES("{}", "{}", "{}")' .format(person, row['main_person_is'], row['tid'])) con2.commit()
def get_doc_id_to_section_id_dict(): """Returns a dict that contains maps from doc_id (0-indexed) to section ids. Every doc_id maps to a tuple of (first_section_id_of_doc, last_section_id_of_doc) :return: """ try: doc_id_to_section_id_dict = pickle.load( open(PATH_TOKENIZED + 'doc_id_to_section_id_dict.pickle', 'rb')) except IOError: print( "doc_id_to_section_id_dict not found. Creating a new one with section length = {}." .format(SECTION_LENGTH)) db = Database("TOB_FULL") con1, cur1 = db.connect() cur1.execute("SELECT id, tid, no_tokens FROM docs ORDER BY id ASC;") doc_id_to_section_id_dict = {} first_section_id_of_doc = 0 while True: row = cur1.fetchone() if not row: break last_section_id_of_doc = first_section_id_of_doc + row[ 'no_tokens'] // 200 # prevent off by 1 error if row['no_tokens'] % 200 == 0: last_section_id_of_doc -= 1 doc_id_to_section_id_dict[row['id']] = (first_section_id_of_doc, last_section_id_of_doc) if row['id'] < 20 or row['id'] % 1000000 == 0: print(row['id'], first_section_id_of_doc, last_section_id_of_doc) first_section_id_of_doc = last_section_id_of_doc + 1 print( "Final section id: {}. Number of sections: {}. These numbers have to be equal." .format(first_section_id_of_doc, SECTION_COUNT)) assert first_section_id_of_doc == SECTION_COUNT pickle.dump( doc_id_to_section_id_dict, open(PATH_TOKENIZED + 'doc_id_to_section_id_dict.pickle', 'wb')) return doc_id_to_section_id_dict
def load_page_number_vector(self, name, return_type='np_uint8'): """ Loads a vector with either a prescribed minimum or maximum number of pages >>> x = Vector().load_page_number_vector('max_5') >>> x <Document Vector of type np_uint8 with 9404609 elements and length 11303161.> :param name: :return: """ if not re.match('(min|max)_[1-9]\d*', name): raise ValueError( "Page number vectors need to start with min/max followed by an" " underscore and a number, e.g. min_10 (minimum 10 pages) or max_5 " "(max 5 pages).") file_path = Path(PATH_TOKENIZED, 'filters', f'{name}.npy') print(file_path) if not file_path.exists(): from tobacco.utilities.databases import Database import operator if name.startswith('min'): op = operator.ge else: op = operator.le num = int(name[4:]) db = Database('TOB_FULL') con, cur = db.connect() print('con') cur.execute('SELECT id, pages from docs order by id asc') arr = np.zeros(DOC_COUNT, dtype=np.uint8) rowid = 0 while True: row = cur.fetchone() if not row: break if not row['pages']: row['pages'] = 0 assert rowid == row['id'] if op(row['pages'], num): # if pagenumber >= or <= specified number arr[rowid] = 1 rowid += 1 v = Vector(arr) v.save_to_disk(file_path) return self.load_from_disk(file_path, return_type)
def __get_nodes_init(self): try: nodes = pickle.load( open( PATH_TOKENIZED + 'networks/{}_nodes.pickle'.format(self.main_name), 'rb')) except IOError: print("Top 50 nodes not available. Creating now...") # Step 1: Identify top 50 nodes node_counter = Counter() db = Database("TOB_FULL") con, cur = db.connect() author_and_recipient_commands = [ '''SELECT recipients.recipient as node, docs.tid, docs.year, "author" as main_person_is FROM authors, recipients, docs WHERE authors.author="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id AND docs.year >= {} AND docs.year <= {};'''. format(NETWORK_CONFIGS[self.main_name]['name'], NETWORK_CONFIGS[self.main_name]['start_year'], NETWORK_CONFIGS[self.main_name]['end_year']), '''SELECT authors.author as node, authors.doc_id, docs.tid, docs.year, "recipient" as main_person_is FROM authors, recipients, docs WHERE recipients.recipient="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id AND docs.year >= {} AND docs.year <= {};'''. format(NETWORK_CONFIGS[self.main_name]['name'], NETWORK_CONFIGS[self.main_name]['start_year'], NETWORK_CONFIGS[self.main_name]['end_year']) ] for command in author_and_recipient_commands: cur.execute(command) while True: row = cur.fetchone() if not row: break else: for person in english_name_regex.findall(row['node']): node_counter[person] += 1 nodes = sorted([i[0] for i in node_counter.most_common(50)]) print(nodes) pickle.dump( nodes, open( PATH_TOKENIZED + 'networks/{}_nodes.pickle'.format(self.main_name), 'wb')) return nodes
def get_tid_to_filelength_dict(): try: tid_to_filelength_dict = pickle.load(gzip.open(PATH_TOKENIZED + 'tid_to_filelength_dict.pickle', 'rb')) except IOError: print("Preprocessed tid_to_filelength_dict not available. Creating a new one.") # tid_to_filelength_dict = {} tid_to_filelength_arr = np.zeros(DOC_COUNT, dtype=np.int64) db = Database('TOB_FULL') con, cur = db.connect() cur.execute('SELECT tid from docs') count = 0 while True: count += 1 if count % 10000 == 0: print(count) row = cur.fetchone() if not row: break tid = row['tid'] filepath = '{}{}/{}/{}/{}/{}'.format(PATH_OCR_FILES, tid[0], tid[1], tid[2], tid[3], tid + ".txt") array_len = 10000 end = None while True: b = bytearray(array_len) f = io.open(filepath, 'rb') f.readinto(b) str = b.decode('cp1252', errors='ignore') end = str.find('\x00') if end > -1: break else: array_len *= 10 # tid_to_filelength_dict[tid] = end tid_to_filelength_arr[tid] = end # pickle.dump(tid_to_filelength_dict, gzip.open(PATH_TOKENIZED + 'tid_to_filelength_dict.pickle', 'wb')) pickle.save(PATH_TOKENIZED + 'tid_to_filelength_arr.npy', tid_to_filelength_arr) print("Longest file is {} bytes long.".format(max(tid_to_filelength_dict.values()))) # if the number of tids in the dict != DOC_COUNT, something is wrong assert len(tid_to_filelength_dict) == DOC_COUNT, "Length of tid_to_filelength_dict ({}) does not equal DOC_COUNT ({})".format(len(tid_to_filelength_dict), DOC_COUNT) return tid_to_filelength_dict
def create_dtype_strings(): ''' Prints out all document types that appear at least 100 times by number of appearances ''' db = Database("TOB_FULL") con, cur = db.connect() cur.execute("select doc_type, count(*) as hits from doc_types group by doc_type order by count(*) desc;") rows = cur.fetchall() for row in rows: if row['hits'] >= 100: print(u'({:30s}, 0, {:8d}, []),'.format('"{}"'.format(row['doc_type']), row['hits']))
def initialize_green_db(): db = Database('TOB_FULL') con, cur = db.connect() cur.execute('''CREATE TABLE IF NOT EXISTS green( id int NOT NULL AUTO_INCREMENT, node varchar(255) NOT NULL, main_author_is varchar(10) NOT NULL, tid varchar(10) NOT NULL, weight float , PRIMARY KEY(id));''') con.commit()
def process_all(): create_sqlite_table() terms = [] db = Database('TOB_FULL') con, cur = db.connect() cur.execute('SELECT token from tokens where total > 10000;') for row in cur.fetchall(): term = row['token'] valid = True for word in term.split(): if len(word) == 1: valid = False try: int(word) valid = False except ValueError: pass if valid: terms.append(term) print("Number of terms: {}".format(len(terms))) for collection in COL_NAME_TO_ID: col_id = COL_NAME_TO_ID[collection] filtered_collection_vector = FILTERS['doc_type'][('internal communication', False)].copy().convert_to_datatype('np_uint8') filtered_collection_vector.filter_with(FILTERS['collection'][(col_id, False)].convert_to_datatype('np_uint8')) max_5p_filter = Vector().load_page_number_vector('max_5') print("pre", filtered_collection_vector) filtered_collection_vector.filter_with(max_5p_filter) print('post', filtered_collection_vector) if collection == 'msa_bat': totals = TOTALS_COL[5] for id in [6, 7, 8, 9, 10, 11, 15]: totals += TOTALS_COL[id] print(totals) else: totals = TOTALS_COL[col_id] filtered_totals_year_vector = totals.convert_to_year_array(filter_vec=filtered_collection_vector) for term in terms: find_and_store_policy(term, filtered_collection_vector, filtered_totals_year_vector, collection)
def get_col_name_and_idx_dict(): """ Creates a dict of collection name to id as well as id to collection name For every id, code ('pm'), name: dict[id] = (code, name) dict[code] = id dict[name] = id :return: """ try: col_name_and_idx_dict = pickle.load( gzip.open(PATH_TOKENIZED + 'col_name_and_idx_dict.pickle', 'rb')) except IOError: print("col_name_and_idx_dict not available. Creating now...") db = Database("TOB_RAW") con, cur = db.connect() cur.execute("SELECT id, code, name FROM idl_collection;") rows = cur.fetchall() col_name_and_idx_dict = {} for row in rows: col_name_and_idx_dict[row['id']] = { 'code': row['code'], 'name': row['name'], 'name_short': row['name'].replace('Collection', '').replace('Records', '').replace( '(DATTA)', '').replace('&', 'and').strip() } col_name_and_idx_dict[row['code']] = row['id'] col_name_and_idx_dict[row['name']] = row['id'] print(col_name_and_idx_dict) pickle.dump( col_name_and_idx_dict, gzip.open(PATH_TOKENIZED + 'col_name_and_idx_dict.pickle', 'wb')) return col_name_and_idx_dict
def add_section_offset(): ''' This really adds the number of tokens per document. :return: ''' db = Database("TOB_FULL") con1, cur1 = db.connect() cur1.execute("SELECT id, tid, no_tokens FROM docs ORDER BY id ASC;") count = 0 first_section_id_of_doc = 0 no_t = None doc_id = None while True: row = cur1.fetchone() if not row: break count += 1 if count % 100000 == 0: print(count, first_section_id_of_doc) if count < 100: doc_text = get_ocr_by_tid(row['tid'], return_bytearray=False).lower() doc_text = expand_contractions(doc_text) document_split = re.findall(WORD_SPLIT_REGEX, doc_text) text_sections = [document_split[i:i+200] for i in range(0, len(document_split), 200)] print(count, first_section_id_of_doc, row['no_tokens'], row['no_tokens']//200+1, len(text_sections)) first_section_id_of_doc = first_section_id_of_doc + row['no_tokens'] // 200 + 1 # prevent off by 1 error if row['no_tokens'] % 200 == 0: first_section_id_of_doc -= 1 no_t = row['no_tokens'] doc_id = row['id'] print("final", doc_id, first_section_id_of_doc, no_t) print(first_section_id_of_doc - SECTION_COUNT)
def __get_tid_weight(self, tid): db = Database('TOB_FULL') _, cur = db.connect() cur.execute('''select docs.tid, authors.author, recipients.recipient FROM docs, authors, recipients WHERE docs.tid="{}" AND docs.id=authors.doc_id AND authors.doc_id =recipients.doc_id;''' .format(tid)) authors = set() recipients = set() for row in cur.fetchall(): for author in english_name_regex.findall(row['author']): authors.add(author) for recipient in english_name_regex.findall(row['recipient']): recipients.add(recipient) weight = 1 / (len(authors) * len(recipients)) # print(weight, authors, recipients) return weight
def get_doc_types_to_idx_dict(): ''' Creates a dict of doc_type to id as well as id to doc_type :return: ''' try: doc_types_and_idx_dict = pickle.load(gzip.open(PATH_TOKENIZED + 'doc_types_and_idx_dict.pickle', 'rb')) except IOError: print("No doc_types_and_idx_dict available. Creating now...") db = Database("TOB_FULL") con, cur = db.connect() cur.execute("select doc_type, count(*) as hits from doc_types group by doc_type order by count(*) desc;") rows = cur.fetchall() doc_types_and_idx_dict = {} ignored_doc_types = get_dtype_dict()['ignored'] idx = 0 for row in rows: doc_type = row['doc_type'] total = row['hits'] if doc_type in ignored_doc_types or total < 100: continue else: doc_types_and_idx_dict[doc_type] = idx idx += 1 doc_types_and_idx_dict.update({d[1]:d[0] for d in doc_types_and_idx_dict.items()}) pickle.dump(doc_types_and_idx_dict, gzip.open(PATH_TOKENIZED + 'doc_types_and_idx_dict.pickle', 'wb')) return doc_types_and_idx_dict
def fill_in_unknown_authors(): ''' Add an empty string as author for all documents that have no authors Otherwise, the join in text_passages search does not work (joining docs and authors) :return: ''' db = Database("TOB_FULL") con, cur = db.connect() for i in range(DOC_COUNT): cur.execute('select doc_id FROM authors WHERE doc_id = {}'.format(i)) if len(cur.fetchall()) == 0: print(i, "inserting empty string") cur.execute('INSERT INTO authors VALUES ({}, "");'.format(i)) con.commit()
def get_nodes_init(main_name): ''' Get initial set of 50 nodes to process :param main_name: :return: ''' # Step 1: Identify top 50 nodes node_counter = Counter() db = Database("TOB_FULL") con, cur = db.connect() author_and_recipient_commands = [ '''SELECT recipients.recipient as node, docs.tid, docs.year, "author" as main_person_is FROM authors, recipients, docs WHERE authors.author="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id AND docs.year >= {} AND docs.year <= {};'''.format( NETWORK_CONFIGS[main_name]['name'], NETWORK_CONFIGS[main_name]['start_year'], NETWORK_CONFIGS[main_name]['end_year']), '''SELECT authors.author as node, authors.doc_id, docs.tid, docs.year, "recipient" as main_person_is FROM authors, recipients, docs WHERE recipients.recipient="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id AND docs.year >= {} AND docs.year <= {};'''.format( NETWORK_CONFIGS[main_name]['name'], NETWORK_CONFIGS[main_name]['start_year'], NETWORK_CONFIGS[main_name]['end_year']) ] for command in author_and_recipient_commands: cur.execute(command) while True: row = cur.fetchone() if not row: break else: for person in english_name_regex.findall(row['node']): node_counter[person] += 1 print(node_counter.most_common(50)) top_50_nodes = sorted([i[0] for i in node_counter.most_common(50)])
def batch_insert(docs, doc_types, authors, recipients): ''' Inserts the documents in batches :param docs: :param doc_types: :param authors: :param recipients: :return: ''' db2 = Database("TOB_FULL") db2.batch_insert('docs', ['id', 'tid', 'timestamp', 'year', 'date_orig', 'title', 'collection_id', 'pages', 'no_docs', 'availability'], docs) db2.batch_insert('doc_types', ['doc_id', 'doc_type', 'weight'], doc_types) db2.batch_insert('authors', ['doc_id', 'author'], authors) db2.batch_insert('recipients', ['doc_id', 'recipient'], recipients)
def distinctive_term_nodes(main_name): local_vocabulary, totals_main = get_vocabulary_and_totals(main_name) db = Database("TOB_NETWORKS") con, cur = db.connect() cur.execute('SELECT DISTINCT(node) as node FROM {}_docs'.format(main_name)) nodes = [row['node'] for row in cur.fetchall()] print(nodes) for node in nodes: cur.execute('SELECT tid from {}_docs WHERE node = "{}"'.format( main_name, node)) tids = [row['tid'] for row in cur.fetchall()] totals_node = get_totals(tids, local_vocabulary) distinctive_terms = get_distinctive_terms(totals_node, totals_main, local_vocabulary) print("\n", node) print(distinctive_terms)
def add_timestamp_to_idl_doc(): ''' 1/19/2017 Before filling the tables, add a timestamp column to idl_doc, so we can order by timestamp when filling tob_full db. :return: ''' db = Database("TOBACCO_RAW") con1, cur1 = db.connect() con2, cur2 = db.connect() #cur1.execute('ALTER TABLE idl_doc ADD timestamp BIGINT;') cur1.execute('SELECT id, value from idl_doc_field where itag = 4 order by id asc;') count = 0 while True: count += 1 if count % 10000 == 0: print(count) row = cur1.fetchone() if not row: break id = row['id'] date = row['value'] timestamp = calculate_unix_timestamp(date) if not type(timestamp) == int: print("Conversion error with: ".format(id, date, timestamp)) if timestamp is None: cur2.execute('UPDATE idl_doc SET timestamp = NULL WHERE id={};'.format(id)) print("No timestamp for", id, date, timestamp) else: cur2.execute('UPDATE idl_doc SET timestamp = {} WHERE id={};'.format(timestamp, id)) con2.commit() cur1.execute('CREATE INDEX idl_doc_timestamp_idx on idl_doc(timestamp);')
def delete_token(token): db = Database('RESULTS_RDS') con, cur = db.connect() cur.execute( 'delete from results_passages_yearly where tokens = "{}";'.format( token)) cur.execute( 'delete from results_passages_yearly_default where tokens = "{}";'. format(token)) cur.execute( 'delete from results_passages where tokens = "{}";'.format(token)) con.commit() db = Database('RESULTS_LOCAL') con, cur = db.connect() cur.execute( 'delete from results_passages_yearly where tokens = "{}";'.format( token)) cur.execute( 'delete from results_passages_yearly_default where tokens = "{}";'. format(token)) cur.execute( 'delete from results_passages where tokens = "{}";'.format(token)) con.commit()
def add_no_tokens(): db = Database('TOB_FULL') con1, cur1 = db.connect() con2, cur2 = db.connect() cur1.execute("SELECT tid FROM docs;") while True: row = cur1.fetchone() if not row: break tid = row['tid'] doc_text = get_ocr_by_tid(tid, return_bytearray=False).lower() doc_text = expand_contractions(doc_text) no_tokens = len(re.findall(WORD_SPLIT_REGEX, doc_text)) print(tid, no_tokens) cur2.execute('UPDATE docs SET no_tokens = {} WHERE tid = "{}";'.format(no_tokens, tid)) con2.commit()
def generate_node_db(main_name): initialize_db(main_name) # Step 1: Identify top 50 nodes node_counter = Counter() db = Database("TOB_FULL") con, cur = db.connect() db_net = Database("TOB_NETWORKS") con_net, cur_net = db_net.connect() con.commit() author_and_recipient_commands = [ '''SELECT recipients.recipient as node, docs.tid, docs.year, "author" as main_person_is FROM authors, recipients, docs WHERE authors.author="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id AND docs.year >= {} AND docs.year <= {};'''.format( NETWORK_CONFIGS[main_name]['name'], NETWORK_CONFIGS[main_name]['start_year'], NETWORK_CONFIGS[main_name]['end_year']), '''SELECT authors.author as node, authors.doc_id, docs.tid, docs.year, "recipient" as main_person_is FROM authors, recipients, docs WHERE recipients.recipient="{}" and authors.doc_id=recipients.doc_id AND docs.id=authors.doc_id AND docs.year >= {} AND docs.year <= {};'''.format( NETWORK_CONFIGS[main_name]['name'], NETWORK_CONFIGS[main_name]['start_year'], NETWORK_CONFIGS[main_name]['end_year']) ] for command in author_and_recipient_commands: cur.execute(command) while True: row = cur.fetchone() if not row: break else: for person in english_name_regex.findall(row['node']): node_counter[person] += 1 print(node_counter.most_common(50)) top_50_nodes = [i[0] for i in node_counter.most_common(50)] # Step 2: insert for command in author_and_recipient_commands: cur.execute(command) while True: row = cur.fetchone() if not row: break else: for person in english_name_regex.findall(row['node']): if person in top_50_nodes: cur_net.execute( 'INSERT INTO {}_docs(node, main_author_is, tid, year) VALUES("{}", "{}", "{}", {})' .format(main_name, person, row['main_person_is'], row['tid'], row['year'])) con_net.commit()
def generate_section_table(main_name): db_net = Database("TOB_NETWORKS") con, cur = db_net.connect() con2, cur2 = db_net.connect() section_id = 0 cur.execute( 'SELECT node, main_author_is, tid, weight, year FROM {}_docs ORDER BY id DESC;' .format(main_name)) for row in cur.fetchall(): document = get_ocr_by_tid(row['tid'], return_bytearray=False) document_split = re.findall(WORD_SPLIT_REGEX, document) text_sections = [ document_split[i:i + SECTION_LENGTH] for i in range(0, len(document_split), SECTION_LENGTH) ] text_sections = [ " ".join(text_section) for text_section in text_sections ] cur2.execute( 'SELECT COUNT(*) as count FROM {}_docs WHERE tid = "{}"'.format( main_name, row['tid'])) weight = 1 / cur2.fetchall()[0]['count'] for tid_section, section in enumerate(text_sections): cur2.execute( '''INSERT INTO {}_sections (section_id, node, main_author_is, tid, tid_section, weight, year, text) VALUES ({}, "{}", "{}", "{}", {}, {}, {}, "{}")''' .format(main_name, section_id, row['node'], row['main_author_is'], row['tid'], tid_section, weight, row['year'], section)) section_id += 1 con2.commit()
def get_entry_queue(n=None): doc_ids = [] db = Database("TOB_FULL") con, cur = db.connect() if n: print("\n\nWARNING: Only using {} documents! Use this for testing purposes only!\n\n".format(n)) cur.execute("SELECT id, tid from docs order by id asc limit {};".format(n)) else: cur.execute("SELECT id, tid from docs order by id asc;") while True: doc_id = cur.fetchone() if not doc_id: break doc_ids.append(doc_id) entry_queue = mp.Queue() for id in doc_ids: entry_queue.put(id) return entry_queue
def wildcard_search( token, ngram=None, number_of_tokens=10, ): ''' Parses wildcard tokens when parsing search tokens :return: ''' wildcard_tokens = [] print("wildcard", token[:-1]) _, cur = Database("TOB_FULL").connect() if ngram: ngram_search = ' AND ngram={} '.format(ngram) else: ngram_search = '' if token[-1] == '*': cur.execute( 'SELECT token FROM tokens where token LIKE "{}%" {} order by total DESC LIMIT {}' .format(token[:-1], ngram_search, number_of_tokens + 1)) else: cur.execute( 'SELECT token FROM tokens WHERE token_reversed LIKE "{}%" {} ORDER BY total DESC LIMIT {};' .format(token[1:][::-1], ngram_search, number_of_tokens + 1)) for row in cur.fetchall(): if len(wildcard_tokens) == number_of_tokens: break cur_token = row['token'] if cur_token == token: continue else: wildcard_tokens.append(row['token']) return wildcard_tokens