def update_an(args): print "Inserting target information from the Analyst's Notebook." print "File: ", anfile # Connect to the DB connection = psycopg2.connect("dbname=%s user=%s" % (args['db'], args['user'])) cursor = connection.cursor() # Read in the CSV and add fields we want with open(anfile, 'r') as csvfile: csvfile.readline() # Consume the header anreader = csv.reader(csvfile) for row in anreader: dbutils.insert_into_table(cursor=cursor, table='targets_an', columns=['target_name', 'target_id', 'target_first_sol'], values=[row[8], int(row[0]), int(row[16])]) connection.commit() cursor.close() connection.close()
def insert(self, cursor): dbutils.insert_into_table(cursor=cursor, table='text_annotations', columns=[ 'text_id', 'start_pos', 'end_pos', 'annotation_type', 'annotator' ], values=[ self.text_id, self.start, self.end, self.label, self.username ])
def insert_all(self, table_name, records): ''' inserts all the records in the iterator to table @param table_name : name of the table to insert @param records : iterator of documents ''' with self.db.cursor() as cursor: count = 0 for rec in records: count += 1 insert_into_table(cursor=cursor, table=table_name, columns=rec.keys(), values=rec.values()) return count
def add_forums(cursor, forums, source_id): # if an id isn't in forum_ids (if the ordering of the forums is # weird) this is going to fail, but for this example we don't need # anything too fancy.. # key=external_id, val=internal_id forum_ids = {} for forum in forums: name = forum['name'] external_id = forum['forum_id'] external_parent_id = forum['parent_id'] cols = ['source_id', 'name', 'external_id'] vals = [source_id, name, external_id] if external_parent_id in forum_ids: cols.append('parent_forum_id') vals.append(forum_ids[external_parent_id]) forum_id = dbutils.insert_into_table( cursor, 'forums', cols, vals) forum_ids[external_id] = forum_id return forum_ids
def add_posts(cursor, posts, source_id, forum_ids, user_ids): # if an id isn't in post_ids (if the ordering of the posts is # weird) this is going to fail, but for this example we don't need # anything too fancy.. # key=external_id, val=internal_id post_ids = {} for post in posts: external_post_id = post['post_id'] external_user_id = post['user_id'] text = post['text'] timestamp = post['timestamp'] external_forum_id = post['forum_id'] external_parent_post_id = post['parent_post_id'] # first deal with texts, so that we can add the text_id field to each # post text_id = dbutils.insert_into_table( cursor, 'texts', ['raw_text'], [text]) # add a "first word" annotation dbutils.insert_into_table( cursor, 'text_annotations', ['text_id', 'start_pos', 'end_pos', 'annotation_type'], [text_id, 0, text.find(' '), 'first_word']) cols = ['user_id', 'text_id', 'source_id', 'created_at', 'forum_id'] vals = [user_ids[external_user_id], text_id, source_id, timestamp, forum_ids[external_forum_id]] if external_parent_post_id in post_ids: cols.append('parent_post_id') vals.append(post_ids[external_parent_post_id]) post_id = dbutils.insert_into_table( cursor, 'posts', cols, vals) post_ids[external_post_id] = post_id return post_ids
def add_sources(cursor, sources): source = sources[0]['name'] return dbutils.insert_into_table( cursor, 'sources', ['name'], [source])
def update_mmgis(args): print "Inserting target information from the MMGIS target DB." print "File: ", mmgisfile # Connect to the DB connection = psycopg2.connect("dbname=%s user=%s" % (args['db'], args['user'])) cursor = connection.cursor() # Read in the CSV and add fields we want with open(mmgisfile, 'r') as csvfile: csvfile.readline() # Consume the header mmgisreader = csv.reader(csvfile) for row in mmgisreader: # Convert target name into canonical form s = row[2] s = string.replace(s, '_', ' ') s = string.capwords(s) s = s.replace(' ', '_') target_name = s dbutils.insert_into_table(cursor=cursor, table='targets_mmgis', columns=['target_name', 'target_latitude', 'target_longitude'], values=[s, float(row[7]), float(row[8])]) connection.commit() cursor.close() connection.close()
def add_users(cursor, users, source_id): # key=external_id, val=internal_id user_ids = {} for user in users: external_id = user['user_id'] name = user['name'] registration = user['registered_timestamp'] last_seen = user['last_seen_timestamp'] user_id = dbutils.insert_into_table( cursor, 'users', ['name'], [name]) user_ids[external_id] = user_id dbutils.insert_into_table( cursor, 'user_source', ['source_id','user_id','username'], [source_id, user_id, external_id]) dbutils.insert_into_table( cursor, 'user_info', ['source_id', 'user_id', 'label', 'value'], [source_id, user_id, 'registered_timestamp', registration]) dbutils.insert_into_table( cursor, 'user_info', ['source_id', 'user_id', 'label', 'value'], [source_id, user_id, 'last_seen_timestamp', last_seen]) return user_ids
def insert(self, cursor): # Insert into the appropriate table depending on the annotation type # Basic annotations (type T in .ann files) are inserted into # the anchors table, which tracks all text mentions (and spans). if self.type == 'anchor': canonical = '' # Targets and components: # Look up canonical entry, or add if needed. if self.label in lookups.keys(): # First remove any hyphenation (due to poor parsing) s = string.replace(self.name, '- ', '') if self.label == 'Target': # Canonical name in mixed case with underscores between words s = string.replace(s, '_', ' ') s = string.capwords(s) s = s.replace(' ', '_') canonical = s elif self.label == 'Element' or self.label == 'Mineral': # Capitalize s = string.capwords(s) canonical = s (tabname, colname) = lookups[self.label] cursor.execute("SELECT %s FROM %s " % (colname, tabname) + "WHERE %s='%s';" % (colname, canonical)) name = cursor.fetchone() if name == None: # Add the label for components if tabname == 'components': dbutils.insert_into_table( cursor=cursor, table=tabname, columns=[colname, 'component_label'], values=[canonical, self.label]) else: # Just add the name dbutils.insert_into_table(cursor=cursor, table=tabname, columns=[colname], values=[canonical]) # Add this anchor. Text other than targets and components # will not have a canonical entry. That's okay. dbutils.insert_into_table( cursor=cursor, table='anchors', columns=[ 'anchor_id', 'label', 'canonical', 'text', 'span_start', 'span_end' ], values=[ self.doc_id + '_' + self.annotation_id, self.label, canonical, self.name, self.start, self.end ]) # Events are elif self.type == 'event': if self.label == 'Contains': # Loop over all targets for t in self.targets: # Loop over all constituents for v in self.cont: # Extract the excerpt cursor.execute("SELECT content " + "FROM documents " + "WHERE doc_id='%s';" \ % (self.doc_id)) content = cursor.fetchone() if content == None: print 'Warning: document %s not found, skipping.' % \ self.doc_id break # Compute the likely start and end of the sentence # surrounding the component content = content[0] cursor.execute("SELECT span_start, span_end, text " + "FROM anchors " + "WHERE anchor_id='%s';" \ % (self.doc_id+'_'+v)) (anchor_start, anchor_end, text) = cursor.fetchone() # Start: first capital letter after last period before last capital letter! sent_start = 0 # Last preceding capital m = [ m for m in re.finditer('[A-Z]', content[:anchor_start]) ] if m: sent_start = m[-1].start() # Last preceding period sent_start = max(content[:sent_start].rfind('.'), 0) # Next capital m = re.search('[A-Z]', content[sent_start:]) if m: sent_start = sent_start + m.start() # End: next period followed by {space,newline}, or end of document. sent_end = anchor_end + content[anchor_end:].find( '. ') + 1 if sent_end <= anchor_end: sent_end = anchor_end + content[anchor_end:].find( '.\n') + 1 if sent_end <= anchor_end: sent_end = len(content) excerpt = content[sent_start:sent_end] # Get the canonical forms of the target and component cursor.execute("SELECT canonical " + "FROM anchors " + "WHERE anchor_id='%s';" \ % (self.doc_id+'_'+t)) canonical_t = cursor.fetchone()[0] cursor.execute("SELECT canonical " + "FROM anchors " + "WHERE anchor_id='%s';" \ % (self.doc_id+'_'+v)) canonical_v = cursor.fetchone()[0] #print ','.join([self.doc_id, canonical_t, canonical_v, text]) #print content[anchor_start:anchor_end+1] #print sent_start, anchor_start, anchor_end, sent_end #print excerpt #raw_input() # Insert into table dbutils.insert_into_table( cursor=cursor, table='contains', columns=[ 'event_id', 'doc_id', 'anchor_id', 'target_name', 'component_name', 'magnitude', 'confidence', 'annotator', 'excerpt' ], values=[ self.doc_id + '_' + self.annotation_id, self.doc_id, self.doc_id + '_' + self.anchor, canonical_t, canonical_v, 'unknown', 'neutral', self.username, excerpt ]) elif (self.label == 'DoesNotContain' or self.label == 'StratRel'): # Not yet handled pass elif self.type == 'relation': # Not yet handled pass elif self.type == 'attribute': # Not yet handled pass else: raise RuntimeError('Unknown label %s' % self.label)