Exemplo n.º 1
0
def update_an(args):

    print "Inserting target information from the Analyst's Notebook."
    print "File: ", anfile

    # Connect to the DB
    connection = psycopg2.connect("dbname=%s user=%s" % (args['db'], args['user']))
    cursor     = connection.cursor()

    # Read in the CSV and add fields we want

    with open(anfile, 'r') as csvfile:
        csvfile.readline()  # Consume the header
        anreader = csv.reader(csvfile)
        for row in anreader:
            dbutils.insert_into_table(cursor=cursor,
                                      table='targets_an',
                                      columns=['target_name',
                                               'target_id',
                                               'target_first_sol'],
                                      values=[row[8],
                                              int(row[0]),
                                              int(row[16])])

        connection.commit()

    cursor.close()
    connection.close()
Exemplo n.º 2
0
 def insert(self, cursor):
     dbutils.insert_into_table(cursor=cursor,
                               table='text_annotations',
                               columns=[
                                   'text_id', 'start_pos', 'end_pos',
                                   'annotation_type', 'annotator'
                               ],
                               values=[
                                   self.text_id, self.start, self.end,
                                   self.label, self.username
                               ])
Exemplo n.º 3
0
 def insert_all(self, table_name, records):
     '''
     inserts all the records in the iterator to table
     @param table_name : name of the table to insert
     @param records : iterator of documents
     '''
     with self.db.cursor() as cursor:
         count = 0
         for rec in records:
             count += 1
             insert_into_table(cursor=cursor,
                               table=table_name,
                               columns=rec.keys(),
                               values=rec.values())
         return count
Exemplo n.º 4
0
def add_forums(cursor, forums, source_id):
    # if an id isn't in forum_ids (if the ordering of the forums is
    # weird) this is going to fail, but for this example we don't need
    # anything too fancy..

    # key=external_id, val=internal_id
    forum_ids = {}
    for forum in forums:
        name = forum['name']
        external_id = forum['forum_id']
        external_parent_id = forum['parent_id']

        cols = ['source_id', 'name', 'external_id']
        vals = [source_id, name, external_id]
        if external_parent_id in forum_ids:
            cols.append('parent_forum_id')
            vals.append(forum_ids[external_parent_id])

        forum_id = dbutils.insert_into_table(
                cursor,
                'forums',
                cols,
                vals)
        
        forum_ids[external_id] = forum_id
                
    return forum_ids
Exemplo n.º 5
0
def add_posts(cursor, posts, source_id, forum_ids, user_ids):
    # if an id isn't in post_ids (if the ordering of the posts is
    # weird) this is going to fail, but for this example we don't need
    # anything too fancy..

    # key=external_id, val=internal_id
    post_ids = {}
    for post in posts:
        external_post_id = post['post_id'] 
        external_user_id = post['user_id'] 
        text = post['text'] 
        timestamp = post['timestamp']
        external_forum_id = post['forum_id'] 
        external_parent_post_id = post['parent_post_id']

        # first deal with texts, so that we can add the text_id field to each
        # post
        text_id = dbutils.insert_into_table(
                cursor,
                'texts',
                ['raw_text'],
                [text])

        # add a "first word" annotation
        dbutils.insert_into_table(
                cursor,
                'text_annotations',
                ['text_id', 'start_pos', 'end_pos', 'annotation_type'],
                [text_id, 0, text.find(' '), 'first_word'])

        cols = ['user_id', 'text_id', 'source_id', 'created_at', 'forum_id']
        vals = [user_ids[external_user_id], text_id, source_id, timestamp, forum_ids[external_forum_id]]

        if external_parent_post_id in post_ids:
            cols.append('parent_post_id')
            vals.append(post_ids[external_parent_post_id])

        post_id = dbutils.insert_into_table(
                cursor,
                'posts',
                cols,
                vals)
        post_ids[external_post_id] = post_id

    return post_ids
Exemplo n.º 6
0
def add_sources(cursor, sources):

    source = sources[0]['name']

    return dbutils.insert_into_table(
            cursor,
            'sources',
            ['name'],
            [source])
Exemplo n.º 7
0
def update_mmgis(args):

    print "Inserting target information from the MMGIS target DB."
    print "File: ", mmgisfile

    # Connect to the DB
    connection = psycopg2.connect("dbname=%s user=%s" % (args['db'], args['user']))
    cursor     = connection.cursor()

    # Read in the CSV and add fields we want

    with open(mmgisfile, 'r') as csvfile:
        csvfile.readline()  # Consume the header
        mmgisreader = csv.reader(csvfile)
        for row in mmgisreader:

            # Convert target name into canonical form
            s = row[2]
            s = string.replace(s, '_', ' ')
            s = string.capwords(s)
            s = s.replace(' ', '_')
            target_name = s

            dbutils.insert_into_table(cursor=cursor,
                                      table='targets_mmgis',
                                      columns=['target_name',
                                               'target_latitude',
                                               'target_longitude'],
                                      values=[s,
                                              float(row[7]),
                                              float(row[8])])

        connection.commit()

    cursor.close()
    connection.close()
Exemplo n.º 8
0
def add_users(cursor, users, source_id):
    # key=external_id, val=internal_id
    user_ids = {}
    for user in users:
        external_id = user['user_id']
        name = user['name']
        registration = user['registered_timestamp']
        last_seen = user['last_seen_timestamp']

        user_id = dbutils.insert_into_table(
                cursor,
                'users',
                ['name'],
                [name])

        user_ids[external_id] = user_id

        dbutils.insert_into_table(
                cursor,
                'user_source',
                ['source_id','user_id','username'],
                [source_id, user_id, external_id])

        dbutils.insert_into_table(
                cursor,
                'user_info',
                ['source_id', 'user_id', 'label', 'value'],
                [source_id, user_id, 'registered_timestamp', registration])

        dbutils.insert_into_table(
                cursor,
                'user_info',
                ['source_id', 'user_id', 'label', 'value'],
                [source_id, user_id, 'last_seen_timestamp', last_seen])

    return user_ids
Exemplo n.º 9
0
    def insert(self, cursor):

        # Insert into the appropriate table depending on the annotation type

        # Basic annotations (type T in .ann files) are inserted into
        # the anchors table, which tracks all text mentions (and spans).
        if self.type == 'anchor':
            canonical = ''

            # Targets and components:
            # Look up canonical entry, or add if needed.
            if self.label in lookups.keys():
                # First remove any hyphenation (due to poor parsing)
                s = string.replace(self.name, '- ', '')
                if self.label == 'Target':
                    # Canonical name in mixed case with underscores between words
                    s = string.replace(s, '_', ' ')
                    s = string.capwords(s)
                    s = s.replace(' ', '_')
                    canonical = s
                elif self.label == 'Element' or self.label == 'Mineral':
                    # Capitalize
                    s = string.capwords(s)
                canonical = s

                (tabname, colname) = lookups[self.label]
                cursor.execute("SELECT %s FROM %s " % (colname, tabname) +
                               "WHERE %s='%s';" % (colname, canonical))
                name = cursor.fetchone()
                if name == None:
                    # Add the label for components
                    if tabname == 'components':
                        dbutils.insert_into_table(
                            cursor=cursor,
                            table=tabname,
                            columns=[colname, 'component_label'],
                            values=[canonical, self.label])
                    else:  # Just add the name
                        dbutils.insert_into_table(cursor=cursor,
                                                  table=tabname,
                                                  columns=[colname],
                                                  values=[canonical])

            # Add this anchor.  Text other than targets and components
            # will not have a canonical entry.  That's okay.
            dbutils.insert_into_table(
                cursor=cursor,
                table='anchors',
                columns=[
                    'anchor_id', 'label', 'canonical', 'text', 'span_start',
                    'span_end'
                ],
                values=[
                    self.doc_id + '_' + self.annotation_id, self.label,
                    canonical, self.name, self.start, self.end
                ])

        # Events are
        elif self.type == 'event':
            if self.label == 'Contains':
                # Loop over all targets
                for t in self.targets:
                    # Loop over all constituents
                    for v in self.cont:
                        # Extract the excerpt
                        cursor.execute("SELECT content " +
                                       "FROM documents " +
                                       "WHERE doc_id='%s';" \
                                           % (self.doc_id))
                        content = cursor.fetchone()
                        if content == None:
                            print 'Warning: document %s not found, skipping.' % \
                                self.doc_id
                            break

                        # Compute the likely start and end of the sentence
                        # surrounding the component
                        content = content[0]
                        cursor.execute("SELECT span_start, span_end, text " +
                                       "FROM anchors " +
                                       "WHERE anchor_id='%s';" \
                                           % (self.doc_id+'_'+v))
                        (anchor_start, anchor_end, text) = cursor.fetchone()

                        # Start: first capital letter after last period before last capital letter!
                        sent_start = 0
                        # Last preceding capital
                        m = [
                            m for m in re.finditer('[A-Z]',
                                                   content[:anchor_start])
                        ]
                        if m:
                            sent_start = m[-1].start()
                        # Last preceding period
                        sent_start = max(content[:sent_start].rfind('.'), 0)
                        # Next capital
                        m = re.search('[A-Z]', content[sent_start:])
                        if m:
                            sent_start = sent_start + m.start()
                        # End: next period followed by {space,newline}, or end of document.
                        sent_end = anchor_end + content[anchor_end:].find(
                            '. ') + 1
                        if sent_end <= anchor_end:
                            sent_end = anchor_end + content[anchor_end:].find(
                                '.\n') + 1
                        if sent_end <= anchor_end:
                            sent_end = len(content)
                        excerpt = content[sent_start:sent_end]

                        # Get the canonical forms of the target and component
                        cursor.execute("SELECT canonical " +
                                       "FROM anchors " +
                                       "WHERE anchor_id='%s';" \
                                           % (self.doc_id+'_'+t))
                        canonical_t = cursor.fetchone()[0]
                        cursor.execute("SELECT canonical " +
                                       "FROM anchors " +
                                       "WHERE anchor_id='%s';" \
                                           % (self.doc_id+'_'+v))
                        canonical_v = cursor.fetchone()[0]

                        #print ','.join([self.doc_id, canonical_t, canonical_v, text])
                        #print content[anchor_start:anchor_end+1]
                        #print sent_start, anchor_start, anchor_end, sent_end
                        #print excerpt
                        #raw_input()

                        # Insert into table
                        dbutils.insert_into_table(
                            cursor=cursor,
                            table='contains',
                            columns=[
                                'event_id', 'doc_id', 'anchor_id',
                                'target_name', 'component_name', 'magnitude',
                                'confidence', 'annotator', 'excerpt'
                            ],
                            values=[
                                self.doc_id + '_' + self.annotation_id,
                                self.doc_id, self.doc_id + '_' + self.anchor,
                                canonical_t, canonical_v, 'unknown', 'neutral',
                                self.username, excerpt
                            ])
            elif (self.label == 'DoesNotContain' or self.label == 'StratRel'):
                # Not yet handled
                pass

        elif self.type == 'relation':
            # Not yet handled
            pass
        elif self.type == 'attribute':
            # Not yet handled
            pass
        else:
            raise RuntimeError('Unknown label %s' % self.label)