Exemplo n.º 1
0
    def importCorpusToDb(self,  directory,  dbName):
        # global vars

        directory = os.path.expanduser(directory)

        print >> sys.stderr,  '==============================================================='
        print >> sys.stderr,  'Reading from ' + directory
        print >> sys.stderr,  'Writing to DB ' + dbName
        print >> sys.stderr,  '==============================================================='



        tTable = string.maketrans("",  "")

        l = wn.WordNetLemmatizer()
        timebankToWordnet = {
            'ADJECTIVE': 'a',
            'NOUN': 'n' ,
            'OTHER': 'a',
            'PREPOSITION': 'a',  # would like to use 's' here, but it's not in nltk.corpus.reader.wordnet POS_LIST; just defined as ADJ_SAT constant
            'VERB': 'v'
            }


        # reset database
        tableCreationSql = open('db_header_'+db.engine+'.sql').read()

        if db.engine == 'mysql':
            db.cursor.execute('DROP DATABASE IF EXISTS %s' % dbName)
            db.cursor.execute('CREATE DATABASE %s' % dbName)
        
        if db.engine == 'sqlite':
            db_path = os.path.join(db.prefix,  dbName)
            if os.path.exists(db_path):
                os.unlink(db_path)
        
        db.changeDb(dbName)


        for creationSql in tableCreationSql.split(';'):
            if len(creationSql.strip()) > 0:
                db.cursor.execute(creationSql)

        if db.engine == 'sqlite':
            db.conn.commit()


        # read directory
        fileList = os.listdir(directory)
        fileList.sort()
        self.doc_id = -1

        for fileName in fileList:
            
            self.tags = {}
            self.tagText = {}
            self.inTag = False
            self.wordOffset = 0
            self.sentenceOffset = 0
            self.posInSentence = 0
            self.bodyText = ''
            self.parsedText = ''

            
            if not os.path.isfile(directory+fileName):
                continue;
            
            if fileName[0] == '.':
                #skip hidden files
                continue;
            
            self.doc_id += 1
            db.cursor.execute('INSERT INTO documents(docname, id) VALUES ("' + fileName + '", '+str(self.doc_id)+')')

            print fileName,  'as',  self.doc_id

            # load file
            timeMlFile = open(directory+fileName)
            self.bodyText = timeMlFile.read()
            
            # get encoding
            charset = chardet.detect(self.bodyText)
            print 'Encoding is', charset['encoding'], 'with confidence', charset['confidence']
            if charset['confidence'] < 0.7:
                print 'Confidence too low - using default', self.defaultEncoding
                self.encoding = self.defaultEncoding
            else:
                self.encoding = charset['encoding']
            
            # break into sentences
            self.bodyText = re.sub(r'<[^>]*?>', '', self.bodyText) # strip tags
            timeMlFile.close()
 
            # collapse whitespaces (this is also done before processing by sax parser)
            self.bodyText = self.cleanText(self.bodyText)
            
            # unescape entities
            self.bodyText = unescape(self.bodyText)
            
            sentences = self.sentenceDetector.tokenize(self.bodyText)
            sentences[0] = sentences[0].lstrip()
            for i,  sentence in enumerate(sentences):
                print i, sentence
                db.cursor.execute('INSERT INTO sentences(doc_id, sentenceID, text) VALUES(?, ?, ?)',  (self.doc_id,  i,  sentence.decode(self.encoding)))

            # get minidom data - element attribute cataloguing

            try:
                timemldoc  = minidom.parse(directory+fileName)
            except Exception, e:
                try:
                    text_to_parse = open(directory+fileName).read().replace('\xc3\x27', 'o') # fix o-circumflex
                    timemldoc  = minidom.parseString(text_to_parse)
                except:
                    print 'Failed to parse: ', e
                    raise e

            eventNodes = timemldoc.getElementsByTagName('EVENT')
            makeInstanceNodes = timemldoc.getElementsByTagName('MAKEINSTANCE')
            timexNodes = timemldoc.getElementsByTagName('TIMEX3')
            signalNodes = timemldoc.getElementsByTagName('SIGNAL')
            tlinkNodes = timemldoc.getElementsByTagName('TLINK')
            slinkNodes = timemldoc.getElementsByTagName('SLINK')
            alinkNodes = timemldoc.getElementsByTagName('ALINK')

            eventAttribs = ['eid',  'class']
            makeInstanceAttribs = ['eiid',  'eventID',  'signalID',  'pos',  'tense',  'aspect',  'cardinality',  'polarity',  'modality', 'vform', 'mood', 'pred']
            timexAttribs = ['tid', 'type',  'functionInDocument',  'beginPoint',  'endPoint',  'quant',  'freq',  'temporalFunction',  'value',  'mod',  'anchorTimeID']
            signalAttribs = ['sid']
            tlinkAttribs = ['lid',  'origin',  'signalID',  'relType']
            slinkAttribs = ['lid',  'origin',  'signalID',  'relType',  'eventInstanceID',  'subordinatedEventInstance']
            alinkAttribs = ['lid',  'origin',  'signalID',  'relType',  'eventInstanceID',  'relatedToEventInstance']

            if len(makeInstanceNodes) == 0 and len(eventNodes) > 0:
                # assume that makeinstance info is listed on events; copy events to instances
                print 'EVENTs are present, but there are no MAKEINSTANCE elements; entering EVENT-only mode'
                self.event_only_mode = True
                # duplicate event data into makeinstance data
                makeInstanceNodes = xml.dom.minicompat.NodeList(eventNodes)

            self.insertNodes(eventNodes,  eventAttribs,  'events')
            self.insertNodes(makeInstanceNodes,  makeInstanceAttribs,  'instances')
            self.insertNodes(timexNodes,  timexAttribs,  'timex3s')
            self.insertNodes(signalNodes,  signalAttribs,  'signals')
            self.insertNodes(tlinkNodes,  tlinkAttribs,  'tlinks',  True)
            self.insertNodes(slinkNodes,  slinkAttribs,  'slinks')
            self.insertNodes(alinkNodes,  alinkAttribs,  'alinks')

            if db.engine == 'sqlite' and self.commitEveryDoc:
                db.conn.commit()


            # get position data
            
            file = open(directory+fileName)
            xmlData = file.read()
            file.close()

            parser = xml.parsers.expat.ParserCreate()

            parser.StartElementHandler = self.startElement
            parser.EndElementHandler = self.endElement
            parser.CharacterDataHandler = self.charData


            xmlData = self.cleanText(xmlData) # collapse whitespace
            try:
                parser.Parse(xmlData) # run sax parser - includes offset calculation
            except Exception, e:
                print 'Failed to parse: ', e
                raise e
Exemplo n.º 2
0
    moduleDir = config.get("cavat", "moduledir")
except:
    moduleDir = "modules"


# db connection
dbName = None
try:
    dbName = config.get("cavat", "dbname")
except:
    pass


db.connect(config)
if dbName:
    db.changeDb(dbName)

# use default history file if nothing else is specified
try:
    historyFile = config.get("cavat", "historyfile")
except Exception, e:
    historyFile = ".cavat_history"


# readline code for persistent command history
histfile = os.path.join(os.environ["HOME"], historyFile)

try:
    readline.read_history_file(histfile)
except IOError:
    pass
Exemplo n.º 3
0
    moduleDir = config.get('cavat',  'moduledir')
except:
    moduleDir = 'modules'


# db connection
dbName = None
try:
    dbName = config.get('cavat',  'dbname')
except:
    pass


db.connect(config)
if dbName:
    db.changeDb(dbName)

# use default history file if nothing else is specified
try:
    historyFile = config.get('cavat',  'historyfile')
except Exception,  e:
    historyFile = '.cavat_history'


# readline code for persistent command history
histfile = os.path.join(os.environ["HOME"], historyFile)

try:
    readline.read_history_file(histfile)
except IOError:
    pass
Exemplo n.º 4
0
    def importCorpusToDb(self, directory, dbName):
        # global vars

        directory = os.path.expanduser(directory)

        print >> sys.stderr, '==============================================================='
        print >> sys.stderr, 'Reading from ' + directory
        print >> sys.stderr, 'Writing to DB ' + dbName
        print >> sys.stderr, '==============================================================='

        tTable = string.maketrans("", "")

        l = wn.WordNetLemmatizer()
        timebankToWordnet = {
            'ADJECTIVE': 'a',
            'NOUN': 'n',
            'OTHER': 'a',
            'PREPOSITION':
            'a',  # would like to use 's' here, but it's not in nltk.corpus.reader.wordnet POS_LIST; just defined as ADJ_SAT constant
            'VERB': 'v'
        }

        # reset database
        tableCreationSql = open('db_header_' + db.engine + '.sql').read()

        if db.engine == 'mysql':
            db.cursor.execute('DROP DATABASE IF EXISTS %s' % dbName)
            db.cursor.execute('CREATE DATABASE %s' % dbName)

        if db.engine == 'sqlite':
            db_path = os.path.join(db.prefix, dbName)
            if os.path.exists(db_path):
                os.unlink(db_path)

        db.changeDb(dbName)

        for creationSql in tableCreationSql.split(';'):
            if len(creationSql.strip()) > 0:
                db.cursor.execute(creationSql)

        if db.engine == 'sqlite':
            db.conn.commit()

        # read directory
        fileList = os.listdir(directory)
        fileList.sort()
        self.doc_id = -1

        for fileName in fileList:

            self.tags = {}
            self.tagText = {}
            self.inTag = False
            self.wordOffset = 0
            self.sentenceOffset = 0
            self.posInSentence = 0
            self.bodyText = ''
            self.parsedText = ''

            if not os.path.isfile(directory + fileName):
                continue

            if fileName[0] == '.':
                #skip hidden files
                continue

            self.doc_id += 1
            db.cursor.execute('INSERT INTO documents(docname, id) VALUES ("' +
                              fileName + '", ' + str(self.doc_id) + ')')

            print fileName, 'as', self.doc_id

            # load file
            timeMlFile = open(directory + fileName)
            self.bodyText = timeMlFile.read()

            # get encoding
            charset = chardet.detect(self.bodyText)
            print 'Encoding is', charset[
                'encoding'], 'with confidence', charset['confidence']
            if charset['confidence'] < 0.7:
                print 'Confidence too low - using default', self.defaultEncoding
                self.encoding = self.defaultEncoding
            else:
                self.encoding = charset['encoding']

            # break into sentences
            self.bodyText = re.sub(r'<[^>]*?>', '',
                                   self.bodyText)  # strip tags
            timeMlFile.close()

            # collapse whitespaces (this is also done before processing by sax parser)
            self.bodyText = self.cleanText(self.bodyText)

            # unescape entities
            self.bodyText = unescape(self.bodyText)

            sentences = self.sentenceDetector.tokenize(self.bodyText)
            sentences[0] = sentences[0].lstrip()
            for i, sentence in enumerate(sentences):
                print i, sentence
                db.cursor.execute(
                    'INSERT INTO sentences(doc_id, sentenceID, text) VALUES(?, ?, ?)',
                    (self.doc_id, i, sentence.decode(self.encoding)))

            # get minidom data - element attribute cataloguing

            try:
                timemldoc = minidom.parse(directory + fileName)
            except Exception, e:
                try:
                    text_to_parse = open(directory + fileName).read().replace(
                        '\xc3\x27', 'o')  # fix o-circumflex
                    timemldoc = minidom.parseString(text_to_parse)
                except:
                    print 'Failed to parse: ', e
                    raise e

            eventNodes = timemldoc.getElementsByTagName('EVENT')
            makeInstanceNodes = timemldoc.getElementsByTagName('MAKEINSTANCE')
            timexNodes = timemldoc.getElementsByTagName('TIMEX3')
            signalNodes = timemldoc.getElementsByTagName('SIGNAL')
            tlinkNodes = timemldoc.getElementsByTagName('TLINK')
            slinkNodes = timemldoc.getElementsByTagName('SLINK')
            alinkNodes = timemldoc.getElementsByTagName('ALINK')

            eventAttribs = ['eid', 'class']
            makeInstanceAttribs = [
                'eiid', 'eventID', 'signalID', 'pos', 'tense', 'aspect',
                'cardinality', 'polarity', 'modality', 'vform', 'mood', 'pred'
            ]
            timexAttribs = [
                'tid', 'type', 'functionInDocument', 'beginPoint', 'endPoint',
                'quant', 'freq', 'temporalFunction', 'value', 'mod',
                'anchorTimeID'
            ]
            signalAttribs = ['sid']
            tlinkAttribs = ['lid', 'origin', 'signalID', 'relType']
            slinkAttribs = [
                'lid', 'origin', 'signalID', 'relType', 'eventInstanceID',
                'subordinatedEventInstance'
            ]
            alinkAttribs = [
                'lid', 'origin', 'signalID', 'relType', 'eventInstanceID',
                'relatedToEventInstance'
            ]

            if len(makeInstanceNodes) == 0 and len(eventNodes) > 0:
                # assume that makeinstance info is listed on events; copy events to instances
                print 'EVENTs are present, but there are no MAKEINSTANCE elements; entering EVENT-only mode'
                self.event_only_mode = True
                # duplicate event data into makeinstance data
                makeInstanceNodes = xml.dom.minicompat.NodeList(eventNodes)

            self.insertNodes(eventNodes, eventAttribs, 'events')
            self.insertNodes(makeInstanceNodes, makeInstanceAttribs,
                             'instances')
            self.insertNodes(timexNodes, timexAttribs, 'timex3s')
            self.insertNodes(signalNodes, signalAttribs, 'signals')
            self.insertNodes(tlinkNodes, tlinkAttribs, 'tlinks', True)
            self.insertNodes(slinkNodes, slinkAttribs, 'slinks')
            self.insertNodes(alinkNodes, alinkAttribs, 'alinks')

            if db.engine == 'sqlite' and self.commitEveryDoc:
                db.conn.commit()

            # get position data

            file = open(directory + fileName)
            xmlData = file.read()
            file.close()

            parser = xml.parsers.expat.ParserCreate()

            parser.StartElementHandler = self.startElement
            parser.EndElementHandler = self.endElement
            parser.CharacterDataHandler = self.charData

            xmlData = self.cleanText(xmlData)  # collapse whitespace
            try:
                parser.Parse(
                    xmlData)  # run sax parser - includes offset calculation
            except Exception, e:
                print 'Failed to parse: ', e
                raise e