def importCorpusToDb(self, directory, dbName): # global vars directory = os.path.expanduser(directory) print >> sys.stderr, '===============================================================' print >> sys.stderr, 'Reading from ' + directory print >> sys.stderr, 'Writing to DB ' + dbName print >> sys.stderr, '===============================================================' tTable = string.maketrans("", "") l = wn.WordNetLemmatizer() timebankToWordnet = { 'ADJECTIVE': 'a', 'NOUN': 'n' , 'OTHER': 'a', 'PREPOSITION': 'a', # would like to use 's' here, but it's not in nltk.corpus.reader.wordnet POS_LIST; just defined as ADJ_SAT constant 'VERB': 'v' } # reset database tableCreationSql = open('db_header_'+db.engine+'.sql').read() if db.engine == 'mysql': db.cursor.execute('DROP DATABASE IF EXISTS %s' % dbName) db.cursor.execute('CREATE DATABASE %s' % dbName) if db.engine == 'sqlite': db_path = os.path.join(db.prefix, dbName) if os.path.exists(db_path): os.unlink(db_path) db.changeDb(dbName) for creationSql in tableCreationSql.split(';'): if len(creationSql.strip()) > 0: db.cursor.execute(creationSql) if db.engine == 'sqlite': db.conn.commit() # read directory fileList = os.listdir(directory) fileList.sort() self.doc_id = -1 for fileName in fileList: self.tags = {} self.tagText = {} self.inTag = False self.wordOffset = 0 self.sentenceOffset = 0 self.posInSentence = 0 self.bodyText = '' self.parsedText = '' if not os.path.isfile(directory+fileName): continue; if fileName[0] == '.': #skip hidden files continue; self.doc_id += 1 db.cursor.execute('INSERT INTO documents(docname, id) VALUES ("' + fileName + '", '+str(self.doc_id)+')') print fileName, 'as', self.doc_id # load file timeMlFile = open(directory+fileName) self.bodyText = timeMlFile.read() # get encoding charset = chardet.detect(self.bodyText) print 'Encoding is', charset['encoding'], 'with confidence', charset['confidence'] if charset['confidence'] < 0.7: print 'Confidence too low - using default', self.defaultEncoding self.encoding = self.defaultEncoding else: self.encoding = charset['encoding'] # break into sentences self.bodyText = re.sub(r'<[^>]*?>', '', self.bodyText) # strip tags timeMlFile.close() # collapse whitespaces (this is also done before processing by sax parser) self.bodyText = self.cleanText(self.bodyText) # unescape entities self.bodyText = unescape(self.bodyText) sentences = self.sentenceDetector.tokenize(self.bodyText) sentences[0] = sentences[0].lstrip() for i, sentence in enumerate(sentences): print i, sentence db.cursor.execute('INSERT INTO sentences(doc_id, sentenceID, text) VALUES(?, ?, ?)', (self.doc_id, i, sentence.decode(self.encoding))) # get minidom data - element attribute cataloguing try: timemldoc = minidom.parse(directory+fileName) except Exception, e: try: text_to_parse = open(directory+fileName).read().replace('\xc3\x27', 'o') # fix o-circumflex timemldoc = minidom.parseString(text_to_parse) except: print 'Failed to parse: ', e raise e eventNodes = timemldoc.getElementsByTagName('EVENT') makeInstanceNodes = timemldoc.getElementsByTagName('MAKEINSTANCE') timexNodes = timemldoc.getElementsByTagName('TIMEX3') signalNodes = timemldoc.getElementsByTagName('SIGNAL') tlinkNodes = timemldoc.getElementsByTagName('TLINK') slinkNodes = timemldoc.getElementsByTagName('SLINK') alinkNodes = timemldoc.getElementsByTagName('ALINK') eventAttribs = ['eid', 'class'] makeInstanceAttribs = ['eiid', 'eventID', 'signalID', 'pos', 'tense', 'aspect', 'cardinality', 'polarity', 'modality', 'vform', 'mood', 'pred'] timexAttribs = ['tid', 'type', 'functionInDocument', 'beginPoint', 'endPoint', 'quant', 'freq', 'temporalFunction', 'value', 'mod', 'anchorTimeID'] signalAttribs = ['sid'] tlinkAttribs = ['lid', 'origin', 'signalID', 'relType'] slinkAttribs = ['lid', 'origin', 'signalID', 'relType', 'eventInstanceID', 'subordinatedEventInstance'] alinkAttribs = ['lid', 'origin', 'signalID', 'relType', 'eventInstanceID', 'relatedToEventInstance'] if len(makeInstanceNodes) == 0 and len(eventNodes) > 0: # assume that makeinstance info is listed on events; copy events to instances print 'EVENTs are present, but there are no MAKEINSTANCE elements; entering EVENT-only mode' self.event_only_mode = True # duplicate event data into makeinstance data makeInstanceNodes = xml.dom.minicompat.NodeList(eventNodes) self.insertNodes(eventNodes, eventAttribs, 'events') self.insertNodes(makeInstanceNodes, makeInstanceAttribs, 'instances') self.insertNodes(timexNodes, timexAttribs, 'timex3s') self.insertNodes(signalNodes, signalAttribs, 'signals') self.insertNodes(tlinkNodes, tlinkAttribs, 'tlinks', True) self.insertNodes(slinkNodes, slinkAttribs, 'slinks') self.insertNodes(alinkNodes, alinkAttribs, 'alinks') if db.engine == 'sqlite' and self.commitEveryDoc: db.conn.commit() # get position data file = open(directory+fileName) xmlData = file.read() file.close() parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = self.startElement parser.EndElementHandler = self.endElement parser.CharacterDataHandler = self.charData xmlData = self.cleanText(xmlData) # collapse whitespace try: parser.Parse(xmlData) # run sax parser - includes offset calculation except Exception, e: print 'Failed to parse: ', e raise e
moduleDir = config.get("cavat", "moduledir") except: moduleDir = "modules" # db connection dbName = None try: dbName = config.get("cavat", "dbname") except: pass db.connect(config) if dbName: db.changeDb(dbName) # use default history file if nothing else is specified try: historyFile = config.get("cavat", "historyfile") except Exception, e: historyFile = ".cavat_history" # readline code for persistent command history histfile = os.path.join(os.environ["HOME"], historyFile) try: readline.read_history_file(histfile) except IOError: pass
moduleDir = config.get('cavat', 'moduledir') except: moduleDir = 'modules' # db connection dbName = None try: dbName = config.get('cavat', 'dbname') except: pass db.connect(config) if dbName: db.changeDb(dbName) # use default history file if nothing else is specified try: historyFile = config.get('cavat', 'historyfile') except Exception, e: historyFile = '.cavat_history' # readline code for persistent command history histfile = os.path.join(os.environ["HOME"], historyFile) try: readline.read_history_file(histfile) except IOError: pass
def importCorpusToDb(self, directory, dbName): # global vars directory = os.path.expanduser(directory) print >> sys.stderr, '===============================================================' print >> sys.stderr, 'Reading from ' + directory print >> sys.stderr, 'Writing to DB ' + dbName print >> sys.stderr, '===============================================================' tTable = string.maketrans("", "") l = wn.WordNetLemmatizer() timebankToWordnet = { 'ADJECTIVE': 'a', 'NOUN': 'n', 'OTHER': 'a', 'PREPOSITION': 'a', # would like to use 's' here, but it's not in nltk.corpus.reader.wordnet POS_LIST; just defined as ADJ_SAT constant 'VERB': 'v' } # reset database tableCreationSql = open('db_header_' + db.engine + '.sql').read() if db.engine == 'mysql': db.cursor.execute('DROP DATABASE IF EXISTS %s' % dbName) db.cursor.execute('CREATE DATABASE %s' % dbName) if db.engine == 'sqlite': db_path = os.path.join(db.prefix, dbName) if os.path.exists(db_path): os.unlink(db_path) db.changeDb(dbName) for creationSql in tableCreationSql.split(';'): if len(creationSql.strip()) > 0: db.cursor.execute(creationSql) if db.engine == 'sqlite': db.conn.commit() # read directory fileList = os.listdir(directory) fileList.sort() self.doc_id = -1 for fileName in fileList: self.tags = {} self.tagText = {} self.inTag = False self.wordOffset = 0 self.sentenceOffset = 0 self.posInSentence = 0 self.bodyText = '' self.parsedText = '' if not os.path.isfile(directory + fileName): continue if fileName[0] == '.': #skip hidden files continue self.doc_id += 1 db.cursor.execute('INSERT INTO documents(docname, id) VALUES ("' + fileName + '", ' + str(self.doc_id) + ')') print fileName, 'as', self.doc_id # load file timeMlFile = open(directory + fileName) self.bodyText = timeMlFile.read() # get encoding charset = chardet.detect(self.bodyText) print 'Encoding is', charset[ 'encoding'], 'with confidence', charset['confidence'] if charset['confidence'] < 0.7: print 'Confidence too low - using default', self.defaultEncoding self.encoding = self.defaultEncoding else: self.encoding = charset['encoding'] # break into sentences self.bodyText = re.sub(r'<[^>]*?>', '', self.bodyText) # strip tags timeMlFile.close() # collapse whitespaces (this is also done before processing by sax parser) self.bodyText = self.cleanText(self.bodyText) # unescape entities self.bodyText = unescape(self.bodyText) sentences = self.sentenceDetector.tokenize(self.bodyText) sentences[0] = sentences[0].lstrip() for i, sentence in enumerate(sentences): print i, sentence db.cursor.execute( 'INSERT INTO sentences(doc_id, sentenceID, text) VALUES(?, ?, ?)', (self.doc_id, i, sentence.decode(self.encoding))) # get minidom data - element attribute cataloguing try: timemldoc = minidom.parse(directory + fileName) except Exception, e: try: text_to_parse = open(directory + fileName).read().replace( '\xc3\x27', 'o') # fix o-circumflex timemldoc = minidom.parseString(text_to_parse) except: print 'Failed to parse: ', e raise e eventNodes = timemldoc.getElementsByTagName('EVENT') makeInstanceNodes = timemldoc.getElementsByTagName('MAKEINSTANCE') timexNodes = timemldoc.getElementsByTagName('TIMEX3') signalNodes = timemldoc.getElementsByTagName('SIGNAL') tlinkNodes = timemldoc.getElementsByTagName('TLINK') slinkNodes = timemldoc.getElementsByTagName('SLINK') alinkNodes = timemldoc.getElementsByTagName('ALINK') eventAttribs = ['eid', 'class'] makeInstanceAttribs = [ 'eiid', 'eventID', 'signalID', 'pos', 'tense', 'aspect', 'cardinality', 'polarity', 'modality', 'vform', 'mood', 'pred' ] timexAttribs = [ 'tid', 'type', 'functionInDocument', 'beginPoint', 'endPoint', 'quant', 'freq', 'temporalFunction', 'value', 'mod', 'anchorTimeID' ] signalAttribs = ['sid'] tlinkAttribs = ['lid', 'origin', 'signalID', 'relType'] slinkAttribs = [ 'lid', 'origin', 'signalID', 'relType', 'eventInstanceID', 'subordinatedEventInstance' ] alinkAttribs = [ 'lid', 'origin', 'signalID', 'relType', 'eventInstanceID', 'relatedToEventInstance' ] if len(makeInstanceNodes) == 0 and len(eventNodes) > 0: # assume that makeinstance info is listed on events; copy events to instances print 'EVENTs are present, but there are no MAKEINSTANCE elements; entering EVENT-only mode' self.event_only_mode = True # duplicate event data into makeinstance data makeInstanceNodes = xml.dom.minicompat.NodeList(eventNodes) self.insertNodes(eventNodes, eventAttribs, 'events') self.insertNodes(makeInstanceNodes, makeInstanceAttribs, 'instances') self.insertNodes(timexNodes, timexAttribs, 'timex3s') self.insertNodes(signalNodes, signalAttribs, 'signals') self.insertNodes(tlinkNodes, tlinkAttribs, 'tlinks', True) self.insertNodes(slinkNodes, slinkAttribs, 'slinks') self.insertNodes(alinkNodes, alinkAttribs, 'alinks') if db.engine == 'sqlite' and self.commitEveryDoc: db.conn.commit() # get position data file = open(directory + fileName) xmlData = file.read() file.close() parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = self.startElement parser.EndElementHandler = self.endElement parser.CharacterDataHandler = self.charData xmlData = self.cleanText(xmlData) # collapse whitespace try: parser.Parse( xmlData) # run sax parser - includes offset calculation except Exception, e: print 'Failed to parse: ', e raise e