def create_table(self):
   # create freq table
   sql = u'CREATE TABLE IF NOT EXISTS %s ( \
       wid INTEGER PRIMARY KEY, freq INTEGER' % self.freq_table
   for i in range(self.fields):
     sql = sql + u', ' + self.fieldnames[i] + u' TEXT'
   sql = sql + u', UNIQUE ('
   for i in range(self.fields):
     sql = sql + self.fieldnames[i] + u', '
   sql = sql.rstrip(u', ')
   sql = sql + u'))'
   self.c.execute(sql)
   # create sentence table
   sql = u'CREATE TABLE IF NOT EXISTS %s ( \
       sid INTEGER PRIMARY KEY, sentence TEXT, len INTEGER)' % self.sentence_table
   self.c.execute(sql)
   # create link table
   sql = u'CREATE TABLE IF NOT EXISTS %s ( \
       wid INTEGER, sid INTEGER, \
       FOREIGN KEY(wid) REFERENCES %s(wid), \
       FOREIGN KEY(sid) REFERENCES %s(sid))'\
       % (self.link_table, self.freq_table, self.sentence_table)
   self.c.execute(sql)
   # create indices for faster lookup
   sql = 'CREATE INDEX IF NOT EXISTS freq_index ON %s (freq DESC)' % self.freq_table
   self.c.execute(sql)
   sql = 'CREATE INDEX IF NOT EXISTS len_index ON %s (len ASC)' % self.sentence_table
   self.c.execute(sql)
   jql = 'CREATE INDEX IF NOT EXISTS link_wid_index ON %s (wid ASC)' % self.link_table
   self.c.execute(sql)
   self.conn.commit()
   logger.out('created database tables')
 def __enter__(self):
   logger.out('connecting to database')
   self.conn = sqlite3.connect(self.filename)
   self.c = self.conn.cursor() # Cursor for word frequency queries
   self.c2 = self.conn.cursor() # Cursor for sentence queries
   self.c3 = self.conn.cursor() # Cursor for option selections
   self.prepare_queries()
 def __exit__(self, typ, value, traceback):
   self.c.close()
   self.c2.close()
   self.c3.close()
   self.conn.commit()
   self.conn.close()
   logger.out('disconnected from database')
def analyze(filename, formatter, parser, encoding, db):
  logger.out('reading %s' % filename)
  formatter.new_file()
  try:
    fp = codecs.open(filename, 'r', encoding)
  except IOError as e:
    logger.err('error opening %s: %s' % (filename, e))
  else:
    with fp:
      # process all files line by line
      for line in fp:
        trimmed_line = formatter.trim(line)
        mecab_data = parser.parse(trimmed_line, db)
def main():
  # parse command line options
  try:
    opts, args = getopt.getopt(sys.argv[1:], 'hn:t:', ['help','number=','tablename='])
  except getopt.error as opterr:
    logger.err(opterr)
    logger.err('for help use --help')
    sys.exit(2)
  # process config and options
  list_number = config.list_number
  tablename = config.tablename
  for o, a in opts:
    if o in ('-h', '--help'):
      logger.out(__doc__)
      sys.exit(0)
    if o in ('-n', '--number'):
      try:
        top_number = int(a)
      except ValueError:
        logger.err('invalid argument for top number: %s' % a)
        sys.exit(2)
      if list_number <= 0:
        logger.err('invalid top number: %s' % list_number)
        sys.exit(2)
    if o in ('-t', '--tablename'):
      tablename = a
      if not re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', tablename):
        logger.err('invalid table name: %s' % tablename)
        sys.exit(2)
  # open gui with database
  try:
    db = database.Database(tablename)
    with db:
      ui = gui.FreqGUI(db, list_number)
      ui.show()
  except sqlite3.Error as e:
    logger.err('database error: %s' % e)
def main():
  basedir = config.get_basedir()
  # parse command line options
  try:
    opts, args = getopt.getopt(sys.argv[1:], 'hf:e:o:rdt:s', ['help','format=','encoding=', 'droptable', 'recursive', 'tablename=', 'sentences'])
  except getopt.error as opterr:
    logger.err(opterr)
    logger.err('for help use --help')
    sys.exit(2)
  # process config and options
  formatter = config.formatter
  encoding = config.encoding
  tablename = config.tablename
  drop = False
  recursive = False
  sentences = False
  for o, a in opts:
    if o in ('-h', '--help'):
      logger.out(__doc__)
      sys.exit(0)
    if o in ('-f', '--format'):
      formatter = a
      if formatter not in ('plain', 'aozora', 'html'):
        logger.err('format not supported: %s' % formatter)
        sys.exit(2)
    if o in ('-e', '--encoding'):
      encoding = a
      try:
        codecs.lookup(encoding)
      except LookupError:
        logger.err('encoding not found: %s' % encoding)
        sys.exit(2)
    if o in ('-d', '--droptable'):
      drop = True
    if o in ('-s', '--sentences'):
      sentences = True
    if o in ('-t', '--tablename'):
      tablename = a
      if not re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', tablename):
        logger.err('invalid table name: %s' % tablename)
        sys.exit(2)
    if o in ('-r', '--recursive'):
      recursive = True
  # create formatter and parser
  if(formatter == 'aozora'):
    formatter = formats.AozoraFormat()
  elif(formatter == 'html'):
    formatter = formats.HTMLFormat()
  else:
    formatter = formats.Format()
  parser = mecab.PyMeCab(sentences)
  # access database
  try:
    db = database.Database(tablename)
    with db:
      if(drop):
        db.drop_table()
      db.create_table()
      # process files
      logger.out('analyzing text files')
      if recursive:
        for dirname in args:
          for dirpath, dirs, files in os.walk(dirname):
            logger.out('going through directory %s' % dirpath)
            for filename in files:
              analyze(os.path.join(dirpath, filename), formatter, parser, encoding, db)
      else:
        for filename in args:
          analyze(filename, formatter, parser, encoding, db)
      logger.out('done analyzing')
  except sqlite3.Error as e:
    logger.err('database error: %s' % e)
 def clear_table(self):
   self.c.execute(u'DELETE FROM %s' % self.freq_table)
   self.c.execute(u'DELETE FROM %s' % self.sentence_table)
   self.c.execute(u'DELETE FROM %s' % self.link_table)
   self.conn.commit()
   logger.out('cleared database tables')
 def drop_table(self):
   self.c.execute(u'DROP TABLE IF EXISTS %s' % self.freq_table)
   self.c.execute(u'DROP TABLE IF EXISTS %s' % self.sentence_table)
   self.c.execute(u'DROP TABLE IF EXISTS %s' % self.link_table)
   self.conn.commit()
   logger.out('dropped database tables')