Пример #1
0
 def _run(self):
     utils.log("[%s] initializing" % self)
     f, numLines, filename = self._open_file(countLines=False)
     
     table_format = epf.parse_table_format(f, filename)
     self.table_format = table_format
     f.close()
     
     numLines = self.execute('SELECT COUNT(*) FROM "%s"' % self.table).fetchone()[0]
     utils.log("[%s] parsing ~%d entities from '%s'" % (self, numLines, self.table))
     
     rows  = self.execute('SELECT * FROM "%s"' % self.table)
     #self._globals['rows'] = rows; self._output.put(StopIteration); return
     count = 0
     
     for row in rows:
         row = self._format_result(row)
         self._parseRow(row)
         count += 1
         
         if numLines > 100 and (count % (numLines / 100)) == 0:
             utils.log("[%s] done parsing %s" % \
                 (self, utils.getStatusStr(count, numLines)))
             time.sleep(0.1)
     
     f.close()
     self._output.put(StopIteration)
     
     utils.log("[%s] finished parsing %d entities (filtered %d)" % (self, count, self.numFiltered))
Пример #2
0
 def _run(self):
     utils.log("[%s] initializing" % self)
     f, numLines, filename = self._open_file(countLines=False)
     
     table_format = epf.parse_table_format(f, filename)
     self.table_format = table_format
     
     stale = False
     self._buffer = []
     self._buffer_threshold = 1024
     
     # determine whether or not the db table already exists and attempt to 
     # determine if it's up-to-date s.t. we won't recalculate it if it'd 
     # be unnecessary.
     try:
         row0 = self.execute('SELECT * FROM %s LIMIT 1' % (self.table, ), error_okay=True).fetchone()
         
         if row0 is None:
             stale = True
         elif len(row0) != len(dict(table_format.cols)):
             stale = True
     except Exception:
         self.conn.rollback()
         #utils.printException()
         stale = True
         pass
     
     #f.close(); self._output.put(StopIteration); return
     
     if not stale:
         # table is usable as-is
         utils.log("[%s] %s.%s doesn't need to be recomputed" % (self, self.dbpath, self.table))
     else:
         utils.log("[%s] opening '%s'" % (self, self._filename))
         
         numLines = max(0, utils.getNumLines(f) - 8)
         table_format = epf.parse_table_format(f, filename)
         self.table_format = table_format
         
         utils.log("[%s] parsing ~%d rows from '%s'" % (self, numLines, self._filename))
         
         # initialize table
         cols  = []
         
         # currently disabling primary keys for most tables
         found_primary = False #(len(table_format.primary_keys) != 1)
         
         for col in table_format.cols:
             cols.append('')
         
         for col in table_format.cols:
             primary = ""
             if not found_primary and col == self.primary and not self._sqlite:
             #if not found_primary and col in table_format.primary_keys:
                 # TODO: handle the common case of multiple primary keys, which sqlite3 does not support
                 # TODO: defining the primary key here as opposed to after insertion is much slower!
                 primary = " PRIMARY KEY"
                 found_primary = True
             
             col2  = table_format.cols[col]
             col_type = col2['type']
             
             if not self._sqlite:
                 # perform mapping between some MySQL types that Apple uses and 
                 # their postgres equivalents
                 if col_type == 'DATETIME':
                     col_type = 'VARCHAR(100)'
                 elif col_type == 'LONGTEXT':
                     col_type = 'VARCHAR(4000)'
             
             text  = "%s %s%s" % (col, col_type, primary)
             index = col2['index']
             cols[index] = text
         
         args = string.joinfields(cols, ', ')
         self.execute("DROP TABLE %s" % (self.table, ), error_okay=True)
         self.execute("CREATE TABLE %s (%s)" % (self.table, args), verbose=True)
         
         if self._sqlite:
             placeholder = '?'
         else:
             placeholder = '%s'
         
         values_str  = '(%s)' % string.joinfields((placeholder for col in table_format.cols), ', ')
         self._cmd   = 'INSERT INTO %s VALUES %s' % (self.table, values_str)
         
         count = 0
         for row in epf.parse_rows(f, table_format):
             self._parseRowOld(row, table_format)
             count += 1
             
             if numLines > 100 and (count % (numLines / 100)) == 0:
                 num_rows = self.execute('SELECT COUNT(*) FROM %s' % (self.table, )).fetchone()[0]
                 
                 utils.log("[%s] done parsing %s -- %d rows" % \
                     (self, utils.getStatusStr(count, numLines), num_rows))
         
         self._try_flush_buffer(force=True)
         
         if self.index:
             self.execute("CREATE INDEX %s on %s (%s)" % (self.index, self.table, self.index), verbose=True)
         
         utils.log("[%s] finished parsing %d rows" % (self, count))
     
     f.close()
     self._output.put(StopIteration)