def convert_to_sstables(input_files, column_family, output_dir_name, keyspace, timestamp, buffer_size, data_type): import fileinput from java.io import File from org.apache.cassandra.io.sstable import SSTableSimpleUnsortedWriter from org.apache.cassandra.db.marshal import AsciiType try: coercer = COERCERS[data_type] except KeyError: raise ValueError("invalid data type") output_dir = File(output_dir_name) if not output_dir.exists(): output_dir.mkdir() writer = SSTableSimpleUnsortedWriter(output_dir, keyspace, column_family, AsciiType.instance, None, buffer_size) try: previous_rowkey = None for line in fileinput.input(input_files): rowkey, colkey, value = line.rstrip("\n").split("\t") if rowkey != previous_rowkey: writer.newRow(bytes(rowkey)) coerced = coercer(value) writer.addColumn(bytes(colkey), coerced, timestamp) if fileinput.lineno() % 1000 == 0: print "%d items processed (%s)" % (fileinput.lineno(), fileinput.filename()) finally: writer.close()
def convert_to_sstables(input_files, column_family, output_dir_name, keyspace, timestamp, buffer_size, data_type, verbose=False): import fileinput from java.io import File from org.apache.cassandra.io.sstable import SSTableSimpleUnsortedWriter from org.apache.cassandra.db.marshal import AsciiType from org.apache.cassandra.service import StorageService from org.apache.cassandra.io.compress import CompressionParameters partitioner = StorageService.getPartitioner() try: coercer = COERCERS[data_type] except KeyError: raise ValueError("invalid data type") output_dir = File(output_dir_name) if not output_dir.exists(): output_dir.mkdir() compression_options = CompressionParameters.create({ 'sstable_compression': 'org.apache.cassandra.io.compress.SnappyCompressor', 'chunk_length_kb': '64' }) writer = SSTableSimpleUnsortedWriter(output_dir, partitioner, keyspace, column_family, AsciiType.instance, None, buffer_size, compression_options) try: previous_rowkey = None for line in fileinput.input(input_files): ttl = None t_columns = line.rstrip("\n").split("\t") if len(t_columns) == 3: rowkey, colkey, value = t_columns elif len(t_columns) == 4: rowkey, colkey, value, ttl = t_columns ttl = int(ttl) else: raise Exception("unknown data format for %r" % (t_columns,)) if rowkey != previous_rowkey: writer.newRow(bytes(rowkey)) coerced = coercer(value) if ttl is None: writer.addColumn(bytes(colkey), coerced, timestamp) else: # see # https://svn.apache.org/repos/asf/cassandra/trunk/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java addExpiringColumn:expirationTimestampMS # for explanation expirationTimestampMS = (timestamp / 1000) + (ttl * 1000) writer.addExpiringColumn(bytes(colkey), coerced, timestamp, ttl, expirationTimestampMS) if verbose and fileinput.lineno() % 10000 == 0: print "%d items processed (%s)" % (fileinput.lineno(), fileinput.filename()) except: # it's common that whatever causes us to fail also cases the finally # clause below to fail, which masks the original exception logging.exception("Failed") raise finally: writer.close()
def convert_to_sstables(input_files, column_family, output_dir_name, keyspace, timestamp, buffer_size, data_type, verbose=False): import fileinput from java.io import File from org.apache.cassandra.io.sstable import SSTableSimpleUnsortedWriter from org.apache.cassandra.db.marshal import AsciiType from org.apache.cassandra.service import StorageService from org.apache.cassandra.io.compress import CompressionParameters partitioner = StorageService.getPartitioner() try: coercer = COERCERS[data_type] except KeyError: raise ValueError("invalid data type") output_dir = File(output_dir_name) if not output_dir.exists(): output_dir.mkdir() compression_options = CompressionParameters.create({ 'sstable_compression': 'org.apache.cassandra.io.compress.SnappyCompressor', 'chunk_length_kb': '64' }) writer = SSTableSimpleUnsortedWriter(output_dir, partitioner, keyspace, column_family, AsciiType.instance, None, buffer_size, compression_options) try: previous_rowkey = None for line in fileinput.input(input_files): ttl = None t_columns = line.rstrip("\n").split("\t") if len(t_columns) == 3: rowkey, colkey, value = t_columns elif len(t_columns) == 4: rowkey, colkey, value, ttl = t_columns ttl = int(ttl) else: raise Exception("unknown data format for %r" % (t_columns, )) if rowkey != previous_rowkey: writer.newRow(bytes(rowkey)) coerced = coercer(value) if ttl is None: writer.addColumn(bytes(colkey), coerced, timestamp) else: # see # https://svn.apache.org/repos/asf/cassandra/trunk/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java addExpiringColumn:expirationTimestampMS # for explanation expirationTimestampMS = (timestamp / 1000) + (ttl * 1000) writer.addExpiringColumn(bytes(colkey), coerced, timestamp, ttl, expirationTimestampMS) if verbose and fileinput.lineno() % 10000 == 0: print "%d items processed (%s)" % (fileinput.lineno(), fileinput.filename()) except: # it's common that whatever causes us to fail also cases the finally # clause below to fail, which masks the original exception logging.exception("Failed") raise finally: writer.close()