def test_schema_hints(self): self.fileConverter = JSONToRelation(self.stringSource, OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV), mainTableName='EdxTrackEvent', schemaHints = OrderedDict() ) edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True) self.fileConverter.jsonParserInstance = edxJsonToRelParser self.fileConverter.convert() schema = self.fileConverter.getSchema() #print schema #print map(ColumnSpec.getType, schema) self.assertEqual(['VARCHAR(40)', 'VARCHAR(40)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'DATETIME', 'TEXT', 'DATETIME', 'TEXT', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'TEXT', 'INT', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'INT', 'INT', 'INT', 'INT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'TEXT', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)'], map(ColumnSpec.getType, schema)) self.stringSource = InURI(os.path.join(os.path.dirname(__file__),"data/twoJSONRecords.json")) self.fileConverter = JSONToRelation(self.stringSource, OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV), mainTableName='EdxTrackEvent', schemaHints = OrderedDict({'chunkSize' : ColDataType.INT, 'length' : ColDataType.INT}) ) edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True) self.fileConverter.jsonParserInstance = edxJsonToRelParser self.fileConverter.convert() schema = self.fileConverter.getSchema() self.assertEqual(['VARCHAR(40)', 'VARCHAR(40)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'DATETIME', 'TEXT', 'DATETIME', 'TEXT', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'TEXT', 'TEXT', 'INT', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'VARCHAR(255)', 'INT', 'INT', 'VARCHAR(255)', 'TEXT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'TEXT', 'TEXT', 'VARCHAR(255)', 'TEXT', 'TINYINT', 'TEXT', 'INT', 'INT', 'INT', 'INT', 'VARCHAR(255)', 'VARCHAR(255)', 'VARCHAR(255)', 'INT', 'TEXT', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)', 'VARCHAR(40)'], map(ColumnSpec.getType, schema))
def convert(inFilePath, destDir, targetFormat, dropTables=False): # Output file is name of input file with the # .json extension replaced by .sql outFullPath = buildOutputFileName(inFilePath, destDir) # Log file will go to <destDir>/../TransformLogs, the file being named j2s_<inputFileName>.log: logDir = os.path.join(destDir, '..', 'TransformLogs') if not os.access(logDir, os.W_OK): try: os.makedirs(logDir) except OSError: # Log dir already exists: pass logFile = os.path.join(logDir, 'j2s_%s.log' % os.path.basename(inFilePath)) # Create an instance of JSONToRelation, taking input from the given file: # and pumping output to the given output path: if targetFormat == 'csv': outputFormat = OutputDisposition.OutputFormat.CSV elif targetFormat == 'sql_dump': outputFormat = OutputDisposition.OutputFormat.SQL_INSERT_STATEMENTS else: outputFormat = OutputDisposition.OutputFormat.SQL_INSERTS_AND_CSV outSQLFile = OutputFile(outFullPath, outputFormat, options='wb') # overwrite any existing sql file jsonConverter = JSONToRelation(InURI(inFilePath), outSQLFile, mainTableName='EdxTrackEvent', logFile=logFile, progressEvery=10000) try: jsonConverter.setParser( EdXTrackLogJSONParser(jsonConverter, 'EdxTrackEvent', replaceTables=dropTables, dbName='Edx', progressEvery=10000)) except Exception as e: with open(logFile, 'w') as fd: fd.write( "In json2sql: could not create EdXTrackLogJSONParser: %s" % ` e `) # Try to delete the .sql file that was created when # the OutputFile instance was made in the JSONToRelation # instantiation statement above: try: outSQLFile.remove() except Exception as e: pass sys.exit(1) jsonConverter.convert()
def test_edX_tracking_import(self): source = InURI(os.path.join(os.path.dirname(__file__),"data/edxTrackLogSample.json")) self.fileConverter = JSONToRelation(source, OutputFile("testEdXImport.csv", OutputDisposition.OutputFormat.CSV), mainTableName='EdxTrackEvent' ) edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True) self.fileConverter.jsonParserInstance = edxJsonToRelParser self.fileConverter.convert(prependColHeader=True)
def test_embedded_json_strings_comma_escaping(self): source = InURI(os.path.join(os.path.dirname(__file__),"data/tinyEdXTrackLog.json")) self.fileConverter = JSONToRelation(source, OutputFile("testTinyEdXImport.csv", OutputDisposition.OutputFormat.CSV), mainTableName='EdxTrackEvent' ) edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True) self.fileConverter.jsonParserInstance = edxJsonToRelParser self.fileConverter.convert(prependColHeader=True)
def test_arrays(self): source = InURI(os.path.join(os.path.dirname(__file__),"data/jsonArray.json")) self.fileConverter = JSONToRelation(source, OutputFile("testArrays.csv", OutputDisposition.OutputFormat.CSV), mainTableName='EdxTrackEvent' ) edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True) edxJsonToRelParser.unittesting = True self.fileConverter.jsonParserInstance = edxJsonToRelParser self.fileConverter.convert(prependColHeader=True)
def test_edX_stress_import(self): source = InURI(os.path.join(os.path.dirname(__file__),"data/tracking.log-20130609.gz")) print("Stress test: importing lots...") self.fileConverter = JSONToRelation(source, OutputFile("testEdXStressImport.csv", OutputDisposition.OutputFormat.CSV), mainTableName='EdxTrackEvent', progressEvery=10 ) edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True) self.fileConverter.jsonParserInstance = edxJsonToRelParser self.fileConverter.convert(prependColHeader=True) print("Stress test done")
def test_simple_json_to_file(self): self.fileConverter = JSONToRelation(self.stringSource, OutputFile("testOutput.csv", OutputDisposition.OutputFormat.CSV) ) edxJsonToRelParser = EdXTrackLogJSONParser(self.fileConverter, "EdxTrackEvent", useDisplayNameCache=True) self.fileConverter.jsonParserInstance = edxJsonToRelParser self.fileConverter.convert() with open(os.path.join(self.currDir, 'data/simpleJsonToFileTruth.txt'), 'r') as fd: expected = fd.read() # expected = "asset,sainani.jpg,HRP258,c4x,Medicine,,image/jpeg,sainani.jpg,262144,/c4x/Medicine/HRP258/asset/sainani.jpg," +\ # "22333,,2013-05-08T22:47:09.762Z,,ebcb2a60b0d6b7475c4e9a102b82637b\n" +\ # "asset,medstats.png,HRP258,c4x,Medicine,,image/png,medstats.png,262144,/c4x/Medicine/HRP258/asset/medstats.png," +\ # "86597,,2013-05-08T22:48:38.174Z,,db47f263ac3532874b8f442ad8937d02" if UPDATE_TRUTH: self.updateTruthFromFile(os.path.join(self.curDir,'testOutput.csv'), os.path.join(self.curDir, 'data/rescueJSONTruth1.txt')) else: self.assertFileContentEquals(expected, "testOutput.csv")