def generateDataMilestone3(dataSize): outputFile_ctrl = TEST_BASE_DIR + '/' + 'data4_ctrl.csv' outputFile_btree = TEST_BASE_DIR + '/' + 'data4_btree.csv' outputFile_clustered_btree = TEST_BASE_DIR + '/' + 'data4_clustered_btree.csv' header_line_ctrl = data_gen_utils.generateHeaderLine('db1', 'tbl4_ctrl', 4) header_line_btree = data_gen_utils.generateHeaderLine('db1', 'tbl4', 4) header_line_clustered_btree = data_gen_utils.generateHeaderLine('db1', 'tbl4_clustered_btree', 4) outputTable = pd.DataFrame(np.random.randint(0, dataSize/5, size=(dataSize, 4)), columns =['col1', 'col2', 'col3', 'col4']) # This is going to have many, many duplicates for large tables!!!! outputTable['col1'] = np.random.randint(0,1000, size = (dataSize)) outputTable['col4'] = np.random.randint(0,10000, size = (dataSize)) ### make ~5\% of values a single value! maskStart = np.random.uniform(0.0,1.0, dataSize) mask1 = maskStart < 0.05 ### make ~2% of values a different value maskStart = np.random.uniform(0.0,1.0, dataSize) mask2 = maskStart < 0.02 outputTable['col2'] = np.random.randint(0,10000, size = (dataSize)) frequentVal1 = np.random.randint(0,int(dataSize/5)) frequentVal2 = np.random.randint(0,int(dataSize/5)) outputTable.loc[mask1, 'col2'] = frequentVal1 outputTable.loc[mask2, 'col2'] = frequentVal2 outputTable['col4'] = outputTable['col4'] + outputTable['col1'] outputTable.to_csv(outputFile_ctrl, sep=',', index=False, header=header_line_ctrl, line_terminator='\n') outputTable.to_csv(outputFile_btree, sep=',', index=False, header=header_line_btree, line_terminator='\n') outputTable.to_csv(outputFile_clustered_btree, sep=',', index=False, header=header_line_clustered_btree, line_terminator='\n') return frequentVal1, frequentVal2, outputTable
def generateDataMilestone4(dataSizeFact, dataSizeDim1, dataSizeDim2, zipfianParam, numDistinctElements): outputFile1 = TEST_BASE_DIR + '/' + 'data5_fact.csv' outputFile2 = TEST_BASE_DIR + '/' + 'data5_dimension1.csv' outputFile3 = TEST_BASE_DIR + '/' + 'data5_dimension2.csv' header_line_fact = data_gen_utils.generateHeaderLine('db1', 'tbl5_fact', 4) header_line_dim1 = data_gen_utils.generateHeaderLine('db1', 'tbl5_dim1', 3) header_line_dim2 = data_gen_utils.generateHeaderLine('db1', 'tbl5_dim2', 2) outputFactTable = pd.DataFrame(np.random.randint(0, dataSizeFact/5, size=(dataSizeFact, 4)), columns =['col1', 'col2', 'col3', 'col4']) zipfDist = ZipfianDistribution(zipfianParam, numDistinctElements) # See Zipf's distribution (wikipedia) for a description of this distribution. outputFactTable['col1'] = zipfDist.createRandomNumpyArray(dataSizeFact) outputFactTable['col3'] = np.full((dataSizeFact),1) outputFactTable['col4'] = np.random.randint(1, dataSizeDim2, size=(dataSizeFact)) outputDimTable1 = pd.DataFrame(np.random.randint(0, dataSizeDim1/5, size=(dataSizeDim1, 3)), columns =['col1', 'col2', 'col3']) # joinable on col1 with fact table outputDimTable1['col1'] = zipfDist.createRandomNumpyArray(dataSizeDim1) # joinable on col2 with dimension table 2 outputDimTable1['col2'] = np.random.randint(1, dataSizeDim2, size=(dataSizeDim1)) outputDimTable2 = pd.DataFrame(np.random.randint(0, dataSizeDim2/5, size=(dataSizeDim2, 2)), columns =['col1', 'col2']) outputDimTable2['col1'] = np.arange(1,dataSizeDim2+1, 1) outputFactTable.to_csv(outputFile1, sep=',', index=False, header=header_line_fact, line_terminator='\n') outputDimTable1.to_csv(outputFile2, sep=',', index=False, header=header_line_dim1, line_terminator='\n') outputDimTable2.to_csv(outputFile3, sep=',', index=False, header=header_line_dim2, line_terminator='\n') return outputFactTable, outputDimTable1, outputDimTable2
def generateDataFile2(dataSizeTableTwo): outputFile = TEST_BASE_DIR + '/' + 'data2_generated.csv' header_line = data_gen_utils.generateHeaderLine('db1', 'tbl2', 4) outputTable = pd.DataFrame(np.random.randint(-1 * dataSizeTableTwo/2, dataSizeTableTwo/2, size=(dataSizeTableTwo, 4)), columns =['col1', 'col2', 'col3', 'col4']) outputTable['col2'] = outputTable['col2'] + outputTable['col1'] # This is going to have many, many duplicates!!!! outputTable['col3'] = np.random.randint(0,100, size = (dataSizeTableTwo)) outputTable['col4'] = np.random.randint(2**31 - 10000, 2**31, size = (dataSizeTableTwo)) outputTable.to_csv(outputFile, sep=',', index=False, header=header_line, line_terminator='\n') return outputTable
def generateDataFileMidwayCheckin(): outputFile = TEST_BASE_DIR + '/data1_generated.csv' header_line = data_gen_utils.generateHeaderLine('db1', 'tbl1', 2) column1 = list(range(0,1000)) column2 = list(range(10,1010)) #### For these 3 tests, the seed is exactly the same on the server. np.random.seed(47) np.random.shuffle(column2) #outputTable = np.column_stack((column1, column2)).astype(int) outputTable = pd.DataFrame(list(zip(column1, column2)), columns =['col1', 'col2']) outputTable.to_csv(outputFile, sep=',', index=False, header=header_line, line_terminator='\n') return outputTable
def generateDataMilestone2(dataSize): outputFile = TEST_BASE_DIR + '/data3_batch.csv' header_line = data_gen_utils.generateHeaderLine('db1', 'tbl3_batch', 4) outputTable = pd.DataFrame(np.random.randint(0, dataSize / 5, size=(dataSize, 4)), columns=['col1', 'col2', 'col3', 'col4']) # This is going to have many, many duplicates for large tables!!!! outputTable['col1'] = np.random.randint(0, 1000, size=(dataSize)) outputTable['col4'] = np.random.randint(0, 10000, size=(dataSize)) outputTable['col4'] = outputTable['col4'] + outputTable['col1'] outputTable.to_csv(outputFile, sep=',', index=False, header=header_line, line_terminator='\n') return outputTable