예제 #1
0
def generateDataMilestone3(dataSize):
    outputFile_ctrl = TEST_BASE_DIR + '/' + 'data4_ctrl.csv'
    outputFile_btree = TEST_BASE_DIR + '/' + 'data4_btree.csv'
    outputFile_clustered_btree = TEST_BASE_DIR + '/' + 'data4_clustered_btree.csv'
    header_line_ctrl = data_gen_utils.generateHeaderLine('db1', 'tbl4_ctrl', 4)
    header_line_btree = data_gen_utils.generateHeaderLine('db1', 'tbl4', 4)
    header_line_clustered_btree = data_gen_utils.generateHeaderLine('db1', 'tbl4_clustered_btree', 4)
    outputTable = pd.DataFrame(np.random.randint(0, dataSize/5, size=(dataSize, 4)), columns =['col1', 'col2', 'col3', 'col4'])
    # This is going to have many, many duplicates for large tables!!!!
    outputTable['col1'] = np.random.randint(0,1000, size = (dataSize))
    outputTable['col4'] = np.random.randint(0,10000, size = (dataSize))
    ### make ~5\% of values a single value! 
    maskStart = np.random.uniform(0.0,1.0, dataSize)   
    mask1 = maskStart < 0.05
    ### make ~2% of values a different value
    maskStart = np.random.uniform(0.0,1.0, dataSize)
    mask2 = maskStart < 0.02
    outputTable['col2'] = np.random.randint(0,10000, size = (dataSize))
    frequentVal1 = np.random.randint(0,int(dataSize/5))
    frequentVal2 = np.random.randint(0,int(dataSize/5))
    outputTable.loc[mask1, 'col2'] = frequentVal1
    outputTable.loc[mask2, 'col2'] = frequentVal2
    outputTable['col4'] = outputTable['col4'] + outputTable['col1']
    outputTable.to_csv(outputFile_ctrl, sep=',', index=False, header=header_line_ctrl, line_terminator='\n')
    outputTable.to_csv(outputFile_btree, sep=',', index=False, header=header_line_btree, line_terminator='\n')
    outputTable.to_csv(outputFile_clustered_btree, sep=',', index=False, header=header_line_clustered_btree, line_terminator='\n')
    return frequentVal1, frequentVal2, outputTable
예제 #2
0
def generateDataMilestone4(dataSizeFact, dataSizeDim1, dataSizeDim2, zipfianParam, numDistinctElements):
    outputFile1 = TEST_BASE_DIR + '/' + 'data5_fact.csv'
    outputFile2 = TEST_BASE_DIR + '/' + 'data5_dimension1.csv'
    outputFile3 = TEST_BASE_DIR + '/' + 'data5_dimension2.csv'

    header_line_fact = data_gen_utils.generateHeaderLine('db1', 'tbl5_fact', 4)
    header_line_dim1 = data_gen_utils.generateHeaderLine('db1', 'tbl5_dim1', 3)
    header_line_dim2 = data_gen_utils.generateHeaderLine('db1', 'tbl5_dim2', 2)
    outputFactTable = pd.DataFrame(np.random.randint(0, dataSizeFact/5, size=(dataSizeFact, 4)), columns =['col1', 'col2', 'col3', 'col4'])
    zipfDist = ZipfianDistribution(zipfianParam, numDistinctElements)
    # See Zipf's distribution (wikipedia) for a description of this distribution. 
    outputFactTable['col1'] = zipfDist.createRandomNumpyArray(dataSizeFact)
    outputFactTable['col3'] = np.full((dataSizeFact),1)
    outputFactTable['col4'] = np.random.randint(1, dataSizeDim2, size=(dataSizeFact))

    outputDimTable1 = pd.DataFrame(np.random.randint(0, dataSizeDim1/5, size=(dataSizeDim1, 3)), columns =['col1', 'col2', 'col3'])
    # joinable on col1 with fact table
    outputDimTable1['col1'] = zipfDist.createRandomNumpyArray(dataSizeDim1)
    # joinable on col2 with dimension table 2
    outputDimTable1['col2'] = np.random.randint(1, dataSizeDim2, size=(dataSizeDim1))

    outputDimTable2 = pd.DataFrame(np.random.randint(0, dataSizeDim2/5, size=(dataSizeDim2, 2)), columns =['col1', 'col2'])
    outputDimTable2['col1'] = np.arange(1,dataSizeDim2+1, 1)
    
    outputFactTable.to_csv(outputFile1, sep=',', index=False, header=header_line_fact, line_terminator='\n')
    outputDimTable1.to_csv(outputFile2, sep=',', index=False, header=header_line_dim1, line_terminator='\n')
    outputDimTable2.to_csv(outputFile3, sep=',', index=False, header=header_line_dim2, line_terminator='\n')
    return outputFactTable, outputDimTable1, outputDimTable2
예제 #3
0
def generateDataFile2(dataSizeTableTwo):
	outputFile = TEST_BASE_DIR + '/' + 'data2_generated.csv'
	header_line = data_gen_utils.generateHeaderLine('db1', 'tbl2', 4)
	outputTable = pd.DataFrame(np.random.randint(-1 * dataSizeTableTwo/2, dataSizeTableTwo/2, size=(dataSizeTableTwo, 4)), columns =['col1', 'col2', 'col3', 'col4'])
	outputTable['col2'] = outputTable['col2'] + outputTable['col1']
	# This is going to have many, many duplicates!!!!
	outputTable['col3'] = np.random.randint(0,100, size = (dataSizeTableTwo))
	outputTable['col4'] = np.random.randint(2**31 - 10000, 2**31, size = (dataSizeTableTwo))
	outputTable.to_csv(outputFile, sep=',', index=False, header=header_line, line_terminator='\n')
	return outputTable
예제 #4
0
def generateDataFileMidwayCheckin():
	outputFile = TEST_BASE_DIR + '/data1_generated.csv'
	header_line = data_gen_utils.generateHeaderLine('db1', 'tbl1', 2)
	column1 = list(range(0,1000))
	column2 = list(range(10,1010))
	#### For these 3 tests, the seed is exactly the same on the server.
	np.random.seed(47)
	np.random.shuffle(column2)
	#outputTable = np.column_stack((column1, column2)).astype(int)
	outputTable = pd.DataFrame(list(zip(column1, column2)), columns =['col1', 'col2'])
	outputTable.to_csv(outputFile, sep=',', index=False, header=header_line, line_terminator='\n')
	return outputTable
예제 #5
0
def generateDataMilestone2(dataSize):
    outputFile = TEST_BASE_DIR + '/data3_batch.csv'
    header_line = data_gen_utils.generateHeaderLine('db1', 'tbl3_batch', 4)
    outputTable = pd.DataFrame(np.random.randint(0,
                                                 dataSize / 5,
                                                 size=(dataSize, 4)),
                               columns=['col1', 'col2', 'col3', 'col4'])
    # This is going to have many, many duplicates for large tables!!!!
    outputTable['col1'] = np.random.randint(0, 1000, size=(dataSize))
    outputTable['col4'] = np.random.randint(0, 10000, size=(dataSize))
    outputTable['col4'] = outputTable['col4'] + outputTable['col1']
    outputTable.to_csv(outputFile,
                       sep=',',
                       index=False,
                       header=header_line,
                       line_terminator='\n')
    return outputTable