示例#1
0
    def description(self, output):
        t = Timing(False);


        t.out("""
Testing individual file access or block access with random access:

Testing scenarios:

- A high probability that the data is in a few of many block files
- A the probability for all blocks is the same
- Each datum is in it's own file (test small, large and varying data size)

- test all scenarios with a variety of buffer sizes, block file sizes and data length.""");
class QueryA():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    filePath=sys.argv[1] # Path for the files
    outFilePath=sys.argv[2] # Path for the output file
    typeOfData=sys.argv[3] # Type of dataset
    with open(outFilePath, "a") as myfile: # Open output file
       myfile.write("QueryA_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime() # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData)
    outputScan_left = selScan_left.execute()

    # Find Accuracy score for each row
    accuracyAttr = "Accuracy"
   
    accInputExpr = AccuracyExpression(outputScan_left,"ship_date",">","submit_date")
    accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr)
    outputAccuracy = accuracyOp.execute()
    # Select columns from the dataframe
    attrList = ["order_no","Accuracy_score"]
    proj = Project(outputAccuracy, attrList)
    outputFinal = proj.execute()
	
    nrows = outputFinal.count()

    stopTime = timing.stopTime()# Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#3
0
class QueryT():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryT_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Create a dataframe from a file
    inputTable_2 = "statusTimelinessQR_11rows"
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right, filePath,
                               typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid")
    join_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = join_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"
    timeliness = Timeliness(outputJoin_1, timelinessAttr)
    outputTimeliness = timeliness.execute()

    #Select columns from the dataframe
    attrList = ["statusTimeliness_qid", "timeliness_score"]
    proj = Project(outputTimeliness, attrList)
    outputFinal = proj.execute()

    nrows = outputFinal.count()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    nrows = outputFinal.count()
    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#4
0
class QueryA():
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryA_traffic\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'newColpvr_2016-01-01_366d_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Find Accuracy score for each row
    accuracyAttr = "Accuracy"
    if (typeOfData != 'Categorical'):
        accInputExpr_6 = AccuracyExpression(outputScan_left, "Class5Volume",
                                            "+", "Class6Volume")
        accInputExpr_5 = AccuracyExpression(outputScan_left, "Class4Volume",
                                            "+", accInputExpr_6)
        accInputExpr_4 = AccuracyExpression(outputScan_left, "Class3Volume",
                                            "+", accInputExpr_5)
        accInputExpr_3 = AccuracyExpression(outputScan_left, "Class2Volume",
                                            "+", accInputExpr_4)
        accInputExpr_2 = AccuracyExpression(outputScan_left, "Class1Volume",
                                            "+", accInputExpr_3)
        accInputExpr_1 = AccuracyExpression(outputScan_left, "Volume", "=",
                                            accInputExpr_2)
        accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1)
    else:
        accInputExpr_1 = AccuracyExpression(outputScan_left, "Volume", "=",
                                            "Class2Volume")
        accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1)
    outputAccuracy = accuracyOp.execute()

    # Select columns from the dataframe
    if (typeOfData != 'Categorical'):
        attrList = ["Sdate", "LaneNumber", "Accuracy_score"]
    else:
        attrList = ["Sdate", "Accuracy_score"]
    proj = Project(outputAccuracy, attrList)
    outputFinal = proj.execute()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#5
0
    def stringIOLineSplit(self, testSuite):
        t = Timing(testSuite['output']);
        t.out('\nStringIO timing:');

        t.start();
        stringHandler = StringIO.StringIO(testSuite['lines']);

        t.start();
        line = stringHandler.readline();
        while line:
            lineHandler = StringIO.StringIO(line);
            char = lineHandler.read(1);
            while char:
                char = lineHandler.read(1);
            line = stringHandler.readline();

        print "Result: " + str(t.stop()) + '\n';
示例#6
0
class QueryA():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    filePath=sys.argv[1] # Path for the files
    outFilePath=sys.argv[2] # Path for the output file
    typeOfData=sys.argv[3] # Type of dataset
    with open(outFilePath, "a") as myfile: # Open output file
       myfile.write("QueryA_traffic\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime() # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'newColpvr_2016-01-01_366d_11rows'  
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData) 
    outputScan_left = selScan_left.execute() 

    # Find Accuracy score for each row
    accuracyAttr = "Accuracy"
    if(typeOfData!='Categorical'):
     accInputExpr_6 = AccuracyExpression(outputScan_left,"Class5Volume","+","Class6Volume")
     accInputExpr_5 = AccuracyExpression(outputScan_left,"Class4Volume","+",accInputExpr_6)
     accInputExpr_4 = AccuracyExpression(outputScan_left,"Class3Volume","+",accInputExpr_5)
     accInputExpr_3 = AccuracyExpression(outputScan_left,"Class2Volume","+",accInputExpr_4)
     accInputExpr_2 = AccuracyExpression(outputScan_left,"Class1Volume","+",accInputExpr_3)
     accInputExpr_1 = AccuracyExpression(outputScan_left,"Volume","=",accInputExpr_2)
     accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1)
    else:
     accInputExpr_1 = AccuracyExpression(outputScan_left,"Volume","=","Class2Volume")
     accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1)
    outputAccuracy = accuracyOp.execute()
	
    # Select columns from the dataframe
    if(typeOfData!='Categorical'):
     attrList = ["Sdate","LaneNumber","Accuracy_score"]
    else:
     attrList = ["Sdate","Accuracy_score"]
    proj = Project(outputAccuracy, attrList)
    outputFinal = proj.execute()
	
    nrows = outputFinal.count()
	
    stopTime = timing.stopTime()# Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#7
0
class QueryT():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryT_traffic\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'newColpvr_2016-01-01_366d_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, sqlContext,
                              filePath, typeOfData)
    outputScan_left = selScan_left.execute()

    # Create a dataframe from a file
    inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows'
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right, sqlContext,
                               filePath, typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid")
    dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = dfJoin_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"
    timeliness = Timeliness(outputJoin_1, timelinessAttr)
    outputTimeliness = timeliness.execute()

    # Select columns from the dataframe
    attrList = ["VolumeTimeliness_id", "timeliness_score"]
    proj = Project(outputTimeliness, attrList)
    outputFinal = proj.execute()

    nrows = outputFinal.count()

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#8
0
class QueryC():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    filePath=sys.argv[1] # Path for the files
    outFilePath=sys.argv[2] # Path for the output file
    typeOfData=sys.argv[3] # Type of dataset
    with open(outFilePath, "a") as myfile: # Open output file
       myfile.write("QueryC_traffic\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime() # Start measuring time
	
    # Create a dataframe from a file
    inputTable_1 = 'newColpvr_2016-01-01_366d_11rows'  
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left,filePath,typeOfData) 
    outputScan_left = selScan_left.execute() 

    # Find Completeness score for each row
    attrName = "Completeness"
    inputColumnNames = ["Class1Volume","Class2Volume"] 
    inputSymbols = ['empty','empty']
    completenessOp = RowCompleteness(outputScan_left, attrName, inputColumnNames, inputSymbols)
    outputCompleteness = completenessOp.execute()
    
	#Select columns from the dataframe
    if(typeOfData!='Categorical'):
     attrList = ["Sdate","LaneNumber","Completeness_score"]
    else:
     attrList = ["Sdate","Completeness_score"]
    proj = Project(outputCompleteness, attrList)
    outputFinal = proj.execute()
	
    nrows = outputFinal.count()
	
    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime() # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#9
0
    def split(self, testSuite):
        t = Timing(testSuite['output']);
        t.out('\nSplit timing:');

        t.start();
        stringHandler = StringIO.StringIO(testSuite['lines']);

        t.start();
        line = stringHandler.readline();
        while line:
            s = line.split(',');
            termId = int(s[0]);
            docId  = int(s[1]);
            posit  = int(s[2]);
            # print str(termId) + "," + str(docId) + "," + str(posit);
            line = stringHandler.readline();

        print "Result: " + str(t.stop()) + '\n';
class QueryT():
    filePath=sys.argv[1] # Path for the files
    outFilePath=sys.argv[2] # Path for the output file
    typeOfData=sys.argv[3] # Type of dataset
    with open(outFilePath, "a") as myfile: # Open output file
       myfile.write("QueryT_traffic\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime() # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'newColpvr_2016-01-01_366d_11rows'  
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left,filePath,typeOfData) 
    outputScan_left = selScan_left.execute() 

    # Create a dataframe from a file
    inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows'
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right,filePath,typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid")
    dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = dfJoin_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"                    
    timeliness = Timeliness(outputJoin_1, timelinessAttr)
    outputTimeliness = timeliness.execute()

    # Select columns from the dataframe
    attrList = ["VolumeTimeliness_id","timeliness_score"]
    proj = Project(outputTimeliness, attrList)
    outputFinal = proj.execute()
    
    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime() # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#11
0
class QueryC():
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryC_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Find Completeness score for each row
    attrName = "Completeness"
    inputColumnNames = ["ship_date", "statusOrder"]
    inputSymbols = ['empty', 'empty']
    completenessOp = RowCompleteness(outputScan_left, attrName,
                                     inputColumnNames, inputSymbols)
    outputCompleteness = completenessOp.execute()

    # Select columns from the dataframe
    attrList = ["order_no", "Completeness_score"]
    proj = Project(outputCompleteness, attrList)
    outputFinal = proj.execute()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#12
0
def main(argv):
	parser = argparse.ArgumentParser(description='Compile C to QSP/QAP')
	parser.add_argument('cfile', metavar='<cfile>',
		help='a C file to compile')
	parser.add_argument('--print', dest='print_exprs',
		help="print output expressions on stdout")
	parser.add_argument('--il', dest='il_file',
		help='intermediate circuit output file')
	parser.add_argument('--json', dest='json_file',
		help='json version of intermediate circuit output file')
	parser.add_argument('--arith', dest='arith_file',
		help='arithmetic circuit output file')
	parser.add_argument('--bit-width', dest='bit_width',
		help='bit width -- affects bitwise operator semantics and arithmetic circuit output', default=32)
	parser.add_argument('--bool', dest='bool_file',
		help='boolean circuit output file')
	parser.add_argument('--ignore-overflow', dest='ignore_overflow',
		help='ignore field-P overflows; never truncate', default=False)
	parser.add_argument('--cpparg', dest='cpp_arg', nargs="*",
		help='extra arguments to C preprocessor')
	parser.add_argument('--loop-sanity-limit', dest='loop_sanity_limit',
		help='limit on statically-measured loop unrolling', default=1000000)
	parser.add_argument('--progress', dest='progress',
		help='print progress messages during compilation')

	args = parser.parse_args(argv)

	timing = Timing(args.cfile, enabled=False)
	try:
		vercomp = Vercomp(args.cfile, args, timing)
	except Exception,ex:
		print repr(ex)
		raise
		print "DFG total count: %s flatbytes: %s" % (total.count, total.flatbytes)
		flats = total.flats
		flats.sort()
		flats = flats[::-1]
		print("Biggest flat: len %s %s" % (flats[0][0], str(flats[0][1])[0:160]))
		raise
class QueryC():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryC_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, sqlContext,
                              filePath, typeOfData)
    outputScan_left = selScan_left.execute()

    # Find Completeness score for each row
    attrName = "Completeness"
    inputColumnNames = ["ship_date", "statusOrder"]
    inputSymbols = [' ', ' ']
    completenessOp = RowCompleteness(outputScan_left, attrName,
                                     inputColumnNames, inputSymbols)
    outputCompleteness = completenessOp.execute()

    # Select columns from the dataframe
    attrList = ["order_no", "Completeness_score"]
    proj = Project(outputCompleteness, attrList)
    outputFinal = proj.execute()

    nrows = outputFinal.count()

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#14
0
class QueryA():
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryA_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Find Accuracy score for each row
    accuracyAttr = "Accuracy"
    accInputExpr = AccuracyExpression(outputScan_left, "ship_date", ">",
                                      "submit_date")
    accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr)
    outputAccuracy = accuracyOp.execute()

    # Select columns from the dataframe
    attrList = ["order_no", "Accuracy_score"]
    proj = Project(outputAccuracy, attrList)
    outputFinal = proj.execute()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#15
0
    def regexLineSplit(self, testSuite):
        t = Timing(testSuite['output']);

        t.out('\nRegex timing:');
        compiledExpression = re.compile(r"""
            (?P<termId>\d*),
            (?P<docId>\d*),
            (?P<position>\d*)
        """, re.S|re.X)
        stringHandler = StringIO.StringIO(testSuite['lines']);

        t.start();
        line = stringHandler.readline();
        while line:
            match  = re.match(compiledExpression, line)
            termId = int(match.group('termId'));
            docId  = int(match.group('docId'));
            posit  = int(match.group('position'));
            # print str(termId) + "," + str(docId) + "," + str(posit);
            line = stringHandler.readline();
        print "Result: " + str(t.stop()) + '\n';
class QueryTC():
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryT+C_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Create a dataframe from a file
    inputTable_2 = "statusTimelinessQR_11rows"
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right, filePath,
                               typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid")
    join_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = join_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"
    timeliness = Timeliness(outputJoin_1, timelinessAttr)
    outputTimeliness = timeliness.execute()

    # Create a dataframe from a file
    inputTable_right_2 = "testeBusinessData_customerInfo"
    predScan_right_2 = None
    selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2,
                                 filePath, typeOfData)
    outputScan_right_2 = selScan_right_2.execute()

    # Join two dataframes
    predJoin_2 = ("customer_id", "=", "custom_id")
    join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2)
    outputJoin_2 = join_2.execute()

    # Find Completeness score for each row
    attrName = "Completeness"
    inputColumnNames = ["postcode", "email"]
    inputSymbols = ['empty', 'empty']
    completenessOp = RowCompleteness(outputJoin_2, attrName, inputColumnNames,
                                     inputSymbols)
    outputCompleteness = completenessOp.execute()

    # Select columns from the dataframe
    attrList = [
        "order_no", "customer_id", "timeliness_score", "Completeness_score"
    ]
    proj = Project(outputCompleteness, attrList)
    outputFinal = proj.execute()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#17
0
class ModelBase:
    """
        Base for all models
        Magic methods:
            1) __str__     : return self.name; __repr__ = __str__
            2) __getitem__ : access to protected members 
        Properties:
            1) name  : name of this model, self.__class__.__name__ or self._name
            2) title : used in matplotlib (plt.title())
        Static method:
            1) disable_timing  : disable Timing()
            2) show_timing_log : show Timing() records
    """

    clf_timing = Timing()

    def __init__(self, **kwargs):
        self._plot_label_dict = {}
        self._title = self._name = None
        self._metrics, self._available_metrics = [], {
            "acc": ClassifierBase.acc
        }
        self._params = {
            "sample_weight": kwargs.get("sample_weight", None)
        }

    def __str__(self):
        return self.name

    def __repr__(self):
        return str(self)

    def __getitem__(self, item):
        if isinstance(item, str):
            return getattr(self, "_" + item)

    @property
    def name(self):
        return self.__class__.__name__ if self._name is None else self._name

    @property
    def title(self):
        return str(self) if self._title is None else self._title

    @staticmethod
    def disable_timing():
        ModelBase.clf_timing.disable()

    @staticmethod
    def show_timing_log(level=2):
        ModelBase.clf_timing.show_timing_log(level)

    # Handle animation

    @staticmethod
    def _refresh_animation_params(animation_params):
        animation_params["show"] = animation_params.get("show", False)
        animation_params["mp4"] = animation_params.get("mp4", False)
        animation_params["period"] = animation_params.get("period", 1)

    def _get_animation_params(self, animation_params):
        if animation_params is None:
            animation_params = self._params["animation_params"]
        else:
            ClassifierBase._refresh_animation_params(animation_params)
        show, mp4, period = animation_params["show"], animation_params["mp4"], animation_params["period"]
        return show or mp4, show, mp4, period, animation_params

    def _handle_animation(self, i, x, y, ims, animation_params, draw_ani, show_ani, make_mp4, ani_period,
                          name=None, img=None):
        if draw_ani and x.shape[1] == 2 and (i + 1) % ani_period == 0:
            if img is None:
                img = self.get_2d_plot(x, y, **animation_params)
            if name is None:
                name = str(self)
            if show_ani:
                cv2.imshow(name, img)
                cv2.waitKey(1)
            if make_mp4:
                ims.append(img)

    def _handle_mp4(self, ims, animation_properties, name=None):
        if name is None:
            name = str(self)
        if animation_properties[2] and ims:
            VisUtil.make_mp4(ims, name)

    def get_2d_plot(self, x, y, padding=1, dense=200, draw_background=False, emphasize=None, extra=None, **kwargs):
        pass

    # Visualization

    def scatter2d(self, x, y, padding=0.5, title=None):
        axis, labels = np.asarray(x).T, np.asarray(y)

        print("=" * 30 + "\n" + str(self))
        x_min, x_max = np.min(axis[0]), np.max(axis[0])
        y_min, y_max = np.min(axis[1]), np.max(axis[1])
        x_padding = max(abs(x_min), abs(x_max)) * padding
        y_padding = max(abs(y_min), abs(y_max)) * padding
        x_min -= x_padding
        x_max += x_padding
        y_min -= y_padding
        y_max += y_padding

        if labels.ndim == 1:
            if not self._plot_label_dict:
                self._plot_label_dict = {c: i for i, c in enumerate(set(labels))}
            dic = self._plot_label_dict
            n_label = len(dic)
            labels = np.array([dic[label] for label in labels])
        else:
            n_label = labels.shape[1]
            labels = np.argmax(labels, axis=1)
        colors = plt.cm.rainbow([i / n_label for i in range(n_label)])[labels]

        if title is None:
            title = self.title

        indices = [labels == i for i in range(np.max(labels) + 1)]
        scatters = []
        plt.figure()
        plt.title(title)
        for idx in indices:
            scatters.append(plt.scatter(axis[0][idx], axis[1][idx], c=colors[idx]))
        plt.legend(scatters, ["$c_{}$".format("{" + str(i) + "}") for i in range(len(scatters))],
                   ncol=math.ceil(math.sqrt(len(scatters))), fontsize=8)
        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)
        plt.show()

        print("Done.")

    def scatter3d(self, x, y, padding=0.1, title=None):
        axis, labels = np.asarray(x).T, np.asarray(y)

        print("=" * 30 + "\n" + str(self))
        x_min, x_max = np.min(axis[0]), np.max(axis[0])
        y_min, y_max = np.min(axis[1]), np.max(axis[1])
        z_min, z_max = np.min(axis[2]), np.max(axis[2])
        x_padding = max(abs(x_min), abs(x_max)) * padding
        y_padding = max(abs(y_min), abs(y_max)) * padding
        z_padding = max(abs(z_min), abs(z_max)) * padding
        x_min -= x_padding
        x_max += x_padding
        y_min -= y_padding
        y_max += y_padding
        z_min -= z_padding
        z_max += z_padding

        def transform_arr(arr):
            if arr.ndim == 1:
                dic = {c: i for i, c in enumerate(set(arr))}
                n_dim = len(dic)
                arr = np.array([dic[label] for label in arr])
            else:
                n_dim = arr.shape[1]
                arr = np.argmax(arr, axis=1)
            return arr, n_dim

        if title is None:
            try:
                title = self.title
            except AttributeError:
                title = str(self)

        labels, n_label = transform_arr(labels)
        colors = plt.cm.rainbow([i / n_label for i in range(n_label)])[labels]
        indices = [labels == i for i in range(n_label)]
        scatters = []
        fig = plt.figure()
        plt.title(title)
        ax = fig.add_subplot(111, projection='3d')
        for _index in indices:
            scatters.append(ax.scatter(axis[0][_index], axis[1][_index], axis[2][_index], c=colors[_index]))
        ax.legend(scatters, ["$c_{}$".format("{" + str(i) + "}") for i in range(len(scatters))],
                  ncol=math.ceil(math.sqrt(len(scatters))), fontsize=8)
        plt.show()

    # Util

    def predict(self, x, get_raw_results=False, **kwargs):
        pass
示例#18
0
    def collapse_tree(self, key):
        timing = Timing("collapse_tree", enabled=False)
        stack = [key]
        loop_count = 0
        while (len(stack) > 0):
            timing.phase("collapser loop # %s setup" % loop_count)
            loop_count += 1
            key = stack[-1]
            if (key in self.table):
                # oh. handy. we already did this dude: he was just
                # wanted multiple times.
                stack.pop()
                continue
            alldeps = self.get_dependencies(key)
            timing.phase("collapser loop # %s get_deps (%d) self %s" %
                         (loop_count, len(alldeps), self.__class__))

            #			def study(table):
            #				keys = table.keys()
            #				keys.sort()
            #				hist = {}
            #				for k in keys:
            #					h = hash(k)
            #					if (h not in hist):
            #						hist[h] = []
            #					hist[h].append(k)
            #				def by_len(a,b):
            #					return cmp(len(a), len(b))
            #				v = hist.values()
            #				v.sort(by_len)
            #				v = v[::-1]
            #				print "%d keys, %d unique hashes, worst duplication: %s" % (
            #					len(keys), len(hist), v[:10])
            #				for k in v[0]:
            #					print "%d %s" % (hash(k), k)
            #				raise Exception("done")

            def notintable(d):
                #				if (len(self.table)>4000):
                #					study(self.table)
                return d not in self.table

            newdeps = filter(notintable, alldeps)
            if (newdeps == []):
                stack.pop()
                assert (key not in self.table)
                self.table[key] = self.collapse_impl(key)
                timing.phase("collapser loop # %s collapse_impl" % loop_count)
            else:
                stack += newdeps
            timing.phase("collapser loop # %s end" % loop_count)
        timing.phase("done")
        #		print "collapser loop_count: %d" % loop_count
        return self.table[key]
示例#19
0
    def blockFiles(self, testSuite):
        t = Timing(testSuite['output']);

        fb = FileBlock(
            testSuite['blockSize'],
            testSuite['bufferSize'],
            testSuite['blockFilesDir'],
            testSuite['tableFileName'],
        );
        fb.loadRegistry();

        t.out("\n\nScenario: Large block files, files are accessed sequentially.");
        t.start();
        for n in xrange(0,testSuite['nbrFiles']):
            fh = fb.open(n);
            buffer = fh.read(testSuite['bufferSize']);
        print "\tresult for sequential block file access:  " + str(t.stop());

        t.out("Scenario: Large block files, files are accessed sequentially in reverse order.");
        t.start();
        for n in xrange(testSuite['nbrFiles']-1,-1,-1):
            fh = fb.open(n);
            buffer = fh.read(testSuite['bufferSize']);
        print "\tresult for reverse sequential block file access:  " + str(t.stop());

        t.out("Scenario: Large block files, files are accessed randomly.");
        t.start();
        for n in xrange(0,testSuite['nbrFiles']):
            fh = fb.open(testSuite['shuffledIdArray'][n]);
            buffer = fh.read(testSuite['bufferSize']);
        print "\tresult for random block file access:  " + str(t.stop());
示例#20
0
    def individualFiles(self, testSuite):
        t = Timing(testSuite['output']);

        individualDir = testSuite['dir'] + ".individualFilesDynamic" + path.sep;

        t.out("\n\nScenario: Each datum is in it's own file which is accessed sequentially.");
        t.start();
        for n in xrange(0,testSuite['nbrFiles']):
            fh = io.open(individualDir + str(testSuite['shuffledIdArray'][n]) + testSuite['extension'], 'r');
            buffer = fh.read(testSuite['bufferSize']);
            fh.close();
        print "\tresult for sequential indiv. file access: " + str(t.stop());

        t.out("Scenario: Each datum is in it's own file which is accessed randomly.");
        t.start();
        for n in xrange(0,testSuite['nbrFiles']):
            fh = io.open(individualDir + str(n) + testSuite['extension'], 'r');
            buffer = fh.read(testSuite['bufferSize']);
            fh.close();
        print "\tresult for sequential indiv. file access: " + str(t.stop());
示例#21
0
    def arrayRead(self, testSuite):
        t = Timing(testSuite['output']);
        t.out('\nSplit timing:');

        array = range(0,testSuite['nbrLines']);
        t.start();
        for x in xrange(0,testSuite['nbrLines']):
            a = array[x];
        time1 = t.stop();
        t.start();
        for x in xrange(0,testSuite['nbrLines']):
            a = array[x];
        time2 = t.stop();
        t.start();
        for x in xrange(0,testSuite['nbrLines']):
            a = array[x];
        time3 = t.stop();
        t.start();
        for x in xrange(0,testSuite['nbrLines']):
            a = array[x];
        time4 = t.stop();
        t.start();
        for x in xrange(0,testSuite['nbrLines']):
            a = array[x];
        time5 = t.stop();
        t.start();
        for x in xrange(0,testSuite['nbrLines']):
            a = array[x];
        time6 = t.stop();

        r1 = time1 - time1;
        r2 = time2 - time1;
        r3 = time3 - time1;
        r4 = time4 - time1;
        r5 = time5 - time1;
        r6 = time6 - time1;
        print (r2 + r3 + r4 + r5 + r6) / 5;
class QueryTC():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryT+C_traffic\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'newColpvr_2016-01-01_366d_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Create a dataframe from a file
    inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows'
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right, filePath,
                               typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid")
    dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = dfJoin_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"
    timeliness = Timeliness(outputJoin_1, timelinessAttr)
    outputTimeliness = timeliness.execute()

    # Create a dataframe from a file
    inputTable_right_2 = "testTrafficData_latlongInfo"
    predScan_right_2 = None
    selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2,
                                 filePath, typeOfData)
    outputScan_right_2 = selScan_right_2.execute()

    # Join two dataframes
    predJoin_2 = ("Cosit", "=", "Site_id")
    join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2)
    outputJoin_2 = join_2.execute()

    # Find Completeness score for each row
    attrName = "Completeness"
    inputColumnNames = ["Latitude", "Longitude"]
    inputSymbols = ['empty', 'empty']
    completenessOp = RowCompleteness(outputJoin_2, attrName, inputColumnNames,
                                     inputSymbols)
    outputCompleteness = completenessOp.execute()

    # Select columns from the dataframe
    attrList = [
        "Cosit", "VolumeTimeliness_id", "timeliness_score",
        "Completeness_score"
    ]
    proj = Project(outputCompleteness, attrList)
    outputFinal = proj.execute()

    nrows = outputFinal.count()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
示例#23
0
class QueryTA():
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    filePath=sys.argv[1] # Path for the files
    outFilePath=sys.argv[2] # Path for the output file
    typeOfData=sys.argv[3] # Type of dataset
    with open(outFilePath, "a") as myfile: # Open output file
       myfile.write("QueryT+A_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime() # Start measuring time
	
    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData)
    outputScan_left = selScan_left.execute()

    # Create a dataframe from a file
    inputTable_2 = "statusTimelinessQR_11rows"
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right,sqlContext,filePath,typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid")
    join_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = join_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"
    timeliness = Timeliness(outputJoin_1, timelinessAttr);
    outputTimeliness = timeliness.execute()

    # Create a dataframe from a file
    inputTable_right_2 = "testeBusinessData_customerInfo"
    predScan_right_2 = None
    selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2,sqlContext,filePath,typeOfData)
    outputScan_right_2 = selScan_right_2.execute()

    # Join two dataframes
    predJoin_2 = ("customer_id", "=", "custom_id")
    join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2)
    outputJoin_2 = join_2.execute()

    # Find Accuracy score for each row
    accuracyAttr = "Accuracy"
    setOfPostcodes = {"M46","M26","M50"}
    accInputExpr = AccuracyExpression(outputJoin_2,"postcode","in",setOfPostcodes)
    accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr)
    outputAccuracy = accuracyOp.execute()

    # Select columns from the dataframe
    attrList = ["order_no","customer_id","timeliness_score","Accuracy_score"]
    proj = Project(outputAccuracy, attrList)
    outputFinal = proj.execute()
	
    nrows = outputFinal.count()

    stopTime = timing.stopTime()# Stop measuring time
    timing.durationTime(stopTime, startTime)
class QueryTA():
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryT+A_business\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'orderT_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Create a dataframe from a file
    inputTable_2 = "statusTimelinessQR_11rows"
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right, filePath,
                               typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid")
    join_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = join_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"
    timeliness = Timeliness(outputJoin_1, timelinessAttr)
    outputTimeliness = timeliness.execute()

    # Create a dataframe from a file
    inputTable_right_2 = "testeBusinessData_customerInfo"
    predScan_right_2 = None
    selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2,
                                 filePath, typeOfData)
    outputScan_right_2 = selScan_right_2.execute()

    # Join two dataframes
    predJoin_2 = ("customer_id", "=", "custom_id")
    join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2)
    outputJoin_2 = join_2.execute()

    # Find Accuracy score for each row
    accuracyAttr = "Accuracy"
    setOfPostcodes = {"M46", "M26", "M50"}
    accInputExpr = AccuracyExpression(outputJoin_2, "postcode", "in",
                                      setOfPostcodes)
    accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr)
    outputAccuracy = accuracyOp.execute()

    # Select columns from the dataframe
    attrList = [
        "order_no", "customer_id", "timeliness_score", "Accuracy_score"
    ]
    proj = Project(outputAccuracy, attrList)
    outputFinal = proj.execute()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)
class QueryTA():
    filePath = sys.argv[1]  # Path for the files
    outFilePath = sys.argv[2]  # Path for the output file
    typeOfData = sys.argv[3]  # Type of dataset
    with open(outFilePath, "a") as myfile:  # Open output file
        myfile.write("QueryT+A_traffic\n")
    timing = Timing(outFilePath)
    startTime = timing.startTime()  # Start measuring time

    # Create a dataframe from a file
    inputTable_1 = 'newColpvr_2016-01-01_366d_11rows'
    predScan_left = None
    selScan_left = ScanSelect(inputTable_1, predScan_left, filePath,
                              typeOfData)
    outputScan_left = selScan_left.execute()

    # Create a dataframe from a file
    inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows'
    predScan_right = None
    selScan_right = ScanSelect(inputTable_2, predScan_right, filePath,
                               typeOfData)
    outputScan_right = selScan_right.execute()

    # Join two dataframes
    predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid")
    dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin)
    outputJoin_1 = dfJoin_1.execute()

    # Find Timeliness score for each row
    timelinessAttr = "timeliness"
    timeliness = Timeliness(outputJoin_1, timelinessAttr)
    outputTimeliness = timeliness.execute()

    # Create a dataframe from a file
    inputTable_right_2 = "testTrafficData_latlongInfo"
    predScan_right_2 = None
    selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2,
                                 filePath, typeOfData)
    outputScan_right_2 = selScan_right_2.execute()

    # Join two dataframes
    predJoin_2 = ("Cosit", "=", "Site_id")
    join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2)
    outputJoin_2 = join_2.execute()

    # Find Accuracy score for each row
    accuracyAttr = "Accuracy"
    accInputExpr = AccuracyExpression(outputJoin_2, "Latitude", ">",
                                      "Longitude")
    accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr)
    outputAccuracy = accuracyOp.execute()

    # Select columns from the dataframe
    attrList = ["VolumeTimeliness_id", "timeliness_score", "Accuracy_score"]
    proj = Project(outputAccuracy, attrList)
    outputFinal = proj.execute()

    # Uncomment to print final output
    '''
    n = len(outputFinal.index)
    print(outputFinal.head(n).to_string())
    print("Project Output= ")
    print(n)
    '''

    stopTime = timing.stopTime()  # Stop measuring time
    timing.durationTime(stopTime, startTime)