def description(self, output): t = Timing(False); t.out(""" Testing individual file access or block access with random access: Testing scenarios: - A high probability that the data is in a few of many block files - A the probability for all blocks is the same - Each datum is in it's own file (test small, large and varying data size) - test all scenarios with a variety of buffer sizes, block file sizes and data length.""");
class QueryA(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" accInputExpr = AccuracyExpression(outputScan_left,"ship_date",">","submit_date") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["order_no","Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime()# Stop measuring time timing.durationTime(stopTime, startTime)
class QueryT(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() #Select columns from the dataframe attrList = ["statusTimeliness_qid", "timeliness_score"] proj = Project(outputTimeliness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' nrows = outputFinal.count() stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" if (typeOfData != 'Categorical'): accInputExpr_6 = AccuracyExpression(outputScan_left, "Class5Volume", "+", "Class6Volume") accInputExpr_5 = AccuracyExpression(outputScan_left, "Class4Volume", "+", accInputExpr_6) accInputExpr_4 = AccuracyExpression(outputScan_left, "Class3Volume", "+", accInputExpr_5) accInputExpr_3 = AccuracyExpression(outputScan_left, "Class2Volume", "+", accInputExpr_4) accInputExpr_2 = AccuracyExpression(outputScan_left, "Class1Volume", "+", accInputExpr_3) accInputExpr_1 = AccuracyExpression(outputScan_left, "Volume", "=", accInputExpr_2) accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) else: accInputExpr_1 = AccuracyExpression(outputScan_left, "Volume", "=", "Class2Volume") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe if (typeOfData != 'Categorical'): attrList = ["Sdate", "LaneNumber", "Accuracy_score"] else: attrList = ["Sdate", "Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
def stringIOLineSplit(self, testSuite): t = Timing(testSuite['output']); t.out('\nStringIO timing:'); t.start(); stringHandler = StringIO.StringIO(testSuite['lines']); t.start(); line = stringHandler.readline(); while line: lineHandler = StringIO.StringIO(line); char = lineHandler.read(1); while char: char = lineHandler.read(1); line = stringHandler.readline(); print "Result: " + str(t.stop()) + '\n';
class QueryA(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" if(typeOfData!='Categorical'): accInputExpr_6 = AccuracyExpression(outputScan_left,"Class5Volume","+","Class6Volume") accInputExpr_5 = AccuracyExpression(outputScan_left,"Class4Volume","+",accInputExpr_6) accInputExpr_4 = AccuracyExpression(outputScan_left,"Class3Volume","+",accInputExpr_5) accInputExpr_3 = AccuracyExpression(outputScan_left,"Class2Volume","+",accInputExpr_4) accInputExpr_2 = AccuracyExpression(outputScan_left,"Class1Volume","+",accInputExpr_3) accInputExpr_1 = AccuracyExpression(outputScan_left,"Volume","=",accInputExpr_2) accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) else: accInputExpr_1 = AccuracyExpression(outputScan_left,"Volume","=","Class2Volume") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr_1) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe if(typeOfData!='Categorical'): attrList = ["Sdate","LaneNumber","Accuracy_score"] else: attrList = ["Sdate","Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime()# Stop measuring time timing.durationTime(stopTime, startTime)
class QueryT(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, sqlContext, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, sqlContext, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Select columns from the dataframe attrList = ["VolumeTimeliness_id", "timeliness_score"] proj = Project(outputTimeliness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryC(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryC_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,filePath,typeOfData) outputScan_left = selScan_left.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["Class1Volume","Class2Volume"] inputSymbols = ['empty','empty'] completenessOp = RowCompleteness(outputScan_left, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() #Select columns from the dataframe if(typeOfData!='Categorical'): attrList = ["Sdate","LaneNumber","Completeness_score"] else: attrList = ["Sdate","Completeness_score"] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
def split(self, testSuite): t = Timing(testSuite['output']); t.out('\nSplit timing:'); t.start(); stringHandler = StringIO.StringIO(testSuite['lines']); t.start(); line = stringHandler.readline(); while line: s = line.split(','); termId = int(s[0]); docId = int(s[1]); posit = int(s[2]); # print str(termId) + "," + str(docId) + "," + str(posit); line = stringHandler.readline(); print "Result: " + str(t.stop()) + '\n';
class QueryT(): filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,filePath,typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right,filePath,typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Select columns from the dataframe attrList = ["VolumeTimeliness_id","timeliness_score"] proj = Project(outputTimeliness, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryC(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryC_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["ship_date", "statusOrder"] inputSymbols = ['empty', 'empty'] completenessOp = RowCompleteness(outputScan_left, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = ["order_no", "Completeness_score"] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
def main(argv): parser = argparse.ArgumentParser(description='Compile C to QSP/QAP') parser.add_argument('cfile', metavar='<cfile>', help='a C file to compile') parser.add_argument('--print', dest='print_exprs', help="print output expressions on stdout") parser.add_argument('--il', dest='il_file', help='intermediate circuit output file') parser.add_argument('--json', dest='json_file', help='json version of intermediate circuit output file') parser.add_argument('--arith', dest='arith_file', help='arithmetic circuit output file') parser.add_argument('--bit-width', dest='bit_width', help='bit width -- affects bitwise operator semantics and arithmetic circuit output', default=32) parser.add_argument('--bool', dest='bool_file', help='boolean circuit output file') parser.add_argument('--ignore-overflow', dest='ignore_overflow', help='ignore field-P overflows; never truncate', default=False) parser.add_argument('--cpparg', dest='cpp_arg', nargs="*", help='extra arguments to C preprocessor') parser.add_argument('--loop-sanity-limit', dest='loop_sanity_limit', help='limit on statically-measured loop unrolling', default=1000000) parser.add_argument('--progress', dest='progress', help='print progress messages during compilation') args = parser.parse_args(argv) timing = Timing(args.cfile, enabled=False) try: vercomp = Vercomp(args.cfile, args, timing) except Exception,ex: print repr(ex) raise print "DFG total count: %s flatbytes: %s" % (total.count, total.flatbytes) flats = total.flats flats.sort() flats = flats[::-1] print("Biggest flat: len %s %s" % (flats[0][0], str(flats[0][1])[0:160])) raise
class QueryC(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryC_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, sqlContext, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["ship_date", "statusOrder"] inputSymbols = [' ', ' '] completenessOp = RowCompleteness(outputScan_left, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = ["order_no", "Completeness_score"] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryA_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" accInputExpr = AccuracyExpression(outputScan_left, "ship_date", ">", "submit_date") accuracyOp = Accuracy(outputScan_left, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["order_no", "Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
def regexLineSplit(self, testSuite): t = Timing(testSuite['output']); t.out('\nRegex timing:'); compiledExpression = re.compile(r""" (?P<termId>\d*), (?P<docId>\d*), (?P<position>\d*) """, re.S|re.X) stringHandler = StringIO.StringIO(testSuite['lines']); t.start(); line = stringHandler.readline(); while line: match = re.match(compiledExpression, line) termId = int(match.group('termId')); docId = int(match.group('docId')); posit = int(match.group('position')); # print str(termId) + "," + str(docId) + "," + str(posit); line = stringHandler.readline(); print "Result: " + str(t.stop()) + '\n';
class QueryTC(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+C_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testeBusinessData_customerInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("customer_id", "=", "custom_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["postcode", "email"] inputSymbols = ['empty', 'empty'] completenessOp = RowCompleteness(outputJoin_2, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = [ "order_no", "customer_id", "timeliness_score", "Completeness_score" ] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class ModelBase: """ Base for all models Magic methods: 1) __str__ : return self.name; __repr__ = __str__ 2) __getitem__ : access to protected members Properties: 1) name : name of this model, self.__class__.__name__ or self._name 2) title : used in matplotlib (plt.title()) Static method: 1) disable_timing : disable Timing() 2) show_timing_log : show Timing() records """ clf_timing = Timing() def __init__(self, **kwargs): self._plot_label_dict = {} self._title = self._name = None self._metrics, self._available_metrics = [], { "acc": ClassifierBase.acc } self._params = { "sample_weight": kwargs.get("sample_weight", None) } def __str__(self): return self.name def __repr__(self): return str(self) def __getitem__(self, item): if isinstance(item, str): return getattr(self, "_" + item) @property def name(self): return self.__class__.__name__ if self._name is None else self._name @property def title(self): return str(self) if self._title is None else self._title @staticmethod def disable_timing(): ModelBase.clf_timing.disable() @staticmethod def show_timing_log(level=2): ModelBase.clf_timing.show_timing_log(level) # Handle animation @staticmethod def _refresh_animation_params(animation_params): animation_params["show"] = animation_params.get("show", False) animation_params["mp4"] = animation_params.get("mp4", False) animation_params["period"] = animation_params.get("period", 1) def _get_animation_params(self, animation_params): if animation_params is None: animation_params = self._params["animation_params"] else: ClassifierBase._refresh_animation_params(animation_params) show, mp4, period = animation_params["show"], animation_params["mp4"], animation_params["period"] return show or mp4, show, mp4, period, animation_params def _handle_animation(self, i, x, y, ims, animation_params, draw_ani, show_ani, make_mp4, ani_period, name=None, img=None): if draw_ani and x.shape[1] == 2 and (i + 1) % ani_period == 0: if img is None: img = self.get_2d_plot(x, y, **animation_params) if name is None: name = str(self) if show_ani: cv2.imshow(name, img) cv2.waitKey(1) if make_mp4: ims.append(img) def _handle_mp4(self, ims, animation_properties, name=None): if name is None: name = str(self) if animation_properties[2] and ims: VisUtil.make_mp4(ims, name) def get_2d_plot(self, x, y, padding=1, dense=200, draw_background=False, emphasize=None, extra=None, **kwargs): pass # Visualization def scatter2d(self, x, y, padding=0.5, title=None): axis, labels = np.asarray(x).T, np.asarray(y) print("=" * 30 + "\n" + str(self)) x_min, x_max = np.min(axis[0]), np.max(axis[0]) y_min, y_max = np.min(axis[1]), np.max(axis[1]) x_padding = max(abs(x_min), abs(x_max)) * padding y_padding = max(abs(y_min), abs(y_max)) * padding x_min -= x_padding x_max += x_padding y_min -= y_padding y_max += y_padding if labels.ndim == 1: if not self._plot_label_dict: self._plot_label_dict = {c: i for i, c in enumerate(set(labels))} dic = self._plot_label_dict n_label = len(dic) labels = np.array([dic[label] for label in labels]) else: n_label = labels.shape[1] labels = np.argmax(labels, axis=1) colors = plt.cm.rainbow([i / n_label for i in range(n_label)])[labels] if title is None: title = self.title indices = [labels == i for i in range(np.max(labels) + 1)] scatters = [] plt.figure() plt.title(title) for idx in indices: scatters.append(plt.scatter(axis[0][idx], axis[1][idx], c=colors[idx])) plt.legend(scatters, ["$c_{}$".format("{" + str(i) + "}") for i in range(len(scatters))], ncol=math.ceil(math.sqrt(len(scatters))), fontsize=8) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.show() print("Done.") def scatter3d(self, x, y, padding=0.1, title=None): axis, labels = np.asarray(x).T, np.asarray(y) print("=" * 30 + "\n" + str(self)) x_min, x_max = np.min(axis[0]), np.max(axis[0]) y_min, y_max = np.min(axis[1]), np.max(axis[1]) z_min, z_max = np.min(axis[2]), np.max(axis[2]) x_padding = max(abs(x_min), abs(x_max)) * padding y_padding = max(abs(y_min), abs(y_max)) * padding z_padding = max(abs(z_min), abs(z_max)) * padding x_min -= x_padding x_max += x_padding y_min -= y_padding y_max += y_padding z_min -= z_padding z_max += z_padding def transform_arr(arr): if arr.ndim == 1: dic = {c: i for i, c in enumerate(set(arr))} n_dim = len(dic) arr = np.array([dic[label] for label in arr]) else: n_dim = arr.shape[1] arr = np.argmax(arr, axis=1) return arr, n_dim if title is None: try: title = self.title except AttributeError: title = str(self) labels, n_label = transform_arr(labels) colors = plt.cm.rainbow([i / n_label for i in range(n_label)])[labels] indices = [labels == i for i in range(n_label)] scatters = [] fig = plt.figure() plt.title(title) ax = fig.add_subplot(111, projection='3d') for _index in indices: scatters.append(ax.scatter(axis[0][_index], axis[1][_index], axis[2][_index], c=colors[_index])) ax.legend(scatters, ["$c_{}$".format("{" + str(i) + "}") for i in range(len(scatters))], ncol=math.ceil(math.sqrt(len(scatters))), fontsize=8) plt.show() # Util def predict(self, x, get_raw_results=False, **kwargs): pass
def collapse_tree(self, key): timing = Timing("collapse_tree", enabled=False) stack = [key] loop_count = 0 while (len(stack) > 0): timing.phase("collapser loop # %s setup" % loop_count) loop_count += 1 key = stack[-1] if (key in self.table): # oh. handy. we already did this dude: he was just # wanted multiple times. stack.pop() continue alldeps = self.get_dependencies(key) timing.phase("collapser loop # %s get_deps (%d) self %s" % (loop_count, len(alldeps), self.__class__)) # def study(table): # keys = table.keys() # keys.sort() # hist = {} # for k in keys: # h = hash(k) # if (h not in hist): # hist[h] = [] # hist[h].append(k) # def by_len(a,b): # return cmp(len(a), len(b)) # v = hist.values() # v.sort(by_len) # v = v[::-1] # print "%d keys, %d unique hashes, worst duplication: %s" % ( # len(keys), len(hist), v[:10]) # for k in v[0]: # print "%d %s" % (hash(k), k) # raise Exception("done") def notintable(d): # if (len(self.table)>4000): # study(self.table) return d not in self.table newdeps = filter(notintable, alldeps) if (newdeps == []): stack.pop() assert (key not in self.table) self.table[key] = self.collapse_impl(key) timing.phase("collapser loop # %s collapse_impl" % loop_count) else: stack += newdeps timing.phase("collapser loop # %s end" % loop_count) timing.phase("done") # print "collapser loop_count: %d" % loop_count return self.table[key]
def blockFiles(self, testSuite): t = Timing(testSuite['output']); fb = FileBlock( testSuite['blockSize'], testSuite['bufferSize'], testSuite['blockFilesDir'], testSuite['tableFileName'], ); fb.loadRegistry(); t.out("\n\nScenario: Large block files, files are accessed sequentially."); t.start(); for n in xrange(0,testSuite['nbrFiles']): fh = fb.open(n); buffer = fh.read(testSuite['bufferSize']); print "\tresult for sequential block file access: " + str(t.stop()); t.out("Scenario: Large block files, files are accessed sequentially in reverse order."); t.start(); for n in xrange(testSuite['nbrFiles']-1,-1,-1): fh = fb.open(n); buffer = fh.read(testSuite['bufferSize']); print "\tresult for reverse sequential block file access: " + str(t.stop()); t.out("Scenario: Large block files, files are accessed randomly."); t.start(); for n in xrange(0,testSuite['nbrFiles']): fh = fb.open(testSuite['shuffledIdArray'][n]); buffer = fh.read(testSuite['bufferSize']); print "\tresult for random block file access: " + str(t.stop());
def individualFiles(self, testSuite): t = Timing(testSuite['output']); individualDir = testSuite['dir'] + ".individualFilesDynamic" + path.sep; t.out("\n\nScenario: Each datum is in it's own file which is accessed sequentially."); t.start(); for n in xrange(0,testSuite['nbrFiles']): fh = io.open(individualDir + str(testSuite['shuffledIdArray'][n]) + testSuite['extension'], 'r'); buffer = fh.read(testSuite['bufferSize']); fh.close(); print "\tresult for sequential indiv. file access: " + str(t.stop()); t.out("Scenario: Each datum is in it's own file which is accessed randomly."); t.start(); for n in xrange(0,testSuite['nbrFiles']): fh = io.open(individualDir + str(n) + testSuite['extension'], 'r'); buffer = fh.read(testSuite['bufferSize']); fh.close(); print "\tresult for sequential indiv. file access: " + str(t.stop());
def arrayRead(self, testSuite): t = Timing(testSuite['output']); t.out('\nSplit timing:'); array = range(0,testSuite['nbrLines']); t.start(); for x in xrange(0,testSuite['nbrLines']): a = array[x]; time1 = t.stop(); t.start(); for x in xrange(0,testSuite['nbrLines']): a = array[x]; time2 = t.stop(); t.start(); for x in xrange(0,testSuite['nbrLines']): a = array[x]; time3 = t.stop(); t.start(); for x in xrange(0,testSuite['nbrLines']): a = array[x]; time4 = t.stop(); t.start(); for x in xrange(0,testSuite['nbrLines']): a = array[x]; time5 = t.stop(); t.start(); for x in xrange(0,testSuite['nbrLines']): a = array[x]; time6 = t.stop(); r1 = time1 - time1; r2 = time2 - time1; r3 = time3 - time1; r4 = time4 - time1; r5 = time5 - time1; r6 = time6 - time1; print (r2 + r3 + r4 + r5 + r6) / 5;
class QueryTC(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR) logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR) filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+C_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testTrafficData_latlongInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("Cosit", "=", "Site_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Completeness score for each row attrName = "Completeness" inputColumnNames = ["Latitude", "Longitude"] inputSymbols = ['empty', 'empty'] completenessOp = RowCompleteness(outputJoin_2, attrName, inputColumnNames, inputSymbols) outputCompleteness = completenessOp.execute() # Select columns from the dataframe attrList = [ "Cosit", "VolumeTimeliness_id", "timeliness_score", "Completeness_score" ] proj = Project(outputCompleteness, attrList) outputFinal = proj.execute() nrows = outputFinal.count() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTA(): sc = SparkContext() sqlContext = SQLContext(sc) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) filePath=sys.argv[1] # Path for the files outFilePath=sys.argv[2] # Path for the output file typeOfData=sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+A_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left,sqlContext,filePath,typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right,sqlContext,filePath,typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr); outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testeBusinessData_customerInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2,sqlContext,filePath,typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("customer_id", "=", "custom_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" setOfPostcodes = {"M46","M26","M50"} accInputExpr = AccuracyExpression(outputJoin_2,"postcode","in",setOfPostcodes) accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["order_no","customer_id","timeliness_score","Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() nrows = outputFinal.count() stopTime = timing.stopTime()# Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+A_business\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'orderT_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = "statusTimelinessQR_11rows" predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("statusTimeliness_id", "=", "statusTimeliness_qid") join_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = join_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testeBusinessData_customerInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("customer_id", "=", "custom_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" setOfPostcodes = {"M46", "M26", "M50"} accInputExpr = AccuracyExpression(outputJoin_2, "postcode", "in", setOfPostcodes) accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = [ "order_no", "customer_id", "timeliness_score", "Accuracy_score" ] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)
class QueryTA(): filePath = sys.argv[1] # Path for the files outFilePath = sys.argv[2] # Path for the output file typeOfData = sys.argv[3] # Type of dataset with open(outFilePath, "a") as myfile: # Open output file myfile.write("QueryT+A_traffic\n") timing = Timing(outFilePath) startTime = timing.startTime() # Start measuring time # Create a dataframe from a file inputTable_1 = 'newColpvr_2016-01-01_366d_11rows' predScan_left = None selScan_left = ScanSelect(inputTable_1, predScan_left, filePath, typeOfData) outputScan_left = selScan_left.execute() # Create a dataframe from a file inputTable_2 = 'TfGM_completeTuple_VolumeTimelinessQR_11rows' predScan_right = None selScan_right = ScanSelect(inputTable_2, predScan_right, filePath, typeOfData) outputScan_right = selScan_right.execute() # Join two dataframes predJoin = ("VolumeTimeliness_id", "=", "VolumeTimeliness_qid") dfJoin_1 = Join(outputScan_left, outputScan_right, predJoin) outputJoin_1 = dfJoin_1.execute() # Find Timeliness score for each row timelinessAttr = "timeliness" timeliness = Timeliness(outputJoin_1, timelinessAttr) outputTimeliness = timeliness.execute() # Create a dataframe from a file inputTable_right_2 = "testTrafficData_latlongInfo" predScan_right_2 = None selScan_right_2 = ScanSelect(inputTable_right_2, predScan_right_2, filePath, typeOfData) outputScan_right_2 = selScan_right_2.execute() # Join two dataframes predJoin_2 = ("Cosit", "=", "Site_id") join_2 = Join(outputTimeliness, outputScan_right_2, predJoin_2) outputJoin_2 = join_2.execute() # Find Accuracy score for each row accuracyAttr = "Accuracy" accInputExpr = AccuracyExpression(outputJoin_2, "Latitude", ">", "Longitude") accuracyOp = Accuracy(outputJoin_2, accuracyAttr, accInputExpr) outputAccuracy = accuracyOp.execute() # Select columns from the dataframe attrList = ["VolumeTimeliness_id", "timeliness_score", "Accuracy_score"] proj = Project(outputAccuracy, attrList) outputFinal = proj.execute() # Uncomment to print final output ''' n = len(outputFinal.index) print(outputFinal.head(n).to_string()) print("Project Output= ") print(n) ''' stopTime = timing.stopTime() # Stop measuring time timing.durationTime(stopTime, startTime)