def main(argv): # Constants for the analyzer and the classifier dataset = 'commit_comments-dump.2015-01-29.json' group = 'id' model_file = 'model.pickle' # Create the analyzer analyzer = Analyzer(group) # Create the classifier algorithm_class = RandomForestRegressor algorithm_parameters = { 'n_estimators': 100, 'n_jobs': 2, 'min_samples_split': 10 } classifier = Classifier(group, model_file) classifier.create_model(train=True, class_name=algorithm_class, parameters=algorithm_parameters) # Compare analyzer output with classifier output and identify differences unrecognized_negative = {} unrecognized_positive = {} predictions = classifier.predict() line = 0 # Dataset line i = 0 # Prediction ID (+1) file = open(dataset, 'rb') for data in Utilities.read_json(file, 'id', group): line = line + 1 if line % 1000 == 0: print(line) if not classifier.filter(data): continue i = i + 1 message = data['message'] score = analyzer.analyze(message)[0] if score == 0: continue diff = predictions[i - 1] - score if abs(diff) < 1.0: continue target = unrecognized_negative if diff < 0 else unrecognized_positive target[line] = diff result = sorted(unrecognized_positive.items(), key=lambda x: x[1]) for item in result: print("{}: {}: {}".format(item[0], item[1], linecache.getline(dataset, item[0])[:-1]))
def main(argv): folds = int(argv[0]) if len(argv) > 0 else 5 filter = argv[1].lower() if len(argv) > 1 else "" # Fields to check whether the filter, if given, appears in. filter_fields = ['name', 'class_name', 'module'] # Read the manifest containing algorithm descriptions. with open('algorithms.json', 'r') as manifest: algorithms = json.load(manifest) # Load previous results try: with open('experiment_results.json', 'r') as file: results = json.load(file) except: results = {} for algorithm in algorithms: # Skip running the algorithm if it is disabled or the filter name does # not appear in any of the fields. if 'disabled' in algorithm and algorithm['disabled']: continue if filter and all( [filter not in algorithm[k].lower() for k in filter_fields]): continue # Convert manifest entries to classifier class and parameters class_name = Utilities.get_class(algorithm['module'], algorithm['class_name']) dense = algorithm['dense'] if 'dense' in algorithm else False # Create all possible combinations of parameters. parameter_combinations = itertools.product( *algorithm['parameters'].values()) single_parameters = [ param for param, values in algorithm['parameters'].iteritems() if len(values) == 1 ] string_parameters = [ param for param, values in algorithm['parameters'].iteritems() if isinstance(values[0], (str, unicode)) ] for combination in parameter_combinations: classifier = Classifier('id') # Turn the selected parameter combination back into a dictionary parameters = dict(zip(algorithm['parameters'].keys(), combination)) # Create the model according to the parameters classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense) Utilities.print_algorithm(algorithm['name'], parameters) parameter_string = Utilities.get_parameter_string( parameters, single_parameters + string_parameters) # Run cross-validation and print results result = classifier.output_cross_validate(folds) print('') name = algorithm['name'] for param in string_parameters: name += ", %s=%s" % (param, parameters[param]) # Write the result measurements into the results dictionary. if name not in results: results[name] = OrderedDict() results[name].update({ parameter_string: { 'average': result.mean(), 'standard_deviation': result.std() } }) # Write intermediate results (back) into a pretty-printed JSON file with open('experiment_results.json', 'w') as file: json.dump(results, file, indent=4, separators=(',', ': '))
def main(argv): folds = int(argv[0]) if len(argv) > 0 else 5 filter = argv[1].lower() if len(argv) > 1 else "" # Fields to check whether the filter, if given, appears in. filter_fields = ['name', 'class_name', 'module'] # Read the manifest containing algorithm descriptions. with open('algorithms.json', 'r') as manifest: algorithms = json.load(manifest) # Load previous results try: with open('experiment_results.json', 'r') as file: results = json.load(file) except: results = {} for algorithm in algorithms: # Skip running the algorithm if it is disabled or the filter name does # not appear in any of the fields. if 'disabled' in algorithm and algorithm['disabled']: continue if filter and all([filter not in algorithm[k].lower() for k in filter_fields]): continue # Convert manifest entries to classifier class and parameters class_name = Utilities.get_class(algorithm['module'], algorithm['class_name']) dense = algorithm['dense'] if 'dense' in algorithm else False # Create all possible combinations of parameters. parameter_combinations = itertools.product(*algorithm['parameters'].values()) single_parameters = [param for param,values in algorithm['parameters'].iteritems() if len(values) == 1] string_parameters = [param for param,values in algorithm['parameters'].iteritems() if isinstance(values[0],(str,unicode))] for combination in parameter_combinations: classifier = Classifier('id') # Turn the selected parameter combination back into a dictionary parameters = dict(zip(algorithm['parameters'].keys(), combination)) # Create the model according to the parameters classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense) Utilities.print_algorithm(algorithm['name'], parameters) parameter_string = Utilities.get_parameter_string(parameters, single_parameters + string_parameters) # Run cross-validation and print results result = classifier.output_cross_validate(folds) print('') name = algorithm['name'] for param in string_parameters: name += ", %s=%s" % (param,parameters[param]) # Write the result measurements into the results dictionary. if name not in results: results[name] = OrderedDict() results[name].update({ parameter_string: { 'average': result.mean(), 'standard_deviation': result.std() } }) # Write intermediate results (back) into a pretty-printed JSON file with open('experiment_results.json', 'w') as file: json.dump(results, file, indent=4, separators=(',', ': '))