def main(argv): group = argv[0] if len(argv) > 0 else "id" analyzer = Analyzer(group) for data in Utilities.read_json(sys.stdin, group=group): (label, disp, message) = analyzer.analyze(data["message"]) group = data["group"] if "group" in data else "" analyzer.output(group, message, label, disp)
def main(argv): group = argv[0] if len(argv) > 0 else "id" analyzer = Analyzer(group) for data in Utilities.read_json(sys.stdin, group=group): (label, disp, message) = analyzer.analyze(data["message"]) group = data["group"] if "group" in data else "" analyzer.output(group, message, label, disp)
def predict(self, file): self.test_group = [] self.test_data = itertools.imap(self.split, itertools.ifilter(self.filter, Utilities.read_json(file, 'id', self.group))) if self.display: self.test_data = list(self.test_data) return self.regressor.predict(self.test_data)
def main(argv): # Constants for the analyzer and the classifier dataset = 'commit_comments-dump.2015-01-29.json' group = 'id' model_file = 'model.pickle' # Create the analyzer analyzer = Analyzer(group) # Create the classifier algorithm_class = RandomForestRegressor algorithm_parameters = { 'n_estimators': 100, 'n_jobs': 2, 'min_samples_split': 10 } classifier = Classifier(group, model_file) classifier.create_model(train=True, class_name=algorithm_class, parameters=algorithm_parameters) # Compare analyzer output with classifier output and identify differences unrecognized_negative = {} unrecognized_positive = {} predictions = classifier.predict() line = 0 # Dataset line i = 0 # Prediction ID (+1) file = open(dataset, 'rb') for data in Utilities.read_json(file, 'id', group): line = line + 1 if line % 1000 == 0: print(line) if not classifier.filter(data): continue i = i + 1 message = data['message'] score = analyzer.analyze(message)[0] if score == 0: continue diff = predictions[i - 1] - score if abs(diff) < 1.0: continue target = unrecognized_negative if diff < 0 else unrecognized_positive target[line] = diff result = sorted(unrecognized_positive.items(), key=lambda x: x[1]) for item in result: print("{}: {}: {}".format(item[0], item[1], linecache.getline(dataset, item[0])[:-1]))
def main(argv): # group = argv[0] if len(argv) > 0 else "event_id" path = argv[0] year = path[-4:] # name = year + 'repo.csv' #2017repo.csv group = "event_id" analyzer = Analyzer("event_id") # for data in Utilities.read_json(sys.stdin, group=group): for dir in os.listdir(path): name = 'tmp/'+dir+'.csv' #2017-01.csv new_name = 'tmp2/'+dir+'.csv' #2017-01.csv newnew_name='Result/'+dir+'.csv' with open(name,"w") as csvfile: writer = csv.writer(csvfile) writer.writerow(["repo","score","num"]) dir_name = path + '/' + dir if dir == '.DS_Store': continue; if os.path.isdir(dir_name)== False: continue; for filename in os.listdir(dir_name): if filename[-5:]!='.json': continue; fullname = dir_name+'/'+filename print "processing...", fullname f = open(fullname,'r') for data in Utilities.read_json(f, group=group): # data['message'] -- text, data['group'] -- id # print data if data['message'] == None: continue; (label, disp, message) = analyzer.analyze(data["message"]) group = data["group"] if "group" in data else "" raw_time = data['time'] time = re.findall(r"(.+?)T",raw_time)[0]+' '+re.findall(r"T(.+?)Z",raw_time)[0] repo = data['repo'] analyzer.output(name, repo, group, message, label, disp, time) # analyzer.output(group, message, label, disp) dataframe = pd.read_csv(name) dataframe = dataframe.groupby('repo').sum() dataframe.to_csv(new_name,index=True,sep=',')
def get_train_data(self): # Collect the training data train_data = [] train_labels = [] with open(self.dataset_name + ".labeled.json", 'r') as f: i = 0 for data in Utilities.read_json(f, ['id','label'], self.group): i = i + 1 score = Utilities.label_to_score(data["label"]) if score is None: # unknown continue line = linecache.getline(self.dataset_name + '.json', i) json_object = json.loads(line) if json_object['id'] != data['id']: raise(ValueError('ID in label dataset does not match with dataset on line {}: {} vs {}'.format(i, data['id'], json_object['id']))) message = json_object['body'].replace('\r\n', '\n') self.train_ids.add(data['id']) train_data.append(message) train_labels.append(score) return (train_data, train_labels)