def main(argv):
    group = argv[0] if len(argv) > 0 else "id"
    analyzer = Analyzer(group)
    for data in Utilities.read_json(sys.stdin, group=group):
        (label, disp, message) = analyzer.analyze(data["message"])
        group = data["group"] if "group" in data else ""
        analyzer.output(group, message, label, disp)
示例#2
0
def main(argv):
    group = argv[0] if len(argv) > 0 else "id"
    analyzer = Analyzer(group)
    for data in Utilities.read_json(sys.stdin, group=group):
        (label, disp, message) = analyzer.analyze(data["message"])
        group = data["group"] if "group" in data else ""
        analyzer.output(group, message, label, disp)
示例#3
0
    def predict(self, file):
        self.test_group = []

        self.test_data = itertools.imap(self.split, itertools.ifilter(self.filter, Utilities.read_json(file, 'id', self.group)))
        if self.display:
            self.test_data = list(self.test_data)

        return self.regressor.predict(self.test_data)
示例#4
0
def main(argv):
    # Constants for the analyzer and the classifier
    dataset = 'commit_comments-dump.2015-01-29.json'
    group = 'id'
    model_file = 'model.pickle'

    # Create the analyzer
    analyzer = Analyzer(group)

    # Create the classifier
    algorithm_class = RandomForestRegressor
    algorithm_parameters = {
        'n_estimators': 100,
        'n_jobs': 2,
        'min_samples_split': 10
    }
    classifier = Classifier(group, model_file)
    classifier.create_model(train=True,
                            class_name=algorithm_class,
                            parameters=algorithm_parameters)

    # Compare analyzer output with classifier output and identify differences
    unrecognized_negative = {}
    unrecognized_positive = {}
    predictions = classifier.predict()
    line = 0  # Dataset line
    i = 0  # Prediction ID (+1)
    file = open(dataset, 'rb')
    for data in Utilities.read_json(file, 'id', group):
        line = line + 1
        if line % 1000 == 0:
            print(line)
        if not classifier.filter(data):
            continue
        i = i + 1

        message = data['message']
        score = analyzer.analyze(message)[0]
        if score == 0:
            continue

        diff = predictions[i - 1] - score
        if abs(diff) < 1.0:
            continue

        target = unrecognized_negative if diff < 0 else unrecognized_positive
        target[line] = diff

    result = sorted(unrecognized_positive.items(), key=lambda x: x[1])
    for item in result:
        print("{}: {}: {}".format(item[0], item[1],
                                  linecache.getline(dataset, item[0])[:-1]))
def main(argv):
    # group = argv[0] if len(argv) > 0 else "event_id"
    path = argv[0]
    year = path[-4:]
    # name = year + 'repo.csv' #2017repo.csv

    group = "event_id"
    analyzer = Analyzer("event_id")



    # for data in Utilities.read_json(sys.stdin, group=group):
    for dir in os.listdir(path):
        name = 'tmp/'+dir+'.csv' #2017-01.csv
        new_name = 'tmp2/'+dir+'.csv' #2017-01.csv
        newnew_name='Result/'+dir+'.csv'

        with open(name,"w") as csvfile: 
            writer = csv.writer(csvfile)
            writer.writerow(["repo","score","num"])
        dir_name = path + '/' + dir

        if dir == '.DS_Store': continue;
        if os.path.isdir(dir_name)== False: continue;

        for filename in os.listdir(dir_name): 
            if filename[-5:]!='.json': continue;
            fullname = dir_name+'/'+filename
            print "processing...", fullname
            f = open(fullname,'r')

            for data in Utilities.read_json(f, group=group):

                # data['message'] -- text, data['group'] -- id
                # print data
                if data['message'] == None: continue;

                (label, disp, message) = analyzer.analyze(data["message"])
                group = data["group"] if "group" in data else ""
            
                raw_time = data['time']
                time = re.findall(r"(.+?)T",raw_time)[0]+' '+re.findall(r"T(.+?)Z",raw_time)[0]
            
                repo = data['repo']

                analyzer.output(name, repo, group, message, label, disp, time) 
                # analyzer.output(group, message, label, disp)
        dataframe = pd.read_csv(name)
        dataframe = dataframe.groupby('repo').sum()
        dataframe.to_csv(new_name,index=True,sep=',')
示例#6
0
    def get_train_data(self):
        # Collect the training data
        train_data = []
        train_labels = []
        with open(self.dataset_name + ".labeled.json", 'r') as f:
            i = 0
            for data in Utilities.read_json(f, ['id','label'], self.group):
                i = i + 1
                score = Utilities.label_to_score(data["label"])
                if score is None: # unknown
                    continue

                line = linecache.getline(self.dataset_name + '.json', i)
                json_object = json.loads(line)
                if json_object['id'] != data['id']:
                    raise(ValueError('ID in label dataset does not match with dataset on line {}: {} vs {}'.format(i, data['id'], json_object['id'])))

                message = json_object['body'].replace('\r\n', '\n')
                self.train_ids.add(data['id'])
                train_data.append(message)
                train_labels.append(score)
        
        return (train_data, train_labels)