Exemplo n.º 1
0
    def merge(self, other_flow):
        changed = False
        incoming_patterns = deepcopy(other_flow.get_tracked_patterns())

        if len(incoming_patterns) == 1 and incoming_patterns[0] == {}:
            # if incoming is empty, did not change
            return False

        if len(self.tracked_patterns) == 1 and self.tracked_patterns[0] == {}:
            # empty patterns: new patterns = incoming!
            self.tracked_patterns = incoming_patterns
            return True

        # TODO: avoid duplicate patterns
        for pattern in incoming_patterns:
            matches_any = False
            sorted_incoming_pattern = sort_dict(pattern)
            for our_pattern in self.tracked_patterns:
                sorted_our_pattern = sort_dict(our_pattern)
                # since they are sorted, the str will produce the same string
                if str(sorted_incoming_pattern) == str(sorted_our_pattern):
                    matches_any = True
                    break

            if not matches_any:
                self.tracked_patterns.append(pattern)
                changed = True

        return changed
Exemplo n.º 2
0
 def calculate_best_precision(self, train, test, k):
     class_sets = self.init_class_sets(train)
     results = {}
     for s in test.samples:
         parametros = s.parameters
         classe = s.sampleClass
         closest_class_local_vector = {}
         for w in class_sets:
             distance_list = {}
             distance_list[w] = self.find_nearest_of_class(
                 class_sets[w], parametros, k)
             list_of_means = self.find_all_means(distance_list[w])
             closest_class_local_vector[w] = list_of_means
         class_harmonic_distance_list = {}
         for w in closest_class_local_vector:
             if len(closest_class_local_vector[w]) != 0:
                 class_harmonic_distance_list[
                     w] = self.get_harmonic_divisor_list(
                         parametros, closest_class_local_vector[w])
         for i in range(1, k):
             class_harmonic_distances = self.get_harmonic_distance_precision(
                 class_harmonic_distance_list, i)
             sorted_dictionary = sort_dict(class_harmonic_distances, 1)
             sorted_dictionary.reverse()
             if classe == sorted_dictionary[0][0]:
                 if i not in results:
                     results[i] = 0.0
                 results[i] += 1.0
     for i in results:
         results[i] = results[i] / len(test.samples)
     return results
Exemplo n.º 3
0
 def predict(self, parametros, vizinhos, limiar, tipo):
     vizinholist = self.get_lista_distancia(parametros)[:vizinhos]
     votos = {}
     k = len(vizinholist)
     for classe in self.classes:
         votos[classe] = 0
     for v in vizinholist:
         votos_amostra = self.hubness[v[0]][k]
         classe_amostra = self.amostras[v[0]]["classe"]
         if self.freqAparicao[v[0]][k] > limiar:
             for classe in votos:
                 votos[classe] += votos_amostra[classe]
         else:
             for classe in votos:
                 if tipo == "crisp":
                     if classe == classe_amostra:
                         votos[classe] += (self.laplace + 1.0) / (
                             1.0 + len(self.classes) * self.laplace)
                     else:
                         votos[classe] += self.laplace / (
                             1.0 + (len(self.classes) * self.laplace))
                 if tipo == "global":
                     votos[classe] += self.classToClassPriorsAllK[k][
                         classe][classe_amostra]
                 if tipo == "local1":
                     votos[classe] += self.amostras[
                         v[0]]["local1"][k][classe]
                 if tipo == "local2":
                     votos[classe] += self.amostras[
                         v[0]]["local2"][k][classe]
     result = sort_dict(votos, 1)
     return result[0][0]
Exemplo n.º 4
0
 def get_weighted_votes(self, neighbors):
     classes_dictionary = {}
     class_list = np.unique(np.array([d[1] for d in neighbors]))
     for c in class_list:
         classes_dictionary[c] = 0
     for n in neighbors:
         classes_dictionary[n[1]] += n[0]
     sorted_dist = util.sort_dict(classes_dictionary, 1)
     return sorted_dist
Exemplo n.º 5
0
def generate_date_to_top_commenters(project_name, sws):
    """
    Generate a mapping from date to number of comment made until that date.

    Large change sets are not exluded because the comments made to the issues related
    to the large change sets still exist.

    Parameters
    ----------
    project_name (str):
        Name of the project

    sws (int):
        Sliding_window_size, in other words number of days to include the graph.

    Returns
    --------
    dict:
        Mapping from date to top commenters and their numbers of comments in the sliding
        window ending that date.
    """
    issue_to_commenters = generate_issue_to_commenters(project_name)
    data_manager = DataManager(get_dataset_path(project_name), sws)

    # Get initial change sets to add and remove
    change_sets_add = data_manager.get_initial_window()
    change_sets_remove = {}
    top_commenters = defaultdict(lambda: 0)

    date_to_top_commenters = {}
    while True:
        # Add change sets
        for change_set in change_sets_add:
            for issue_id in change_set.issues:
                for commenter in issue_to_commenters.get(issue_id, []):
                    top_commenters[commenter] += 1

        # Remove change sets
        for change_set in change_sets_remove:
            for issue_id in change_set.issues:
                for commenter in issue_to_commenters.get(issue_id, []):
                    top_commenters[commenter] -= 1
                    if top_commenters[commenter] <= 0:
                        del top_commenters[commenter]

        date = data_manager.get_last_included_date()
        date_to_top_commenters[date] = sort_dict(top_commenters,
                                                 by_value=True,
                                                 reverse=True)

        try:
            change_sets_add, change_sets_remove = data_manager.forward_one_day(
            )
        except SlidingNotPossible:
            break

    return date_to_top_commenters
Exemplo n.º 6
0
 def get_r_closest(self, k, landmarks, r):
     xi = np.array(k.parameters)
     distances = {}
     for i in range(len(landmarks)):
         distances[i] = np.linalg.norm(
             np.subtract(xi, np.array(landmarks[i].parameters)))
     array = util.sort_dict(distances, 1)
     array.reverse()
     array = array[:r]
     return [landmarks[i] for i in [k[0] for k in array]]
Exemplo n.º 7
0
def read_data(file_name, has_reading=False):
    data = dict()
    with open(file_name) as f:
        for line in f.readlines():
            year, cloze, read_a, read_b = line.strip().split(' ')
            item = list(cloze)
            if has_reading:
                item.extend(read_a)
                item.extend(read_b)
            data[year] = item
    return util.sort_dict(data)
Exemplo n.º 8
0
def generate_date_to_top_committers(project_name, sws):
    """
    Generate a mapping from date to number of commits made until that date.

    TODO:
    Large change sets can be excluded.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    sws (int):
        Sliding_window_size.

    Returns
    --------
    dict:
        Mapping from date to top committers and their numbers of commits in the sliding
        window ending that date.
    """

    data_manager = DataManager(get_dataset_path(project_name), sws)

    # Get initial change sets to add and remove
    change_sets_add = data_manager.get_initial_window()
    change_sets_remove = {}
    top_committers = defaultdict(lambda: 0)

    date_to_top_committers = {}
    while True:
        # Add change sets
        for change_set in change_sets_add:
            top_committers[change_set.author] += 1

        # Remove change sets
        for change_set in change_sets_remove:
            author = change_set.author
            top_committers[author] -= 1
            if top_committers[author] <= 0:
                del top_committers[author]

        date = data_manager.get_last_included_date()
        date_to_top_committers[date] = sort_dict(
            top_committers, by_value=True, reverse=True
        )

        try:
            change_sets_add, change_sets_remove = data_manager.forward_one_day()
        except SlidingNotPossible:
            break

    return date_to_top_committers
Exemplo n.º 9
0
 def find_local_k(self, sample, k_min):
     acc_set = {}
     distan_list = self.vizinhanca[sample]
     for k in range(k_min, len(self.scores)):
         sliced = distan_list[:k]
         count = 0.0
         for distances in sliced:
             if distances[1] == self.samples[sample][0].sampleClass:
                 count += 1.0
         acc_set[k] = ((float(count) / float(k)) + self.scores[k]) / 2
     sorted_dist = util.sort_dict(acc_set, 1)
     return sorted_dist[0][0]
Exemplo n.º 10
0
 def most_freq(self):
     self.trend_freq = [0] * self.probs
     self.trend_answer_head = ['Z'] * self.probs
     self.trend_answer_tail = ['Z'] * self.probs
     for answer, fq in util.sort_dict(count):
         for i in range(self.probs):
             if fq[i] > self.trend_freq[i]:
                 self.trend_freq[i] = fq[i]
                 self.trend_answer_head[i] = answer
                 self.trend_answer_tail[i] = answer
             if fq[i] == self.trend_freq[i]:
                 self.trend_answer_tail[i] = answer
Exemplo n.º 11
0
 def predict(self, sample, k):
     class_harmonic_distances = {}
     subsets = {}
     for w in self.class_sets:
         nn_set = self.find_nearest_of_class(self.class_sets[w], sample, k)
         local_mean_vectors = self.find_all_means(nn_set)
         subsets[w] = local_mean_vectors
     for w in subsets:
         class_harmonic_distances[w] = self.get_harmonic_mean_distance(
             sample, subsets[w])
     sorted_dictionary = sort_dict(class_harmonic_distances, 1)
     sorted_dictionary.reverse()
     return sorted_dictionary[0][0]
Exemplo n.º 12
0
def _generate_date_to_change_sets(dataset_path):
    """
    Generate a dictionary for the pairs of date and change sets committed that date.

    Returns
    -------
    dict:
        A sorted (by date) dictionary for date and change sets pairs.
    """
    if dataset_path in cache:
        return cache[dataset_path]

    with open(dataset_path, encoding="utf8") as f:
        change_set_jsons = json.load(f)["change_sets"]

    date_to_change_sets = defaultdict(list)
    for change_set_json in change_set_jsons:
        code_changes = []
        for code_change in change_set_json["code_changes"]:
            cc = CodeChange(
                code_change["file_path"],
                code_change["change_type"],
                code_change.get("old_file_path", None),
            )
            code_changes.append(cc)

        change_set = ChangeSet(
            change_set_json["commit_hash"],
            change_set_json["author"],
            max_of_day(str_to_date(change_set_json["date"])),
            change_set_json["issues"],
            code_changes,
            change_set_json["num_current_files"],
        )

        date_to_change_sets[max_of_day(change_set.date)].append(change_set)

    # Fill the blanks with empty lists
    dates = list(date_to_change_sets)
    last_date = dates[-1]
    date = dates[0]

    while date < last_date:
        date_to_change_sets[date]
        date += timedelta(days=1)

    change_sets = sort_dict(date_to_change_sets)
    cache[dataset_path] = change_sets
    return change_sets
Exemplo n.º 13
0
async def main():
    args = docopt(doc)
    bs = api.API(token)
    limit = int(args.get('--count'))

    if args.get('topbrawlers') is True:
        bar = ChargingBar(f'Fetching {limit} profiles...',
                          max=limit,
                          suffix='%(percent).1f%% - %(eta)ds')
        top_brawlers = util.sort_dict(await
                                      bs.top_brawlers(limit=limit,
                                                      cb=lambda: bar.next()))
        datavis.bar_graph(top_brawlers,
                          ylabel='OK',
                          title='Top brawlers in the leaderboard')
        if args.get('--out') is not None:
            plt.savefig(args.get('--out'))
        else:
            plt.show()

    elif args.get('brawlerstats') is True:
        num_brawlers = 24  # TODO: change this when Bibi is added ;)
        _type = args.get('--type')
        if _type not in ['mode', 'mean']:
            print('--type must be either "mode" or "mean"')
            exit(0)
        bar = ChargingBar(f'Getting stats for {num_brawlers} brawlers.',
                          max=num_brawlers,
                          suffix='%(percent).1f%% - %(eta)ds')
        dataset = util.sort_dict(await bs.brawler_stats(limit=limit,
                                                        cb=lambda: bar.next(),
                                                        _type=_type))
        datavis.line_graph(dataset)
        plt.show()

    await bs.client.close()
Exemplo n.º 14
0
 def find_best_k(self):
     folds = self.trainset.n_folds(10)
     all_results = {}
     for o in range(0, 10):
         test, train = split_train_test(folds, o)
         all_results = self.addDictionaries(
             all_results,
             self.calculate_best_precision(train, test, self.k_max))
     all_results = sort_dict(all_results, 1)
     best_result = all_results[0][1]
     all_results.reverse()
     for k in all_results:
         if k[1] == best_result:
             return k[0]
     return -1
Exemplo n.º 15
0
def outline(datafile):
    """ 45 个选择题总体分布率 """
    data = read_data(datafile, has_reading=True)

    print util.cutting_line('历年答案 ABCD 比例分布')

    def fmt(lst):
        return str(lst).strip('[]')

    print 'year | total | Cloze | Reading A'
    for year, answer in data:
        print '%s | %s | %s | %s' % (year, fmt(freq(answer[:40])), fmt(freq(answer[:20])), fmt(freq(answer[20:40])))

    print util.cutting_line('完型各选项出现次数统计')
    count = calc_freq(data)
    for answer, fq in util.sort_dict(count):
        print answer, fq[:20]
Exemplo n.º 16
0
    def find_global_accuracies(self):
        scores = {}
        ten_folds = self.samples.n_folds(10)
        if self.k_max - self.k_min == 0:
            return 1
        for i in range(self.k_min, self.k_max):
            global_accuracy = 0
            for fold in range(0, len(ten_folds)):
                test, train = util.split_train_test(ten_folds, fold)
                temp_knn = KNeighborsClassifier(n_neighbors=i,
                                                algorithm='brute')
                temp_knn.fit(train.get_X(), train.get_y())
                score = temp_knn.score(test.get_X(), test.get_y())
                global_accuracy += float(score)
            global_accuracy /= 10
            scores[i] = global_accuracy
        result = util.sort_dict(scores, 1)

        return result[0][0]
Exemplo n.º 17
0
    def _sort_and_filter(self, d):
        """
        Sort the given dictionary `d` by scores in descending order.
        Exclude the ones who have score less than the score threshold.

        Parameters
        ----------
        d (dict):
            Any dictionary.

        Returns
        -------
        dict:
            Sorted and filtered copy of the given dictionary.
        """
        return {
            k: d[k]
            for k in sort_dict(d, by_value=True, reverse=True)
            if d[k] >= self._score_threshold
        }
Exemplo n.º 18
0
 def inicialize_variables(self, limiarMax, min_k, max_k):
     best = {}
     best["acc"] = -1.0
     best["type"] = ""
     best["k"] = -1
     best["thetha"] = -1
     for thethaLimiar in range(0, limiarMax):
         for k in range(min_k, max_k):
             acuracias = {}
             acuracias["crisp"] = 0.0
             acuracias["local1"] = 0.0
             acuracias["local2"] = 0.0
             acuracias["global"] = 0.0
             for amostra in self.amostras:
                 classe = self.amostras[amostra]["classe"]
                 parametros = self.amostras[amostra]["parametros"]
                 classe_predita_crisp = self.predict(
                     parametros, k, thethaLimiar, "crisp")
                 if classe == classe_predita_crisp:
                     acuracias["crisp"] += 1.0
                 classe_predita_global = self.predict(
                     parametros, k, thethaLimiar, "global")
                 if classe == classe_predita_global:
                     acuracias["global"] += 1.0
                 classe_predita_local1 = self.predict(
                     parametros, k, thethaLimiar, "local1")
                 if classe == classe_predita_local1:
                     acuracias["local1"] += 1.0
                 classe_predita_local2 = self.predict(
                     parametros, k, thethaLimiar, "local2")
                 if classe == classe_predita_local2:
                     acuracias["local2"] += 1.0
             resultados_ordenados = sort_dict(acuracias, 1)
             if resultados_ordenados[0][1] >= best["acc"]:
                 best["acc"] = resultados_ordenados[0][1]
                 best["k"] = k
                 best["type"] = resultados_ordenados[0][0]
                 best["thetha"] = thethaLimiar
     self.bThetha = best["thetha"]
     self.bK = best["k"]
     self.bType = best["type"]