Exemplo n.º 1
0
    def finalize(self):
        merged_clusters = []
        for c1 in self.clusters.values():
            existing = None
            for m in c1:
                for c2 in merged_clusters:
                    if m in c2:
                        existing = c2
                        break
                if existing is not None:
                    break
            if existing is not None:
                print("Merging clusters (shouldn't happen very often.)")
                existing.update(c1)
            else:
                merged_clusters.append(set(c1))
        merged_clusters = [list(c) for c in merged_clusters]
        all_mentions = flatten_list_of_lists(merged_clusters)
        assert len(all_mentions) == len(set(all_mentions))

        return {
            "doc_key": self.doc_key,
            "sentences": self.sentences,
            "speakers": self.speakers,
            "constituents": self.span_dict_to_list(self.constituents),
            "ner": self.span_dict_to_list(self.ner),
            "clusters": merged_clusters
        }
Exemplo n.º 2
0
 def _parse_jsonlines(self, file_path):
     examples = []
     max_mention_num = -1
     max_cluster_size = -1
     max_num_clusters = -1
     with open(file_path, 'r') as f:
         for line in f:
             d = json.loads(line.strip())
             doc_key = d["doc_key"]
             input_words = flatten_list_of_lists(d["sentences"])
             clusters = d["clusters"]
             max_mention_num = max(max_mention_num, len(flatten_list_of_lists(clusters)))
             max_cluster_size = max(max_cluster_size, max(len(cluster) for cluster in clusters) if clusters else 0)
             max_num_clusters = max(max_num_clusters, len(clusters) if clusters else 0)
             speakers = flatten_list_of_lists(d["speakers"])
             examples.append((doc_key, input_words, clusters, speakers))
     return examples, max_mention_num, max_cluster_size, max_num_clusters
Exemplo n.º 3
0
 def flatten_holroyd_data(not_flat_list):
     """
     :param not_flat_list: data structure used for data in this class: a list of 4 lists (1 per sequence)
     of 2D-ndarrays
     :return: a flattened list of 1D-ndarrays
     """
     flat_list = utils.flatten_list_of_lists(not_flat_list)
     return [element[0] for element in flat_list]
Exemplo n.º 4
0
def sort_specs_by_order_label(allspecs=None,
                              all_order_labels=None,
                              rvcor_spectra=True):
    if allspecs is None:
        _, allspecs = load_master_table_and_spectra(
            rvcor_spectra=rvcor_spectra)
    if all_order_labels is None:
        _, all_order_labels = load_order_labels()
    ## Verify that everything is the same size
    assert len(allspecs) == len(all_order_labels), (len(allspecs),
                                                    len(all_order_labels))
    for allspec, order_labels in zip(allspecs, all_order_labels):
        assert len(allspec) == len(order_labels), (len(allspec),
                                                   len(order_labels))
    ## Make a dictionary
    unique_labels = np.unique(utils.flatten_list_of_lists(all_order_labels))
    sorted_specs = dict(zip(unique_labels, [[] for _ in unique_labels]))
    for allspec, order_labels in zip(allspecs, all_order_labels):
        for spec, order_label in zip(allspec, order_labels):
            sorted_specs[order_label] += [spec]
    return sorted_specs
Exemplo n.º 5
0
 def extract_boundries_mention_indices(self, clusters):
     return zip(*self.pad_mentions(flatten_list_of_lists(clusters)))
Exemplo n.º 6
0
def main():
    verbose = False
    error_type = "mse"  # mse or cross_entropy
    num_networks = 10
    num_accurate_networks = 0
    training_steps_per_network = 10000

    spearman_matrix = np.zeros([24, 24])
    euclidian_matrix = np.zeros([24, 24])
    spearman_matrix_s = np.zeros([24, 24])
    euclidian_matrix_s = np.zeros([24, 24])
    for i in range(num_networks):
        print("\nTraining starts for network {}".format(i+1))
        running_avg_loss = 0.
        holroyd_net = Holroyd2018Network()
        print("Iteration\tLoss")
        for j in range(training_steps_per_network):
            idx = np.random.randint(4)
            loss, _, _ = holroyd_net.full_sequence(holroyd_net.input_sequences_one_hot[idx],
                                                   holroyd_net.output_sequences_one_hot[idx],
                                                   error_type)
            if j == 0:
                running_avg_loss = loss
            elif j < 10:
                running_avg_loss = running_avg_loss * 0.5 + 0.5 * loss
            elif j < 100:
                running_avg_loss = running_avg_loss * 0.8 + 0.2 * loss
            else:
                running_avg_loss = running_avg_loss * 0.99 + 0.01 * loss

            if j == 0 or (j+1) % 1000 == 0 or (j < 1000 and math.log(j+1, 4) % 1 == 0):
                print("{0:5d}:\t\t{1:8.4f}".format(j+1, running_avg_loss))

        # Freeze the network
        holroyd_net.learning_rate = 0.
        # holroyd_net.show_results(error_type, verbose=False)
        spearman_matrix += holroyd_net.generate_rdm(error_type, distance_type="spearman_rho")
        euclidian_matrix += holroyd_net.generate_rdm(error_type, distance_type="euclidian")
        if holroyd_net.test_accuracy(error_type):
            print(colored("Accuracy test for network {}: PASSED".format(i+1), 'green'))
            spearman_matrix_s += holroyd_net.generate_rdm(error_type, distance_type="spearman_rho")
            euclidian_matrix_s += holroyd_net.generate_rdm(error_type, distance_type="euclidian")
            num_accurate_networks += 1
        else:
            print(colored("Accuracy test for network {}: FAILED".format(i+1), 'red'))

    if num_accurate_networks == 0:
        print("Not a single network was accurate... Exiting")
        return
    else:
        print("Number of accurate networks out of 100: {0}".format(num_accurate_networks))

    # Generate Spearman and Euclidian matrices, one for all networks, one for only the accurate ones
    spearman_matrix = spearman_matrix / num_networks
    euclidian_matrix = euclidian_matrix / num_networks
    spearman_matrix_s = spearman_matrix_s / num_accurate_networks
    euclidian_matrix_s = euclidian_matrix_s / num_accurate_networks

    net = Holroyd2018Network() # This is just to have access to the input sequences strings. Not elegant but... oh well
    x_labels = utils.flatten_list_of_lists(net.input_sequences_strings)
    y_labels = x_labels  # labels for y-axis

    sns.heatmap(euclidian_matrix, cbar=True, square=True, xticklabels=x_labels, yticklabels=y_labels)
    plt.title("Euclidian RDM, all nets")
    plt.show()

    sns.heatmap(euclidian_matrix_s, cbar=True, square=True, xticklabels=x_labels, yticklabels=y_labels)
    plt.title("Euclidian RDM, only accurate nets")
    plt.show()

    sns.heatmap(spearman_matrix, cbar=True, square=True, xticklabels=x_labels, yticklabels=y_labels, vmin=0., vmax=1.5)
    plt.title("Spearman RDM, all nets")
    plt.show()

    sns.heatmap(spearman_matrix_s, cbar=True, square=True, xticklabels=x_labels, yticklabels=y_labels,
                vmin=0., vmax=1.5)
    plt.title("Spearman RDM, only accurate nets")
    plt.show()

    # export data to csv
    np.savetxt("euclidian_dist_matrix.csv", euclidian_matrix, delimiter=",")
    np.savetxt("spearmanrho_dist_matrix.csv", spearman_matrix, delimiter=",")
    np.savetxt("euclidian_dist_matrix_s.csv", euclidian_matrix_s, delimiter=",")
    np.savetxt("spearmanrho_dist_matrix_s.csv", spearman_matrix_s, delimiter=",")