def process() -> tuple:
     X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
     X = graph_helper.get_graphs_only(X)
     graph_helper.convert_graphs_to_adjs_tuples(X)
     X = [(x, [0] * len(y)) for x, y in X]
     estimator, params = task_helper.get_graph_estimator_and_params(X, Y)
     return ClassificationData(X, Y, estimator, params)
 def transform(self, X, y=None, **fit_params):
     if self.from_nx_to_tuple:
         X = graph_helper.get_graphs_only(X)
         graph_helper.convert_graphs_to_adjs_tuples(X)
     else:
         graph_helper.convert_adjs_tuples_to_graphs(X)
     return X
예제 #3
0
 def transform(self, X, y=None, **fit_params):
     assert len(X)
     X = graph_helper.get_graphs_only(X)
     if self.copy:
         X = _copy_graphs(X)
     X = [_remove_label(x, self.labels_to_be_removed_) for x in X]
     return X
예제 #4
0
def get_concept_map_for_dataset(dataset_name: str, graphs_only: bool = False):
    candidates = get_all_cached_graph_datasets(dataset_name, TYPE_CONCEPT_MAP)
    assert len(candidates)
    cmap_file = sorted(candidates)[-1]
    X, Y = get_dataset_cached(cmap_file)
    if graphs_only:
        X = graph_helper.get_graphs_only(X)
    return X, Y
def process_dataset(cache_file, label_lookup_file, args):
    dataset = filename_utils.get_dataset_from_filename(cache_file)

    cache_filename = filename_utils.get_filename_only(cache_file,
                                                      with_extension=False)

    threshold, topn = filename_utils.get_topn_threshold_from_lookupfilename(
        label_lookup_file)

    result_file = cache_file.replace(
        dataset,
        'relabeled_threshold_{}_topn_{}_{}'.format(threshold, topn, dataset))
    if not args.force and os.path.exists(result_file):
        return

    with open(label_lookup_file, 'rb') as f:
        label_lookup = pickle.load(f)

    X, Y = dataset_helper.get_dataset_cached(cache_file)
    X = graph_helper.get_graphs_only(X)

    # Get label to be renamed
    node_labels = list(chain.from_iterable([x.nodes() for x in X]))
    unique_labels = set(node_labels)
    counter = collections.Counter(node_labels)

    node_labels_to_be_renamed = set([
        label for label, occurrences in counter.items()
        if occurrences <= args.max_occurrence
    ])

    lookup_ = {
        label: new_label
        for label, new_label in label_lookup.items()
        if label in node_labels_to_be_renamed
    }

    new_labels = set(lookup_.values())
    lookup__ = collections.defaultdict(list)

    for label, new_label in label_lookup.items():
        if new_label in new_labels:
            lookup__[label].append(new_label)

    lookup_ = dict(lookup_, **lookup__)

    LOGGER.info(
        '{:80} topn={:4} threshold={:4}\n\t\t#relabeled labels: {}\n\t\t#unique labels: {}\n\t\t#nodes: {}'
        .format(cache_filename, topn, threshold, len(lookup_),
                len(unique_labels), len(node_labels)))

    relabel_trans = transformers.RelabelGraphsTransformer(lookup_)

    X = relabel_trans.transform(X)

    with open(result_file, 'wb') as f:
        pickle.dump((X, Y), f)
 def process_relabeled():
     X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
     X = graph_helper.get_graphs_only(X)
     estimator, params = task_helper.get_graph_estimator_and_params(X, Y)
     params['graph_preprocessing'] = [transformers.RelabelGraphsTransformer()]
     params['graph_preprocessing__dataset'] = [dataset]
     params['graph_preprocessing__threshold'] = [0.99]
     params['graph_preprocessing__topn'] = [10]
     return ClassificationData(X, Y, estimator, params)
예제 #7
0
def get_graphs(dataset='ling-spam'):
    global X_graphs
    if X_graphs is not None:
        return X_graphs

    X, _ = dataset_helper.get_concept_map_for_dataset(dataset)
    X = graph_helper.get_graphs_only(X)
    X_graphs = X
    return X
    def transform(self, X, y=None, **fit_params):
        assert len(X)

        X = graph_helper.get_graphs_only(X)

        if self.copy:
            X = _copy_graphs(X)

        X = [_keep_only_train_labels(x, self.train_labels_) for x in X]
        return X
def _retrieve_node_weights_and_convert_graphs(X,
                                              node_weight_function=None,
                                              same_label=False,
                                              use_directed=True,
                                              use_nx=True):
    X = graph_helper.get_graphs_only(X)
    if not use_directed:
        if use_nx:
            X = [nx.Graph(x) for x in X]
            assert not np.any([x.is_directed() for x in X])
        else:
            raise NotImplementedError(
                '!use_directed and !use_nx not implemented')
    node_weight_factors = get_node_weight_factors(X,
                                                  metric=node_weight_function)
    X = graph_helper.convert_graphs_to_adjs_tuples(X, copy=True)
    if same_label:
        X = [(adj, ['dummy'] * len(labels)) for adj, labels in X]

    return X, node_weight_factors
예제 #10
0
def process_dataset(dataset_name, args):
    LOGGER.info('{:15} - Start'.format(dataset_name))
    LOGGER.info('{:15} - Retrieving trained embedding'.format(dataset_name))

    pre_trained_embedding = embeddings.get_embedding_model(
        args.pre_trained_embedding,
        binary=False,
        first_line_header=True,
        with_gensim=True)

    try:
        trained_embedding = dataset_helper.get_w2v_embedding_for_dataset(
            dataset_name)
    except FileNotFoundError as e:
        LOGGER.exception(e)
        return

    cmap_cache_files = dataset_helper.get_all_cached_graph_datasets(
        dataset_name=dataset_name, graph_type=constants.TYPE_CONCEPT_MAP)

    coo_cache_files = [
        x for x in dataset_helper.get_all_cached_graph_datasets(
            dataset_name=dataset_name, graph_type=constants.TYPE_COOCCURRENCE)
        if 'all' in x
    ]

    if not len(cmap_cache_files) or not len(coo_cache_files):
        return

    used_graphs = [cmap_cache_files[0], coo_cache_files[0]]

    LOGGER.info('{:15} - Retrieving dataset'.format(dataset_name))
    all_labels = set()
    for graph_cache_file in used_graphs:
        X, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        X = graph_helper.get_graphs_only(X)
        all_labels |= graph_helper.get_all_node_labels_uniq(
            X, as_sorted_list=False)

    LOGGER.info('{:15} - Resolving embeddings'.format(dataset_name))
    embeddings_pre_trained, not_found_pre_trained_coreferenced, not_found_trained, not_found_pre_trained, lookup, similar_els = embeddings.get_embeddings_for_labels_with_lookup(
        all_labels, trained_embedding, pre_trained_embedding)

    LOGGER.info('{:15} - Missing'.format(dataset_name))

    for label, s in [('trained', not_found_trained),
                     ('pre_trained', not_found_pre_trained),
                     ('after_coreference', not_found_pre_trained_coreferenced)
                     ]:
        LOGGER.info('\t{:20} {:>6}'.format(label, len(s)))

    embedding_file = '{}/{}.w2v.txt'.format(args.embeddings_result_folder,
                                            dataset_name)
    embeddings.save_embedding_dict(embeddings_pre_trained, embedding_file)
    embeddings_pre_trained = embeddings.load_word2vec_format(
        fname=embedding_file, binary=False)

    LOGGER.info('{:15} - Co-reference resolution'.format(dataset_name))
    max_topn = max(args.topn)

    similar_labels = coreference.get_most_similar_labels(
        all_labels, embeddings_pre_trained, max_topn)

    for topn in args.topn:
        for threshold in args.merge_threshold:
            LOGGER.info(
                '{:15} - Co-reference resolution: topn: {}, threshold: {}'.
                format(dataset_name, topn, threshold))
            clique_lookup = coreference.create_label_cliques_by_similarity(
                similar_labels, threshold=threshold, topn=topn)

            new_lookup = embeddings.merge_lookups(clique_lookup, lookup)

            with open(
                    '{}/{}.threshold-{}.topn-{}.label-lookup.npy'.format(
                        args.embeddings_result_folder, dataset_name, threshold,
                        topn), 'wb') as f:
                pickle.dump(new_lookup, f)
    LOGGER.info('{:15} - Finished'.format(dataset_name))
예제 #11
0
 def process() -> tuple:
     X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
     X = graph_helper.get_graphs_only(X)
     estimator, params = task_helper.get_graph_estimator_and_params(X, Y, with_node_weights=True)
     return ClassificationData(X, Y, estimator, params)
예제 #12
0
 def process() -> tuple:
     #X, Y = graph_helper.get_graphs_with_mutag_enzyme_format('tests/data/{}'.format(dataset))
     X, Y = graph_helper.get_mutag_enzyme_graphs(dataset)
     X = graph_helper.get_graphs_only(X)
     estimator, params = task_helper.get_graph_estimator_and_params(X, Y)
     return ClassificationData(X, Y, estimator, params)
 def fit(self, X, *s):
     self.train_labels_ = set(
         chain.from_iterable(
             [_get_labels(x) for x in graph_helper.get_graphs_only(X)]))
     return self
예제 #14
0
def process_graph_cache_file(graph_cache_file, args):
    graph_cache_filename = graph_cache_file.split('/')[-1].rsplit('.')[0]
    dataset = filename_utils.get_dataset_from_filename(graph_cache_file)

    if '.phi.' in graph_cache_filename or not filter_utils.file_should_be_processed(
            graph_cache_filename, args.include_filter, args.exclude_filter,
            args.limit_dataset):
        return

    LOGGER.info('{:15} starting ({})'.format(dataset, graph_cache_filename))

    fast_wl_trans = FastWLGraphKernelTransformer(
        h=args.wl_h, use_early_stopping=False, truncate_to_highest_label=False)

    try:
        phi_graph_cache_file = graph_cache_file.replace('.npy', '.phi.npy')
        X_graphs, Y = dataset_helper.get_dataset_cached(graph_cache_file)
        X_graphs = graph_helper.get_graphs_only(X_graphs)

        # Kernel: WL
        if args.use_wl:
            used_phi_graph_cache_file = phi_graph_cache_file
            splitted_phi_graph_cache_file = phi_graph_cache_file.replace(
                '.phi', '.splitted.phi')
            phi_same_label_graph_cache_file = phi_graph_cache_file.replace(
                dataset, '{}_same-label'.format(dataset)).replace(
                    '.phi', '.splitted.phi')

            # Stop here if all files have already been created
            if not args.force and np.all([
                    os.path.exists(x) for x in
                [
                    splitted_phi_graph_cache_file, used_phi_graph_cache_file,
                    phi_same_label_graph_cache_file
                ]
            ]):
                return

            X_, Y_ = np.array(np.copy(X_graphs)), np.array(np.copy(Y))
            if args.wl_sort_classes:
                X_, Y_ = sort(X_, Y_, by=Y_)

            num_vertices = len(graph_helper.get_all_node_labels(X_))
            fast_wl_trans.set_params(phi_dim=num_vertices)

            X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
                np.copy(X_),
                np.copy(Y_),
                stratify=Y_,
                test_size=args.wl_test_size)

            X_train, Y_train = sort(X_train, Y_train, by=Y_train)
            X_test, Y_test = sort(X_test, Y_test, by=Y_test)

            # Splitted version
            if args.force or not os.path.exists(splitted_phi_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans).set_params(
                    same_label=True)
                phi_train = t.fit_transform(np.copy(X_train))
                phi_test = t.transform(np.copy(X_test))

                with open(splitted_phi_graph_cache_file, 'wb') as f:
                    pickle.dump((phi_train, phi_test, X_train, X_test, Y_train,
                                 Y_test), f)

            # Splitted, same label
            if args.force or not os.path.exists(
                    phi_same_label_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans)
                phi_train = t.fit_transform(X_train)
                phi_test = t.transform(X_test)

                with open(phi_same_label_graph_cache_file, 'wb') as f:
                    pickle.dump((phi_train, phi_test, X_train, X_test, Y_train,
                                 Y_test), f)

            # Whole dataset
            if args.force or not os.path.exists(used_phi_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans)
                with open(used_phi_graph_cache_file, 'wb') as f:
                    pickle.dump((t.fit_transform(X_), Y_), f)

        # Kernel: spgk
        if args.use_spgk:
            for depth in args.spgk_depth:
                spgk_graph_cache_file = graph_cache_file.replace(
                    '.npy', '.spgk-{}.gram.npy'.format(depth))

                if args.force or not os.path.exists(spgk_graph_cache_file):
                    K = spgk.transform(X_graphs, depth=depth)

                    with open(spgk_graph_cache_file, 'wb') as f:
                        pickle.dump((K, Y), f)
    except Exception as e:
        LOGGER.exception(e)

    LOGGER.info('{:15} finished ({})'.format(dataset, graph_cache_filename))
예제 #15
0
 def transform(self, X, y=None, **fit_params):
     X = graph_helper.get_graphs_only(X)
     X = [graph_helper.graph_to_text(g, self.use_edges) for g in X]
     return X
예제 #16
0
 def fit(self, X, *s):
     labels = [_get_labels(x) for x in graph_helper.get_graphs_only(X)]
     occurrences = collections.Counter(chain.from_iterable(labels))
     self.labels_to_be_removed_ = set(
         [k for k, v in occurrences.items() if v <= self.max_occurrences])
     return self