def process() -> tuple:
     X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
     X = graph_helper.get_graphs_only(X)
     graph_helper.convert_graphs_to_adjs_tuples(X)
     X = [(x, [0] * len(y)) for x, y in X]
     estimator, params = task_helper.get_graph_estimator_and_params(X, Y)
     return ClassificationData(X, Y, estimator, params)
def process_dataset(cache_file, label_lookup_file, args):
    dataset = filename_utils.get_dataset_from_filename(cache_file)

    cache_filename = filename_utils.get_filename_only(cache_file,
                                                      with_extension=False)

    threshold, topn = filename_utils.get_topn_threshold_from_lookupfilename(
        label_lookup_file)

    result_file = cache_file.replace(
        dataset,
        'relabeled_threshold_{}_topn_{}_{}'.format(threshold, topn, dataset))
    if not args.force and os.path.exists(result_file):
        return

    with open(label_lookup_file, 'rb') as f:
        label_lookup = pickle.load(f)

    X, Y = dataset_helper.get_dataset_cached(cache_file)
    X = graph_helper.get_graphs_only(X)

    # Get label to be renamed
    node_labels = list(chain.from_iterable([x.nodes() for x in X]))
    unique_labels = set(node_labels)
    counter = collections.Counter(node_labels)

    node_labels_to_be_renamed = set([
        label for label, occurrences in counter.items()
        if occurrences <= args.max_occurrence
    ])

    lookup_ = {
        label: new_label
        for label, new_label in label_lookup.items()
        if label in node_labels_to_be_renamed
    }

    new_labels = set(lookup_.values())
    lookup__ = collections.defaultdict(list)

    for label, new_label in label_lookup.items():
        if new_label in new_labels:
            lookup__[label].append(new_label)

    lookup_ = dict(lookup_, **lookup__)

    LOGGER.info(
        '{:80} topn={:4} threshold={:4}\n\t\t#relabeled labels: {}\n\t\t#unique labels: {}\n\t\t#nodes: {}'
        .format(cache_filename, topn, threshold, len(lookup_),
                len(unique_labels), len(node_labels)))

    relabel_trans = transformers.RelabelGraphsTransformer(lookup_)

    X = relabel_trans.transform(X)

    with open(result_file, 'wb') as f:
        pickle.dump((X, Y), f)
 def process() -> tuple:
     K, Y = dataset_helper.get_dataset_cached(gram_cache_file, check_validity=False)
     estimator = sklearn.pipeline.Pipeline([('classifier', None)])
     params = dict(
         classifier=[sklearn.svm.SVC()],
         classifier__kernel='precomputed',
         classifier__class_weight='balanced'
     )
     return ClassificationData(K, Y, estimator, params)
 def process_relabeled():
     X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
     X = graph_helper.get_graphs_only(X)
     estimator, params = task_helper.get_graph_estimator_and_params(X, Y)
     params['graph_preprocessing'] = [transformers.RelabelGraphsTransformer()]
     params['graph_preprocessing__dataset'] = [dataset]
     params['graph_preprocessing__threshold'] = [0.99]
     params['graph_preprocessing__topn'] = [10]
     return ClassificationData(X, Y, estimator, params)
示例#5
0
def process_dataset(dataset_name, args, embedding_models):
    print('\tdataset: {:20} - Processing'.format(dataset_name))
    results = {}
    used_models = embedding_models + [
        ('trained', dataset_helper.get_w2v_embedding_for_dataset(dataset_name))
    ] if args.check_own_embeddings else embedding_models
    all_graph_cache_files = [
        x for x in dataset_helper.get_all_cached_graph_datasets()
        if x.endswith('{}.npy'.format(dataset_name))
    ]
    graph_cache_files = []
    found_all_cache = False
    found_gml_cache = False
    for cache_file in all_graph_cache_files:
        # ...
        if len(graph_cache_files) == 2: break
        if (not found_gml_cache
                and 'gml' in cache_file) or (not found_all_cache
                                             and 'all' in cache_file):
            found_all_cache = found_all_cache or 'all' in cache_file
            found_gml_cache = found_gml_cache or 'gml' in cache_file
            graph_cache_files.append(cache_file)

    if len(graph_cache_files) != 2:
        print('\tdataset: {:20} - Found: gml: {}, all: {}'.format(
            dataset_name, found_gml_cache, found_all_cache))
    for model_name, model in used_models:
        results[model_name] = {}
        print('\tdataset: {:20} - Model: {}'.format(dataset_name, model_name))
        for graph_cache_file in graph_cache_files:
            print('\tdataset: {:20} - Graph: {}'.format(
                dataset_name, graph_cache_file))
            X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
            labels = graph_helper.get_all_node_labels_uniq(X)
            print('\tdataset: {:20} - #unique labels: {}'.format(
                dataset_name, len(labels)))
            counter = {'found': 0, 'not_found': 0}
            not_found_labels = []
            for idx, label in enumerate(labels):
                if label in model:
                    counter['found'] += 1
                else:
                    if len(not_found_labels) < 100:
                        not_found_labels.append(label)
                    counter['not_found'] += 1
            print(
                '\tdataset: {:20} - {}, Found: {}%, Missing Labels Sample: {}'.
                format(dataset_name, counter,
                       int(100 * counter['found'] / len(labels)),
                       not_found_labels[:10]))
            results[model_name][graph_cache_file] = {
                'num_labels': len(labels),
                'counts': counter,
                'not_found_sample': not_found_labels
            }
    print('\tdataset: {:20} - Finished'.format(dataset_name))
    return results
    def test_convert_graph_datasets(self):
        for graph_dataset, dataset_name in self.iterate_graph_cache_datasets():
            X, Y = dataset_helper.get_dataset_cached(graph_dataset)
            self.assertTrue(len(X))
            self.assertTrue(len(Y))

            graph_helper.convert_graphs_to_adjs_tuples(X)

            for x in X:
                self.assertTrue(isinstance(x, tuple))
                self.assertTrue(isinstance(x[0], scipy.sparse.spmatrix))
                self.assertTrue(isinstance(x[1], list))
                break
def get_combined_text_graph_dataset(graph_cache_file, use_ana=False) -> typing.Tuple[typing.List[typing.Tuple], typing.List]:
    dataset_name = filename_utils.get_dataset_from_filename(graph_cache_file)

    X_text, Y_text = dataset_helper.get_dataset(dataset_name + ('-ana' if use_ana else ''))
    X_graph, Y_graph = dataset_helper.get_dataset_cached(graph_cache_file)

    # Same length but has ID
    if len(X_graph) == len(X_text) and (not isinstance(X_graph[0], tuple) or not isinstance(X_graph[0][1], str)):
        return list(zip(X_graph, X_text, [None] * len(X_graph))), Y_graph

    # Get class to class ids mapping
    class_2_id = collections.defaultdict(lambda: [])
    for x, y in zip(X_text, Y_text):
        class_2_id[y].append(x)

    X_combined, Y_combined = [], Y_graph
    for (x_graph, y_id), y_graph in zip(X_graph, Y_graph):
        y_id = int(y_id)
        X_combined.append((x_graph, class_2_id[y_graph][y_id], y_id))

    return X_combined, Y_combined
示例#8
0
def process_dataset(dataset_name, args):
    LOGGER.info('{:15} - Start'.format(dataset_name))
    LOGGER.info('{:15} - Retrieving trained embedding'.format(dataset_name))

    pre_trained_embedding = embeddings.get_embedding_model(
        args.pre_trained_embedding,
        binary=False,
        first_line_header=True,
        with_gensim=True)

    try:
        trained_embedding = dataset_helper.get_w2v_embedding_for_dataset(
            dataset_name)
    except FileNotFoundError as e:
        LOGGER.exception(e)
        return

    cmap_cache_files = dataset_helper.get_all_cached_graph_datasets(
        dataset_name=dataset_name, graph_type=constants.TYPE_CONCEPT_MAP)

    coo_cache_files = [
        x for x in dataset_helper.get_all_cached_graph_datasets(
            dataset_name=dataset_name, graph_type=constants.TYPE_COOCCURRENCE)
        if 'all' in x
    ]

    if not len(cmap_cache_files) or not len(coo_cache_files):
        return

    used_graphs = [cmap_cache_files[0], coo_cache_files[0]]

    LOGGER.info('{:15} - Retrieving dataset'.format(dataset_name))
    all_labels = set()
    for graph_cache_file in used_graphs:
        X, _ = dataset_helper.get_dataset_cached(graph_cache_file)
        X = graph_helper.get_graphs_only(X)
        all_labels |= graph_helper.get_all_node_labels_uniq(
            X, as_sorted_list=False)

    LOGGER.info('{:15} - Resolving embeddings'.format(dataset_name))
    embeddings_pre_trained, not_found_pre_trained_coreferenced, not_found_trained, not_found_pre_trained, lookup, similar_els = embeddings.get_embeddings_for_labels_with_lookup(
        all_labels, trained_embedding, pre_trained_embedding)

    LOGGER.info('{:15} - Missing'.format(dataset_name))

    for label, s in [('trained', not_found_trained),
                     ('pre_trained', not_found_pre_trained),
                     ('after_coreference', not_found_pre_trained_coreferenced)
                     ]:
        LOGGER.info('\t{:20} {:>6}'.format(label, len(s)))

    embedding_file = '{}/{}.w2v.txt'.format(args.embeddings_result_folder,
                                            dataset_name)
    embeddings.save_embedding_dict(embeddings_pre_trained, embedding_file)
    embeddings_pre_trained = embeddings.load_word2vec_format(
        fname=embedding_file, binary=False)

    LOGGER.info('{:15} - Co-reference resolution'.format(dataset_name))
    max_topn = max(args.topn)

    similar_labels = coreference.get_most_similar_labels(
        all_labels, embeddings_pre_trained, max_topn)

    for topn in args.topn:
        for threshold in args.merge_threshold:
            LOGGER.info(
                '{:15} - Co-reference resolution: topn: {}, threshold: {}'.
                format(dataset_name, topn, threshold))
            clique_lookup = coreference.create_label_cliques_by_similarity(
                similar_labels, threshold=threshold, topn=topn)

            new_lookup = embeddings.merge_lookups(clique_lookup, lookup)

            with open(
                    '{}/{}.threshold-{}.topn-{}.label-lookup.npy'.format(
                        args.embeddings_result_folder, dataset_name, threshold,
                        topn), 'wb') as f:
                pickle.dump(new_lookup, f)
    LOGGER.info('{:15} - Finished'.format(dataset_name))
 def process():
     X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
     estimator, params = text_pipeline.get_params(reduced=False)
     estimator.steps.insert(0, ('graph_to_text', GraphToTextTransformer()))
     params = dict(params, **dict(graph_to_text__use_edges=[True, False]))
     return ClassificationData(X, Y, estimator, params)
 def process() -> tuple:
     X, Y = dataset_helper.get_dataset_cached(graph_cache_file)
     X = graph_helper.get_graphs_only(X)
     estimator, params = task_helper.get_graph_estimator_and_params(X, Y, with_node_weights=True)
     return ClassificationData(X, Y, estimator, params)
示例#11
0
def process_graph_cache_file(graph_cache_file, args):
    graph_cache_filename = graph_cache_file.split('/')[-1].rsplit('.')[0]
    dataset = filename_utils.get_dataset_from_filename(graph_cache_file)

    if '.phi.' in graph_cache_filename or not filter_utils.file_should_be_processed(
            graph_cache_filename, args.include_filter, args.exclude_filter,
            args.limit_dataset):
        return

    LOGGER.info('{:15} starting ({})'.format(dataset, graph_cache_filename))

    fast_wl_trans = FastWLGraphKernelTransformer(
        h=args.wl_h, use_early_stopping=False, truncate_to_highest_label=False)

    try:
        phi_graph_cache_file = graph_cache_file.replace('.npy', '.phi.npy')
        X_graphs, Y = dataset_helper.get_dataset_cached(graph_cache_file)
        X_graphs = graph_helper.get_graphs_only(X_graphs)

        # Kernel: WL
        if args.use_wl:
            used_phi_graph_cache_file = phi_graph_cache_file
            splitted_phi_graph_cache_file = phi_graph_cache_file.replace(
                '.phi', '.splitted.phi')
            phi_same_label_graph_cache_file = phi_graph_cache_file.replace(
                dataset, '{}_same-label'.format(dataset)).replace(
                    '.phi', '.splitted.phi')

            # Stop here if all files have already been created
            if not args.force and np.all([
                    os.path.exists(x) for x in
                [
                    splitted_phi_graph_cache_file, used_phi_graph_cache_file,
                    phi_same_label_graph_cache_file
                ]
            ]):
                return

            X_, Y_ = np.array(np.copy(X_graphs)), np.array(np.copy(Y))
            if args.wl_sort_classes:
                X_, Y_ = sort(X_, Y_, by=Y_)

            num_vertices = len(graph_helper.get_all_node_labels(X_))
            fast_wl_trans.set_params(phi_dim=num_vertices)

            X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
                np.copy(X_),
                np.copy(Y_),
                stratify=Y_,
                test_size=args.wl_test_size)

            X_train, Y_train = sort(X_train, Y_train, by=Y_train)
            X_test, Y_test = sort(X_test, Y_test, by=Y_test)

            # Splitted version
            if args.force or not os.path.exists(splitted_phi_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans).set_params(
                    same_label=True)
                phi_train = t.fit_transform(np.copy(X_train))
                phi_test = t.transform(np.copy(X_test))

                with open(splitted_phi_graph_cache_file, 'wb') as f:
                    pickle.dump((phi_train, phi_test, X_train, X_test, Y_train,
                                 Y_test), f)

            # Splitted, same label
            if args.force or not os.path.exists(
                    phi_same_label_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans)
                phi_train = t.fit_transform(X_train)
                phi_test = t.transform(X_test)

                with open(phi_same_label_graph_cache_file, 'wb') as f:
                    pickle.dump((phi_train, phi_test, X_train, X_test, Y_train,
                                 Y_test), f)

            # Whole dataset
            if args.force or not os.path.exists(used_phi_graph_cache_file):
                t = sklearn.base.clone(fast_wl_trans)
                with open(used_phi_graph_cache_file, 'wb') as f:
                    pickle.dump((t.fit_transform(X_), Y_), f)

        # Kernel: spgk
        if args.use_spgk:
            for depth in args.spgk_depth:
                spgk_graph_cache_file = graph_cache_file.replace(
                    '.npy', '.spgk-{}.gram.npy'.format(depth))

                if args.force or not os.path.exists(spgk_graph_cache_file):
                    K = spgk.transform(X_graphs, depth=depth)

                    with open(spgk_graph_cache_file, 'wb') as f:
                        pickle.dump((K, Y), f)
    except Exception as e:
        LOGGER.exception(e)

    LOGGER.info('{:15} finished ({})'.format(dataset, graph_cache_filename))