Пример #1
0
def benchmark(posf, negf, minsup, topk):
    """
    Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence
    with a minimum positive support of minsup and prints them.
    """

    prefix = "../statement/data/"

    database_file_name_pos = prefix + posf
    database_file_name_neg = prefix + negf
    top_K = topk
    total_min_freq = minsup

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets,
                                  top_K)  # Creating task

    gSpan(task).run()  # Running gSpan
Пример #2
0
def task1(database_file_name_pos, database_file_name_neg, k, minsup):
    """
    Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence
    with a minimum positive support of minsup and prints them.
    """

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids

    subsets = [pos_ids, neg_ids]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(graph_database, subsets, minsup, k)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their confidence and total support:
    for pattern in task.patterns:
        total_support = pattern[1]
        confidence = pattern[0]
        print('{} {} {}'.format(pattern[2], confidence, total_support))
def top_k():
    """
    Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
    with a minimum positive support of minsup and prints them.
    """

    args = sys.argv
    database_file_name_pos = args[
        1]  # First parameter: path to positive class file
    database_file_name_neg = args[
        2]  # Second parameter: path to negative class file
    k = int(args[3])  # Third parameter: minimum support
    minsup = int(args[4])  # Third parameter: minimum support

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(minsup, k, graph_database,
                                  subsets)  # Creating task

    gSpan(task).run()  # Running gSpan
    sort = sorted(task.patterns,
                  key=attrgetter('confidence', 'support'),
                  reverse=True)
    bestConf = -1
    bestSupp = -1
    for patt in sort:
        confidence = patt.confidence
        support = patt.support
        dfs_code = patt.code
        if (confidence != bestConf or support != bestSupp):
            bestConf = confidence
            bestSupp = support
            k -= 1
            if k == -1:
                print(" ")
                break
        print('{} {} {}'.format(dfs_code, confidence, support))
Пример #4
0
def example1():
    """
	Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
	with a minimum positive support of minsup and prints them.
	"""

    a = 11
    if a == 1:
        args = sys.argv
        database_file_name_pos = args[
            1]  # First parameter: path to positive class file
        database_file_name_neg = args[
            2]  # Second parameter: path to negative class file
        k = int(args[3])
        minsup = int(args[4])  # Third parameter: minimum support
    else:
        database_file_name_pos = 'data/molecules-small.pos'
        database_file_name_neg = 'data/molecules-small.neg'
        k = 5
        minsup = 5

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids
    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    print(subsets)
    task = ConfidencePositiveGraphs(k, minsup, graph_database,
                                    subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    keys = task.patterns.keys()
    for key in keys:
        for pattern, a in task.patterns[key]:
            confidence = key[0]
            support = key[
                1]  # This will have to be replaced by the confidence and support on both classes
            print('{} {} {}'.format(pattern, confidence, support))
            print(a)
def task1():
    """
    Runs gSpan with the specified positive and negative graphs, finds all frequent subGraphs in the positive class
    with a minimum positive support of minSup and prints them.
    """

    args = sys.argv
    database_file_name_pos = args[1]  # First parameter: path to positive class file
    database_file_name_neg = args[2]  # Second parameter: path to negative class file
    k = int(args[4])  # Third parameter: k
    minFrequency = int(args[4])  # Fourth parameter: minimum frequency

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    # Reading positive graphs, adding them to database and getting ids
    pos_ids = graph_database.read_graphs(database_file_name_pos)
    # Reading negative graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(database_file_name_neg)

    task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, graph_database, [pos_ids, neg_ids], k, False)

    gSpan(task).run()  # Running gSpan

    # with open('./solution1', 'w') as file:
    firstLine = True
    result = ""
    # Printing frequent patterns along with their positive support:
    with open('./results/task1.txt', 'w') as dataset:
        for confidenceLevel in reversed(task.orderedListOfConfidenceValues):
            for pattern, gid_subsets, confidence, frequency, _, _, _ in task.patterns:
                if confidence == confidenceLevel:
                    toPrint = False
                    if confidence > task.minConfidence:
                        toPrint = True
                    elif confidence == task.minConfidence:
                        if frequency >= task.orderedListOfFrequencyValuesForMinConfidence[0]:
                            toPrint = True

                    if toPrint:
                        if not firstLine:
                            result += '\n'
                        else:
                            firstLine = False
                        result += '{}_{}_{}'.format(pattern, confidence, frequency)
            # print(result, file=file, end='')
        print(result, end='', file= dataset)
Пример #6
0
def example1():
    """
	Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
	with a minimum positive support of minsup and prints them.
	"""

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(minsup, graph_database,
                                  subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    result = []
    frequents = []
    for pattern, gid_subsets in task.patterns:
        pos_support = len(gid_subsets[0])
        neg_support = len(gid_subsets[1])
        confidence = pos_support / (pos_support + neg_support)
        frequents.append((confidence, pos_support + neg_support))
        result.append((pattern, confidence, pos_support + neg_support))

        uniq = list(set(freq for freq in frequents))
        s = sorted(uniq, key=lambda x: x[0], reverse=True)
        r = [s.index(freq) for freq in frequents]

        ranked = []
        for idx, i in enumerate(r):
            if i < k:
                ranked.append(result[idx])
                ranked.sort(key=lambda x: x[1], reverse=True)

        for a, b, c in ranked:
            print('{} {} {}'.format(a, b, c))
def train_and_evaluate(minsup, database, subsets, top_k, args=None):
    task = FrequentPositiveGraphs(minsup, database, subsets, top_k)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Creating feature matrices for training and testing:
    features = task.get_feature_matrices()
    train_fm = numpy.concatenate((features[0], features[2]))  # Training feature matrix
    train_labels = numpy.concatenate(
        (
            numpy.full(len(features[0]), 1, dtype=int),
            numpy.full(len(features[2]), -1, dtype=int),
        )
    )  # Training labels
    test_fm = numpy.concatenate((features[1], features[3]))  # Testing feature matrix
    test_labels = numpy.concatenate(
        (
            numpy.full(len(features[1]), 1, dtype=int),
            numpy.full(len(features[3]), -1, dtype=int),
        )
    )  # Testing labels

    classifier = DecisionTreeClassifier(random_state=1)
    classifier.fit(train_fm, train_labels)  # Training model

    predicted = classifier.predict(
        test_fm
    )  # Using model to predict labels of testing data

    accuracy = metrics.accuracy_score(test_labels, predicted)  # Computing accuracy

    # Printing frequent patterns along with their positive support:
    for (confidence, frequency), dfs_code, _ in task.patterns:
        print("{} {} {}".format(dfs_code, confidence, frequency))

    # printing classification results:
    print(predicted.tolist())

    if args and args.benchmark:
        train_predicted = classifier.predict(
            train_fm
        )  # Using model to predict labels of testing data

        train_accuracy = metrics.accuracy_score(
            train_labels, train_predicted
        )  # Computing accuracy:

        print("train accuracy: {}".format(train_accuracy))

    print("accuracy: {}".format(accuracy))
    print()  # Blank line to indicate end of fold.
Пример #8
0
def train_and_evaluate(minsup, database, subsets, top_K, ret=False):
    task = FrequentPositiveGraphs(minsup, database, subsets,
                                  top_K)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Creating feature matrices for training and testing:
    features = task.get_feature_matrices()
    train_fm = numpy.concatenate(
        (features[0], features[2]))  # Training feature matrix
    train_labels = numpy.concatenate(
        (numpy.full(len(features[0]), 1,
                    dtype=int), numpy.full(len(features[2]), -1,
                                           dtype=int)))  # Training labels
    test_fm = numpy.concatenate(
        (features[1], features[3]))  # Testing feature matrix
    test_labels = numpy.concatenate(
        (numpy.full(len(features[1]), 1,
                    dtype=int), numpy.full(len(features[3]), -1,
                                           dtype=int)))  # Testing labels

    classifier = tree.DecisionTreeClassifier(
        random_state=1)  # Creating model object
    classifier.fit(train_fm, train_labels)  # Training model

    predictedtest = classifier.predict(
        test_fm)  # Using model to predict labels of testing data

    testaccuracy = metrics.accuracy_score(test_labels,
                                          predictedtest)  # Computing accuracy:

    if ret:
        predictedtrain = classifier.predict(
            train_fm)  # Using model to predict labels of training data

        trainaccuracy = metrics.accuracy_score(train_labels, predictedtrain)

        return testaccuracy, trainaccuracy
    else:
        # Printing frequent patterns along with their positive support:
        for pattern in task.patterns:
            total_support = pattern[1]
            confidence = pattern[0]
            print('{} {} {}'.format(pattern[2], confidence, total_support))
        # printing classification results:
        print(predictedtest.tolist())
        print('accuracy: {}'.format(testaccuracy))
        print()  # Blank line to indicate end of fold.
Пример #9
0
def subgraph_is_isomorphic(graph, subgraph):
    """
    determines whether main graph contains a subgraph which is isomorphic to input subgraph
    :param graph: main graph
    :param subgraph: a subgraph to be searched in main graph
    :return: boolean
    """
    graph_gspan = networkx_to_gspan(graph, 0)
    subgraph_gspan = networkx_to_gspan(subgraph, 1)

    # create temporary files during gspan processing
    input_fd, input_filename = tempfile.mkstemp()
    output_fd, output_filename = tempfile.mkstemp()

    with os.fdopen(input_fd, 'w', encoding='utf-8') as input_handler:
        input_handler.write(graph_gspan + subgraph_gspan)
    orig_stdout = sys.stdout
    sys.stdout = os.fdopen(output_fd, 'w', encoding='utf-8')
    subgraph_miner = gSpan(input_filename, 2, where=True)
    subgraph_miner.run()
    sys.stdout = orig_stdout
    mined_subgraphs = parse_mined_gspan_file(output_filename)

    # remove temporary files
    os.remove(input_filename)
    os.remove(output_filename)

    em = iso.numerical_edge_match('weight', 0)
    nm = iso.categorical_node_match('name', None)
    for mined_subgraph in mined_subgraphs:
        graph_matcher = iso.GraphMatcher(mined_subgraph, subgraph, node_match=nm, edge_match=em)
        if graph_matcher.is_isomorphic():
            return True
    return False
Пример #10
0
def train_and_evaluate(minsup, k, database, subsets):
    task = FrequentPositiveGraphs(minsup, k, database,
                                  subsets)  # Creating task

    gSpan(task).run()  # Running gSpan
    patterns = get_output(task, k)
    # Creating feature matrices for training and testing:

    features = get_feature_matrices(task, patterns)
    train_fm = np.concatenate(
        (features[0], features[2]))  # Training feature matrix
    train_labels = np.concatenate(
        (np.full(len(features[0]), 1,
                 dtype=int), np.full(len(features[2]), -1,
                                     dtype=int)))  # Training labels
    test_fm = np.concatenate(
        (features[1], features[3]))  # Testing feature matrix
    test_labels = np.concatenate(
        (np.full(len(features[1]), 1,
                 dtype=int), np.full(len(features[3]), -1,
                                     dtype=int)))  # Testing labels

    classifier = DecisionTreeClassifier(
        random_state=1)  # Creating model object
    classifier.fit(train_fm, train_labels)  # Training model

    predicted = classifier.predict(
        test_fm)  # Using model to predict labels of testing data

    accuracy = metrics.accuracy_score(test_labels,
                                      predicted)  # Computing accuracy:

    # Printing frequent patterns along with their positive support:
    #print("number of patterns:", len(patterns))
    for pattern, gid_subsets in patterns:
        p = len(gid_subsets[0])
        n = len(gid_subsets[2])
        total = p + n
        if total == 0:
            confidence = 0
        else:
            confidence = p / total
        print('{} {} {}'.format(pattern, confidence, total))
    # printing classification results:
    print(predicted.tolist())
    print('accuracy: {}'.format(accuracy))
    print()  # Blank line to indicate end of fold.
def train_and_evaluate(minFrequency, database, subsets, k, dataset):
    task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, database, subsets, k, False)

    gSpan(task).run()  # Running gSpan

    # Creating feature matrices for training and testing:
    features = task.get_feature_matrices()
    train_fm = numpy.concatenate((features[0], features[2]))  # Training feature matrix
    train_labels = numpy.concatenate(
        (numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int)))  # Training labels
    test_fm = numpy.concatenate((features[1], features[3]))  # Testing feature matrix
    test_labels = numpy.concatenate(
        (numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int)))  # Testing labels

    classifier = tree.DecisionTreeClassifier(random_state=1)  # Creating model object
    classifier.fit(train_fm, train_labels)  # Training model

    predicted = classifier.predict(test_fm)  # Using model to predict labels of testing data

    accuracy = metrics.accuracy_score(test_labels, predicted)  # Computing accuracy:

    # Printing frequent patterns along with their positive support:
    firstLine = True
    result = ""
    # Printing frequent patterns along with their positive support:
    for confidenceLevel in reversed(task.orderedListOfConfidenceValues):
        for pattern, gid_subsets, confidence, frequency, _, _, _ in task.patterns:
            if confidence == confidenceLevel:
                toPrint = False
                if confidence > task.minConfidence:
                    toPrint = True
                elif confidence == task.minConfidence:
                    if frequency >= task.orderedListOfFrequencyValuesForMinConfidence[0]:
                        toPrint = True

                if toPrint:
                    if not firstLine:
                        result += '\n'
                    else:
                        firstLine = False
                    result += '{}_{}_{}'.format(pattern, confidence, frequency)

    print(result, file= dataset)
    # printing classification results:
    print(predicted, file= dataset)
    print('accuracy: {}'.format(accuracy), file= dataset)
    print("",file= dataset)  # Blank line to indicate end of fold.
Пример #12
0
def train_and_evaluate(k, minsup, database, subsets):
    task = ConfidencePositiveGraphs2(k, minsup, database,
                                     subsets)  # Creating task
    gSpan(task).run()  # Running gSpan
    # Creating feature matrices for training and testing:
    features = task.get_feature_matrices()
    train_fm = numpy.concatenate(
        (features[0], features[2]))  # Training feature matrix
    train_labels = numpy.concatenate(
        (numpy.full(len(features[0]), 1,
                    dtype=int), numpy.full(len(features[2]), -1,
                                           dtype=int)))  # Training labels
    test_fm = numpy.concatenate(
        (features[1], features[3]))  # Testing feature matrix
    test_labels = numpy.concatenate(
        (numpy.full(len(features[1]), 1,
                    dtype=int), numpy.full(len(features[3]), -1,
                                           dtype=int)))  # Testing labels

    classifier = tree.DecisionTreeClassifier(random_state=1)
    # classifier = naive_bayes.GaussianNB(random_state=1)  # Creating model object
    classifier.fit(train_fm, train_labels)  # Training model

    predicted = classifier.predict(
        test_fm)  # Using model to predict labels of testing data

    accuracy = metrics.accuracy_score(test_labels,
                                      predicted)  # Computing accuracy:

    # Printing frequent patterns along with their positive support:

    keys = task.patterns.keys()
    # print(len(keys))
    for key in keys:
        for pattern, a in task.patterns[key]:
            confidence = key[0]
            support = key[
                1]  # This will have to be replaced by the confidence and support on both classes
            print('{} {} {}'.format(pattern, confidence, support))
            # print(a)
    # printing classification results:
    print(predicted)
    print('accuracy: {}'.format(accuracy))
    print()  # Blank line to indicate end of fold.
Пример #13
0
def example1():
    """
    Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence
    with a minimum positive support of minsup and prints them.
    """

    args = sys.argv
    database_file_name_pos = args[
        1]  # First parameter: path to positive class file
    database_file_name_neg = args[
        2]  # Second parameter: path to negative class file
    top_K = int(args[3])  # Third parameter: minimum support
    total_min_freq = int(args[4])

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets,
                                  top_K)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their confidence and total support:
    for pattern in task.patterns:
        total_support = pattern[1]
        confidence = pattern[0]
        print('{} {} {}'.format(pattern[2], confidence, total_support))
Пример #14
0
def find_subgraphs():
    """
    Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
    with a minimum positive support of minsup and prints them.
    """

    from argparse import ArgumentParser

    parser = ArgumentParser("Find subgraphs")
    parser.add_argument("positive_file", type=str)
    parser.add_argument("negative_file", type=str)
    parser.add_argument("top_k", type=int)
    parser.add_argument("min_supp", type=int)
    args = parser.parse_args()

    if not os.path.exists(args.positive_file):
        print("{} does not exist.".format(args.positive_file))
        sys.exit()
    if not os.path.exists(args.negative_file):
        print("{} does not exist.".format(args.negative_file))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        args.positive_file
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        args.negative_file
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids,
        neg_ids,
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(args.min_supp, graph_database, subsets,
                                  args.top_k)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    for (confidence, frequency), dfs_code in task.patterns:
        print("{} {} {}".format(dfs_code, confidence, frequency))
def example1():
    """
	Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
	with a minimum positive support of minsup and prints them.
	"""

    args = sys.argv
    database_file_name_pos = args[
        1]  # First parameter: path to positive class file
    database_file_name_neg = args[
        2]  # Second parameter: path to negative class file
    minsup = int(args[3])  # Third parameter: minimum support

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(minsup, graph_database,
                                  subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    for pattern, gid_subsets in task.patterns:
        pos_support = len(
            gid_subsets[0]
        )  # This will have to be replaced by the confidence and support on both classes
        print('{} {}'.format(pattern, pos_support))
Пример #16
0
def tae(minsup, database, subsets, top_K, cl):
    task = FrequentPositiveGraphs(minsup, database, subsets,
                                  top_K)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Creating feature matrices for training and testing:
    features = task.get_feature_matrices()
    train_fm = numpy.concatenate(
        (features[0], features[2]))  # Training feature matrix
    train_labels = numpy.concatenate(
        (numpy.full(len(features[0]), 1,
                    dtype=int), numpy.full(len(features[2]), -1,
                                           dtype=int)))  # Training labels
    test_fm = numpy.concatenate(
        (features[1], features[3]))  # Testing feature matrix
    test_labels = numpy.concatenate(
        (numpy.full(len(features[1]), 1,
                    dtype=int), numpy.full(len(features[3]), -1,
                                           dtype=int)))  # Testing labels

    testaccuracy = []
    trainaccuracy = []

    for classifier in cl:
        classifier.fit(train_fm, train_labels)  # Training model

        predictedtest = classifier.predict(
            test_fm)  # Using model to predict labels of testing data

        testaccuracy.append(metrics.accuracy_score(
            test_labels, predictedtest))  # Computing accuracy:

        predictedtrain = classifier.predict(
            train_fm)  # Using model to predict labels of training data

        trainaccuracy.append(
            metrics.accuracy_score(train_labels, predictedtrain))

    return testaccuracy, trainaccuracy
def train_and_evaluate(minsup, database, subsets):
    task = FrequentPositiveGraphs(minsup, database, subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Creating feature matrices for training and testing:
    features = task.get_feature_matrices()
    train_fm = numpy.concatenate(
        (features[0], features[2]))  # Training feature matrix
    train_labels = numpy.concatenate(
        (numpy.full(len(features[0]), 1,
                    dtype=int), numpy.full(len(features[2]), -1,
                                           dtype=int)))  # Training labels
    test_fm = numpy.concatenate(
        (features[1], features[3]))  # Testing feature matrix
    test_labels = numpy.concatenate(
        (numpy.full(len(features[1]), 1,
                    dtype=int), numpy.full(len(features[3]), -1,
                                           dtype=int)))  # Testing labels

    classifier = naive_bayes.GaussianNB()  # Creating model object
    classifier.fit(train_fm, train_labels)  # Training model

    predicted = classifier.predict(
        test_fm)  # Using model to predict labels of testing data

    accuracy = metrics.accuracy_score(test_labels,
                                      predicted)  # Computing accuracy:

    # Printing frequent patterns along with their positive support:
    for pattern, gid_subsets in task.patterns:
        pos_support = len(gid_subsets[0])
        print('{} {}'.format(pattern, pos_support))
    # printing classification results:
    print(predicted)
    print('accuracy: {}'.format(accuracy))
    print()  # Blank line to indicate end of fold.
Пример #18
0
def train_and_evaluate(minsup, database, subsets, k):
    task = FrequentPositiveGraphs(minsup, database, subsets, k)  # Creating task
    gSpan(task).run()  # Running gSpan

    task.sortPatters()







    features = task.get_feature_matrices()
    train_fm = numpy.concatenate((features[0], features[2]))  # Training feature matrix
    train_labels = numpy.concatenate((numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int)))  # Training labels
    test_fm = numpy.concatenate((features[1], features[3]))  # Testing feature matrix
    test_labels = numpy.concatenate((numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int)))  # Testing labels

    #classifier = naive_bayes.GaussianNB()
#    classifier = svm.SVC() 
    classifier = KNeighborsClassifier()
    classifier.fit(train_fm, train_labels)  # Training model



    predicted = classifier.predict(test_fm)  # Using model to predict labels of testing data

    accuracy = metrics.accuracy_score(test_labels, predicted)  # Computing accuracy:
#    for pattern, gid_subsets in task.patterns:
#        print(' {} {} {}'.format(pattern,(len(gid_subsets[0]) / (len(gid_subsets[0])+len(gid_subsets[2]))),(len(gid_subsets[0])+len(gid_subsets[2]))))


    #print(predicted.tolist())
    #print('accuracy: {}'.format(accuracy))
#    print()  # Blank line to indicate end of fold
    return accuracy
Пример #19
0
def run_gspan(graph_file,
              output_file,
              min_support=10,
              min_num_vertices=1,
              where=True,
              **kwargs):
    """Run gSpan algorithm from https://github.com/betterenvi/gSpan

    Args:
      graph_file formatted as follows
        ```
        t # 0
        v 0 Oct4-Sox2/match=medium/imp=high
        v 1 Oct4-Sox2/match=high/imp=high
        v 2 Oct4-Sox2/match=high/imp=high
        v 3 Oct4-Sox2-deg/match=medium/imp=high
        e 0 1 10-50
        e 1 0 10-50
        t # 1
        v 0 Oct4-Sox2/match=medium/imp=high
        v 1 Nanog/match=medium/imp=high
        ...
        t # -1
        ```
      output_file: output file path
      min_support: minimal required support in order to display the output count
      min_num_vertices: minimal number of vertices in the graph
    """
    from gspan_mining import gSpan
    import io
    from contextlib import redirect_stdout

    f = io.StringIO()
    with redirect_stdout(f):
        g = gSpan(graph_file,
                  min_support=min_support,
                  min_num_vertices=min_num_vertices,
                  where=where,
                  **kwargs)
        g.run()
    out = f.getvalue()
    with open(output_file, 'w') as f:
        f.write(out)
Пример #20
0
def tae(minsup, database, subsets, k):

    pos_ids = copy.deepcopy(subsets[1])
    neg_ids = copy.deepcopy(subsets[3])

    pos_ids2 = copy.deepcopy(subsets[0])
    neg_ids2 = copy.deepcopy(subsets[2])

    list_subsets = []
    for subset in subsets:
        if type(subset) != type([]):
            new_subset = subset.tolist()
            list_subsets.append(new_subset)
        else:
            list_subsets.append(subset)

    result = []
    temp_conf = []
    train_pos_conf = []
    for i in range(k):
        task = FrequentPositiveGraphs(minsup, database, list_subsets, 1)
        gSpan(task).run()
        sorted_list = []
        for pattern in task.patterns:
            sorted_list.append(
                [pattern[2], pattern[0], pattern[1], pattern[3]])
        sorted_list.sort()
        if len(sorted_list) > 0:
            result.append(sorted_list[0])
            subsets_list = sorted_list[0][3]
            test_list = subsets_list[1] + subsets_list[3]

            train_list = subsets_list[0] + subsets_list[2]

            for item in test_list:
                insort(temp_conf, [item, pattern[4]])
            for item in train_list:
                insort(train_pos_conf, [item, pattern[4]])

            list_subsets = [[x for x in b if x not in a]
                            for a, b in zip(subsets_list, list_subsets)]

    test_list = list_subsets[1] + list_subsets[3]
    train_list = list_subsets[0] + list_subsets[2]

    pos_conf = True
    if len(list_subsets[0]) < len(list_subsets[2]):
        pos_conf = False

    # building test and training lists with conf, item & boolean
    for item in test_list:
        insort(temp_conf, [item, pos_conf])
    for item in train_list:
        insort(train_pos_conf, [item, pos_conf])

    # test accuracy
    counter = 0
    for pos_conf in temp_conf:
        if pos_conf[0] in pos_ids:
            if pos_conf[1]:
                counter += 1
        if pos_conf[0] in neg_ids:
            if not pos_conf[1]:
                counter += 1
    testaccuracy = counter / len(temp_conf)

    # training accuracy
    counter = 0
    for pos_conf in train_pos_conf:
        if pos_conf[0] in pos_ids2:
            if pos_conf[1]:
                counter += 1
        if pos_conf[0] in neg_ids2:
            if not pos_conf[1]:
                counter += 1
    trainaccuracy = counter / len(train_pos_conf)

    return testaccuracy, trainaccuracy
Пример #21
0
def Sequential_Covering(k, minsup, database, subsets):
    origin_label = copy.deepcopy([subsets[1], subsets[3]])
    new_subsets = []
    for subset in subsets:
        if type(subset) != type([]):
            new_subset = subset.tolist()
            new_subsets.append(new_subset)
        else:
            new_subsets.append(subset)
    pattern_dic = {}
    test_pred = {}
    for _ in range(k):
        task = ConfidencePositiveGraphs3(1, minsup, database,
                                         new_subsets)  # Creating task
        gSpan(task).run()  # Running gSpan
        new_pattern = task.patterns
        keys = new_pattern.keys()
        for key in keys:
            # print(key)
            pattern_list = new_pattern[key]
            if len(pattern_list) == 1:
                pattern = pattern_list[0]
            else:
                DFS_list = [pattern[0] for pattern in pattern_list]
                min_DFS = min(DFS_list)
                DFS_index = DFS_list.index(min_DFS)
                # get lowest
                pattern = pattern_list[DFS_index]
            # print(pattern[0], key)
            pattern_dic[pattern[0]] = (pattern[2], key)
            example_list = pattern[1]
            test_list = example_list[1] + example_list[3]
            # print(example_list)
            for item in test_list:
                test_pred[item] = pattern[2]
            new_subsets = RemoveX1FromX2(example_list, new_subsets)

    test_list = new_subsets[1] + new_subsets[3]
    # print(new_subsets)
    length_pos, length_neg = len(new_subsets[0]), len(new_subsets[2])
    if length_pos >= length_neg:
        default = 'pos'
    else:
        default = 'neg'
    for item in test_list:
        test_pred[item] = default
    # print('dic', pattern_dic)
    # print(test_pred)
    keys = test_pred.keys()
    key_list = [key for key in keys]
    key_list.sort()
    # print(key_list)
    test_prediction = [test_pred[key] for key in key_list]
    # print patterns
    keys = pattern_dic.keys()
    for key in keys:
        print('{} {} {}'.format(key, pattern_dic[key][1][0],
                                pattern_dic[key][1][1]))
    # print prediction
    out_pred = []
    for pred in test_prediction:
        if pred == 'pos':
            out_pred.append(1)
        else:
            out_pred.append(-1)
    print(out_pred)
    # print accuracy
    keys = test_pred.keys()
    counter = 0
    # print(origin_label)
    for key in keys:
        if key in origin_label[0]:
            if test_pred[key] == 'pos':
                counter += 1
        if key in origin_label[1]:
            if test_pred[key] == 'neg':
                counter += 1
    accuracy = counter / len(keys)
    print('accuracy: {}'.format(accuracy))
    print()  # Blank line to indicate end of fold.
def train_and_evaluate(minsup, database, subsets, top_k, args=None):
    y_test = [(item, 1) for item in subsets[1]] + [(item, -1)
                                                   for item in subsets[3]]
    y_test.sort()

    y_train = [(item, 1) for item in subsets[0]] + [(item, -1)
                                                    for item in subsets[2]]
    y_train.sort()

    sc_subsets = [
        subset.tolist() if type(subset) != list else subset.copy()
        for subset in subsets
    ]

    rules = list()
    y_test_predicted = list()
    y_train_predicted = list()

    for k in range(top_k):
        task = FrequentPositiveGraphs(minsup, database, sc_subsets, 1)
        gSpan(task).run()

        if task.patterns:

            task.patterns.sort(key=lambda x: (x[1], *x[0], x[2]))
            best_pattern = task.patterns[0]
            (confidence,
             frequency), dfs_code, gid_subsets, label = best_pattern

            rules.append(best_pattern)

            for item in gid_subsets[1] + gid_subsets[3]:
                insort(y_test_predicted, (item, label))

            for item in gid_subsets[0] + gid_subsets[2]:
                insort(y_train_predicted, (item, label))

            sc_subsets = remove(gid_subsets, sc_subsets)

    default_label = 1 if len(sc_subsets[0]) >= len(sc_subsets[2]) else -1

    for item in sc_subsets[1] + sc_subsets[3]:
        insort(y_test_predicted, (item, default_label))

    for item in sc_subsets[0] + sc_subsets[2]:
        insort(y_train_predicted, (item, default_label))

    for (confidence, frequency), dfs_code, _, _ in rules:
        print(f"{dfs_code} {confidence} {frequency}")

    predicted_labels = [label for _, label in y_test_predicted]
    print(predicted_labels)

    accuracy = sum(
        t == p
        for t, p in zip(y_test, y_test_predicted)) / len(y_test_predicted)
    if args and args.benchmark:
        train_accuracy = sum(t == p for t, p in zip(
            y_train, y_train_predicted)) / len(y_train_predicted)
        print(f"train accuracy: {train_accuracy}")

    print(f"accuracy: {accuracy}")

    print()
def train_and_evaluate_task4(minFrequency, database, subsets, k, dataset):
    rules = []
    task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, database, subsets, 5, True, True)
    pos_ids = subsets[0]
    pos_idsTest = subsets[1]
    neg_ids = subsets[2]
    neg_idsTest = subsets[3]
    for _ in range(0, k):
        gSpan(task).run()  # Running gSpan

        numberOfPatternsFound = len(task.patterns)
        if numberOfPatternsFound == 0:
            break
        patterns = sortList(task.patterns)
        numberOfPatternsFound = len(patterns)
        pattern = patterns[0]
        if numberOfPatternsFound == 1:
            #  N.B. rule format: (dfs_code, gid_subsets, confidence, frequency, p_test, n_test, isPositivePattern)
            rules.append(pattern)
        elif numberOfPatternsFound > 1:
            for i in range(1, numberOfPatternsFound):
                if patterns[i] < pattern:
                    pattern = patterns[i]
            rules.append(pattern)

        nextPosIds = []
        for transaction in pos_ids:
            if int(transaction) not in pattern[1][0]:  # == gid_subsets
                nextPosIds.append(int(transaction))
        nextPosIdsTest = []
        for transaction in pos_idsTest:
            if int(transaction) not in pattern[1][1]:  # == gid_subsets
                nextPosIdsTest.append(int(transaction))
        nextNegIds = []
        for transaction in neg_ids:
            if int(transaction) not in pattern[1][2]:  # == gid_subsets
                nextNegIds.append(int(transaction))
        nextNegIdsTest = []
        for transaction in neg_idsTest:
            if int(transaction) not in pattern[1][3]:  # == gid_subsets
                nextNegIdsTest.append(int(transaction))
        projectedSubset = [nextPosIds, nextPosIdsTest, nextNegIds, nextNegIdsTest]
        pos_ids = nextPosIds
        pos_idsTest = nextPosIdsTest
        neg_ids = nextNegIds
        neg_idsTest = nextNegIdsTest
        if len(pos_ids) == 0 and len(neg_ids) == 0:
            break
        task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, database, projectedSubset, 5, True, True)
    # default class is positive if there are more remaining positive examples, or if there are no remaining patterns
    isDefaultPositive = len(nextPosIds) >= len(nextNegIds)

    # classification
    predicted = []
    correctPredictions = 0
    testPositive = subsets[1]
    testNegative = subsets[3]
    for transaction in testPositive:
        isTransactionPositive = isDefaultPositive
        for rule in rules:
            # N.B. rule format: (dfs_code, gid_subsets, confidence, frequency, p_test, n_test, isPositivePattern)
            if transaction in rule[1][1]:
                if rule[6]:
                    isTransactionPositive = True
                else:
                    isTransactionPositive = False
                break

        if isTransactionPositive:
            predicted.append(1)
            correctPredictions += 1
        else:
            predicted.append(-1)
    for transaction in testNegative:
        isTransactionPositive = isDefaultPositive
        for rule in rules:
            # N.B. rule format: (dfs_code, gid_subsets, confidence, frequency, p_test, n_test, isPositivePattern)
            if transaction in rule[1][3]:
                if rule[6]:
                    isTransactionPositive = True
                else:
                    isTransactionPositive = False
                break

        if isTransactionPositive:
            predicted.append(1)
        else:
            predicted.append(-1)
            correctPredictions += 1
    accuracy = correctPredictions / (len(testPositive) + len(testNegative))

    # Printing frequent patterns along with their positive support:
    firstLine = True
    result = ""
    # Printing frequent patterns along with their positive support:
    for pattern, gid_subsets, confidence, frequency, _, _, _ in rules:
        if not firstLine:
            result += '\n'
        else:
            firstLine = False
        result += '{}_{}_{}'.format(pattern, confidence, frequency)

    print(result, file= dataset)
    # printing classification results:
    print(predicted, file= dataset)
    print('accuracy: {}'.format(accuracy), file= dataset)
    print("", file= dataset)  # Blank line to indicate end of fold.
Пример #24
0
def train_and_evaluate(minsup, database, subsets, k):

    pos_ids = copy.deepcopy(subsets[1])
    neg_ids = copy.deepcopy(subsets[3])

    list_subsets = []
    for subset in subsets:
        if isinstance(subset, list):
            list_subsets.append(subset)
        else:
            ready_to_go = subset.tolist()
            list_subsets.append(ready_to_go)

    result = []
    temp_conf = []
    for i in range(k):
        task = FrequentPositiveGraphs(minsup, database, list_subsets, 1)
        gSpan(task).run()
        sorted_list = []
        for pattern in task.patterns:
            sorted_list.append(
                [pattern[2], pattern[0], pattern[1], pattern[3]])
        sorted_list.sort()
        if len(sorted_list) > 0:
            result.append(sorted_list[0])
            subsets_list = sorted_list[0][3]
            test_list = subsets_list[1] + subsets_list[3]

            list_subsets = [[x for x in b if x not in a]
                            for a, b in zip(subsets_list, list_subsets)]

            for item in test_list:
                insort(temp_conf, [item, pattern[4]])

    test_list = list_subsets[1] + list_subsets[3]

    pos_conf = True
    if len(list_subsets[0]) < len(list_subsets[2]):
        pos_conf = False

    for item in test_list:
        insort(temp_conf, [item, pos_conf])

    for pattern in result:
        print('{} {} {}'.format(pattern[0], pattern[1], pattern[2]))

    pred_result = []
    for pred in temp_conf:
        if pred[1]:
            pred_result.append(1)
        else:
            pred_result.append(-1)
    print(pred_result)

    counter = 0
    for pos_conf in temp_conf:
        if pos_conf[0] in pos_ids:
            if pos_conf[1]:
                counter += 1
        if pos_conf[0] in neg_ids:
            if not pos_conf[1]:
                counter += 1
    accuracy = counter / len(temp_conf)
    print('accuracy: {}'.format(accuracy))
    print()
Пример #25
0
def train_and_evaluate(minsup, database, subsets, top_K):

    pos_ids = copy.deepcopy(subsets[1])
    neg_ids = copy.deepcopy(subsets[3])
    new_subsets = []
    for subset in subsets:
        if type(subset) != type([]):
            new_subset = subset.tolist()
            new_subsets.append(new_subset)
        else:
            new_subsets.append(subset)

    result = []
    test_is_pos = []
    for i in range(top_K):
        task = FrequentPositiveGraphs(minsup, database, new_subsets, 1)
        gSpan(task).run()
        sort_list = []
        for pattern in task.patterns:
            sort_list.append([pattern[2], pattern[0], pattern[1], pattern[3]])
        sort_list.sort()
        if len(sort_list) > 0:
            result.append(sort_list[0])
            subsets_list = sort_list[0][3]
            test_list = subsets_list[1] + subsets_list[3]

            for item in test_list:
                insort(test_is_pos, [item, pattern[4]])

            new_subsets = remove(subsets_list, new_subsets)

    test_list = new_subsets[1] + new_subsets[3]
    length_pos = len(new_subsets[0])
    length_neg = len(new_subsets[2])

    if length_pos >= length_neg:
        is_pos = True
    else:
        is_pos = False

    for item in test_list:
        insort(test_is_pos, [item, is_pos])

    for pattern in result:
        print('{} {} {}'.format(pattern[0], pattern[1], pattern[2]))

    pred_result = []
    for pred in test_is_pos:
        if pred[1]:
            pred_result.append(1)
        else:
            pred_result.append(-1)

    print(pred_result)

    counter = 0

    for is_pos in test_is_pos:
        if is_pos[0] in pos_ids:
            if is_pos[1]:
                counter += 1
        if is_pos[0] in neg_ids:
            if not is_pos[1]:
                counter += 1
    accuracy = counter / len(test_is_pos)
    print('accuracy: {}'.format(accuracy))
    print()
Пример #26
0
def tae(minsup, database, subsets, top_K):

    pos_ids = copy.deepcopy(subsets[1])
    neg_ids = copy.deepcopy(subsets[3])

    pos_ids2 = copy.deepcopy(subsets[0])
    neg_ids2 = copy.deepcopy(subsets[2])

    new_subsets = []
    for subset in subsets:
        if type(subset) != type([]):
            new_subset = subset.tolist()
            new_subsets.append(new_subset)
        else:
            new_subsets.append(subset)

    result = []
    test_is_pos = []
    train_is_pos = []
    for i in range(top_K):
        task = FrequentPositiveGraphs(minsup, database, new_subsets, 1)
        gSpan(task).run()
        sort_list = []
        for pattern in task.patterns:
            sort_list.append([pattern[2], pattern[0], pattern[1], pattern[3]])
        sort_list.sort()
        if len(sort_list) > 0:
            result.append(sort_list[0])
            subsets_list = sort_list[0][3]
            test_list = subsets_list[1] + subsets_list[3]

            train_list = subsets_list[0] + subsets_list[2]

            for item in test_list:
                insort(test_is_pos,
                       [item, pattern[4]])  # pattern[4]: pos or not

            for item in train_list:
                insort(train_is_pos, [item, pattern[4]])

            new_subsets = remove(subsets_list, new_subsets)

    test_list = new_subsets[1] + new_subsets[3]
    train_list = new_subsets[0] + new_subsets[2]
    length_pos = len(new_subsets[0])
    length_neg = len(new_subsets[2])

    if length_pos >= length_neg:
        is_pos = True
    else:
        is_pos = False

    for item in test_list:
        insort(test_is_pos, [item, is_pos])

    for item in train_list:
        insort(train_is_pos, [item, is_pos])

    counter = 0

    for is_pos in test_is_pos:
        if is_pos[0] in pos_ids:
            if is_pos[1]:
                counter += 1
        if is_pos[0] in neg_ids:
            if not is_pos[1]:
                counter += 1
    testaccuracy = counter / len(test_is_pos)

    counter = 0

    for is_pos in train_is_pos:
        if is_pos[0] in pos_ids2:
            if is_pos[1]:
                counter += 1
        if is_pos[0] in neg_ids2:
            if not is_pos[1]:
                counter += 1
    trainaccuracy = counter / len(train_is_pos)

    return testaccuracy, trainaccuracy