예제 #1
0
def enumerate_frequent_seq(elements, support_threshold):
    '''Recursively traverse the sequence lattice, generating frequent n+1-length
    sequences from n-length sequences provided in the id_list parameter.'''

    frequent_elements = _KeyDefaultDict(Element)

    for element_index_i, seq_i in enumerate(elements.keys()):

        frequent_elements_inner = _KeyDefaultDict(Element)
        # print((list(elements.keys())))
        # print(elements.keys()[element_index_i + 1])
        for element_index_j, seq_j in enumerate(
                list(elements.keys())[element_index_i + 1:]):

            R = temporal_join(elements[seq_i], elements[seq_j])

            for seq, element in list(R.items()):
                support = len(set([event.sid for event in element.events]))
                if support >= support_threshold:
                    frequent_elements_inner[seq] |= element

        for seq, element in list(frequent_elements_inner.items()):
            frequent_elements[seq] |= element

        for seq, element in list(
                enumerate_frequent_seq(frequent_elements_inner,
                                       support_threshold).items()):
            frequent_elements[seq] |= element

    return frequent_elements
예제 #2
0
def mine(sequences, support_threshold):
    '''SPADE (Zaki 2001) is performed in three distinct steps:
    1. Identify frequent single elements.
    2. Identify frequent two-element sequences.
    3. Identify all remaining sequences of three elements or more.
    '''

    # parse input sequences into individual item Elements
    elements = _KeyDefaultDict(Element)

    for sid, eid, itemset in sequences:
        for item in itemset:
            elements[tuple(item)] |= Element(tuple(item),
                                             Event(sid=sid, eid=eid))

    # identify frequent single elements
    elements = subset_to_support(elements, support_threshold)

    # identify frequent two-element sequences using a horizontal database
    freq_elements_len_eq_2 = count_frequent_two_seq(elements,
                                                    support_threshold)

    # generate ID lists for frequent two-element sequences discovered above
    elements_len_eq_2 = _KeyDefaultDict(Element)

    for two_seq in freq_elements_len_eq_2:

        R = temporal_join(elements[tuple(two_seq[0])],
                          elements[tuple(two_seq[1])])

        for seq, element in list(R.items()):
            support = len(set([event.sid for event in element.events]))
            if support >= support_threshold:
                elements_len_eq_2[seq] |= element

    # identify and generate ID lists for all remaining sequences
    freq = enumerate_frequent_seq(elements_len_eq_2, support_threshold)

    # collect all identified sequences of any length
    for seq, element in list(elements_len_eq_2.items()):
        freq[seq] |= element

    for seq, element in list(elements.items()):
        freq[seq] |= element

    # return frequent sequences
    return freq
예제 #3
0
def temporal_join(element_i, element_j):
    '''Given two elements, return a dictionary of new elements indexed by
    their corresponding item sequences.
    '''

    join_results = _KeyDefaultDict(Element)

    for event_index_i, event_i in enumerate(element_i.events):
        for event_index_j, event_j in enumerate(element_j.events):

            if event_i.sid == event_j.sid:

                sid = event_i.sid
                superseqs = tuple()
                superseqs_events = tuple()

                # these two atoms occur in the same sequence
                # if they occur at different times (different eids), then
                # their combination atom has the later eid by Corollary 1 (Zaki 2001)
                if event_i.eid > event_j.eid:
                    superseq = element_j.seq + tuple(element_i.seq[-1])
                    superseq_event = Event(sid=sid, eid=event_i.eid)
                    join_results[superseq] |= Element(superseq, superseq_event)

                elif event_i.eid < event_j.eid:
                    superseq = element_i.seq + tuple(element_j.seq[-1])
                    superseq_event = Event(sid=sid, eid=event_j.eid)
                    join_results[superseq] |= Element(superseq, superseq_event)

                elif element_i.seq[-1] != element_j.seq[-1]:

                    superseq_event = Event(sid=sid, eid=event_j.eid)

                    # for coincident atoms, join the last element of one atom to the other
                    # ensure that the itemset is sorted
                    superseq_i = element_i.seq[:-1] + tuple([
                        ''.join(
                            sorted(set(element_i.seq[-1] + element_j.seq[-1])))
                    ])
                    join_results[superseq_i] |= Element(
                        superseq_i, superseq_event)

                    superseq_j = element_j.seq[:-1] + tuple([
                        ''.join(
                            sorted(set(element_i.seq[-1] + element_j.seq[-1])))
                    ])

                    # if both resulting atoms are identical, only add it once
                    if superseq_j != superseq_i:
                        join_results[superseq_j] |= Element(
                            superseq_j, superseq_event)

    return join_results
예제 #4
0
def subset_to_support(elements, support_threshold):
    '''Given an IdList, return an IdList containing only those atoms which
    meet the support threshold.
    '''

    subsetted = _KeyDefaultDict(Element)

    for element_name, element in list(elements.items()):
        support = len(set([event.sid for event in element.events]))
        if support >= support_threshold:
            subsetted[element_name] = element

    return subsetted