Exemplo n.º 1
0
    def build_smart_tree(self):
        t = SmartTree()
        rootNode = t.set_root()
        defaultSupport = 1
        t.add_path(rootNode, [1, 2], defaultSupport)
        t.add_path(rootNode, [1, 3, 1], 8)
        t.add_path(rootNode, [1, 3, 1, 0], 8)
        t.add_path(rootNode, [1, 3, 1, 6], 1)
        t.add_path(rootNode, [1, 3, 1, 4], 6)
        t.add_path(rootNode, [1, 3, 1, 4, 9], 3)
        t.add_path(rootNode, [1, 3, 1, 4, 2], 3)
        t.add_path(rootNode, [1, 3, 6], defaultSupport)
        t.add_path(rootNode, [1, 3, 2], defaultSupport)
        t.add_path(rootNode, [2, 3], defaultSupport)
        t.add_path(rootNode, [2, 1], defaultSupport)
        t.add_path(rootNode, [3, 4, 2, 1, 5, 1], defaultSupport)
        t.add_path(rootNode, [3, 4, 2, 6], defaultSupport)
        t.add_path(rootNode, [3, 4, 1, 6], defaultSupport)
        t.add_path(rootNode, [3, 4, 1, 5, 5], defaultSupport)
        t.add_path(rootNode, [3, 4, 1, 5, 4], defaultSupport)
        t.add_path(rootNode, [3, 5], defaultSupport)
        t.add_path(rootNode, [9], defaultSupport)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 3], 1)
        t.add_path(rootNode, [4, 2, 1, 5, 1], 8)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 2], 7)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6], 8)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 9], 4)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 9, 3], 1)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 9, 0], 3)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 4], 4)

        return t
def add_nodes_to_graph(seqs,last_k):
    t = SmartTree()
    rootNode = t.set_root()

    countDict = {}
    G=nx.DiGraph()
    for s in seqs:
        nearHistory = tuple(s[-(last_k):])
        if nearHistory in countDict:
            #increment count
            countDict[nearHistory] += 1
        else:
            #init count
            countDict[nearHistory] = 1
            #add seq to sequence tree
            t.add_path(rootNode,list(nearHistory))
            #add node to graph
            G.add_node(nearHistory)

        ## i also have to save the sequence of length k+1 because otherwise I cannot calculate the count
        ## from state x to state y. So the seqeunces of length k+1 are in the tree but not in the states
        nearHistoryLong = tuple(s[-(last_k+1):])# +1 because I need one more element to calculate the transition prob
        if nearHistory != nearHistoryLong: # otherwise short seq are counted double
            if nearHistoryLong in countDict:
                #increment count
                countDict[nearHistoryLong]+= 1
            else:
                #init count
                countDict[nearHistoryLong] = 1
    return (t,countDict,G)
Exemplo n.º 3
0
    def fit(self, train_data=None):
        """
        Fit the model
        :param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence".
            If None, run FSM using SPFM over the sequence database stored in `self.db_path`.
            Otherwise, run FSM using `pymining.seqmining` (slower).
        """

        if train_data is None:
            if self.spmf_path is None or self.db_path is None:
                raise ValueError(
                    "You should set db_path and spfm_path before calling fit() without arguments."
                )

            self.logger.info('Using SPFM (Java) for Frequent Sequence Mining')
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            # call spmf
            command = ' '.join([
                self.spmf_algorithm, self.db_path, self.output_path,
                str(percentage_min_sup) + '%'
            ])
            callSPMF(self.spmf_path, command)

            # parse back output from text file
            self._parse_spfm_output()
        else:
            # use pymining
            self.logger.info(
                'Using pymining.seqmining (python) for Frequent Sequence Mining'
            )
            sequences = train_data['sequence'].values
            msup = int(
                self.minsup *
                len(sequences)) if 0 <= self.minsup <= 1 else self.minsup
            self.logger.info(
                'Mining frequent sequences (minsup={})'.format(msup))
            self.freq_seqs = seqmining.freq_seq_enum(sequences, msup)

        self.logger.info('{} frequent sequences found'.format(
            len(self.freq_seqs)))
        self.logger.info('Building the prefix tree')
        self.tree = SmartTree()
        self.root_node = self.tree.set_root()
        for pattern, support in self.freq_seqs:
            if len(pattern) == 1:
                # add node to root
                self.tree.create_node(pattern[0],
                                      parent=self.root_node,
                                      data={"support": support})
            elif len(pattern) > 1:
                # add entire path starting from root
                self.tree.add_path(self.root_node, pattern, support)
            else:
                raise ValueError('Frequent sequence of length 0')
        self.logger.info('Training completed')
    def fit(self, seqs):
        """Takes a list of list of sequences ."""

        if self.spmf_path and self.db_path:
            self.logger.info("Using SPMF")
            #parse minsup
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            #call spmf
            algorithm = "PrefixSpan"
            command = ' '.join([
                algorithm, self.db_path, self.outputPath,
                str(percentage_min_sup) + '%'
            ])
            callSPMF(self.spmf_path, command)

            #parse back output from text file
            self._parse_SPMF_output()
        elif seqs:
            msup = self.minsup * len(
                seqs) if 0 <= self.minsup <= 1 else self.minsup

            self.logger.debug('Mining frequent sequences')
            self.freq_seqs = seqmining.freq_seq_enum(seqs, msup)
        else:
            self.logger.error(
                "No sequence dabase path nor sequence list provided.")

        self.logger.info('{} frequent sequences found'.format(
            len(self.freq_seqs)))
        self.logger.debug('Building frequent sequence tree')
        self.tree = SmartTree()
        self.rootNode = self.tree.set_root()
        for tuple in self.freq_seqs:
            if len(tuple[0]) == 1:
                #add node to root
                self.tree.create_node(tuple[0][0],
                                      parent=self.rootNode,
                                      data={"support": tuple[1]})
            elif len(tuple[0]) > 1:
                #add entire path starting from root
                self.tree.add_path(self.rootNode, tuple[0], tuple[1])
            else:
                raise NameError('Frequent sequence of length 0')
        self.logger.debug('Tree completed')