Exemplo n.º 1
0
    def fit(self, X, y=None):

        import pandas as pd
        from prefixspan import PrefixSpan

        l = list()
        p = PrefixSpan(X)
        for i in range(2, self.__frequent + 1):
            l.extend(p.frequent(i))
        df = pd.DataFrame(columns=["secuencia", "support", "tam"])
        for i, j in enumerate(l):
            df.loc[i] = [j[1], j[0] / len(X), len(j[1])]
        df = df.sort_values("tam", ascending=True)
        df.drop("tam", axis=1, inplace=True)
        df = df[df["support"] >= self.__minSupport]
        df = df.reset_index(drop=True)

        for i in df.iterrows():
            node = self.root
            for pos, j in enumerate(i[1]["secuencia"]):
                if node.existChildren(j):
                    node = node.getChildren(j)
                    if pos == len(i[1]["secuencia"]) - 1:
                        node.setSupport(i[1]["support"])
                else:
                    child = nodo(se=j, su=i[1]["support"])
                    node.addChild(j, child)
                    node = child

        return self
Exemplo n.º 2
0
def sequence_mining(min_support, token):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = "{}\\dataset\\upload_sequence_processed{}.txt".format(
    # data_path = "{}\\dataset\\upload_sequence_processed-{}.txt".format(
    #     current_dir, token)
    data_path = os.path.join(current_dir, 'dataset',
                             'upload_sequence_processed-{}.txt'.format(token)
                             )
    db = []
    with open(data_path, 'r') as f:
        file = reader(f, delimiter=' ', quotechar='\r')
        i = 0
        for row in file:
            # if i % 2 == 0:
            # if i % 2 == 0 or i % 2 == 1:
                # print(row)
            db.append([int(item) for item in row])
            i += 1
    row_count = len(db)
    if min_support * row_count < 2:
        if row_count != 0:
            min_support = 2 / row_count
    # print(db)
    # print(db)
    # print(row_count)
    ps = PrefixSpan(db)
    all_sequence = ps.frequent(row_count*min_support)
    # all_sequence = ps.frequent(1)
    all_sequence_num = len(all_sequence)
    # print("="*99)
    # print(all_sequence_num)
    return all_sequence_num, all_sequence
Exemplo n.º 3
0
    def score_session(self, items: List[int], items_to_score: List[int],
                      relevant_sessions_indices: Set[int]):
        scores = self.items_to_scores.get(str(items))
        if scores is None:
            self.competence = []

            self.last = len(items)
            self.bcs_sigma = (self.last - 1) / (2 * np.sqrt(2 * np.log(2)))
            if self.bcs_sigma == 0:
                self.bcs_sigma = 0.1
            self.bcs_a = 1 / (self.bcs_sigma * np.sqrt(2 * np.pi))
            total_bcs_weight = sum(
                [self.bcs_weight(i + 1) for i, x in enumerate(items)])

            relevant_sessions = [self.db[i] for i in relevant_sessions_indices]

            for session in relevant_sessions:
                lcs = get_longest_common_subsequence(session, items)
                lcs_indices = get_indices(items, lcs)

                bcs = sum([self.bcs_weight(x)
                           for x in lcs_indices]) / total_bcs_weight

                fes_last = len(session)
                self.lcs_last = get_indices(session, lcs)[-1]
                self.fes_sigma = (fes_last -
                                  self.lcs_last) / (2 * np.sqrt(2 * np.log(2)))
                if self.fes_sigma == 0:
                    self.fes_sigma = 0.1
                self.fes_a = 1 / (self.fes_sigma * np.sqrt(2 * np.pi))
                cni = session[self.lcs_last:]
                unique_cni = set(cni)
                fes = sum(
                    [self.fes_weight(cni.index(x) + 1)
                     for x in unique_cni]) / len(items)

                self.competence.append(0 if bcs == 0 or fes == 0 else (bcs *
                                                                       fes) /
                                       (1 / 2 * (bcs + fes)))

            # mine patterns
            self.total_weight = sum(self.competence)

            ps = PrefixSpan(relevant_sessions)

            patterns = ps.frequent(self.delta,
                                   key=self.pattern_key,
                                   bound=self.pattern_key)

            scores = self.score_items(patterns)

            self.items_to_scores.update({str(items): scores})
        predictions = np.zeros(len(items_to_score))
        mask = np.isin(items_to_score, list(scores.keys()))
        scored_items = items_to_score[mask]
        values = [scores[x] for x in scored_items]
        predictions[mask] = values
        return pd.Series(data=predictions, index=items_to_score)
Exemplo n.º 4
0
def base_sequence(dataset, num):
    data = []

    for i in range(0, num):
        item = dataset[i].split("\t")[0].lower()
        data.append(item)
    ps_base = PS(data)
    base_sequence = get_longest(ps_base.topk(1000))
    return base_sequence
def mine_string_patterns(doc):
    id, lines = doc
    docs = []

    for i, line in enumerate(lines):
        lr = []
        line = re.sub(r'\d+', '', line)
        toks = line.strip().split(' ')
        for t in toks:
            if t:
                lr.append(t)
        docs.append(lr)

    wordmap = {}  # type: Dict[str, int] #problematic!
    idx = 0
    for doc in docs:
        for tok in doc:
            if tok not in wordmap:
                wordmap[tok] = idx
                idx += 1
    doc_vecs = []
    for doc in docs:
        doc_vec = []
        for tok in doc:
            doc_vec.append(wordmap[tok])
        doc_vecs.append(doc_vec)
    db = doc_vecs
    ps = PrefixSpan(db)
    invwordmap = invert(wordmap)
    func = ps.frequent
    # lambda function for sorting
    key = None
    # upper bound
    bound = None
    # filter lambda function
    filter = None
    threshold = 2
    closed = True
    generator = False
    ps.minlen = 2
    ps.maxlen = 10
    results = []
    for freq, patt in func(threshold,
                           closed=closed,
                           generator=generator,
                           key=key,
                           bound=bound,
                           filter=filter):
        pattern = ' '.join((invwordmap[i] for i in patt))
        results.append([pattern, freq])

    return id, results
Exemplo n.º 6
0
def nlp_FreqSubsequenceMining(classfication,
                              MINSUP=3,
                              CLOSED=False,
                              GENERATOR=True):

    Sequences.clear()

    print("Analyzing %s..." % classfication)

    nlp = spacy.load("en_core_web_sm")

    # Read from raw data file, then convert it to NodeID-sequences.
    file = open("./DATA-Train/DATA-%s" % classfication)
    while 1:

        # read a conf-id, a description from file
        line = file.readline()
        if not line:
            break
        try:
            conf_id, conf_desc_text_raw = line.split('\t')
        except ValueError:
            print(line)
        doc = nlp(conf_desc_text_raw.strip())

        doc = MergeWords(doc)

        Sequences.append([
            MapDict[(
                #root.dep_,
                token.pos_,
                MyWordTags(token.text))] for token in doc
            if (token.pos_, MyWordTags(token.text)) != ('PUNCT', 'OTHER')
        ])

    size = len(Sequences)
    mean_len = np.mean([len(s) for s in Sequences])
    print("Config & Desc: %d\nMean length: %.1f" % (size, mean_len))

    # Mining FreqSeqs from those NodeID-sequences
    # FreqSeqs = ((sup, [seq]), (sup, [seq]), ...)
    FreqSeqs = PrefixSpan(Sequences)
    tmp = FreqSeqs.frequent(int(MINSUP), closed=CLOSED, generator=GENERATOR)
    res = {}
    for FreqSeq in tmp:
        res[tuple(FreqSeq[1])] = FreqSeq[0]

    print("Frequent Sub-sequences: %d\n" % len(res))

    # FreqSeqs with support number,
    return res
Exemplo n.º 7
0
def generate_rules(changes_sets, threshold):
    ps = PrefixSpan(changes_sets)
    print("Start rule generation")
    # freq_seqs = ps.frequent(minsup=int(len(new_changes) * 0.1), closed=True)
    freq_seqs = ps.frequent(minsup=threshold, closed=True)

    # freq_seqs = PrefixSpan_frequent(
    #     ps, minsup=int(len(new_changes) * 0.1), closed=True)
    freq_seqs = [
        x for x in freq_seqs
        if any([y.startswith("+")
                for y in x[1]]) and any([y.startswith("-") for y in x[1]])
    ]

    freq_seqs = sorted(freq_seqs, reverse=True)
    return freq_seqs
Exemplo n.º 8
0
def sequence_mining(min_support):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    data_path = "{}\\dataset\\upload_sequence_processed.txt".format(
        # data_path = "{}\\dataset\\upload_processed.txt".format(
        current_dir)
    db = []
    with open(data_path, 'r') as f:
        file = reader(f, delimiter=' ', quotechar='\r')
        for row in file:
            db.append(row)
    # db = array(read_csv(data_path, sep=' ', header=None))
    row_count = len(db)
    ps = PrefixSpan(db)
    all_sequence = ps.frequent(row_count * min_support)
    all_sequence_len = len(all_sequence)
    return all_sequence_len, all_sequence
def mine_frequent_span(log):
    input = []

    different_events = set()

    for trace in log:
        trace_events = []
        for event in trace:
            event_attribs = event.get_attributes()
            event_name = str(event_attribs["concept:name"])
            if "lifecycle:transition" in event_attribs:
                event_name += "-" + str(event_attribs["lifecycle:transition"])
            trace_events.append(event_name)
            different_events.add(event_name)
        input.append(trace_events)

    # Encode input
    encoding = {}

    decoding = {}
    for i, event in enumerate(different_events):
        encoding[event] = i
        decoding[i] = event

    # Encode traces
    minimum_size = 5

    encoded = [[encoding[event] for event in sublist] for sublist in input]
    ps = PrefixSpan(encoded)

    outputs = ps.topk(10000)

    decoded_output = list(
        reversed(
            sorted([(sublist[0], [decoding[output] for output in sublist[1]])
                    for sublist in outputs],
                   key=lambda x: x[0])))

    #print(decoded_output)
    to_file = "\n".join(map(str, decoded_output))

    with open("frequent_subs.txt", "w") as f:
        f.write(to_file)
def find_clusters_names(labels, features):
    
    groups = [[] for i in range(0, max(labels)+1)]
    for i in range(0, max(labels)+1):
        groups[i] =  features[features['labels'] == i].index
        groups[i] = groups[i].tolist()
    
    for group in groups:
        for i in range(0, len(group)):
            group[i] = group[i].split("::")
            group[i] = group[i] + group[i][len(group[i])-1].split(" ")
            
    res= []
    for group in groups :
        prefix = PrefixSpan(group)
        prefix.maxlen = 4
        prefix.minlen = 4
        res.append(prefix.topk(5, filter = lambda patt, matches : diversity_score(patt) >= len(patt)))
                
    return [create_str(res[i][0][1]) for i in range(0, len(res))]
Exemplo n.º 11
0
    def find_patterns(self):
        print(self.sampling_type)
        db = self.data
        ps = PrefixSpan(db)
        n_items = len(db)
        result = None
        opts = {
            "closed": self.closed,
            # Somehow does not work
            #"generator": self.generator
        }
        from pprint import pprint
        pprint(opts)
        if self.sampling_type:
            result = ps.topk(self.k, **opts)
        else:
            print("Support value:", self.min_support)
            print("Size:", n_items)
            print("Support:", n_items * self.min_support / 100)
            result = ps.frequent((self.min_support * n_items / 100.0), **opts)

        self.table.model().clear()
        model = QStandardItemModel(self.table)
        model.clear()
        for col, label in enumerate(["Support", "Pattern"]):
            item = QStandardItem(label)
            model.setHorizontalHeaderItem(col, item)
        sequences = []
        for support, pattern in result:
            if len(pattern) < self.min_len:
                continue
            support /= n_items
            sequences.append((support, pattern))
            sitem = self.NumericItem(support)
            pitem = QStandardItem(str(pattern))
            model.appendRow([sitem, pitem])
        self.Outputs.object.send(sequences)
        self.table.setModel(model)
Exemplo n.º 12
0
    def compute_prefix_span(self):
        r'''
        Accepts a list of list representing sequnces and a
        minimum support, returns the output of the
        PrefixSpan algorithm.

        Parameters
        ----------
        database: (list of lists)
            The "database" (list) of sequences.
        min_support: (int)
            The minimum relative support for PrefixSpan.

        Returns
        -------
        prefix_span: (list of tuples)
            Output of PrefixSpan.frequent. List of tuples of the
            form (frequency, sequence), where sequence is a list
            representing the sequence from the database.
        '''
        ps = PrefixSpan(self.database)
        prefix_span = ps.frequent(self.min_support)
        return prefix_span
Exemplo n.º 13
0
def one_stacking_period():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        temp = []
        curr_time = 0
        aversion = "c"
        for row in spamreader:
            if not curr_usr == row[2]:
                curr_usr = row[2]
                dict[curr_usr] = temp
                temp = []
            if row[2] == "":
                continue

            curr_time += int(row[3])
            if row[6] == "0":
                if aversion == "c":
                    temp.append(aversion)
                aversion = "a"
                temp.append(aversion)
                aversion = "c"
                curr_time = 0

            if curr_time > PERIOD * 1000:
                temp.append(aversion)
                curr_time = curr_time - (PERIOD * 1000)
                aversion = "c"

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("one stacking period \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")

    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
def spatronesintax(libro, posv=True):
    #empieza
    df = libro
    #df=pd.read_excel('../Visualization/Relatos_Benvenutto.xlsx')
    tes = df['Texto'].tolist()
    for i in range(len(tes)):
        tes[i] = tes[i] + '.\n'
    tes = ''.join(tes)
    #o=re.sub('…|[.]{3}','.',tes)
    o = re.sub('[“]+|[”]+|["]+', '', tes)
    listaprueba = sent_tokenize(o)
    listapos = []
    for i in listaprueba:
        i = i.strip()
        doc = nlp(i)
        listapos.append(doc)
    oye = separo(listapos, posv)
    listanum, pola = labeltonum(oye)
    #dfl=pd.DataFrame(listalen)
    #dfl['ok']=listanum
    ps = PrefixSpan(oye)
    ps = PrefixSpan(listanum)
    lista = ps.frequent(int(len(oye) * 0.5))
    lista2 = []
    for i in lista:
        if len(i[1]) > 5:
            lista2.append(i)
    df2 = correr(lista2, listanum)
    listatrans = []
    for i in df2['indis']:
        listaux2 = []
        for j in i:
            listaux2.append(pola[j])
        listatrans.append(listaux2)
    df2['transformer'] = listatrans
    df2.to_excel('pospattern.xlsx', index=False)
Exemplo n.º 15
0
def raw_data():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        temp = []
        for row in spamreader:

            if not curr_usr == row[2]:
                curr_usr = row[2]
                dict[curr_usr] = temp
                temp = []
            if row[2] == "":
                continue
            if row[6] == "0":
                temp.append("a")
            else:
                temp.append("c")

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("raw data \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")
    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
Exemplo n.º 16
0
NUM_EXAMPLES = 200
MIN_FREQ = 25
MIN_LEN = 5
MIN_DIST = 3

data_generator = ExamplesGenerator(seq_len=SEQ_LEN,
                                   vocab_size=VOCAB_SIZE,
                                   seed=111,
                                   multiple_patterns=multiple_patterns)

data_sequences = [next(data_generator()) for _ in range(NUM_EXAMPLES)]
positive_sequences = [s[0] for s in data_sequences if s[1] == 1]
negative_sequences = [s[0] for s in data_sequences if s[1] == 0]

positive_seq = PrefixSpan(positive_sequences).frequent(MIN_FREQ)
long_seq = [s for s in positive_seq if len(s[1]) >= MIN_LEN]
seq_by_freq = sorted(long_seq, key=lambda x: x[0], reverse=True)


def distance_from_seqs(s, s_list: list):
    """return distance (in terms of number of different tokens) between the sequence s
    and the list of sequence s_list"""
    if not s_list:
        s_list = [[]]
    dist_per_seq = [len(set(s) - set(s2)) for s2 in s_list]
    return min(dist_per_seq)


most_freq_seq = []
for s in seq_by_freq:
Exemplo n.º 17
0
#! python3
# -*- coding:utf-8 -*-

__author__ = "yoyo"

from prefixspan import PrefixSpan as PS
import os.path as path

data_dir = "./dataset/vocabulary/"
filename = "GRE_pure.txt"

if __name__ == "__main__":
    filepath = path.join(data_dir, filename)
    f = open(filepath)
    vocabulary = f.read()
    vocabulary = vocabulary.split("\n")
    f.close()
    ps = PS(vocabulary)
    for sequence in ps.frequent(3):
        if len(sequence[1]) >= 4:
            print(sequence)
Exemplo n.º 18
0
import pickle
from prefixspan import PrefixSpan

with open("../data/objects/paths", "rb") as f:
    paths = pickle.load(f)

ps = PrefixSpan(paths)
freqs = ps.frequent(2)

with open("../data/objects/freqs", "wb") as f:
    pickle.dump(freqs, f)
class PrefixSpanManager:
    """
    Classe d'outil a l'utilisation de prefixspan

    Parameters:
        * sax_engine: SaxEngine
            Instance de preprocessing SAX
        * export: Boolean
            Si oui ou non les donnees sont deja exportees au bon format

    Variables:
        * se_instance: SaxEngine
            L'instance de class SAX
        * data: Array[]
            Les donnees au format SAX
    """
    def __init__(self, sax_engine, export = True):
        self.se_instance = sax_engine
        self.data = sax_engine.sax_data
        self.process_data = []
        self.ps = None
        self.ploter = Plot(self)
        if export:
            self.export_format()

    def run(self):
        """
        Creer l'instance PrefixSpan avec les donnees pretraites
        """
        self.ps = PrefixSpan(self.process_data)

    def export_format(self):
        """
        Modifie le format pour correspondre au besoin de l'instance de PrefixSpan
        """
        tmp = []
        for elmt in self.data:
            tmp.append(elmt.ravel())
        self.process_data = tmp

    def topk(self, n, c = True):
        """
        Affiche les motifs les plus frequents(plus grand support) et par defaut les fermes

        Parameters:
            * n: int
                Nombre de motifs a afficher
        Returns:
            Liste de motifs frequent
        """
        return self.ps.topk(n, closed = c)

    def frequent(self, n):
        """
        Retourne les frequent de support n

        Parameters:
            * n: int
                Support minimal
        Returns:
            Liste des motifs de support minimal n
        """
        return self.ps.frequent(n)

    def plot(self, l):
        self.ploter.plot_prefixspan(l)
 def run(self):
     """
     Creer l'instance PrefixSpan avec les donnees pretraites
     """
     self.ps = PrefixSpan(self.process_data)
def main():

    dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1")
    author_title = dblp_data
    dataset = author_title.to_numpy()
    list1 = dataset[:,2].tolist()

    #convert authors to lower case
    list2 = []
    for i in list1:
        sublist = i.lower().split()
        list2.append(sublist)
    
    te = TransactionEncoder()
    te_ary = te.fit(list2).transform(list2)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent = fpgrowth(df, min_support=0.001, use_colnames=True)
    frequent = frequent[frequent['itemsets'].str.len()>1]

    freqauth_list = []
    for i in frequent['itemsets']:
        freqauth_list.append([x for x in i])

    freqauth_dict = {}
    for i in freqauth_list:
        title_idx_sublist = []
        for idx, j in enumerate(list2):
            if set(i).issubset(j):
                title_idx_sublist.append(idx)
        freqauth_dict.update({tuple(i):title_idx_sublist})

    freqauth_title_dict = {}
    kstem = ks.PyKrovetzStemmer()
    for key, value in freqauth_dict.items():
        title_df = author_title.iloc[value]['title']
        title_sublist = list(title_df)
        title_sublists = []
        temp_list = []
        for x in title_sublist:
            tempx     = re.sub(r'[.]','', x)
            temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split()
            temp_list2 = []
            if isinstance(temp_list, list):
                temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist])
                title_sublists.extend(temp_list2)
            else:
                if not temp_list in stopwordlist:
                    title_sublists.extend([kstem.stem(temp_list)])
        freqauth_title_dict.update({key:title_sublists})

    # Closed / Top k titles of frequent authors
    freqauth_title_dict_closed = {}
    for k, v in freqauth_title_dict.items():
        ps = PrefixSpan(v)
        closed_Seq_pattern = ps.topk(5, closed=True)
        freqauth_title_dict_closed.update({k:closed_Seq_pattern})

    # To get frequent author's context indicators
    frequentlist = freqauth_list
    cleanedList  = list2

    new_author_list = []
    for i in range(0,len(frequentlist)):
        temp_author_list = []
        authorlist = list(frequentlist[i])
        found = 0
        for k in range(0,len(cleanedList)):
            for j in range(0, len(authorlist)):
                if (authorlist[j] in(cleanedList[k])):
                    found = 1
                else:
                    found = 0
                    break
                    
            if found == 1:
                for jj in range(0,len(authorlist)):
                    if (authorlist[jj] in(cleanedList[k])):
                        cleanedList[k].remove(authorlist[jj])
                temp_author_list.append(cleanedList[k])

        new_author_list.append(temp_author_list)

    context_indicator_list = []
    for i in range(0,len(new_author_list)):
        te = TransactionEncoder()
        te_ary = te.fit(new_author_list[i]).transform(new_author_list[i])
        df = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True)

        supp = frequent_author_list.support.unique()  # all unique support count
        # Dictionary storing itemset with same support count key
        freq_dic = {}
        for i in range(len(supp)):
            inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets'])
            freq_dic[supp[i]] = inset
        # Dictionary storing itemset with  support count <= key
        freq_dic2 = {}
        for i in range(len(supp)):
            inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets'])
            freq_dic2[supp[i]] = inset2

        # Find Closed frequent itemset
        close_freq = []
        for index, row in frequent_author_list.iterrows():
            isclose = True
            cli = row['itemsets']
            cls = row['support']
            checkset = freq_dic[cls]
            for i in checkset:
                if (cli != i):
                    if (frozenset.issubset(cli, i)):
                        isclose = False
                        break

            if (isclose):
                close_freq.append([x for x in  (row['itemsets'])])
        context_indicator_list.append(close_freq)
    
    freqauth_context_ind_dict = {}
    for authpair, titlelist in freqauth_title_dict_closed.items():
        cleantitlelist = []
        for i in titlelist:
            if isinstance(i, tuple):
                if isinstance(i[1], list):
                    listtostring = ' '.join(i[1])
                    cleantitlelist.append(listtostring)
        freqauth_context_ind_dict.update({authpair:cleantitlelist})

    # Merging both titles and Context indicator author for frequent pattern authors 
    for idx, key in enumerate(freqauth_context_ind_dict):
        newval = []
        if len(context_indicator_list[idx])> 0:
            for i in context_indicator_list[idx]:
                if len(i) > 0:                
                    tempstr = '&'.join(i)
                    newval = freqauth_context_ind_dict[key]
                    newval.append(tempstr)
                    freqauth_context_ind_dict.update({key:newval})

# Context Indicator Weighting
    CI_list = list(freqauth_context_ind_dict.values())
    freqauth_context_in_weights = {}
    for key, value in freqauth_context_ind_dict.items():
        freq_auth_CI_list = value
        length_of_CI = len(value)
        temp_dict = {}
        for i in freq_auth_CI_list:
            count_tmp = 0
            for j in CI_list:
                if (i in (j)):
                    count_tmp += 1
            weight = round(1 - ((count_tmp - 1) /  count_tmp), 2)
            if (weight > 0.1):
                temp_dict.update({i:weight})
        sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        freqauth_context_in_weights.update({key:sorted_weights_dict})

    freq_auth_transactions = {}
    list_of_freq_auth = list(freqauth_context_in_weights.keys())
    for i in range(0, len(freqauth_title_dict)):
        temp_dict = {}
        title_list = freqauth_title_dict.get(list_of_freq_auth[i])
        CI_list = freqauth_context_in_weights[list_of_freq_auth[i]]
        CI_list_auth = []
        for n, c in enumerate(CI_list):
            CI_list_auth.append(c[0])
        for j in range(0, len(title_list)):
            cos_sim = cos_similarity(CI_list_auth,title_list[j])
            cos_sim = round(cos_sim, 3)
            t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j])
            temp_dict.update({t_title:cos_sim})

        sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        t_len = len(list(temp_dict.values()))
        max_len = t_len
        if (t_len > 4):
            max_len = 4
        sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len])
        freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1})

    # To find the strongest SSP - Match against similarity of the context units

    freq_auth_SSPs = {}
    list_of_freq_auth = list(freqauth_context_ind_dict.keys())
    list_of_freq_auth_CI =  list(freqauth_context_ind_dict.values())
    len_list_of_freq_auth_CI = len(list_of_freq_auth_CI)

    context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float)
    for i in range (0,len_list_of_freq_auth_CI):
        for j in range (0,len_list_of_freq_auth_CI):
            cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j])
            cos_sim = round(cos_sim, 3)
            if (i != j):
                context_indicator_similarity[i][j] = cos_sim
                context_indicator_similarity[j][i] = cos_sim

    context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int)
    for i in range(0,len(context_indicator_similarity)):
        context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:]

    SSP_Author_List = []
    for i in range(0,len(list_of_freq_auth)):
        temp_author_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
           temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]])
        SSP_Author_List.append(temp_author_list_ssp)

    SSP_Title_List = []

    CI_list_title = list(freqauth_title_dict_closed.values())
    CI_list1 = []
    for i in (CI_list_title):
        temp_list3 = []
        for j in i:
            CI_str = ' '.join(j[1])
            temp_list3.append(CI_str)
        CI_list1.append(list(set(temp_list3)))

    for i in range(0,len(CI_list1)):
        temp_title_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
            ssp_str = CI_list1[context_indicator_similarity_idx[i][j]]
            temp_title_list_ssp.extend(ssp_str)
        SSP_Title_List.append(list(set(temp_title_list_ssp)))

    # Write the output to a CSV file
    # a) list_of_freq_auth
    # b) list_of_freq_auth_CI / freqauth_context_in_weights
    # c) freq_auth_transactions
    # d) SSP_Author_List
    # e) SSP_Title_List
    #for i in range(0, frequent_author_list):
    #print(len(SSP_Title_List))
    #print(SSP_Title_List)
    titles_list_with_weight = list(freq_auth_transactions.values())
    # Joining SSP authors
    SSP_authors_formatted = []
    for i in range(0,len(SSP_Author_List)):
        temp_list = []
        for j in range(0, len(SSP_Author_List[i])):
            authors = '&'.join(list(SSP_Author_List[i][j]))
            temp_list.append(authors)
        SSP_authors_formatted.append(temp_list)

    with open("./output.txt", 'w', encoding="utf-8") as f:
        f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' +
                'Transaction 2' + '||'  + 'Transaction 3' + '||'  + 'Transaction 4' + '||' + 'SSP - Co-Author' +
                '||' + 'SSP - Title' + '\n')
        for i in range(0, len(list_of_freq_auth)):
            authors = ' '.join(list(list_of_freq_auth[i]))
            f.write(authors + '||')
            Context_indicators = '; '.join(list_of_freq_auth_CI[i])
            f.write(Context_indicators + '||')
            for j in (titles_list_with_weight[i].keys()):
                f.write(j + '||')
            ssp_authors = '; '.join(SSP_authors_formatted[i])
            f.write(ssp_authors + '||')
            ssp_titles = '; '.join(SSP_Title_List[i])
            f.write(ssp_titles )
            f.write('\n')
Exemplo n.º 22
0
def apply(grouped_stream, all_labels, parameters=None):
    """
    Applies the prefix span algorithm

    Parameters
    -------------
    grouped_stream
        Grouped stream
    all_labels
        Indexed labels
    parameters
        All the parameters of the algorithm

    Returns
    --------------
    frequents
        List containing frequent itemsets as label indexes
    frequents_label
        List containing frequent itemsets as labels
    frequents_encodings
        List containing frequent itemsets as word encodings
    frequents_occurrences
        List containing all the sequences of events associated to the corresponding itemset
    """
    if parameters is None:
        parameters = {}

    final_label_idx = parameters[FINAL_LABEL_IDX] if FINAL_LABEL_IDX in parameters else DEFAULT_FINAL_LABEL_IDX

    m = parameters[M] if M in parameters else DEFAULT_M

    data = [[y[final_label_idx] for y in x] for x in grouped_stream]
    ps = PrefixSpan(data)

    frequents = [x[1] for x in ps.frequent(m)]
    frequents_label = [" ".join([all_labels[y] for y in x]) for x in frequents]

    F = tempfile.NamedTemporaryFile(suffix='.txt')
    F.close()
    F2 = open(F.name, "w")
    for label in frequents_label:
        F2.write(label+"\n")
    F2.close()

    model = fasttext.train_unsupervised(F.name)
    frequents_encodings = []
    for i in range(len(frequents)):
        phrase = [x for x in frequents_label[i].split() if x in model.words]
        v = None
        for w in phrase:
            if v is None:
                v = model.get_word_vector(w)
            else:
                v = v + model.get_word_vector(w)
        frequents_encodings.append(v)

    frequents_occurrences = []
    for f in frequents:
        frequents_occurrences.append([])
        for g in grouped_stream:
            d = [x[final_label_idx] for x in g]
            for i in range(len(d)-len(f)):
                if d[i] == f[0] and d[i+len(f)-1] == f[len(f)-1]:
                    if d[i:i+len(f)] == f:
                        frequents_occurrences[-1].append(g[i:i+len(f)])

    return frequents, frequents_label, frequents_encodings, frequents_occurrences
Exemplo n.º 23
0
OUTPUT_JSON_NAME = "data/rules/" + owner + "_" + repo + "_" + lang + ".json"

with open(INPUT_JSON_NAME, mode='r', encoding='utf-8') as f:
    changes_sets = load(f)

changes = [x["changes_set"] for x in changes_sets]

new_changes = []
for tokens in changes:
    new_tokens = [
        x for x in tokens if not x.endswith("\n") and not x.endswith(" ")
    ]
    if new_tokens != [] and new_tokens not in new_changes:
        new_changes.append(new_tokens)

print("Start rule generation")
ps = PrefixSpan(new_changes)
freq_seqs = ps.frequent(minsup=int(len(new_changes) * 0.1), closed=True)
# freq_seqs = PrefixSpan_frequent(
#     ps, minsup=int(len(new_changes) * 0.1), closed=True)
freq_seqs = [
    x for x in freq_seqs
    if any([y.startswith("+")
            for y in x[1]]) and any([y.startswith("-") for y in x[1]])
]

freq_seqs = sorted(freq_seqs, reverse=True)

with open(OUTPUT_JSON_NAME, mode='w', encoding='utf-8') as f:
    dump(freq_seqs, f, indent=1)
Exemplo n.º 24
0
def get_common(sequence, data):
    data = data.split("\t")[0].lower()
    data = [sequence, data]
    ps = PS(data)
    common = get_longest(ps.topk(1000))
    return common
def recommend(trainingset=l_good,
              s_group=s_good,
              student=s_good[0],
              path_length=9,
              rl=resourcelist):

    # Here we put the influence or this stdent's learning log bigger. x10
    for i in range(30):
        trainingset.append(trainingset[s_group.index(student)])
    ps = PrefixSpan(trainingset)

    pattern = ps.topk(1000, filter=lambda patt, matches: len(patt) > 1
                      )  # pattern lenth should bigger than 1
    pattern_time = {}  #Here stores all pattern with appear times

    for i, element in enumerate(pattern):
        l_s = []  # store pattern in this element
        s = ""
        for i in range(len(element[1])):
            if i == 0:
                s = str(element[1][i])
            else:
                l_s.append(s + "," + str(element[1][i]))
                s = str(element[1][i])
        for j in l_s:
            if j in pattern_time.keys():
                pattern_time[j] += element[0]
            else:
                pattern_time[j] = element[0]

    # ordered pattern in list
    pattern_time = sorted(pattern_time.items(),
                          key=lambda pattern_time: pattern_time[1],
                          reverse=True)
    print("pattern with time:", pattern_time)
    # delete repeat part
    #print(len(pattern_time))
    """
    Here is deduplication.
    we can't delete the item of list in for cycle. It will have 'index out of range problem'. 
    So we store the repeat index and delete after
    """
    delete_indice = []
    for k1 in range(len(pattern_time)):
        starter = pattern_time[k1][0].split(",")[0]
        ender = pattern_time[k1][0].split(",")[1]
        if starter == ender:
            delete_indice.append(k1)
        if pattern_time[k1] == pattern_time[-1]:
            break

        for k2 in range(k1 + 1, len(pattern_time)):
            #print(pattern_time[k2])
            temps_start = pattern_time[k2][0].split(",")[0]
            temps_end = pattern_time[k2][0].split(",")[1]
            if starter == temps_start:
                delete_indice.append(pattern_time[k2])
            if ender == pattern_time[k2][0].split(",")[1]:
                delete_indice.append(pattern_time[k2])

    for i in set(delete_indice):
        if i in pattern_time:
            pattern_time.remove(i)
    """
    Here we organise the path from pattern list.
    We should firstly find the head then finish the path.
    """
    element = []

    pattern_result = [x[0] for x in pattern_time
                      ]  # delete pattern show times, keep pattern
    #print("unique pattern:",pattern_result)
    store = []
    for i in range(len(pattern_result)):
        for j in range(len(pattern_result)):
            if i == j:
                continue
            if pattern_result[i].split(",")[0] in pattern_result[j]:
                store.append(pattern_result[i])
    path = list(set(pattern_result).difference(set(store)))[0]
    print("begin_node of path:", path)
    compt = 0
    c_b = 0
    l_change = 2
    while compt < path_length - 2:  # First node has two element, so we should add path_length-2
        c_b += 1
        for i in pattern_result:
            if i.split(",")[0] == path.split(",")[-1]:
                path += "," + i.split(",")[-1]
                compt += 1

        if l_change == len(path):
            c_b += 1
        else:
            l_change = len(path)
        if c_b > 100000:
            break
    print("path:", path)
    return path
Exemplo n.º 26
0
    discrete_time = []
    with open(filename, "r", encoding="utf-8") as weights_file:
        print(f"Reading file {filename}")
        for i, weights_triple in enumerate(weights_file):
            current_weights = weights_triple.replace(",", ".").split("\t")
            weights.append(int(current_weights[1]))
            discrete_time_base = int(current_weights[0].strip())
            discrete_time.append(discrete_time_base)
            curr_frequency = int(current_weights[3].strip())
            frequency.append(curr_frequency)
            for k in range(0, curr_frequency):
                weights.append(int(current_weights[1]))
                discrete_time_base += 1
                discrete_time.append(discrete_time_base)
            if limit is not None and (i == limit or discrete_time_base >= limit):
                print("Limit reached")
                break
    return discrete_time, weights


if __name__ == '__main__':
    basedir = "C:/Users/havar/Home/cache_simulation_results/"

    _t, _w = _read_db(basedir + "scaled_w_01.csv")
    data = list(chunks(_w, 1000))
    ps = PrefixSpan(data)
    ps.minlen = 5
    ps.maxlen = 100

    print(ps.frequent(5, closed=True))
Exemplo n.º 27
0
from prefixspan import PrefixSpan

db = [
    [0, 1, 2, 3, 4],
    [1, 1, 1, 3, 4],
    [2, 1, 2, 2, 0],
    [1, 1, 1, 2, 2],
]

ps = PrefixSpan(db)

print(ps.frequent(2))
Exemplo n.º 28
0
def aversion_direction_one_stacking_period():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        avg = []
        curr_time = 0
        aversion = [0.0, 0.0, "1"]
        temp = []
        for row in spamreader:
            if not curr_usr == row[2]:
                mean = np.average(avg, axis=0)
                t = []
                for i in temp:
                    res = "c"
                    if i[2] == "0":
                        diff = [np.abs(a - b) for a, b in zip(i[0:2], mean)]
                        # if np.abs((diff[0] / mean[0]) - (diff[1] / mean[1])) < treshold:
                        if (np.abs(((diff[0] + mean[0]) / mean[0]) - ((diff[1] + mean[1]) / mean[1])) < treshold) or \
                                (((diff[0] + mean[0]) / mean[0]) > treshold2 and (
                                            ((diff[1] + mean[1]) / mean[1]) > treshold2)):
                            res = "f"
                        elif diff[0] - diff[1] > 0:
                            if i[0] < mean[0]:
                                res = "l"
                            if i[0] > mean[0]:
                                res = "r"
                        else:
                            if i[1] < mean[1]:
                                res = "u"
                            if i[1] > mean[1]:
                                res = "d"
                    t.append(res)
                dict[curr_usr] = t
                curr_usr = row[2]
                temp = []
                avg = []
            if row[2] == "":
                continue

            if row[6] == "1":
                avg.append([
                    float(row[4].replace(",", ".")),
                    float(row[5].replace(",", "."))
                ])
            curr_time += int(row[3])
            if row[6] == "0":
                if aversion[2] == "1":
                    temp.append(aversion)
                aversion = [
                    float(row[4].replace(",", ".")),
                    float(row[5].replace(",", ".")), row[6]
                ]
                temp.append(aversion)
                aversion = [0.0, 0.0, "1"]
                curr_time = 0

            if curr_time > PERIOD * 1000:
                temp.append(aversion)
                curr_time = curr_time - (PERIOD * 1000)
                aversion = [0.0, 0.0, "1"]

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("aversion direction one stacking period \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")

    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
Exemplo n.º 29
0
        while line:
            db.append(eval(line))
            line = file.readline()

    return db


def generateFilename(tno, cno):
    "根据队编号与聚类编号产生文件名"
    return "Team" + str(tno) + "Cluster" + str(cno) + ".txt"


path = "Cluster/"
list_p = []
# tno 代表 team number
# cno 代表 cluster number
for tno in range(1, 3):
    for cno in range(0, 5):
        filepath = path + generateFilename(tno, cno)
        db = loadFile(filepath)
        ps = PrefixSpan(db)
        for x in range(0, 10):
            list_p.append(Pattern(ps.topk(10)[x][0], ps.topk(10)[x][1]))
        # 输出当前Cluster中出现频率最高的10个Pattern
        # print(ps.topk(10))

# 对Score进行排序
list_p = sorted(list_p, key=lambda x: x.score, reverse=True)
print("#######################################")
for x in list_p:
    print("score:", x.score, "freq:", x.freq)
Exemplo n.º 30
0
def preprocess_dataset(path_):

    #Extraction données------------------------------------------------------------------------------

    path = path_
    date_column = 'Date-heure UTC (événement)'
    action_column = 'Pages'
    identity_col = 'Visiteurs uniques ID'

    dataset = pd.read_csv(path, sep=';', parse_dates=[date_column])

    dataset[date_column] = pd.to_datetime(dataset[date_column],
                                          errors='coerce')
    dataset = dataset.dropna(subset=[date_column])

    dataset = dataset[dataset[action_column] != '-']
    dataset.index = dataset[date_column]
    dataset.drop(columns=date_column, inplace=True)
    dataset.sort_index(ascending=True, inplace=True)

    #'particulier::compte::compte-conseil univers de besoin'
    valeurs_interdites = [
        'particulier::acces-CR::acces-CR-store locator trouver ma CR 50',
        'particulier::particulier-accueil particuliers et BP'
    ]

    dataset_after = dataset[~dataset[action_column].isin(valeurs_interdites)]
    person_list = dataset_after[identity_col].unique()

    print("Le nombre de personnes répertoriées est : " + str(len(person_list)))

    #liste_pages_dataset = dataset[action_column].unique().tolist()
    #comparaisons du nombre de visiteurs avant/ après : 77406 - 24682 = 52724 personnes
    #52724 personnes ne visitent que les pages de valeurs-interdites ....

    List_actions = []

    #start = time.time()

    #parameter authorized_session_time defines the maximal inactivity time before we consider the client
    #opened two distincts sessions
    authorized_inactivity_time = datetime.timedelta(minutes=30)

    for i in range(0, len(person_list)):

        personne = person_list[i]
        subdata = dataset_after[dataset_after[identity_col] == personne]
        start = 0

        for j in range(0, len(subdata.index) - 1):

            duree = subdata.index[j + 1] - subdata.index[j]

            if duree > authorized_inactivity_time:

                actions = subdata[action_column].iloc[start:j + 1].tolist()
                start = j + 1
                List_actions.append(actions)

        actions = subdata[action_column].iloc[start:len(subdata.index)].tolist(
        )
        List_actions.append(actions)

    #Premier coup de PrefixSpan pour trouver les parcours pertinents---------------------------------------

    first_search = PrefixSpan(List_actions)
    first_search.min_len = 2
    first_search.max_len = 7
    results_search1 = first_search.frequent(
        15, filter=lambda patt, matches: diversity_score(patt) >= len(patt))

    results_search1.sort(key=lambda x: -x[0])

    #Deuxieme passage pour obtenir la liste des transitions de taille 2 et leurs effectifs---------------------

    second_search = PrefixSpan(List_actions)
    second_search.min_len = 2
    second_search.max_len = 2
    filter_list = compute_transitions_list(results_search1)
    results_search2 = second_search.frequent(
        5, filter=lambda patt, matches: patt in filter_list)

    results_search2.sort(key=lambda x: -x[0])

    #Tracé du Sankey -------------------------------------------------------------------------------------

    liste_resfinal = results_search2

    labels = []
    sources = []
    targets = []
    values = []
    links = []

    #A link only appears in the graph if it constitutes more than rate% of the incoming/outgoing traffic of the
    #two nodes involved in the link
    rate = 0.11

    for match in liste_resfinal:

        if len(match[1]) == second_search.minlen:
            pattern = match[1]
        else:
            pattern = match[1][len(match[1]) - 2:len(match[1])]

        for label in pattern:
            renamed = rename(label)
            if renamed not in labels:
                labels.append(renamed)
            if match[1].index(label) < len(match[1]) - 1:
                targetted = rename(match[1][match[1].index(label) + 1])
                res_exit = 0
                res_entry = 0
                res_incoming = 0
                res_ongoing = 0
                if labels.index(renamed) in targets:

                    for i in range(0, len(targets)):
                        x = targets[i]
                        if x == labels.index(renamed):
                            res_exit += values[i]

                if labels.index(renamed) in sources:

                    for i in range(0, len(sources)):
                        if sources[i] == labels.index(renamed):
                            res_incoming += values[i]

                if targetted in labels:

                    if labels.index(targetted) in sources:

                        for i in range(0, len(sources)):
                            x = sources[i]
                            if x == labels.index(targetted):
                                res_entry += values[i]

                    if labels.index(targetted) in targets:

                        for i in range(0, len(targets)):
                            if targets[i] == labels.index(targetted):
                                res_ongoing += values[i]

                if (renamed, targetted) not in links:

                    if match[0] > rate * res_exit and match[
                            0] > rate * res_entry and match[
                                0] > rate * res_ongoing and match[
                                    0] > rate * res_incoming:

                        if ((targetted, renamed) in links):

                            if values[links.index(
                                (targetted, renamed))] <= match[0]:

                                links.append((renamed, targetted))
                                sources.append(labels.index(renamed))
                                if targetted in labels:
                                    targets.append(labels.index(targetted))
                                else:
                                    labels.append(targetted)
                                    targets.append(labels.index(targetted))
                                sources.pop(links.index((targetted, renamed)))
                                targets.pop(links.index((targetted, renamed)))
                                values.pop(links.index((targetted, renamed)))
                                links.pop(links.index((targetted, renamed)))
                                values.append(match[0])

                        else:

                            links.append((renamed, targetted))
                            sources.append(labels.index(renamed))
                            if targetted in labels:
                                targets.append(labels.index(targetted))
                            else:
                                labels.append(targetted)
                                targets.append(labels.index(targetted))
                            values.append(match[0])

                else:

                    values[links.index((renamed, targetted))] += match[0]

    global_matrix = generate_global_matrix(List_actions, labels)

    return [labels, links, values, global_matrix, liste_resfinal, List_actions]


#pk.dump(labels, open('Listedespages.pkl', 'wb'))
#pk.dump(links, open('Listofedges.pkl', 'wb'))
#pk.dump(values, open('Listofvalues.pkl', 'wb'))

#pk.dump((subgraphs[23],list_subsources[23], list_subtargets[23], list_subvalues[23]), open('subgraphtotestonclustering.pkl','wb'))