Пример #1
0
def one_stacking_period():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        temp = []
        curr_time = 0
        aversion = "c"
        for row in spamreader:
            if not curr_usr == row[2]:
                curr_usr = row[2]
                dict[curr_usr] = temp
                temp = []
            if row[2] == "":
                continue

            curr_time += int(row[3])
            if row[6] == "0":
                if aversion == "c":
                    temp.append(aversion)
                aversion = "a"
                temp.append(aversion)
                aversion = "c"
                curr_time = 0

            if curr_time > PERIOD * 1000:
                temp.append(aversion)
                curr_time = curr_time - (PERIOD * 1000)
                aversion = "c"

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("one stacking period \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")

    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
Пример #2
0
def base_sequence(dataset, num):
    data = []

    for i in range(0, num):
        item = dataset[i].split("\t")[0].lower()
        data.append(item)
    ps_base = PS(data)
    base_sequence = get_longest(ps_base.topk(1000))
    return base_sequence
Пример #3
0
def raw_data():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        temp = []
        for row in spamreader:

            if not curr_usr == row[2]:
                curr_usr = row[2]
                dict[curr_usr] = temp
                temp = []
            if row[2] == "":
                continue
            if row[6] == "0":
                temp.append("a")
            else:
                temp.append("c")

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("raw data \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")
    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
def mine_frequent_span(log):
    input = []

    different_events = set()

    for trace in log:
        trace_events = []
        for event in trace:
            event_attribs = event.get_attributes()
            event_name = str(event_attribs["concept:name"])
            if "lifecycle:transition" in event_attribs:
                event_name += "-" + str(event_attribs["lifecycle:transition"])
            trace_events.append(event_name)
            different_events.add(event_name)
        input.append(trace_events)

    # Encode input
    encoding = {}

    decoding = {}
    for i, event in enumerate(different_events):
        encoding[event] = i
        decoding[i] = event

    # Encode traces
    minimum_size = 5

    encoded = [[encoding[event] for event in sublist] for sublist in input]
    ps = PrefixSpan(encoded)

    outputs = ps.topk(10000)

    decoded_output = list(
        reversed(
            sorted([(sublist[0], [decoding[output] for output in sublist[1]])
                    for sublist in outputs],
                   key=lambda x: x[0])))

    #print(decoded_output)
    to_file = "\n".join(map(str, decoded_output))

    with open("frequent_subs.txt", "w") as f:
        f.write(to_file)
def find_clusters_names(labels, features):
    
    groups = [[] for i in range(0, max(labels)+1)]
    for i in range(0, max(labels)+1):
        groups[i] =  features[features['labels'] == i].index
        groups[i] = groups[i].tolist()
    
    for group in groups:
        for i in range(0, len(group)):
            group[i] = group[i].split("::")
            group[i] = group[i] + group[i][len(group[i])-1].split(" ")
            
    res= []
    for group in groups :
        prefix = PrefixSpan(group)
        prefix.maxlen = 4
        prefix.minlen = 4
        res.append(prefix.topk(5, filter = lambda patt, matches : diversity_score(patt) >= len(patt)))
                
    return [create_str(res[i][0][1]) for i in range(0, len(res))]
Пример #6
0
    def find_patterns(self):
        print(self.sampling_type)
        db = self.data
        ps = PrefixSpan(db)
        n_items = len(db)
        result = None
        opts = {
            "closed": self.closed,
            # Somehow does not work
            #"generator": self.generator
        }
        from pprint import pprint
        pprint(opts)
        if self.sampling_type:
            result = ps.topk(self.k, **opts)
        else:
            print("Support value:", self.min_support)
            print("Size:", n_items)
            print("Support:", n_items * self.min_support / 100)
            result = ps.frequent((self.min_support * n_items / 100.0), **opts)

        self.table.model().clear()
        model = QStandardItemModel(self.table)
        model.clear()
        for col, label in enumerate(["Support", "Pattern"]):
            item = QStandardItem(label)
            model.setHorizontalHeaderItem(col, item)
        sequences = []
        for support, pattern in result:
            if len(pattern) < self.min_len:
                continue
            support /= n_items
            sequences.append((support, pattern))
            sitem = self.NumericItem(support)
            pitem = QStandardItem(str(pattern))
            model.appendRow([sitem, pitem])
        self.Outputs.object.send(sequences)
        self.table.setModel(model)
class PrefixSpanManager:
    """
    Classe d'outil a l'utilisation de prefixspan

    Parameters:
        * sax_engine: SaxEngine
            Instance de preprocessing SAX
        * export: Boolean
            Si oui ou non les donnees sont deja exportees au bon format

    Variables:
        * se_instance: SaxEngine
            L'instance de class SAX
        * data: Array[]
            Les donnees au format SAX
    """
    def __init__(self, sax_engine, export = True):
        self.se_instance = sax_engine
        self.data = sax_engine.sax_data
        self.process_data = []
        self.ps = None
        self.ploter = Plot(self)
        if export:
            self.export_format()

    def run(self):
        """
        Creer l'instance PrefixSpan avec les donnees pretraites
        """
        self.ps = PrefixSpan(self.process_data)

    def export_format(self):
        """
        Modifie le format pour correspondre au besoin de l'instance de PrefixSpan
        """
        tmp = []
        for elmt in self.data:
            tmp.append(elmt.ravel())
        self.process_data = tmp

    def topk(self, n, c = True):
        """
        Affiche les motifs les plus frequents(plus grand support) et par defaut les fermes

        Parameters:
            * n: int
                Nombre de motifs a afficher
        Returns:
            Liste de motifs frequent
        """
        return self.ps.topk(n, closed = c)

    def frequent(self, n):
        """
        Retourne les frequent de support n

        Parameters:
            * n: int
                Support minimal
        Returns:
            Liste des motifs de support minimal n
        """
        return self.ps.frequent(n)

    def plot(self, l):
        self.ploter.plot_prefixspan(l)
def main():

    dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1")
    author_title = dblp_data
    dataset = author_title.to_numpy()
    list1 = dataset[:,2].tolist()

    #convert authors to lower case
    list2 = []
    for i in list1:
        sublist = i.lower().split()
        list2.append(sublist)
    
    te = TransactionEncoder()
    te_ary = te.fit(list2).transform(list2)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent = fpgrowth(df, min_support=0.001, use_colnames=True)
    frequent = frequent[frequent['itemsets'].str.len()>1]

    freqauth_list = []
    for i in frequent['itemsets']:
        freqauth_list.append([x for x in i])

    freqauth_dict = {}
    for i in freqauth_list:
        title_idx_sublist = []
        for idx, j in enumerate(list2):
            if set(i).issubset(j):
                title_idx_sublist.append(idx)
        freqauth_dict.update({tuple(i):title_idx_sublist})

    freqauth_title_dict = {}
    kstem = ks.PyKrovetzStemmer()
    for key, value in freqauth_dict.items():
        title_df = author_title.iloc[value]['title']
        title_sublist = list(title_df)
        title_sublists = []
        temp_list = []
        for x in title_sublist:
            tempx     = re.sub(r'[.]','', x)
            temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split()
            temp_list2 = []
            if isinstance(temp_list, list):
                temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist])
                title_sublists.extend(temp_list2)
            else:
                if not temp_list in stopwordlist:
                    title_sublists.extend([kstem.stem(temp_list)])
        freqauth_title_dict.update({key:title_sublists})

    # Closed / Top k titles of frequent authors
    freqauth_title_dict_closed = {}
    for k, v in freqauth_title_dict.items():
        ps = PrefixSpan(v)
        closed_Seq_pattern = ps.topk(5, closed=True)
        freqauth_title_dict_closed.update({k:closed_Seq_pattern})

    # To get frequent author's context indicators
    frequentlist = freqauth_list
    cleanedList  = list2

    new_author_list = []
    for i in range(0,len(frequentlist)):
        temp_author_list = []
        authorlist = list(frequentlist[i])
        found = 0
        for k in range(0,len(cleanedList)):
            for j in range(0, len(authorlist)):
                if (authorlist[j] in(cleanedList[k])):
                    found = 1
                else:
                    found = 0
                    break
                    
            if found == 1:
                for jj in range(0,len(authorlist)):
                    if (authorlist[jj] in(cleanedList[k])):
                        cleanedList[k].remove(authorlist[jj])
                temp_author_list.append(cleanedList[k])

        new_author_list.append(temp_author_list)

    context_indicator_list = []
    for i in range(0,len(new_author_list)):
        te = TransactionEncoder()
        te_ary = te.fit(new_author_list[i]).transform(new_author_list[i])
        df = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True)

        supp = frequent_author_list.support.unique()  # all unique support count
        # Dictionary storing itemset with same support count key
        freq_dic = {}
        for i in range(len(supp)):
            inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets'])
            freq_dic[supp[i]] = inset
        # Dictionary storing itemset with  support count <= key
        freq_dic2 = {}
        for i in range(len(supp)):
            inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets'])
            freq_dic2[supp[i]] = inset2

        # Find Closed frequent itemset
        close_freq = []
        for index, row in frequent_author_list.iterrows():
            isclose = True
            cli = row['itemsets']
            cls = row['support']
            checkset = freq_dic[cls]
            for i in checkset:
                if (cli != i):
                    if (frozenset.issubset(cli, i)):
                        isclose = False
                        break

            if (isclose):
                close_freq.append([x for x in  (row['itemsets'])])
        context_indicator_list.append(close_freq)
    
    freqauth_context_ind_dict = {}
    for authpair, titlelist in freqauth_title_dict_closed.items():
        cleantitlelist = []
        for i in titlelist:
            if isinstance(i, tuple):
                if isinstance(i[1], list):
                    listtostring = ' '.join(i[1])
                    cleantitlelist.append(listtostring)
        freqauth_context_ind_dict.update({authpair:cleantitlelist})

    # Merging both titles and Context indicator author for frequent pattern authors 
    for idx, key in enumerate(freqauth_context_ind_dict):
        newval = []
        if len(context_indicator_list[idx])> 0:
            for i in context_indicator_list[idx]:
                if len(i) > 0:                
                    tempstr = '&'.join(i)
                    newval = freqauth_context_ind_dict[key]
                    newval.append(tempstr)
                    freqauth_context_ind_dict.update({key:newval})

# Context Indicator Weighting
    CI_list = list(freqauth_context_ind_dict.values())
    freqauth_context_in_weights = {}
    for key, value in freqauth_context_ind_dict.items():
        freq_auth_CI_list = value
        length_of_CI = len(value)
        temp_dict = {}
        for i in freq_auth_CI_list:
            count_tmp = 0
            for j in CI_list:
                if (i in (j)):
                    count_tmp += 1
            weight = round(1 - ((count_tmp - 1) /  count_tmp), 2)
            if (weight > 0.1):
                temp_dict.update({i:weight})
        sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        freqauth_context_in_weights.update({key:sorted_weights_dict})

    freq_auth_transactions = {}
    list_of_freq_auth = list(freqauth_context_in_weights.keys())
    for i in range(0, len(freqauth_title_dict)):
        temp_dict = {}
        title_list = freqauth_title_dict.get(list_of_freq_auth[i])
        CI_list = freqauth_context_in_weights[list_of_freq_auth[i]]
        CI_list_auth = []
        for n, c in enumerate(CI_list):
            CI_list_auth.append(c[0])
        for j in range(0, len(title_list)):
            cos_sim = cos_similarity(CI_list_auth,title_list[j])
            cos_sim = round(cos_sim, 3)
            t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j])
            temp_dict.update({t_title:cos_sim})

        sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        t_len = len(list(temp_dict.values()))
        max_len = t_len
        if (t_len > 4):
            max_len = 4
        sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len])
        freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1})

    # To find the strongest SSP - Match against similarity of the context units

    freq_auth_SSPs = {}
    list_of_freq_auth = list(freqauth_context_ind_dict.keys())
    list_of_freq_auth_CI =  list(freqauth_context_ind_dict.values())
    len_list_of_freq_auth_CI = len(list_of_freq_auth_CI)

    context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float)
    for i in range (0,len_list_of_freq_auth_CI):
        for j in range (0,len_list_of_freq_auth_CI):
            cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j])
            cos_sim = round(cos_sim, 3)
            if (i != j):
                context_indicator_similarity[i][j] = cos_sim
                context_indicator_similarity[j][i] = cos_sim

    context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int)
    for i in range(0,len(context_indicator_similarity)):
        context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:]

    SSP_Author_List = []
    for i in range(0,len(list_of_freq_auth)):
        temp_author_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
           temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]])
        SSP_Author_List.append(temp_author_list_ssp)

    SSP_Title_List = []

    CI_list_title = list(freqauth_title_dict_closed.values())
    CI_list1 = []
    for i in (CI_list_title):
        temp_list3 = []
        for j in i:
            CI_str = ' '.join(j[1])
            temp_list3.append(CI_str)
        CI_list1.append(list(set(temp_list3)))

    for i in range(0,len(CI_list1)):
        temp_title_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
            ssp_str = CI_list1[context_indicator_similarity_idx[i][j]]
            temp_title_list_ssp.extend(ssp_str)
        SSP_Title_List.append(list(set(temp_title_list_ssp)))

    # Write the output to a CSV file
    # a) list_of_freq_auth
    # b) list_of_freq_auth_CI / freqauth_context_in_weights
    # c) freq_auth_transactions
    # d) SSP_Author_List
    # e) SSP_Title_List
    #for i in range(0, frequent_author_list):
    #print(len(SSP_Title_List))
    #print(SSP_Title_List)
    titles_list_with_weight = list(freq_auth_transactions.values())
    # Joining SSP authors
    SSP_authors_formatted = []
    for i in range(0,len(SSP_Author_List)):
        temp_list = []
        for j in range(0, len(SSP_Author_List[i])):
            authors = '&'.join(list(SSP_Author_List[i][j]))
            temp_list.append(authors)
        SSP_authors_formatted.append(temp_list)

    with open("./output.txt", 'w', encoding="utf-8") as f:
        f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' +
                'Transaction 2' + '||'  + 'Transaction 3' + '||'  + 'Transaction 4' + '||' + 'SSP - Co-Author' +
                '||' + 'SSP - Title' + '\n')
        for i in range(0, len(list_of_freq_auth)):
            authors = ' '.join(list(list_of_freq_auth[i]))
            f.write(authors + '||')
            Context_indicators = '; '.join(list_of_freq_auth_CI[i])
            f.write(Context_indicators + '||')
            for j in (titles_list_with_weight[i].keys()):
                f.write(j + '||')
            ssp_authors = '; '.join(SSP_authors_formatted[i])
            f.write(ssp_authors + '||')
            ssp_titles = '; '.join(SSP_Title_List[i])
            f.write(ssp_titles )
            f.write('\n')
Пример #9
0
def aversion_direction_one_stacking_period():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        avg = []
        curr_time = 0
        aversion = [0.0, 0.0, "1"]
        temp = []
        for row in spamreader:
            if not curr_usr == row[2]:
                mean = np.average(avg, axis=0)
                t = []
                for i in temp:
                    res = "c"
                    if i[2] == "0":
                        diff = [np.abs(a - b) for a, b in zip(i[0:2], mean)]
                        # if np.abs((diff[0] / mean[0]) - (diff[1] / mean[1])) < treshold:
                        if (np.abs(((diff[0] + mean[0]) / mean[0]) - ((diff[1] + mean[1]) / mean[1])) < treshold) or \
                                (((diff[0] + mean[0]) / mean[0]) > treshold2 and (
                                            ((diff[1] + mean[1]) / mean[1]) > treshold2)):
                            res = "f"
                        elif diff[0] - diff[1] > 0:
                            if i[0] < mean[0]:
                                res = "l"
                            if i[0] > mean[0]:
                                res = "r"
                        else:
                            if i[1] < mean[1]:
                                res = "u"
                            if i[1] > mean[1]:
                                res = "d"
                    t.append(res)
                dict[curr_usr] = t
                curr_usr = row[2]
                temp = []
                avg = []
            if row[2] == "":
                continue

            if row[6] == "1":
                avg.append([
                    float(row[4].replace(",", ".")),
                    float(row[5].replace(",", "."))
                ])
            curr_time += int(row[3])
            if row[6] == "0":
                if aversion[2] == "1":
                    temp.append(aversion)
                aversion = [
                    float(row[4].replace(",", ".")),
                    float(row[5].replace(",", ".")), row[6]
                ]
                temp.append(aversion)
                aversion = [0.0, 0.0, "1"]
                curr_time = 0

            if curr_time > PERIOD * 1000:
                temp.append(aversion)
                curr_time = curr_time - (PERIOD * 1000)
                aversion = [0.0, 0.0, "1"]

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("aversion direction one stacking period \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")

    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
Пример #10
0
def get_common(sequence, data):
    data = data.split("\t")[0].lower()
    data = [sequence, data]
    ps = PS(data)
    common = get_longest(ps.topk(1000))
    return common
def recommend(trainingset=l_good,
              s_group=s_good,
              student=s_good[0],
              path_length=9,
              rl=resourcelist):

    # Here we put the influence or this stdent's learning log bigger. x10
    for i in range(30):
        trainingset.append(trainingset[s_group.index(student)])
    ps = PrefixSpan(trainingset)

    pattern = ps.topk(1000, filter=lambda patt, matches: len(patt) > 1
                      )  # pattern lenth should bigger than 1
    pattern_time = {}  #Here stores all pattern with appear times

    for i, element in enumerate(pattern):
        l_s = []  # store pattern in this element
        s = ""
        for i in range(len(element[1])):
            if i == 0:
                s = str(element[1][i])
            else:
                l_s.append(s + "," + str(element[1][i]))
                s = str(element[1][i])
        for j in l_s:
            if j in pattern_time.keys():
                pattern_time[j] += element[0]
            else:
                pattern_time[j] = element[0]

    # ordered pattern in list
    pattern_time = sorted(pattern_time.items(),
                          key=lambda pattern_time: pattern_time[1],
                          reverse=True)
    print("pattern with time:", pattern_time)
    # delete repeat part
    #print(len(pattern_time))
    """
    Here is deduplication.
    we can't delete the item of list in for cycle. It will have 'index out of range problem'. 
    So we store the repeat index and delete after
    """
    delete_indice = []
    for k1 in range(len(pattern_time)):
        starter = pattern_time[k1][0].split(",")[0]
        ender = pattern_time[k1][0].split(",")[1]
        if starter == ender:
            delete_indice.append(k1)
        if pattern_time[k1] == pattern_time[-1]:
            break

        for k2 in range(k1 + 1, len(pattern_time)):
            #print(pattern_time[k2])
            temps_start = pattern_time[k2][0].split(",")[0]
            temps_end = pattern_time[k2][0].split(",")[1]
            if starter == temps_start:
                delete_indice.append(pattern_time[k2])
            if ender == pattern_time[k2][0].split(",")[1]:
                delete_indice.append(pattern_time[k2])

    for i in set(delete_indice):
        if i in pattern_time:
            pattern_time.remove(i)
    """
    Here we organise the path from pattern list.
    We should firstly find the head then finish the path.
    """
    element = []

    pattern_result = [x[0] for x in pattern_time
                      ]  # delete pattern show times, keep pattern
    #print("unique pattern:",pattern_result)
    store = []
    for i in range(len(pattern_result)):
        for j in range(len(pattern_result)):
            if i == j:
                continue
            if pattern_result[i].split(",")[0] in pattern_result[j]:
                store.append(pattern_result[i])
    path = list(set(pattern_result).difference(set(store)))[0]
    print("begin_node of path:", path)
    compt = 0
    c_b = 0
    l_change = 2
    while compt < path_length - 2:  # First node has two element, so we should add path_length-2
        c_b += 1
        for i in pattern_result:
            if i.split(",")[0] == path.split(",")[-1]:
                path += "," + i.split(",")[-1]
                compt += 1

        if l_change == len(path):
            c_b += 1
        else:
            l_change = len(path)
        if c_b > 100000:
            break
    print("path:", path)
    return path
Пример #12
0
        while line:
            db.append(eval(line))
            line = file.readline()

    return db


def generateFilename(tno, cno):
    "根据队编号与聚类编号产生文件名"
    return "Team" + str(tno) + "Cluster" + str(cno) + ".txt"


path = "Cluster/"
list_p = []
# tno 代表 team number
# cno 代表 cluster number
for tno in range(1, 3):
    for cno in range(0, 5):
        filepath = path + generateFilename(tno, cno)
        db = loadFile(filepath)
        ps = PrefixSpan(db)
        for x in range(0, 10):
            list_p.append(Pattern(ps.topk(10)[x][0], ps.topk(10)[x][1]))
        # 输出当前Cluster中出现频率最高的10个Pattern
        # print(ps.topk(10))

# 对Score进行排序
list_p = sorted(list_p, key=lambda x: x.score, reverse=True)
print("#######################################")
for x in list_p:
    print("score:", x.score, "freq:", x.freq)