Пример #1
0
 def train(self, train_vects):
     # load the train data to a matrix.
     if self.verbose:
         print "Loading the training data to memory..."
     (self.D, self.t) = get_train_data(train_vects)
     # create co-variance matrix C.
     self.n = len(train_vects["vects"])
     self.m = len(train_vects["featIDs"])
     if self.verbose:
         print "Creating the covariance matrix..."
         term = TerminalController()
         progress = ProgressBar(term, "Train instances = %d" % self.n)
     C = zeros((self.n, self.n))
     for i in range(0, self.n):
         if self.verbose:
             progress.update(
                 float(i + 1) / self.n,
                 "Processing instance no. %d" % (i + 1))
         for j in range(i, self.n):
             x_i = self.D[i, :]
             x_j = self.D[j, :]
             val = self.kernel.value(x_i, x_j)
             if i == j:
                 val += 1.0 / float(self.beta)
             C[i, j] = val
             C[j, i] = val
     # compute the inverse.
     if self.verbose:
         print "Computing the inverse of the matrix..."
     self.Cinv = inv(C)
     pass
Пример #2
0
 def train(self, train_vects):
     # load the train data to a matrix.
     if self.verbose:
         print "Loading the training data to memory..."
     (self.D, self.t) = get_train_data(train_vects)
     # create co-variance matrix C.
     self.n = len(train_vects["vects"])
     self.m = len(train_vects["featIDs"])
     if self.verbose:
         print "Creating the covariance matrix..."
         term = TerminalController()
         progress = ProgressBar(term,
                                "Train instances = %d" % self.n)        
     C = zeros((self.n,self.n))
     for i in range(0,self.n):
         if self.verbose:
             progress.update(float(i + 1) / self.n,
                             "Processing instance no. %d" % (i + 1))
         for j in range(i,self.n):
             x_i = self.D[i,:]
             x_j = self.D[j,:]
             val = self.kernel.value(x_i, x_j)
             if i == j:
                 val += 1.0 / float(self.beta)
             C[i,j] = val
             C[j,i] = val
     # compute the inverse.
     if self.verbose:
         print "Computing the inverse of the matrix..."
     self.Cinv = inv(C)                
     pass
Пример #3
0
def write_distribution(M, result_fname):
    """
    Compute the row similarity distribution.
    To compute column similarity distribution, transpose
    the matrix first.
    """
    work_queue = Queue()
    lock = Lock()
    distFile = open(result_fname, "w")
    row_ids = M.get_row_id_list()
    no_rows, no_cols = M.shape()
    for (counter, i) in enumerate(row_ids):
        work_queue.put(i)
    term = TerminalController()
    progress = ProgressBar(term,"Total rows = %d, columns = %d"\
                           % (no_rows,no_cols))
    count = 0
    # compute similarity.
    procs = [
        Process(target=do_work,
                args=(work_queue, lock, M, no_rows, distFile, progress))
        for i in range(NO_OF_PROCESSORS)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    distFile.close()
    pass
Пример #4
0
def write_distribution(M, result_fname, DIelements):
    """
    Compute the row similarity distribution.
    To compute column similarity distribution, transpose
    the matrix first. if Domain independent row elements are
    given (DIelements), then compute the similarity between
    those elements and all the row elements.
    """
    work_queue = Queue()
    lock = Lock()
    distFile = open(result_fname, "w")
    row_ids = []
    for rowid in DIelements:
        if M.row_exists(rowid):
            row_ids.append(rowid)
    (no_rows, no_cols) = M.shape()
    for (counter, i) in enumerate(row_ids):
        work_queue.put(i)
    term = TerminalController()
    progress = ProgressBar(term,"Total rows = %d, columns = %d"\
                           % (no_rows,no_cols))
    count = 0
    # compute similarity.
    procs = [
        Process(target=do_work,
                args=(work_queue, lock, M, len(row_ids), distFile, progress))
        for i in range(NO_OF_PROCESSORS)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    distFile.close()
    pass
Пример #5
0
 def cluster(self, m, theta):
     #first sort patterns according to the total frequency
     #of all word-pairs in which they appear.
     pats = [] # (pat_id, total_frequency_in_wpairs)
     for pat in m.get_row_id_list():
         row = m.get_row(pat)
         total = 0
         for k in row:
             total += row[k]
         pats.append((pat, total))
     N = len(pats)
     pats.sort(self.patsort)
     #initialize clusters.
     clusts = []
     count = 0
     m.L2_normalize_rows()
     term = TerminalController()
     progress = ProgressBar(term, "Clustering total rows = %d" %N)
     for (pat, total) in pats:
         maxsim = 0
         maxclust = None
         count += 1
         for c in clusts:
             v = m.get_row(pat)
             s = self.sim(c, v)
             if s > maxsim:
                 maxsim = s
                 maxclust = c
         if maxsim > theta:
             progress.update(float(count)/N,
                             "MERGED %d: row = %d freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             maxclust.merge(pat, m.get_row(pat))
         else:
             progress.update(float(count)/N,
                             "   NEW %d: %s freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             clusts.append(SEQ_CLUST_DATA(pat, m.get_row(pat)))
     return(clusts)
Пример #6
0
 def cluster(self, m, theta):
     #first sort patterns according to the total frequency
     #of all word-pairs in which they appear.
     pats = []  # (pat_id, total_frequency_in_wpairs)
     for pat in m.get_row_id_list():
         row = m.get_row(pat)
         total = 0
         for k in row:
             total += row[k]
         pats.append((pat, total))
     N = len(pats)
     pats.sort(self.patsort)
     #initialize clusters.
     clusts = []
     count = 0
     m.L2_normalize_rows()
     term = TerminalController()
     progress = ProgressBar(term, "Clustering total rows = %d" % N)
     for (pat, total) in pats:
         maxsim = 0
         maxclust = None
         count += 1
         for c in clusts:
             v = m.get_row(pat)
             s = self.sim(c, v)
             if s > maxsim:
                 maxsim = s
                 maxclust = c
         if maxsim > theta:
             progress.update(float(count)/N,
                             "MERGED %d: row = %d freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             maxclust.merge(pat, m.get_row(pat))
         else:
             progress.update(float(count)/N,
                             "   NEW %d: %s freq = %d clusts = %d" \
                             % (count, pat, total, len(clusts)))
             clusts.append(SEQ_CLUST_DATA(pat, m.get_row(pat)))
     return (clusts)
Пример #7
0
 def coclustering(self, M, theta, phi):
     """
     Implements sequential co-clustering.
     (alternation variant)
     """
     # Initialization. sorting row counts.
     cols = []
     rows = []
     columnIndex = {}
     rowIndex = {}
     for rowid in M.get_row_id_list():
         rows.append((rowid,M.get_row_sum(rowid)))
     rows.sort(self.patsort)
     no_rows = len(rows)
     # sorting column counts.
     for colid in M.get_column_id_list():
         cols.append((colid, M.get_column_sum(colid)))
     cols.sort(self.patsort)
     no_cols = len(cols)
     colclusts = {}
     rowclusts = {}
     theta_max = -1
     phi_max = -1
     if not self.VERBOSE:
         term = TerminalController()
         progress = ProgressBar(term,
                                "Clustering rows = %d, columns = %d" % \
                                (no_rows, no_cols))
     total = no_rows + no_cols
     count = 0
     # start alternative clustering.
     while(cols or rows):
         if cols:
             # column clustering.
             count += 1
             current_column = cols[0][0]
             del cols[0]
             theta_max = 0
             max_col_clust = -1
             validClusts = self.get_clusters(rowIndex,
                                             M.get_column(current_column))
             for c in validClusts:
                 s = self.cosine(M.get_column(current_column),
                                 M.get_column(c))
                 if s > theta_max:
                     theta_max = s
                     max_col_clust = c
             if theta_max > theta:
                 colclusts[max_col_clust].append(current_column)
                 self.update_index(rowIndex, M.get_column(current_column),
                                   max_col_clust)
                 M.merge("COLUMNS",max_col_clust,current_column)
                 if self.VERBOSE:
                     print "COL\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \
                           (current_column,theta_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "COL %d MRG SIM=%f Total=(%d,%d) [%d/%d]" %\
                                     (current_column,theta_max,\
                                      len(rowclusts), len(colclusts),\
                                      count, total))
                     pass                                        
             else:
                 colclusts[current_column] = [current_column]
                 self.update_index(rowIndex, M.get_column(current_column),
                                   current_column)
                 if self.VERBOSE:
                     print "COL\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \
                           (current_column,theta_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "COL %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \
                                     (current_column,theta_max,\
                                      len(rowclusts), len(colclusts),\
                                      count, total))
                     pass
         if rows:
             # row clustering.
             count += 1
             current_row = rows[0][0]
             del rows[0]
             phi_max = 0
             max_row_clust = -1
             validClusts = self.get_clusters(columnIndex,
                                             M.get_row(current_row))
             for c in validClusts:
                 s = self.cosine(M.get_row(current_row),
                                 M.get_row(c))
                 if s > phi_max:
                     phi_max = s
                     max_row_clust = c
             if phi_max > phi:
                 rowclusts[max_row_clust].append(current_row)
                 self.update_index(columnIndex, M.get_row(current_row),
                                   max_row_clust)
                 M.merge("ROWS",max_row_clust,current_row)
                 if self.VERBOSE:
                     print "ROW\t%d\tMRG\tSIM=%f\tTotal=(%d,%d) [%d/%d]" % \
                           (current_row,phi_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "ROW %d MRG SIM=%f Total=(%d,%d) [%d/%d]" % \
                                     (current_row,phi_max,
                                      len(rowclusts), len(colclusts),
                                      count, total))
                     pass                                        
             else:
                 rowclusts[current_row] = [current_row]
                 self.update_index(columnIndex, M.get_row(current_row),
                                   current_row)
                 if self.VERBOSE:
                     print "ROW\t%d\tNEW\tSIM=%f\tTotal=(%d,%d) [%d,%d]" % \
                           (current_row,phi_max,
                            len(rowclusts), len(colclusts),
                            count, total)
                 else:
                     progress.update(float(count)/total,\
                                     "ROW %d NEW SIM=%f Total=(%d,%d) [%d/%d]" % \
                                     (current_row,phi_max,
                                      len(rowclusts), len(colclusts),
                                      count, total))
                     pass                 
     # Final steps.
     return (rowclusts,colclusts)