Exemplo n.º 1
0
def fit(data, labels, label_size, alpha=1.0):
  '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
  # calc document freq
  df = expr.reduce(data,
                   axis=0,
                   dtype_fn=lambda input: input.dtype,
                   local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis),
                   accumulate_fn=np.add)
  
  idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1
   
  # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency 
  # by the root mean square of features frequencies in that document
  square_sum = expr.reduce(data,
                           axis=1,
                           dtype_fn=lambda input: input.dtype,
                           local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
                           accumulate_fn=np.add)
  
  rms = expr.sqrt(square_sum * 1.0 / data.shape[1])
  
  # calculate weight normalized Tf-Idf
  data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1]))
  
  # add up all the feature vectors with the same labels
  #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64)
  #for i in range(label_size):
  #  i_mask = (labels == i)
  #  weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0))
  weights_per_label_and_feature = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
                                               _sum_instance_by_label_mapper,
                                               target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add),
                                               kw={'labels': labels, 'label_size': label_size},
                                               cost_hint={hash(labels):{'00':0, '01':np.prod(labels.shape)}})

  # sum up all the weights for each label from the previous step
  weights_per_label = expr.sum(weights_per_label_and_feature, axis=1)
  
  # generate naive bayes per_label_and_feature weights
  weights_per_label_and_feature = expr.log((weights_per_label_and_feature + alpha) / 
                                           (weights_per_label.reshape((weights_per_label.shape[0], 1)) + 
                                            alpha * weights_per_label_and_feature.shape[1]))

  return {'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(),
          'scores_per_label': weights_per_label.optimized().force(),
          }
Exemplo n.º 2
0
    def test_pca(self):
        FLAGS.opt_parakeet_gen = 0
        data = np.random.randn(*DIM)
        A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0))

        m = PCA(N_COMPONENTS)
        m2 = SK_PCA(N_COMPONENTS)

        m.fit(A)
        m2.fit(data)
        print m2.components_ - m.components_
        assert np.allclose(absolute(m.components_), absolute(m2.components_))
Exemplo n.º 3
0
  def test_pca(self):
    FLAGS.opt_parakeet_gen = 0
    data = np.random.randn(*DIM)
    A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0))
    
    m = PCA(N_COMPONENTS)
    m2 = SK_PCA(N_COMPONENTS)

    m.fit(A)
    m2.fit(data)
    print m2.components_ - m.components_
    assert np.allclose(absolute(m.components_), absolute(m2.components_))
Exemplo n.º 4
0
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1):
  '''
  Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model.

  Args:
    terms_docs_matrix(Expr or DistArray): the count of each term in each document.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter(int):the max iterations to train LDA topic model.
    max_iter_per_doc: the max iterations to train each document.
  '''
  num_terms = terms_docs_matrix.shape[0]
  num_docs = terms_docs_matrix.shape[1]

  topic_term_counts = expr.rand(k_topics, num_terms)
  for i in range(max_iter):
    topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), 
                                     _lda_mapper, 
                                     target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add), 
                                     kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 
                                         'topic_term_counts': topic_term_counts}).optimized()
  # calculate the doc-topic inference
  doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), 
                            _lda_doc_topic_mapper, 
                            kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 
                                'topic_term_counts': topic_term_counts}, 
                            shape_hint=(num_docs, k_topics)).optimized()
  
  # normalize the topic-term distribution  
  norm_val = expr.reduce(topic_term_counts, axis=1, 
                         dtype_fn=lambda input: input.dtype, 
                         local_reduce_fn=lambda ex, data, axis:np.abs(data).sum(axis), 
                         accumulate_fn=np.add)
  topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1))
  topic_term_counts = topic_term_counts.optimized()
  return doc_topics, topic_term_counts
Exemplo n.º 5
0
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10):
  '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.
  
  where `A` is the "ratings" matrix which maps from a user and item to a rating score, 
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
  num_users = A.shape[0]
  num_items = A.shape[1]
 
  AT = expr.transpose(A)

  avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

  M = expr.rand(num_items, num_features)
  M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1)))
  
  for i in range(num_iter):
    # Recomputing U
    U = expr.shuffle(expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)), 
                     _solve_U_or_M_mapper, 
                     kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, 
                     shape_hint=(num_users, num_features)).optimized()
    # Recomputing M
    M = expr.shuffle(expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)), 
                     _solve_U_or_M_mapper, 
                     kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, 
                     shape_hint=(num_items, num_features)).optimized()
  return U, M
Exemplo n.º 6
0
  def __init__(self, rating_table, k=10):
    '''Based on the user-item ratings, recommend items to user.
    Parameters
    ----------
    rating_table : Spartan array of shape (N_USERS, N_ITEMS). 
        Array which represents the ratings of user(M, N)
        M is number of user, N is number of items. Mi,j means
        the rating of user i on item j.

    k : integer. The number of most similar items for each item needs to be precomputed.
        It must be less or equal than the number of items.
    '''
    assert rating_table.shape[1] >= k,\
           "The number of items must be grater or equal than k!"
    self.rating_table = expr.retile(rating_table, tile_hint=util.calc_tile_hint(rating_table, axis=1))
    self.k = k
Exemplo n.º 7
0
    def __init__(self, rating_table, k=10):
        '''Based on the user-item ratings, recommend items to user.
    Parameters
    ----------
    rating_table : Spartan array of shape (N_USERS, N_ITEMS). 
        Array which represents the ratings of user(M, N)
        M is number of user, N is number of items. Mi,j means
        the rating of user i on item j.

    k : integer. The number of most similar items for each item needs to be precomputed.
        It must be less or equal than the number of items.
    '''
        assert rating_table.shape[1] >= k,\
               "The number of items must be grater or equal than k!"
        self.rating_table = expr.retile(rating_table,
                                        tile_hint=util.calc_tile_hint(
                                            rating_table, axis=1))
        self.k = k
Exemplo n.º 8
0
def fit(data, labels, T=50, la=1.0):
  '''
  Train an SVM model using the disdca (2013) algorithm.
 
  Args:
    data(Expr): points to be trained.
    labels(Expr): the correct labels of the training data.
    T(int): max training iterations.
    la(float): lambda parameter of this SVM model.
  '''
  w = expr.zeros((data.shape[1], 1), dtype=np.float64)
  alpha = expr.zeros((data.shape[0], 1), dtype=np.float64)
  for i in range(T):
    alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
                         _svm_mapper, 
                         kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]},
                         shape_hint=alpha.shape, 
                         cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} })
    w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1))
    w = w.optimized()
  return w
Exemplo n.º 9
0
def fit(data, labels, label_size, alpha=1.0):
    '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
    # calc document freq
    df = expr.reduce(data,
                     axis=0,
                     dtype_fn=lambda input: input.dtype,
                     local_reduce_fn=lambda ex, data, axis:
                     (data > 0).sum(axis),
                     accumulate_fn=np.add)

    idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1

    # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency
    # by the root mean square of features frequencies in that document
    square_sum = expr.reduce(
        data,
        axis=1,
        dtype_fn=lambda input: input.dtype,
        local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
        accumulate_fn=np.add)

    rms = expr.sqrt(square_sum * 1.0 / data.shape[1])

    # calculate weight normalized Tf-Idf
    data = data / rms.reshape((data.shape[0], 1)) * idf.reshape(
        (1, data.shape[1]))

    # add up all the feature vectors with the same labels
    #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64)
    #for i in range(label_size):
    #  i_mask = (labels == i)
    #  weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0))
    weights_per_label_and_feature = expr.shuffle(
        expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
        _sum_instance_by_label_mapper,
        target=expr.ndarray((label_size, data.shape[1]),
                            dtype=np.float64,
                            reduce_fn=np.add),
        kw={
            'labels': labels,
            'label_size': label_size
        },
        cost_hint={hash(labels): {
                       '00': 0,
                       '01': np.prod(labels.shape)
                   }})

    # sum up all the weights for each label from the previous step
    weights_per_label = expr.sum(weights_per_label_and_feature, axis=1)

    # generate naive bayes per_label_and_feature weights
    weights_per_label_and_feature = expr.log(
        (weights_per_label_and_feature + alpha) /
        (weights_per_label.reshape((weights_per_label.shape[0], 1)) +
         alpha * weights_per_label_and_feature.shape[1]))

    return {
        'scores_per_label_and_feature':
        weights_per_label_and_feature.optimized().force(),
        'scores_per_label':
        weights_per_label.optimized().force(),
    }