示例#1
0
def projection(graph, attr='type', on_attr='user', using_attr='business'):
    proj_graph = copy_graph(graph)

    print('> Computing projection on %s using %s' % (on_attr, using_attr))
    nbnodes = proj_graph.GetNodes()

    for node in proj_graph.Nodes():
        node_id = node.GetId()
        node_type = proj_graph.GetStrAttrDatN(node_id, attr)
        if node_type == on_attr:
            nodeVec = snap.TIntV()
            snap.GetNodesAtHop(graph, node_id, 2, nodeVec, False)
            tempPrint('%d / %d' % (node_id, nbnodes))

            for next_neighbor_id in nodeVec:
                if proj_graph.GetStrAttrDatN(
                        next_neighbor_id,
                        attr) == on_attr and not proj_graph.IsEdge(
                            node_id, next_neighbor_id):
                    proj_graph.AddEdge(node_id, next_neighbor_id)

    delete_node_type(proj_graph, attr=attr, value=using_attr)
    return proj_graph
示例#2
0
root = data.getParent(__file__)

alltoken = data.loadFile(root + '/computed/alltoken.pkl')
reviews_feature = data.loadFile(root + '/computed/reviews_feature.pkl')

n = len(reviews_feature)

print "Total reviews:", n

# TF-IDF
print "> Computing TF"
TF = dict()
i = 0
for review in reviews_feature:
  i += 1
  disp.tempPrint(str(i))
  TF[review] = Counter()
  for token in reviews_feature[review]:
    TF[review][token] = float(reviews_feature[review][token]) / float(max(reviews_feature[review].values()))

print "> Computing IDF"
IDF = dict()
i = 0
for token in alltoken:
  i += 1
  disp.tempPrint(str(i))
  IDF[token] = log(float(n) / float(len(alltoken[token])))

print "> Computing TFIDF"
TFIDF = dict()
i = 0
def sgd(TFIDF, target, alpha=0.04, epsilon=0.01, alapcoeff=.01, nIter=1000):
  weights = Counter()

  # parameters 
  iIter = 0
  bias = 0
  alphainit = alpha

  # variables
  RMSE = 2 * epsilon
  first = True

  #delta_weights = Counter()
  #delta_bias = 0

  while iIter < nIter:
    print "Iter", iIter

    # Update weights 
    i = 0
    for review in TFIDF:
      i += 1
      disp.tempPrint(str(i))

      coeff = target[review]
      coeff -= dot(TFIDF[review], weights)
      coeff -= bias

      #old_delta_weights = copy(weights)
      #old_delta_bias = coeff
 
      for token in TFIDF[review] :
        weights[token] += alpha * coeff * TFIDF[review][token]
        #delta_weights[token] = coeff * TFIDF[review][token]
        
      # Constant term
      bias += alpha * coeff
      #delta_bias = coeff

      #if alapcoeff:
        #alpha = alpha * (1 + alapcoeff * (delta_bias * old_delta_bias + dot(old_delta_weights, delta_weights))) 
    # Compute the approximation error

    print "Computing error"
    error = 0.
    
    i = 0
    for review in TFIDF:
      i += 1
      disp.tempPrint(str(i))
      errori = target[review]
      errori -= dot(TFIDF[review], weights)
      errori -= bias

      error += errori ** 2.0

    iIter += 1
    
    RMSE_old = RMSE
    RMSE = error / len(TFIDF)
    if first:
      first = False
    else:
      alpha = alphainit / math.sqrt(iIter)

    print '[',iIter,']'," - Error =", RMSE, "- Alpha =", alpha
    
    if abs(RMSE_old - RMSE) < epsilon:
      break

  return (RMSE, weights, bias)
def generate_slda_data(filename="yelp_academic_dataset_review_training.json", n_reviews=None, category=None):

    filepath = root + "/dataset/" + filename

    """ A couple of useful initializations """
    # Chris Potts tokenizer.
    tok = tokenizer.Tokenizer(preserve_case=False)
    # min and max ngram sizes
    MIN_NGRAM = 1
    MAX_NGRAM = 1
    word_set = set()
    # set of unique words
    word2idx = dict()
    # mapping from word to int representation of a word
    ratings_list = []
    reviews_list = []
    reviews = []
    data_list = []
    words_distr = dict()
    words_counts = Counter()

    """ PHASE 1 : Load file and get set of all words """
    stopwords = nltk.corpus.stopwords.words("english")
    print " PHASE 1 : Get all words "
    loaded_file = open(filepath)
    lines_file = loaded_file.readlines()
    if n_reviews == None:
        n_reviews = len(lines_file)
    loaded_file.close()
    i_review = 1

    # we randomly select n_reviews from the dataset
    permutation = np.random.permutation(len(lines_file))
    sample_reviews = permutation[0:n_reviews]

    for idx_review in sample_reviews:
        line_json = lines_file[idx_review]
        review_dict = json.loads(line_json)
        tokens_list = tok.ngrams(review_dict["text"], MIN_NGRAM, MAX_NGRAM, string=True)
        rating = review_dict["stars"]
        for token in tokens_list:
            if token not in stopwords:
                """
              if token not in words_distr:
                words_distr[token] = Counter({5:0, 4:0, 3:0, 2:0, 1:0}); 
              words_distr[token][rating] += 1;
              """
                words_counts[token] += 1

        reviews_list.append(Counter(tokens_list))
        ratings_list.append(review_dict["stars"] - 1)
        word_set |= set(tokens_list)
        disp.tempPrint(str(i_review))
        i_review += 1

    """ PHASE 2 : Word to int conversion """
    filter_threshold = 0.00001 * (max(words_counts.values()) * 1.0)
    print " PHASE 2 : Word to int conversion "
    i_word = 1
    for word in word_set:
        if words_counts[word] >= filter_threshold:
            word2idx[word] = i_word
            disp.tempPrint(str(i_word))
            i_word += 1
    print "    Filtered. Before : %d words. After : %d" % (len(word_set), len(word2idx))

    """ PHASE 3 : Converting data to the right format """
    print " PHASE 3 : Converting data to the right format "
    i_review = 1
    for review in reviews_list:
        nwords = 0
        data_line = ""
        for word in review:
            if word in word2idx:
                data_line += " " + str(word2idx[word]) + ":" + str(review[word])
                nwords += 1
        data_line += "\n"
        if nwords != 0:
            data_line = str(nwords) + " " + data_line
            data_list.append(data_line)
            disp.tempPrint(str(i_review))
            i_review += 1

    """ PHASE 4 : Save into right files """
    print " PHASE 4 : Save into right files "
    n_reviews = len(data_list)
    idx_test = n_reviews * 8 / 10

    if category:
        category = "_" + category
    else:
        cateogory = ""

    data_train = open("/tmp/slda_data_train" + category + ".txt", "w")
    label_train = open("/tmp/slda_label_train" + category + ".txt", "w")

    data_test = open("/tmp/slda_data_test" + category + ".txt", "w")
    label_test = open("/tmp/slda_label_test" + category + ".txt", "w")

    for i_review in range(idx_test):
        data_train.write(data_list[i_review])
        label_train.write(str(ratings_list[i_review]) + "\n")

    for i_review in range(idx_test, n_reviews):
        data_test.write(data_list[i_review])
        label_test.write(str(ratings_list[i_review]) + "\n")

    data_train.close()
    data_test.close()
    label_train.close()
    label_test.close()

    """ PHASE 5 : Save useful datastructures """
    print " PHASE 5 : Save useful datastructures "
    data.save(reviews_list, "/tmp/slda_reviews" + category + ".pkl.gz")
    data.save(ratings_list, "/tmp/slda_ratings" + category + ".pkl.gz")
    data.save(word2idx, "/tmp/slda_word2idx" + category + ".pkl.gz")