Пример #1
0
def findthemes(nothemes,wordlist,questionresponses,inc_articles,outputfile):
    #print questionresponses
    synonym_wordlists = []
    synonym_wordlist = wordlist
    synonym_wordlists = synonym_wordlist.splitlines()
    exclude_wordlist = []

    stop_path = "englishstop.txt"
    stop_words = surveythemer.loadStopWords(stop_path)

    surveyQuestionResponse = []
    surveyQuestionResponseNID = []
    
    for response in questionresponses:
      newresp = remove_html_tags(response["text"])
      surveyQuestionResponse.append(newresp)
      surveyQuestionResponseNID.append(response["id"])
        
    listOfAllWords, listOfSurveyQuestionWords, listOfAllSurveyQuestionTitles, stemdictionary = surveythemer.getItemWords(surveyQuestionResponse,stop_words)
    wordMatrix, listOfWordsIncluded, wordCount, fc, ic = surveythemer.createWordMatrix(listOfAllWords,listOfSurveyQuestionWords)
    pc = nothemes
    #size of input matrix
    ic=shape(wordMatrix)[0]
    fc=shape(wordMatrix)[1]
    # Random initialization
    w=array([[random.random() for j in range(pc)] for i in range(ic)])
    h=array([[random.random() for i in range(fc)] for i in range(pc)])
    nmfresult = ""
    themes = ""
    weights,themes = nmf.nmf(wordMatrix,w,h,0.001, 10, 500)
    themexml = surveythemer.display_themes(weights,themes,listOfWordsIncluded,surveyQuestionResponse, stemdictionary, wordCount, inc_articles, surveyQuestionResponseNID)
    f = open(outputfile, 'w')
    f.write(themexml)
    f.close()  
    return 
Пример #2
0
def consensus(V, rank, nloop, method):
    """ 
    Calculate consensus matrix for columns of V
    Matrix V has the size of n rows and m columns
    n = shape[0]
    m = shape[1]
    """
    shape = V.shape
    m = shape[1]
    consensus = np.zeros((m, m))
    conn = np.zeros((m, m))
    connac = np.zeros((m, m))

    for l in range(nloop):
        if method == "pnmf":
            (W, H) = pnmf.projnmf(V, rank, 0.001, 50, 100)
        elif method == "nmf":
            (W, H) = nmf.nmf(V, rank, 0.001, 50, 100)
        elif method == "spnmf":
            (W, H) = spnmf.spnmf(V, rank, 0.001, 50, 100)
        elif method == "snmf":
            (W, H) = snmf.snmf(V, rank, 0.001, 50, 100)
        conn = connectivity(H)
        connac = connac + conn

    consensus = connac / float(nloop)
    return consensus
Пример #3
0
def multi_evaluation(method, filename, view_names, weights, norm = 'l2', seed="random", post = "direct", data_size=-1):
    """
    Eval of baselines (k-means, nmf, svd) on the combined view of all views for multi-view clustering. 
    
    :param: method (type: string). Supported methods include "k-means", "nmf", "svd"
    :param: filename (type: string). The path of the input multi-view data (.MAT format).
    :param: view_names (type: list<string>). View names of the input multi-view data.
    :param: weights (type: list<int>). The weight of each view to build the combined view.
    :param: norm (type: string). Normalization strategy on the input data. Supported norm methods include:
        'l2': each item vector is normalized by its Euclidean length (i.e. l2 distance).
        'l1': each item vector is normalized by its sum of all elements (i.e. l1 distance). 
        'l0': the whole matrix is normalized by the sum of all elements (i.e. l1 normalization on the whole matrix).
    :param seed (type: string). The initialization method in CoNMF. Values can be 'k-means' and 'random':
        'k-means': initialize W and H matrix using k-means results. The details are seen in paper [1] Section 4.5
        'random': randomly initialize W and H matrix. 
    :param post (type: string). Post processing on W matrix (m*k) to generate clustering result. Values can be 'direct' and 'k-means':
        'direct': for each item vector (m*1), use the element with largest value as its cluster assignment.
        'k-means': perform k-means on W matrix to get cluster assignment. 
    :param data_size (type: int). Select the first data_size items to run clustering algorithm. 
        When the value is -1, the clustering algorithm is run on all items. 
        This parameter is for a quick check of clustering algorithms in case the input data is too large.      
    """
    method = method.lower()
    if method not in ["k-means","kmeans","nmf","svd"]:
        print "Error! Wrong input method name."
        return None
    # weights can only be integers
    if len(view_names)!= len(weights):
        print "Error! Length of view_names is not equal to the length of weights!"
        return None

    datas, names, groundtruth, cluster_k = dl.loadMATdata(filename, view_names,data_size)
    data = dl.combineData(datas, weights, norm)
    NMIs,As,F1s = [],[],[]
    print "Running multi- %s(k=%d,norm=%s,seed=%s,post=%s) for %s, size = %s, #runs: %d" %(method,cluster_k,norm,seed,post,names,str(data.shape), n_runs)
    print "view_names = %s, weights = %s" %(str(names),str(weights))
    i_run = 1
    t0 = time()
    while i_run <= n_runs:
        t1 = time()
        if method == "kmeans" or method=="k-means":
            labels = kmeans(data, cluster_k, norm)[0]
        if method == "nmf":
            labels = nmf(data, cluster_k, norm, seed, post, groundtruth)[0]
        if method == "svd":
            labels = svd(data, cluster_k, norm, post)
        NMI,A,F1,P,RI,ARI = evaluation_scores(groundtruth,labels)
        print "\t %d-th run(time=%ds),<Acc, F1, NMI>\t%f\t%f\t%f" %(i_run,time()-t1,A,F1,NMI)
        NMIs.append(NMI)
        As.append(A)
        F1s.append(F1)
        i_run = i_run+1    
            
    print "Results of %d runs (mean,std_var):\n\t Acc: %f, %f\n\t F1 : %f, %f\n\t NMI: %f, %f"  %(n_runs,
        np.mean(As),np.std(As),np.mean(F1s),np.std(F1s),np.mean(NMIs),np.std(NMIs))
    print "Running time: %fs" %(time() - t0)
Пример #4
0
def plot_run(plotter):
    a = nmf.tf_idf(nmf.read_term_document())
    terms = nmf.read_terms()
    abort_error = -500
    max_iter = 500
    num_terms = 3
    errors = []
    for c in range(2, 7):
        term_indices, iterations, best_w, e = nmf.nmf(a, abort_error, max_iter, c, num_terms)
        errors.append(e)

    plotter(errors)
def train():
    """Train with the train set

    Train with the train set, and return the user_user_similarities array.

    Returns:
        user_user_similarities(Type: numpy.ndarray): The similarities between user and user(user_user_similarities[i, j] represents the similarity between user "i" and user "j". Similarity of oneself is "1".)
        user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array
    """

    # get user-news array
    user_news_array_of_train = numpy.load("user_news_array_of_train.npy")
    user_ids_of_train = numpy.load("user_ids_of_train.npy")
    user_news_array = user_news_array_of_train
    user_ids = user_ids_of_train
    user_num = len(user_ids)
    del user_news_array_of_train, user_ids_of_train

    # NMF
    V = numpy.float16(user_news_array.T)
    K = 15  # TODO refine this parameter
    W, H = nmf.nmf(V, K)
    del V, W
    # estimatedV = numpy.dot(W, H)

    # calculate similarity between users
    print("Calculate similarity between users started.")
    time_start = time.time()
    user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16)
    H_norm = numpy.power(H, 2)
    H_norm = H_norm.sum(0)
    H_norm = numpy.sqrt(H_norm)  # norm of each column vector in H
    H_norm = numpy.tile(H_norm, (K, 1))
    eps = numpy.finfo(float).eps
    H_normalized = H / (H_norm + eps)
    H_normalized_transpose = H_normalized.transpose()
    computed_count = 0
    compute_step = 1000  # to avoid MemoryError, only compute a part each time
    while computed_count < user_num:
        compute_upper_limit = min((computed_count + compute_step, user_num))
        user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_normalized_transpose[computed_count:compute_upper_limit, :], H_normalized)
        computed_count += compute_step
    del H, H_norm, H_normalized, H_normalized_transpose, computed_count, compute_step
    time_end = time.time()
    print("Calculate similarity between users ended. %f s cost." % (time_end - time_start))

    print("[NMF-User-based Collaborative Filtering] Train finished!")

    return user_user_similarities, user_ids
Пример #6
0
def findthemes(nothemes,wordlist,question_responses,stop_words):
    synonym_wordlists = []
    synonym_wordlist = wordlist
    synonym_wordlists = synonym_wordlist.splitlines()
    exclude_wordlist = []
    surveyQuestionResponse = []
    favourites = []
    questionIDs = []
    
    
    for response in question_responses:
      newresp = remove_html_tags(response[0])
      
      for synliststring in synonym_wordlists:
        synlist = synliststring.split(",")
        mainsyn = synlist[0]
        for synword in synlist[1:len(synlist)]:
            newresp = newresp.replace(synword,mainsyn)
            #print newresp
      surveyQuestionResponse.append(newresp)
      favourites.append(response[1])
      questionIDs.append(response[2])
      
    listOfAllWords, listOfSurveyQuestionWords, listOfAllSurveyQuestionTitles, stemdictionary = surveythemer.getItemWords(surveyQuestionResponse,stop_words)
    wordMatrix, listOfWordsIncluded, wordCount, fc, ic = surveythemer.createWordMatrix(listOfAllWords,listOfSurveyQuestionWords)

    if (wordMatrix.shape[0] < nothemes+1):
        return '{"themesData": "Not enough data"}'
        #sys.exit("Not enough data")
    w,h = surveythemer.nndsvd(wordMatrix,nothemes,1)
    if not h.any():
        return '{"themesData": "Empty"}'
    # sys.exit("Error message")
    nmfresult = ""
    themes = ""
    
    weights,themes = nmf.nmf(wordMatrix,w,h,0.001, 10, 500)
	
    json,data = surveythemer.display_themes(weights,themes,listOfWordsIncluded,surveyQuestionResponse, stemdictionary, wordCount, favourites, questionIDs)
    # print theme_html
    returnData = '{"themesData":'+ data +', "jsonData":'+ json + '}'    
    return returnData
Пример #7
0
Файл: 123.py Проект: lipiny/ao
end_time = time.time()
print('reading test data process finish time: %f' % (end_time - start_time))

training_set_normalize = preprocessing.normalize(training_set, axis=0)
test_set_normalize = preprocessing.normalize(test_set, axis=0)

#===============================
#nmf and knn
#===============================

print('-×-×-×-NMF process begin-×-×-×-')
start_time = time.time()

### NMF
training_W, training_H = nmf(training_set_normalize, n_components, max_iter)

end_time = time.time()
print('NMF process end, time: %f' % (end_time - start_time))

test_H = nmf_keepW(test_set_normalize, training_W)

training_H_trans = training_H.transpose()
test_H_trans = test_H.transpose()

knn(num_neighbors, training_H_trans, training_categorie, test_H_trans,
    test_categorie)

#===============================
#new method
#===============================
Пример #8
0
def getMainFeature(feature, data):
    """
        对df_train(test).csv进行特征提取
    """
    Log('...get main feature...')
    df = copy.deepcopy(data)
    # 诊断结果病种提取
    case = df['出院诊断病种名称'].replace(np.nan, u' ').values
    df['case_length'] = [len(value) for value in case]
    df['心脏病'] = [[0, 10]['心脏病' in value] for value in case]
    df['肺心病'] = [[0, 4]['肺心病' in value] for value in case]
    df['高血压'] = [[0, 17]['高血压' in value] for value in case]
    df['冠心病'] = [[0, 10]['冠心病' in value] for value in case]
    df['挂号'] = [[0, 7]['挂号' in value] for value in case]
    df['门特挂号'] = [[0, 13]['门特挂号' in value] for value in case]
    df['糖尿病'] = [[0, 45]['糖尿病' in value] for value in case]
    df['尿毒症'] = [[0, 5]['尿毒症' in value] for value in case]
    df['偏瘫'] = [[0, 14]['偏瘫' in value] for value in case]
    df['精神病'] = [[0, 2]['精神病' in value] for value in case]

    df['是否残疾军人'] = 0
    df.loc[(~df['残疾军人医疗补助基金支付金额'].isnull()) | (df['残疾军人医疗补助基金支付金额'] != 0),
           '是否残疾军人'] = 1
    df['是否城乡救助'] = 0
    df.loc[(~df['城乡救助补助金额'].isnull()) | (df['城乡救助补助金额'] != 0), '是否城乡救助'] = 6
    df['是否公务员'] = 0
    df.loc[(~df['公务员医疗补助基金支付金额'].isnull()) | (df['公务员医疗补助基金支付金额'] != 0),
           '是否公务员'] = 4
    df['是否自负'] = 0
    df.loc[(~df['起付标准以上自负比例金额'].isnull()) | (df['起付标准以上自负比例金额'] != 0),
           '是否自负'] = 1
    df['是否民政救助'] = 0
    df.loc[(~df['民政救助补助金额'].isnull()) | (df['民政救助补助金额'] != 0), '是否民政救助'] = 4
    df['是否城乡优抚'] = 0
    df.loc[(~df['城乡优抚补助金额'].isnull()) | (df['城乡优抚补助金额'] != 0), '是否城乡优抚'] = 3
    # 诊断结果分隔符数量提取
    df['case_has_type1'] = [len(value.split(',')) - 1 for value in case]
    df['case_has_type2'] = [len(value.split(',')) - 1 for value in case]
    df['case_has_type3'] = [len(value.split(';')) - 1 for value in case]
    df['case_has_type4'] = [len(value.split(';')) - 1 for value in case]
    df['case_has_type5'] = [len(value.split('(')) - 1 for value in case]
    df['case_has_type6'] = [len(value.split(' ')) - 1 for value in case]
    # 统计信息
    case_col1 = [
        '心脏病', '肺心病', '高血压', '冠心病', '挂号', '门特挂号', '糖尿病', '尿毒症', '偏瘫', '精神病'
    ]
    case_col_sp = ['是否残疾军人', '是否城乡救助', '是否公务员', '是否自负', '是否民政救助', '是否城乡优抚']
    df['病_sum'] = df[case_col1].sum(axis=1)
    df['special_sum'] = df[case_col_sp].sum(axis=1)
    case_col2 = ['case_length','case_has_type1','case_has_type2','case_has_type3',\
    'case_has_type4','case_has_type5','case_has_type6','病_sum', 'special_sum']

    money = []
    no_money = []
    for value in df.columns.values:
        if '金额' in value:
            money.append(value)
        else:
            no_money.append(value)
    # 非负矩阵分解,得到用户矩阵和项目费用矩阵,将用户矩阵作为新的特征 (20维)
    money_sum_df, money_mean_df = df.groupby(['uid'])[money].apply(
        np.sum).fillna(0), df.groupby(['uid'])[money].apply(np.mean).fillna(0)
    matdfs = {'MoneySum': money_sum_df, 'MoneyMean': money_mean_df}
    matvs = {k: item.copy().values for k, item in matdfs.items()}
    factor_num = 20
    for k in matdfs:
        matdf, matv = matdfs[k], matvs[k]
        shape = matv.shape
        np.random.seed(20)
        initW, initH = np.random.rand(shape[0], factor_num), np.random.rand(
            factor_num, shape[1])
        outW, outH = nmf(matv, initW, initH, 0.0001, 5555555, 250)
        feature_factor = pd.DataFrame(
            outW,
            index=pd.Index(data=matdf.index.values, copy=True, name='uid'),
            columns=[k + '_factor_' + str(i)
                     for i in range(factor_num)]).reset_index()
        print(feature_factor.max(axis=0))
        feature = pd.merge(feature, feature_factor, on='uid',
                           how='left').fillna(0)
    # 获得用户记录条数
    Log('get record number feature...')
    feature_rec = df.groupby(['uid'])['顺序号'].count()
    feature_rec = feature_rec.rename('record_num')
    feature_rec = feature_rec.reset_index()
    feature = pd.merge(feature, feature_rec, on='uid', how='left')
    # 获得用户去的不同医院数量
    Log('get category feature...')
    feature_cat = df.groupby(['uid'])['医院编码'].nunique()
    feature_cat = feature_cat.rename('hospital_num')
    feature_cat = feature_cat.reset_index()
    feature = pd.merge(feature, feature_cat, on='uid', how='left')
    # 对连续型变量产生统计信息【mean, sum, median, min, max, std】
    Log('get continuous feature...')
    col_con = money + case_col2
    # col_con.append('医疗救助医院申请')
    # ,'医疗救助医院申请'
    feature_con = df.groupby(['uid'])[col_con].agg(
        [np.mean, np.sum, np.median, np.min, np.max, np.std])
    feature_con = feature_con.reset_index()
    feature = pd.merge(feature, feature_con, on='uid', how='left')

    # 对类别变量产生统计信息【mean, sum, std】
    Log('get dummy feature...')
    feature_dum = df.groupby(['uid'])[case_col1 + case_col_sp].agg(
        [np.mean, np.sum, np.std])
    feature_dum = feature_dum.reset_index()
    feature = pd.merge(feature, feature_dum, on='uid', how='left')
    # 非负矩阵分解,得到用户矩阵和医院矩阵,将用户矩阵作为新的特征 (15维)
    factor_num = 15
    PH_Mat = df.groupby(['uid', '医院编码']).size().unstack().fillna(0)
    PH_MatV = PH_Mat.copy().values
    shape = PH_MatV.shape
    np.random.seed(20)
    initW, initH = np.random.rand(shape[0], factor_num), np.random.rand(
        factor_num, shape[1])
    outW, outH = nmf(PH_MatV, initW, initH, 0.0001, 5555555, 200)
    feature_factor = pd.DataFrame(
        outW,
        index=pd.Index(data=PH_Mat.index.values, copy=True, name='uid'),
        columns=['Hostpital_factor_' + str(i)
                 for i in range(factor_num)]).reset_index()
    feature = pd.merge(feature, feature_factor, on='uid', how='left').fillna(0)
    return feature
Пример #9
0
def cluster():
    return nmf.nmf();
Пример #10
0
        [5, 3, 0, 1],
        [4, 0, 0, 1],
        [1, 1, 0, 5],
        [1, 0, 0, 4],
        [0, 1, 5, 4],
    ]
V = [
        [1, 1, 0, 1, 1],
        [1, 0, 0, 1, 0],
        [1, 1, 0, 1, 0],
        [1, 0, 0, 1, 0],
        [0, 1, 1, 1, 0],
    ]

V = numpy.array(V)
print("V = ")
print(V)

time_start = time.time()
K = 2
W, H = nmf.nmf(V, K)
time_end = time.time()
estimatedV = numpy.dot(W, H)
print("W = ")
print(W)
print("H = ")
print(H)
print("estimatedV = ")
print(estimatedV)
print(time_end - time_start)
Пример #11
0
# Example data - impute to replace missing values
data = Orange.data.Table("yeast-class-RPR")
imputer = Orange.feature.imputation.MinimalConstructor()
imputer = imputer(data)
data = imputer(data)

# Convert to numpy and separate positive and negative values
# Note that NMF works on strictly *nonnegative data*
X, y, _ = data.to_numpy()
Xp = X * (X > 0)
Xn = X * (X < 0) * (-1)
X = np.hstack([Xp, Xn])

# Unsupervised learning of training examples
W, Hs = nmf([X], rank=rank)
Xa = W.dot(Hs[0])
print Xa.shape

plt.figure(figsize=(5.0, 5.0))
plt.imshow(X)
plt.title("Original")

plt.figure(figsize=(5.0, 5.0))
plt.title("Approximation")
plt.imshow(Xa)

# Show clusters
if show_features:
    plt.figure()
    plt.title("Clusters")
Пример #12
0
# Example data - impute to replace missing values
data = Orange.data.Table("yeast-class-RPR")
imputer = Orange.feature.imputation.MinimalConstructor()
imputer = imputer(data)
data = imputer(data)

# Convert to numpy and separate positive and negative values
# Note that NMF works on strictly *nonnegative data*
X, y, _ = data.to_numpy()
Xp = X * (X > 0)
Xn = X * (X < 0) * (-1)
X = np.hstack([Xp, Xn])

# Unsupervised learning of training examples
W, Hs = nmf([X], rank=rank)
Xa = W.dot(Hs[0])
print Xa.shape

plt.figure(figsize=(5.0, 5.0))
plt.imshow(X)
plt.title("Original")

plt.figure(figsize=(5.0, 5.0))
plt.title("Approximation")
plt.imshow(Xa)

# Show clusters
if show_features:
    plt.figure()
    plt.title("Clusters")
Пример #13
0
    show_features = 0

# Example data - impute to replace missing values
data = Orange.data.Table("yeast-class-RPR")
imputer = Orange.feature.imputation.MinimalConstructor()
imputer = imputer(data)
data = imputer(data)

# Convert to numpy and separate positive and negative values
# Note that NMF works on strictly *nonnegative data*
X, y, _ = data.to_numpy()
Xp = X * (X > 0)
Xn = X * (X < 0) * (-1)
X = np.hstack([Xp, Xn])

# Reshape and select target class
y = y.reshape((len(X), 1))
y = (y == 0).astype(int)

# Learn the model parameters given the attributes and the class values
tr = range(0, len(X), 2)
te = range(1, len(X), 2)

W, Hs = nmf([X[tr], y[tr]], rank=rank)
Wte = nmf_fix([X[te]], [Hs[0]], rank=rank)
prediction = Wte.dot(Hs[1])

# Measure performance
auc = roc_auc_score(y_score=prediction, y_true=y[te])
print "AUC: ", auc
Пример #14
0
data = Orange.data.Table("yeast-class-RPR")
imputer = Orange.feature.imputation.MinimalConstructor()
imputer = imputer(data)
data = imputer(data)

# Convert to numpy and separate positive and negative values
# Note that NMF works on strictly *nonnegative data*
X, y, _ = data.to_numpy()
Xp = X * (X > 0)
Xn = X * (X < 0) * (-1)
X = np.hstack([Xp, Xn])

# Reshape and select target class
y = y.reshape((len(X), 1))
y = (y == 0).astype(int)


# Learn the model parameters given the attributes and the class values
tr = range(0, len(X), 2)
te = range(1, len(X), 2)

W, Hs = nmf([X[tr], y[tr]], rank=rank)
Wte  = nmf_fix([X[te]], [Hs[0]], rank=rank)
prediction = Wte.dot(Hs[1])

# Measure performance
auc = roc_auc_score(y_score=prediction, y_true=y[te])
print "AUC: ", auc


Пример #15
0
    binvals = array.array('f',A.flatten('F').tolist())
    binvals.tofile(fp)
    fp.close()
    return

path = '../data/'
X = npy_from_file(path + 'X.bin')
W = npy_from_file(path + 'W.bin')
H = npy_from_file(path + 'H.bin')


#print X
#print W
#print H


nmf.nmf(X,W,H,100,1)


#print W
#print H
#print np.dot(W,H)








Пример #16
0
    Ay_position_a_guess_llh = xyz2llh(Ay_position_a_guess[0, 0],
                                      Ay_position_a_guess[0, 1],
                                      Ay_position_a_guess[0, 2]).xyz()
    F_zhd_a = Mobj_b_base_Error.saastamoinen_model(
        np.array([Ay_position_a_guess_llh[0]]),
        np.array([Ay_position_a_guess_llh[1]]),
        np.array([Ay_position_a_guess_llh[2]]), np.array([np.pi / 2]), 0)

    F_zhd_b = Mobj_b_base_Error.saastamoinen_model(
        np.array([Ay_position_b_llh[0]]), np.array([Ay_position_b_llh[1]]),
        np.array([Ay_position_b_llh[2]]), np.array([np.pi / 2]), 0)
    '''
    Ay_nmf : b站nmf值
    並修正Ay_geographical_distance_a,b
    '''
    Ay_nmf_b = nmf.nmf(I_doy, Ay_position_b_llh[0], Ay_position_b_llh[1],
                       Ay_position_b_llh[2], Ay_elevation)
    Ay_nmf_a = nmf.nmf(I_doy, Ay_position_a_guess_llh[0],
                       Ay_position_a_guess_llh[1], Ay_position_a_guess_llh[2],
                       Ay_elevation)

    Ay_geographical_distance_b += F_zhd_b * Ay_nmf_b
    Ay_geographical_distance_a += F_zhd_a * Ay_nmf_a

    Ay_L1_chose_a_minus_ra = Ay_L1_chose_a - Ay_geographical_distance_a
    Ay_C1_chose_a_minus_ra = Ay_C1_chose_a - Ay_geographical_distance_a

    Ay_L1_chose_b_minus_rb = Ay_L1_chose_b - Ay_geographical_distance_b
    Ay_C1_chose_b_minus_rb = Ay_C1_chose_b - Ay_geographical_distance_b

    Ay_L1_chose_SD_minus_rSD = Ay_L1_chose_a_minus_ra - Ay_L1_chose_b_minus_rb
    Ay_C1_chose_SD_minus_rSD = Ay_C1_chose_a_minus_ra - Ay_C1_chose_b_minus_rb
Пример #17
0
def recommend_skill_skill_matrix(user_skills, unique_skills):
    
    n = len(unique_skills.keys())
    ss_matrix = np.zeros(n*n).reshape(n,n)

    skill_id_map = {}
    skill_name_map = {}
    skill_id = 0
    for sk in unique_skills:
	skill_id_map[sk] = skill_id
	skill_name_map[skill_id] = sk
	skill_id +=1

    for sk in user_skills:
        for a in range(0,len(sk)):
            for b in range(a+1,len(sk)):
		s1 = skill_id_map[sk[a]]
	        s2 = skill_id_map[sk[b]]
		ss_matrix[s1][s2] += 1
	        ss_matrix[s2][s1] += 1

    print 'Skill-skill matrix shape', ss_matrix.shape
    model = nmf('kl',10)

    W,H = model.fit(ss_matrix, 200, False)

    ss_matrix_approx = W.dot(H)
    print "Reconstructed matrix ", ss_matrix_approx


    for sk in skill_id_map:
        sid = skill_id_map[sk]
        val = ss_matrix_approx[sid,:]
	print sk.encode("utf-8"), " : "
	top5_h = []
	for i in range(5):
	    max_v = 0
	    max_j = 0
	    for j in range(len(val)): 
		if val[j] > max_v and (j not in top5_h):
		    max_v = val[j]
   		    max_j = j
	    top5_h.append(max_j)

        val = ss_matrix_approx[:,sid].T
	top5_v = []
	for i in range(5):
	    max_v = 0
	    max_j = 0
	    for j in range(len(val)): 
		if val[j] > max_v and (j not in top5_v):
		    max_v = val[j]
   		    max_j = j
	    top5_v.append(max_j)

        val = ss_matrix[sid,:]
	top5_o = []
	for i in range(5):
	    max_v = 0
	    max_j = 0
	    for j in range(len(val)): 
		if val[j] > max_v and (j not in top5_o):
		    max_v = val[j]
   		    max_j = j
	    top5_o.append(max_j)

	for t1, t2, t3 in zip(top5_o,top5_h,top5_v):
	    print "    ", skill_name_map[t1].encode("utf-8"), "   \t", skill_name_map[t2].encode("utf-8")
Пример #18
0
from numpy import *
import numpy as np
import nimfa as nm
import nmf
import plot_err
m = 5
n = 5
W = mat(random.random((m, n)))
V = np.array([W, W, W])
w2 = mat(random.random((5, 2)))
h2 = mat(random.random((5, 2)))

(wo, ho) = nmf.nmf(V[:, :, 0], w2, h2, 0.001, 10, 10)
print((wo, ho))
Пример #19
0
    binvals = array.array('f',A.flatten('F').tolist())
    binvals.tofile(fp)
    fp.close()
    return

path = '/home/ericb/projects/nmf/trunk/'
X = npy_from_file(path + 'X.bin')
W = npy_from_file(path + 'W.bin')
H = npy_from_file(path + 'H.bin')


#print X
#print W
#print H


nmf.nmf(X,W,H,200,1)


#print W
#print H
#print np.dot(W,H)