Пример #1
0
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    global OH, lr, quota, i, km, part
    quota = QUOTA
    list_q = []
    obj_q = []
    for url in QLINK_URLS:
        prop = get_prop(url)
        list_q += prop
        obj_q.append(prop)


    params, count = np.unique(list_q, return_counts=True)
    params = np.array(params, dtype='object')
    count = np.array(count, dtype='object')
    feat_q_num = np.hstack((params.reshape(-1,1), count.reshape(-1,1)))
    feat_q_num =  feat_q_num[np.argsort(feat_q_num[:,1])]
    feat_q_num = feat_q_num[::-1]

    obj_g = []

    for url in UNKNOWN_URLS:
        prop = get_prop(url)
        obj_g.append(prop)


    imp_features = (feat_q_num[:100][:,0]).reshape(-1,1)


    OH = OnehotTransactions()
    OH.fit(imp_features)

    q_matrix = OH.transform(obj_q)
    g_matrix = OH.transform(obj_g)


    X = np.vstack((q_matrix, g_matrix))
    y = [1]*500 + [0]*500
    y = np.array(y)

    lr =  LinearRegression()
    lr.fit(X,y)
    i = 0

## divide by n clusters set part for each one
    n = 15
    km = KMeans(n_clusters=n)
    a = np.hstack((km.fit_predict(X).reshape(-1,1), y.reshape(-1,1)))
    part = np.zeros(n)
    for i in range(n):
        part[i] = a[a[:,0] == i][:,1].sum()
    part /= part.sum()
Пример #2
0
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    global OH, lr, quota, i, km, part
    quota = QUOTA
    list_q = []
    obj_q = []
    for url in QLINK_URLS:
        prop = get_prop(url)
        list_q += prop
        obj_q.append(prop)

    params, count = np.unique(list_q, return_counts=True)
    params = np.array(params, dtype='object')
    count = np.array(count, dtype='object')
    feat_q_num = np.hstack((params.reshape(-1, 1), count.reshape(-1, 1)))
    feat_q_num = feat_q_num[np.argsort(feat_q_num[:, 1])]
    feat_q_num = feat_q_num[::-1]

    obj_g = []

    for url in UNKNOWN_URLS:
        prop = get_prop(url)
        obj_g.append(prop)

    imp_features = (feat_q_num[:100][:, 0]).reshape(-1, 1)

    OH = OnehotTransactions()
    OH.fit(imp_features)

    q_matrix = OH.transform(obj_q)
    g_matrix = OH.transform(obj_g)

    X = np.vstack((q_matrix, g_matrix))
    y = [1] * 500 + [0] * 500
    y = np.array(y)

    lr = LinearRegression()
    lr.fit(X, y)
    i = 0

    ## divide by n clusters set part for each one
    n = 15
    km = KMeans(n_clusters=n)
    a = np.hstack((km.fit_predict(X).reshape(-1, 1), y.reshape(-1, 1)))
    part = np.zeros(n)
    for i in range(n):
        part[i] = a[a[:, 0] == i][:, 1].sum()
    part /= part.sum()
def test_transform():

    oht = OnehotTransactions()
    oht.fit(dataset)
    trans = oht.transform(dataset)
    np.testing.assert_array_equal(expect, trans)
Пример #4
0
def test_transform():

    oht = OnehotTransactions()
    oht.fit(dataset)
    trans = oht.transform(dataset)
    np.testing.assert_array_equal(expect, trans)