Пример #1
0
def _label_encoder2(table, input_cols, suffix='_index'):
    out_table = table.copy()
    out_model_list = [None] * len(input_cols)
    new_col_list = []
    number_distinct_classes = []
    for ind, col in enumerate(input_cols):
        le = LabelEncoder().fit(table[col])
        out_model_list[ind] = le
        new_col_name = col + suffix
        new_col_list.append(new_col_name)
        number_distinct_classes.append(len(le.classes_))
        out_table[new_col_name] = le.transform(table[col])
    out_model = _model_dict('label_encoders')
    out_model['label_encoders'] = out_model_list
    out_model['input_cols'] = input_cols
    rb = BrtcReprBuilder()
    params = {"Input columns": input_cols, "Suffix": suffix}
    summary_table = pd.DataFrame()
    summary_table['Input columns'] = input_cols
    summary_table['No. distinct classes'] = number_distinct_classes
    summary_table['New column names'] = new_col_list
    rb.addMD(
        strip_margin("""
    | ## Label Encoder Model
    | ### Parameters
    | {params}
    | ### Summary
    | {summary_table}
    """.format(params=dict2MD(params),
               summary_table=pandasDF2MD(summary_table))))
    out_model['_repr_brtc_'] = rb.get()
    return {'out_table': out_table, 'model': out_model}
Пример #2
0
def _ada_boost_classification_train(table,
                                    feature_cols,
                                    label_col,
                                    max_depth=1,
                                    n_estimators=50,
                                    learning_rate=1.0,
                                    algorithm='SAMME.R',
                                    random_state=None):

    x_train = table[feature_cols]
    y_train = table[label_col]

    base_estimator = DecisionTreeClassifier(max_depth=max_depth)

    classifier = AdaBoostClassifier(base_estimator, n_estimators,
                                    learning_rate, algorithm, random_state)

    classifier.fit(x_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'feature_importance': classifier.feature_importances_,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'algorithm': algorithm,
        'random_state': random_state
    }

    model = _model_dict('ada_boost_classification_model')
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_cols, classifier)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## AdaBoost Classification Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params)))

    model['_repr_brtc_'] = rb.get()
    feature_importance = classifier.feature_importances_
    feature_importance_table = pd.DataFrame(
        [[feature_cols[i], feature_importance[i]]
         for i in range(len(feature_cols))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
Пример #3
0
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True,
              probability=True, tol=1e-3, max_iter=-1, random_state=None):
    validate(greater_than(c, 0.0, 'c'))
    
    _table = table.copy()
    
    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]
    
    if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')
    
    _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking,
              probability=probability, tol=tol, max_iter=max_iter, random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)
    
    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))
    
    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()
    
    return {'model':_model}
Пример #4
0
def _bow(table,
         input_col,
         add_words=None,
         no_below=1,
         no_above=0.8,
         keep_n=10000):
    word_list = table[input_col].tolist()
    dictionary = Dictionary(word_list)
    if add_words != None:
        dictionary.add_documents([add_words])
    dictionary.filter_extremes(no_below=no_below,
                               no_above=no_above,
                               keep_n=keep_n,
                               keep_tokens=None)

    params = {
        'Input Column': input_col,
        'Minimum Number of Occurrence': no_below,
        'Maximum Fraction of Occurrence': no_above,
        'Keep N most Frequent': keep_n
    }

    empty_description = ''
    if len(list(dictionary.dfs.values())) == 0:
        out_table = pd.DataFrame([], columns=['token', 'document_frequency'])
        empty_description = 'Out table is empty since parameter \"Minimum Number of Occurrence\" is greater than the maximum of document frequency.'
    else:
        out_table = pd.DataFrame.from_dict(dictionary.token2id,
                                           orient='index').drop([0], axis=1)
        out_table.insert(loc=0,
                         column='token',
                         value=dictionary.token2id.keys())

        token_cnt = sorted(dictionary.dfs.items(), key=operator.itemgetter(0))
        dfs_list = []
        for i in range(len(dictionary.dfs)):
            dfs_list.append(token_cnt[i][1])
        out_table['document_frequency'] = dfs_list

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
        |# Bag of Words Result
        |### Parameters
        |
        | {display_params}
        |
        | {description}
        |
        """.format(display_params=dict2MD(params),
                   description=empty_description)))

    model = _model_dict('bow')
    model['dict_table'] = out_table
    model['dictionary'] = dictionary
    model['add_words'] = add_words
    model['_repr_brtc_'] = rb.get()

    return {'model': model, 'out_table': out_table}
Пример #5
0
def _ada_boost_regression_train(table,
                                feature_cols,
                                label_col,
                                max_depth=3,
                                n_estimators=50,
                                learning_rate=1.0,
                                loss='linear',
                                random_state=None):

    feature_names, x_train = check_col_type(table, feature_cols)
    y_train = table[label_col]

    base_estimator = DecisionTreeRegressor(max_depth=max_depth)
    regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate,
                                  loss, random_state)

    regressor.fit(x_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'feature_importance': regressor.feature_importances_,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'loss': loss,
        'random_state': random_state
    }

    model = _model_dict('ada_boost_regression_model')
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_names, regressor)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## AdaBoost Regression Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params)))

    model['_repr_brtc_'] = rb.get()
    feature_importance = regressor.feature_importances_
    feature_importance_table = pd.DataFrame(
        [[feature_names[i], feature_importance[i]]
         for i in range(len(feature_names))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
Пример #6
0
def _hierarchical_clustering_post(table,
                                  model,
                                  num_clusters,
                                  cluster_col='prediction'):
    Z = model['model']
    mode = model['input_mode']
    if mode == 'matrix':
        distance_matrix = model['dist_matrix']
    out_table = model['linkage_matrix']

    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    if mode == 'original':
        prediction_table = table.copy()
    elif mode == 'matrix':
        prediction_table = distance_matrix
    prediction_table[cluster_col] = predict

    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])

    clusters_info_table = pd.DataFrame([])
    clusters_info_table[cluster_col] = M
    clusters_info_table['name_of_clusters'] = which_cluster
    clusters_info_table = clusters_info_table.sort_values(cluster_col)
    cluster_count = np.bincount(prediction_table[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    clusters_info_table['num_of_entities'] = list(cluster_count)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(
        strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{clusters_info_table}
    |
    """.format(display_params=dict2MD(model['parameters']),
               clusters_info_table=pandasDF2MD(clusters_info_table))))

    model = _model_dict('hierarchical_clustering_post')
    model['clusters_info'] = clusters_info_table
    model['_repr_brtc_'] = rb.get()

    return {'out_table': prediction_table, 'model': model}
Пример #7
0
def _outlier_detection_lof(table,
                           input_cols,
                           n_neighbors=20,
                           result_type='add_prediction',
                           new_column_name='is_outlier'):
    out_table = table.copy()
    features = out_table[input_cols]
    lof_model = LocalOutlierFactor(n_neighbors,
                                   algorithm='auto',
                                   leaf_size=30,
                                   metric='minkowski',
                                   p=2,
                                   novelty=True,
                                   contamination=0.1)
    lof_model.fit(features)

    isinlier = lambda _: 'in' if _ == 1 else 'out'
    out_table[new_column_name] = [
        isinlier(lof_predict) for lof_predict in lof_model.predict(features)
    ]

    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        out_table = out_table[out_table[new_column_name] == 'in']
        out_table = out_table.drop(new_column_name, axis=1)
    elif result_type == 'both':
        out_table = out_table[out_table[new_column_name] == 'in']
    else:
        raise_runtime_error("Please check 'result_type'.")

    params = {
        'Input Columns': input_cols,
        'Result Type': result_type,
        'Number of Neighbors': n_neighbors,
    }

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Outlier Detection (Local Outlier Factor) Result
    | ### Parameters
    |
    | {display_params}
    |
    """.format(display_params=dict2MD(params))))

    model = _model_dict('outlier_detection_lof')
    model['params'] = params
    model['lof_model'] = lof_model
    model['input_cols'] = input_cols
    model['result_type'] = result_type
    model['num_neighbors'] = n_neighbors
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Пример #8
0
def _discretize_quantile(table,
                         input_col,
                         num_of_buckets=2,
                         out_col_name='bucket_number'):
    out_table = table.copy()
    out_table[out_col_name], buckets = pd.qcut(table[input_col],
                                               num_of_buckets,
                                               labels=False,
                                               retbins=True,
                                               precision=10,
                                               duplicates='drop')

    params = {
        'input_col': input_col,
        'num_of_buckets': num_of_buckets,
        'out_col_name': out_col_name
    }

    cnt = Counter(out_table[out_col_name].values)

    # index_list, bucket_list
    index_list = []
    bucket_list = []
    cnt_list = []
    for i in range(len(buckets) - 1):
        left = '[' if i == 0 else '('
        index_list.append(i)
        cnt_list.append(cnt[i])
        bucket_list.append("{left}{lower}, {upper}]".format(
            left=left, lower=buckets[i],
            upper=buckets[i + 1]))  # 'buckets' is tuple type data.

    # Build model
    result = pd.DataFrame.from_items([['bucket number', index_list],
                                      ['buckets', bucket_list],
                                      ['count', cnt_list]])

    # Build model
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Quantile-based Discretization Result
    | ### Result
    | {result}
    |
    | ### Parameters
    | {params} 
    """.format(result=pandasDF2MD(result), params=dict2MD(params))))

    model = _model_dict('discretize_quantile')
    model['result'] = result
    model['params'] = params
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Пример #9
0
def _scale(table, input_cols, scaler, suffix=None):
    if scaler == 'RobustScaler':
        if suffix is None:
            suffix = '_robust'
        scale = RobustScaler()
    elif scaler == 'StandardScaler':
        if suffix is None:
            suffix = '_standard'
        scale = StandardScaler()
    elif scaler == 'MaxAbsScaler':
        if suffix is None:
            suffix = '_max_abs'
        scale = MaxAbsScaler()
    else:  # minmax
        if suffix is None:
            suffix = '_min_max'
        scale = MinMaxScaler()

    scaled_cols = []
    for col in input_cols:
        scaled_cols.append(col + suffix)

    out_table = table.copy()
    scaled_table = scale.fit_transform(out_table[input_cols])
    out_table[scaled_cols] = pd.DataFrame(data=scaled_table)

    out_model = _model_dict('scaler')
    out_model['input_cols'] = input_cols
    out_model['used_scaler'] = scaler
    out_model['scaler'] = scale
    out_model['suffix'] = suffix
    rb = BrtcReprBuilder()
    params = {
        "Input columns": input_cols,
        "Normalization method": scaler,
        "Suffix": suffix
    }
    summary_table = pd.DataFrame()
    summary_table['Input columns'] = input_cols
    summary_table['Normalization method'] = [scaler] * len(input_cols)
    summary_table['New column names'] = scaled_cols
    rb.addMD(
        strip_margin("""
    | ## Label Encoder Model
    | ### Parameters
    | {params}
    |
    | ### Summary table
    | {summary_table}
    """.format(params=dict2MD(params),
               summary_table=pandasDF2MD(summary_table))))
    out_model['_repr_brtc_'] = rb.get()
    return {'out_table': out_table, 'model': out_model}
Пример #10
0
def _one_sample_ttest_repr(statistics, result_dict, params):
    input_cols = params['input_cols']
    alternatives = params['alternatives']
    hypothesized_mean = params['hypothesized_mean']
    conf_level = params['conf_level']

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## One Sample T Test Result
    | - Statistics = {s}
    | - Hypothesized mean = {h} 
    | - Confidence level = {cl}
    """.format(s=statistics, h=hypothesized_mean, cl=conf_level)))

    for input_col in input_cols:
        H1_list = []
        p_list = []
        CI_list = []
        for alter in alternatives:
            test_info = result_dict[input_col][alter]
            H1_list.append(test_info['alternative_hypothesis'])
            p_list.append(test_info['p_value'])
            CI_list.append(test_info['confidence_interval'])

        result_table = pd.DataFrame.from_items(
            [['alternative hypothesis', H1_list], ['p-value', p_list],
             ['%g%% confidence Interval' % (conf_level * 100), CI_list]])

        rb.addMD(
            strip_margin("""
        | ### Data = {input_col}
        | - t-value = {t_value} 
        |
        | {result_table}
        """.format(input_col=input_col,
                   t_value=result_dict[input_col]['t_value'],
                   result_table=pandasDF2MD(result_table))))

    rb.addMD(
        strip_margin("""
        | ### Parameters
        | {params}
        """.format(params=dict2MD(params))))

    return rb
Пример #11
0
def _label_encoder(table, input_col, new_column_name='encoded_column'):
    out_table = table.copy()
    le = LabelEncoder().fit(table[input_col])
    out_model = _model_dict('label_encoder')
    out_model['label_encoder'] = le
    out_model['input_col'] = input_col
    out_model['classes'] = le.classes_
    rb = BrtcReprBuilder()
    params = {
        'Input Column': input_col,
        "No. distinct classes": len(le.classes_),
        "New column name": new_column_name
    }
    rb.addMD(
        strip_margin("""
    | ## Label Encoder Model
    | ### Parameters
    | {params}
    |
    """.format(params=dict2MD(params))))
    out_model['_repr_brtc_'] = rb.get()
    out_table[new_column_name] = le.transform(table[input_col])
    return {'out_table': out_table, 'model': out_model}
Пример #12
0
def _lda(table,
         input_col,
         num_voca=1000,
         num_topic=3,
         num_topic_word=3,
         max_iter=20,
         learning_method='online',
         learning_offset=10.,
         random_state=None):
    corpus = table[input_col]
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=num_voca,
                                    stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")

    topic_model = pd.DataFrame([])
    topic_idx_list = []
    voca_weights_list = []
    for topic_idx, weights in enumerate(lda_model.components_):
        topic_idx_list.append("Topic {}".format(topic_idx))
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)
    topic_model['topic idx'] = topic_idx_list
    topic_model['topic vocabularies'] = voca_weights_list

    doc_topic = lda_model.transform(term_count)

    doc_classification = pd.DataFrame()
    doc_classification['documents'] = [doc for doc in corpus]
    doc_classification['top topic'] = [
        "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus))
    ]

    params = {
        'Input Column': input_col,
        'Number of Vocabularies': num_voca,
        'Number of Topics': num_topic,
        'Number of Terminologies': num_topic_word,
        'Iterations': max_iter,
        'Learning Method': learning_method,
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result"""))
    rb.addMD(
        strip_margin("""
    |
    |### Parameters
    |
    | {display_params}
    |
    |### Topic Model
    |
    |{topic_model}
    |
    |### Documents Classification
    |
    |{doc_classification}
    |
    """.format(display_params=dict2MD(params),
               topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1),
               doc_classification=pandasDF2MD(doc_classification,
                                              num_rows=len(corpus) + 1))))

    model = _model_dict('lda')
    model['parameter'] = params
    model['topic_model'] = topic_model
    model['documents_classification'] = doc_classification
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #13
0
def _collaborative_filtering_train(table, user_col , item_col, rating_col, N=10, filter=True, k=5, based='item', mode='train', method='cosine', weighted=True, centered=True, targets=None, normalize=True, workers=1, filter_minus=False, maintain_already_scored=True):
    if based == 'item':
        normalize = False
    table_user_col = table[user_col]
    table_item_col = table[item_col]
    rating_col = table[rating_col]
    user_encoder = preprocessing.LabelEncoder()
    item_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table_user_col)
    item_encoder.fit(table_item_col)
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)
    if based == 'item':
        item_users = csr_matrix((rating_col, (item_correspond, user_correspond)))
        check_cen = csr_matrix((rating_col + 1, (item_correspond, user_correspond)))
    else:
        item_users = csr_matrix((rating_col, (user_correspond, item_correspond)))
        check_cen = csr_matrix((rating_col + 1, (user_correspond, item_correspond)))
    centered_ratings = item_users.copy()
    
    num_item, num_user = item_users.shape
    if centered:
        update_item = []
        update_user = []
        update_rating = []
        for item in range(num_item):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, item):
                index += 1
                sum += rating
            avg = sum / index - 1
            for user, rating in _nonzeros(check_cen, item):
                update_item.append(item)
                update_user.append(user)
                update_rating.append(avg)
                
        centered_ratings -= csr_matrix((update_rating, (update_item, update_user)))
    if (method == 'adjusted' or normalize) and based == 'item':
        check_cen = check_cen.transpose().tocsr()
    if based == 'user':
        tmp = num_user
        num_user = num_item
        num_item = tmp
    user_avg = []
    if normalize:
        for user in range(num_user):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, user):
                index += 1
                sum += rating
            avg = sum / index
            user_avg.append(avg)
    if method == 'adjusted':
        update_item = []
        update_user = []
        update_rating = []
        for user in range(num_user):
            sum = 0
            for item, rating in _nonzeros(check_cen, user):
                sum += rating
            avg = sum / num_item
            for item in range(num_item):
                update_item.append(item)
                update_user.append(user)
                update_rating.append(avg)
        if based == 'item':
            centered_ratings -= csr_matrix((update_rating, (update_item, update_user)))
        else:
            centered_ratings -= csr_matrix((update_rating, (update_user, update_item)))
        method = 'cosine'     
    if based == 'user':
        tmp = num_user
        num_user = num_item
        num_item = tmp
        
    if method == 'cosine':
        similar_coeff = cosine_similarity(centered_ratings)
    elif method == 'pearson':
        result = []
        for i in centered_ratings.toarray():
            result.append(i - np.average(i))
        similar_coeff = cosine_similarity(result)
    elif method == 'jaccard':
        similar_coeff = 1 - pairwise_distances(centered_ratings.toarray(), metric="hamming")
    if based == 'user':
        item_users = item_users.transpose().tocsr()

    if mode == 'Topn':
        if targets is None:
            targets = user_encoder.classes_
        if table_user_col.dtype in (np.floating, float, np.int, int, np.int64):
            targets = [float(i) for i in targets]
        targets_en = user_encoder.transform(targets)      
        Topn_result = []
        if workers == 1:
            for user in targets_en:
                recommendations_corre = _recommend(user, item_users, similar_coeff, N, k, method, weighted, centered, based, normalize, user_avg, filter, filter_minus, maintain_already_scored)
                recommendations = []
                for (item, rating) in recommendations_corre:
                    recommendations += [item_encoder.inverse_transform([item])[0], rating]
                Topn_result += [recommendations]
        else:
            Topn_result_tmp = apply_by_multiprocessing_list_to_list(targets_en, _recommend_multi, item_users=item_users, similar_coeff=similar_coeff, N=N, k=k, method=method, weighted=weighted, centered=centered, based=based, normalize=normalize, user_avg=user_avg, item_encoder=item_encoder, workers=workers, filter_minus=filter_minus, maintain_already_scored=maintain_already_scored)
            Topn_result = []
            for i in range(workers):
                Topn_result += Topn_result_tmp[i]
        Topn_result = pd.DataFrame(Topn_result)
        Topn_result = pd.concat([pd.DataFrame(targets), Topn_result], axis=1, ignore_index=True)
        column_names = ['user_name']
        for i in range(int((Topn_result.shape[1] - 1) / 2)):
            column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)]
        Topn_result.columns = column_names
        return {'out_table' : Topn_result}

    parameters = dict()
    parameters['Number of Neighbors'] = k
    parameters['Based'] = based
    if method == 'cosine':
        parameters['Similarity method'] = 'Cosine'
    elif method == 'jaccard':
        parameters['Similarity method'] = 'Jaccard'
    elif method == 'pearson':
        parameters['Similarity method'] = 'Pearson'
    else:
        parameters['Similarity method'] = 'Adjusted Cosine'
    parameters['Use Centered Mean'] = centered
    parameters['Use Weighted Rating'] = weighted
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Collaborative Filtering Result
    |
    | ### Parameters
    | {parameters} 
    |
    """.format(parameters=dict2MD(parameters))))
            
    model = _model_dict('collaborative filtering')
    model['weighted'] = weighted
    model['k'] = k
    model['similar_coeff'] = similar_coeff
    model['item_encoder'] = item_encoder
    model['user_encoder'] = user_encoder
    model['item_users'] = item_users
    model['user_col'] = user_col
    model['item_col'] = item_col
    model['based'] = based
    model['_repr_brtc_'] = rb.get()
    model['normalize'] = normalize
    model['user_avg'] = user_avg
    return{'model' : model}
Пример #14
0
def _outlier_detection_tukey_carling(table,
                                     input_cols,
                                     outlier_method='tukey',
                                     multiplier=None,
                                     number_of_removal=1,
                                     result_type='add_prediction',
                                     new_column_prefix='is_outlier_'):
    out_table = table.copy()
    median = out_table.median()
    q1s = out_table.quantile(0.25)
    q3s = out_table.quantile(0.75)
    iqrs = q3s - q1s
    output_col_names = []

    if outlier_method == 'tukey':
        if multiplier is None:
            multiplier = 1.5
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix,
                                                     col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(
                lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier))
    elif outlier_method == 'carling':
        if multiplier is None:
            multiplier = 2.3
        for col in input_cols:
            output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix,
                                                     col=col)
            output_col_names.append(output_col_name)
            out_table[output_col_name] = out_table[col].apply(
                lambda _: _carling(_, median[col], iqrs[col], multiplier))
    else:
        raise_runtime_error("Please check 'outlier_method'.")

    # result_type is one of 'add_prediction', 'remove_outliers', 'both'
    if result_type == 'add_prediction':
        pass
    elif result_type == 'remove_outliers':
        prediction = out_table[output_col_names].apply(
            lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
        out_table = out_table[prediction.values]
        out_table = out_table.drop(output_col_names, axis=1)
    elif result_type == 'both':
        prediction = out_table[output_col_names].apply(
            lambda row: np.sum(row == 'out') < number_of_removal, axis=1)
        out_table = out_table[prediction.values]
    else:
        raise_runtime_error("Please check 'result_type'.")

    params = {
        'Input Columns': input_cols,
        'Outlier Method': outlier_method,
        'Multiplier': multiplier,
        'Number of Outliers in a Row': number_of_removal,
        'Result Type': result_type,
        'New Column Prefix': new_column_prefix
    }

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Outlier Detection (Tukey/Carling) Result
    | ### Parameters
    |
    | {display_params}
    |
    """.format(display_params=dict2MD(params))))

    model = _model_dict('outlier_detection_tukey_carling')
    model['params'] = params
    model['input_cols'] = input_cols
    model['outlier_method'] = outlier_method
    model['multiplier'] = multiplier
    model['number_of_removal'] = number_of_removal
    model['result_type'] = result_type
    model['median'] = median
    model['q1'] = q1s
    model['q3'] = q3s
    model['iqr'] = iqrs
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Пример #15
0
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None):
    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0] * len(class_prior)
        for elems in class_prior: 
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    class_log_prior = nb_model.class_log_prior_
    feature_log_prob_ = nb_model.feature_log_prob_
    tmp_result = np.hstack((list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_)))
    column_names = ['labels', 'pi']
    for feature_col in feature_cols:
        column_names += ['theta_' + feature_col]
    result_table = pd.DataFrame.from_records(tmp_result, columns=column_names)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_,
                      title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Naive Bayes Classification Result
    |
    | ### Model:Multinomial
    | {result_table}
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}
Пример #16
0
def _hierarchical_clustering(table,
                             input_cols,
                             input_mode='original',
                             key_col=None,
                             link='complete',
                             met='euclidean',
                             num_rows=20,
                             figure_height=6.4,
                             orient='right'):
    out_table = table.copy()
    features = out_table[input_cols]

    if input_mode == 'original':
        len_features = len(features)
        if key_col != None:
            data_names = list(out_table[key_col])
        elif key_col == None:
            data_names = ['pt_' + str(i) for i in range(len_features)]
        out_table['name'] = data_names
        Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met)
    elif input_mode == 'matrix':
        len_features = len(input_cols)
        if key_col != None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table[key_col][out_table.columns.get_loc(column)])
        elif key_col == None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table.columns[out_table.columns.get_loc(column)])
        col_index = []
        for column in input_cols:
            col_index.append(out_table.columns.get_loc(column))
        dist_matrix = features.iloc[col_index]

        Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met)
        dist_matrix['name'] = data_names
    else:
        raise_runtime_error("Please check 'input_mode'.")

    range_len_Z = range(len(Z))
    linkage_matrix = pd.DataFrame([])
    linkage_matrix['linkage step'] = [
        '%g' % (x + 1) for x in reversed(range_len_Z)
    ]
    linkage_matrix['name of clusters'] = [
        'CL_%g' % (i + 1) for i in reversed(range_len_Z)
    ]
    joined_column1 = []
    for i in range_len_Z:
        if Z[:, 0][i] < len_features:
            joined_column1.append(data_names[int(Z[:, 0][i])])
        elif Z[:, 0][i] >= len_features:
            joined_column1.append(
                linkage_matrix['name of clusters'][Z[:, 0][i] - len_features])
    linkage_matrix['joined column1'] = joined_column1
    joined_column2 = []
    for i in range_len_Z:
        if Z[:, 1][i] < len_features:
            joined_column2.append(data_names[int(Z[:, 1][i])])
        elif Z[:, 1][i] >= len_features:
            joined_column2.append(
                linkage_matrix['name of clusters'][Z[:, 1][i] - len_features])
    linkage_matrix['joined column2'] = joined_column2

    linkage_matrix['distance'] = [distance for distance in Z[:, 2]]
    linkage_matrix['number of original'] = [
        int(entities) for entities in Z[:, 3]
    ]
    linkage_matrix = linkage_matrix.reindex(
        index=linkage_matrix.index[::-1])[0:]

    # calculate full dendrogram

    plt.figure(figsize=(8.4, figure_height))
    dendrogram(Z,
               truncate_mode='none',
               get_leaves=True,
               orientation=orient,
               labels=data_names,
               leaf_rotation=45,
               leaf_font_size=10.,
               show_contracted=False)
    plt.title('Hierarchical Clustering Dendrogram')
    if orient == 'top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient == 'right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    plt.tight_layout()
    plt2 = plt2MD(plt)
    plt.clf()

    params = {
        'Input Columns': input_cols,
        'Input Mode': input_mode,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Hierarchical Clustering Result"""))
    rb.addMD(
        strip_margin("""
    |### Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    |{display_params}
    |
    |### Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2,
               display_params=dict2MD(params),
               out_table1=pandasDF2MD(linkage_matrix.head(num_rows),
                                      num_rows=num_rows + 1))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_mode'] = input_mode
    model['table'] = out_table
    if input_mode == 'matrix':
        model['dist_matrix'] = dist_matrix
    model['parameters'] = params
    model['linkage_matrix'] = linkage_matrix
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #17
0
def _one_hot_encoder2(table,
                      input_cols,
                      prefix='list',
                      prefix_list=None,
                      suffix='index',
                      n_values='auto',
                      categorical_features='all',
                      sparse=True,
                      handle_unknown='error',
                      drop_last=False):
    out_table = table.copy()
    sparse = False
    enc_list = []
    le_list = []
    if drop_last:
        new_col_names_list_with_true_drop_last = []
    new_col_names_list = []
    prefix_list_index = 0
    if prefix == 'list':
        len_prefix_list = 0 if prefix_list is None else len(prefix_list)
        if len(input_cols) != len_prefix_list:
            # TODO: make the error message code
            raise_runtime_error(
                'The number of Input Columns and the number of Prefixes should be equal.'
            )
    number_distinct_classes = []
    for col_name in input_cols:
        enc = OneHotEncoder(n_values=n_values,
                            categorical_features=categorical_features,
                            sparse=sparse,
                            handle_unknown=handle_unknown)
        le = LabelEncoder()
        distinct_classes = np.unique(out_table[col_name].values)
        number_distinct_classes.append(len(distinct_classes))
        new_col_names = []
        if suffix == 'index':
            if prefix == 'list':
                for i in range(0, len(distinct_classes)):
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         str(i))
            else:
                for i in range(0, len(distinct_classes)):
                    new_col_names.append(col_name + '_' + str(i))
        else:
            pattern = re.compile("\W")
            for i in distinct_classes:
                i = re.sub(pattern, "_", str(i))
                if prefix == 'list':
                    new_col_names.append(prefix_list[prefix_list_index] + '_' +
                                         i)
                else:
                    new_col_names.append(col_name + '_' + i)

        transformed_table = pd.DataFrame(enc.fit_transform(
            le.fit_transform(out_table[col_name]).reshape(-1, 1)),
                                         columns=new_col_names)
        new_col_names_list.append(new_col_names)
        if drop_last:
            new_col_names = new_col_names[:-1]
            new_col_names_list_with_true_drop_last.append(new_col_names)
        for new_col_name in new_col_names:
            out_table[new_col_name] = transformed_table[new_col_name]

        enc_list.append(enc)
        le_list.append(le)
        prefix_list_index = prefix_list_index + 1
    rb = BrtcReprBuilder()
    params = {
        'Input Columns': input_cols,
        "Prefix Type": prefix,
        "Suffix Type": suffix,
        "Drop Last": drop_last,
        "Number of values per feature": n_values,
        "Categorical features": categorical_features,
        "Error handling": handle_unknown
    }
    summary_table = pd.DataFrame()
    summary_table['Input Columns'] = input_cols
    summary_table['No. distinct classes'] = number_distinct_classes
    if drop_last:
        summary_table[
            'New encoded columns'] = new_col_names_list_with_true_drop_last
    else:
        summary_table['New encoded columns'] = new_col_names_list
    rb.addMD(
        strip_margin("""
    | ## One Hot Encoder Model
    | ### Parameters
    | {params}
    |
    | ### Summary
    | {summary_table}
    """.format(params=dict2MD(params),
               summary_table=pandasDF2MD(summary_table))))
    out_model = _model_dict('one_hot_encoder')
    out_model['one_hot_encoder_list'] = enc_list
    out_model['label_encoder_list'] = le_list
    out_model['input_cols'] = input_cols
    out_model['prefix'] = prefix
    out_model['prefix_list'] = prefix_list
    out_model['suffix'] = suffix
    out_model['drop_last'] = drop_last
    out_model['_repr_brtc_'] = rb.get()
    if drop_last:
        out_model[
            'new_col_names_list_with_true_drop_last'] = new_col_names_list_with_true_drop_last
    out_model['new_col_names_list'] = new_col_names_list
    return {'out_table': out_table, 'model': out_model}
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]),
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    y_train = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        class_names=table[label_col].astype('str').unique(),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report
    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)

    # Add tree plot
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #19
0
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    feature_names, inputarr = check_col_type(table, input_cols)
    if n_samples is None:
        n_samples = len(inputarr)
        
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples}
    
    cluster_centers = k_means.cluster_centers_
    n_clusters = len(cluster_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Sum of square error: {sse_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
Пример #20
0
def _collaborative_filtering_train(table,
                                   user_col,
                                   item_col,
                                   rating_col,
                                   N=10,
                                   k=5,
                                   based='item',
                                   mode='train',
                                   method='cosine',
                                   weighted=True,
                                   centered=True,
                                   targets=None,
                                   normalize=True):

    if based == 'item':
        normalize = False
    table_user_col = table[user_col]
    table_item_col = table[item_col]
    rating_col = table[rating_col]
    user_encoder = preprocessing.LabelEncoder()
    item_encoder = preprocessing.LabelEncoder()
    user_encoder.fit(table_user_col)
    item_encoder.fit(table_item_col)
    user_correspond = user_encoder.transform(table_user_col)
    item_correspond = item_encoder.transform(table_item_col)
    item_users = np.zeros(
        (len(item_encoder.classes_), len(user_encoder.classes_)))
    for i in range(len(table_user_col)):
        item_users[item_correspond[i]][user_correspond[i]] = rating_col[i] + 1
    centered_ratings = item_users.copy()
    num_item, num_user = item_users.shape
    if centered and based == 'item':
        check_cen = csr_matrix(centered_ratings)
        for item in range(num_item):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, item):
                index += 1
                sum += rating
            avg = sum / index
            for user, rating in _nonzeros(check_cen, item):
                centered_ratings[item][user] -= avg
    if centered and based == 'user':
        check_cen = csr_matrix(np.transpose(centered_ratings))
        for user in range(num_user):
            index = 0
            sum = 0
            for item, rating in _nonzeros(check_cen, user):
                index += 1
                sum += rating
            avg = sum / index
            for item, rating in _nonzeros(check_cen, user):
                centered_ratings[item][user] -= avg
    for i in range(len(table_user_col)):
        item_users[item_correspond[i]][user_correspond[i]] -= 1
    if method == 'adjusted' or normalize:
        check_cen = csr_matrix(np.transpose(item_users))
    user_avg = []
    if normalize:
        for user in range(num_user):
            index = 0
            sum = 0
            for user, rating in _nonzeros(check_cen, user):
                index += 1
                sum += rating
            avg = sum / index
            user_avg.append(avg)
    if method == 'adjusted':
        for user in range(num_user):
            sum = 0
            for item, rating in _nonzeros(check_cen, user):
                sum += rating
            avg = sum / num_item
            for item in range(num_item):
                centered_ratings[item][user] -= avg
        method = 'cosine'

    if based == 'item':
        similar_coeff = np.zeros((num_item, num_item))
        for item in range(num_item):
            similar_coeff[item][item] = -1
            for diff_item in range(item + 1, num_item):
                similar_coeff[item][diff_item] = _similar_coeff(
                    centered_ratings, item, diff_item, method)
                similar_coeff[diff_item][item] = similar_coeff[item][diff_item]

    else:
        similar_coeff = np.zeros((num_user, num_user))
        for user in range(num_user):
            similar_coeff[user][user] = -1
            for diff_user in range(user + 1, num_user):
                similar_coeff[user][diff_user] = _similar_coeff(
                    np.transpose(centered_ratings), user, diff_user, method)
                similar_coeff[diff_user][user] = similar_coeff[user][diff_user]

    if mode == 'Topn':
        if targets is None:
            targets = user_encoder.classes_
        targets_en = user_encoder.transform(targets)
        Topn_result = []
        for user in targets_en:
            recommendations_corre = _recommend(user, item_users, similar_coeff,
                                               N, k, method, weighted,
                                               centered, based, normalize,
                                               user_avg)
            recommendations = []
            for (item, rating) in recommendations_corre:
                recommendations += [
                    item_encoder.inverse_transform([item])[0], rating
                ]
            Topn_result += [recommendations]
        Topn_result = pd.DataFrame(Topn_result)
        Topn_result = pd.concat([pd.DataFrame(targets), Topn_result],
                                axis=1,
                                ignore_index=True)
        column_names = ['user_name']
        for i in range(int((Topn_result.shape[1] - 1) / 2)):
            column_names += ['item_top%d' % (i + 1), 'rating_top%d' % (i + 1)]
        Topn_result.columns = column_names
        return {'out_table': Topn_result}

    parameters = dict()
    parameters['Number of Neighbors'] = k
    parameters['Based'] = based
    if method == 'cosine':
        parameters['Similarity method'] = 'Cosine'
    elif method == 'jaccard':
        parameters['Similarity method'] = 'Jaccard'
    elif method == 'pearson':
        parameters['Similarity method'] = 'Pearson'
    else:
        parameters['Similarity method'] = 'Adjusted Cosine'
    parameters['Use Centered Mean'] = centered
    parameters['Use Weighted Rating'] = weighted
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Collaborative Filtering Result
    |
    | ### Parameters
    | {parameters} 
    |
    """.format(parameters=dict2MD(parameters))))

    model = _model_dict('collaborative filtering')
    model['weighted'] = weighted
    model['k'] = k
    model['similar_coeff'] = similar_coeff
    model['item_encoder'] = item_encoder
    model['user_encoder'] = user_encoder
    model['item_users'] = item_users
    model['user_col'] = user_col
    model['item_col'] = item_col
    model['based'] = based
    model['_repr_brtc_'] = rb.get()
    model['normalize'] = normalize
    model['user_avg'] = user_avg
    return {'model': model}
Пример #21
0
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None,
         coherence='u_mass', vis_time=0, seed=None):
    running_os = platform.system()
    is_os_64bit = platform.machine().endswith('64')
    if running_os == 'Linux':
        if is_os_64bit:
            dtm_filename = 'dtm-linux64'
        else:
            dtm_filename = 'dtm-linux32'
    elif running_os == 'Windows':
        if is_os_64bit:
            dtm_filename = 'dtm-win64.exe'
        else:
            dtm_filename = 'dtm-win32.exe'
    else:  # Mac
        dtm_filename = 'dtm-darwin64'
    dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename)
    if running_os != 'Windows':
        bash_command = "chmod +x {}".format(dtm_path)
        os.system(bash_command)
    tokenized_doc = np.array(table[input_col])
    num_doc = len(tokenized_doc)
    if time_slice is None:
        time_slice = [num_doc]
    elif sum(time_slice) != num_doc:
        raise_runtime_error("The sum of time slice list does not match the number of documents.")
    if vis_time < 0 or vis_time >= len(time_slice):
        raise_runtime_error("Invalid time parameter: {}".format(vis_time))
    dictionary = corpora.Dictionary(tokenized_doc)
    corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
    dtm_params = {"corpus": corpus,
                  "id2word": dictionary,
                  "time_slices": time_slice,
                  "num_topics": num_topic,
                  "lda_sequence_max_iter": max_iter,
                  "model": 'dtm'}
    if seed is not None:
        dtm_params["rng_seed"] = seed
    dtm_model = DtmModel(dtm_path, **dtm_params)

    topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)]
                  for t in range(len(time_slice))]
    topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time]
    timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)]
    columns = ["topic_{}".format(i + 1) for i in range(num_topic)]
    topic_table = pd.DataFrame(topic_time, columns=columns)
    topic_table['time'] = timeline
    topic_table = topic_table[['time'] + columns]

    prop_arr = dtm_model.gamma_
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors(
            [{'0100': "Existing table contains Topic Column Name. Please choose again."}])
    out_table[topic_name] = [item.argmax() + 1 for item in prop_arr]
    out_table['topic_distribution'] = prop_arr.tolist()

    coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))]
    if coherence == 'u_mass':
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence()
                   for item in coherence_topic_arr]
    else:
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc,
                                  coherence='c_v').get_coherence() for item in coherence_topic_arr]

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time)
    prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False)
    html_result = plv.prepared_data_to_html(prepared_data)

    params = {'Input column': input_col,
              'Topic column name': topic_name,
              'Number of topics': num_topic,
              'Number of words for each topic': num_topic_word,
              'Maximum number of iterations': max_iter,
              'Time slice': time_slice,
              'Coherence measure': coherence,
              'Time to visualize': vis_time}
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Dynamic Topic Modeling Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    | ### Coherence for each period
    | {coh_arr}
    |
    | ### Parameters
    | {params}
    """.format(coh_arr=coh_arr, params=dict2MD(params))))

    model = _model_dict('dtm_model')
    model['params'] = params
    model['dtm_model'] = dtm_model
    model['coherences'] = coh_arr
    model['corpus'] = corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Пример #22
0
def _lda4(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=5,
          num_topic_word=10,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    # generate model
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    log_likelihood = lda_model.score(term_count)
    perplexity = lda_model.perplexity(term_count)

    # create topic table
    vocab_weights_list = []
    vocab_list = []
    weights_list = []
    topic_term_prob = normalize(lda_model.components_, norm='l1')
    for vector in topic_term_prob:
        pairs = []
        for term_idx, value in enumerate(vector):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        vocab_weights = []
        vocab = []
        weights = []
        for pair in pairs[:num_topic_word]:
            vocab_weights.append("{}: {}".format(pair[1], pair[0]))
            vocab.append(pair[1])
            weights.append(pair[0])
        vocab_weights_list.append(vocab_weights)
        vocab_list.append(vocab)
        weights_list.append(weights)
    topic_table = pd.DataFrame({
        'vocabularies_weights': vocab_weights_list,
        'vocabularies': vocab_list,
        'weights': weights_list
    })
    topic_table['index'] = [idx + 1 for idx in topic_table.index]
    topic_table = topic_table[[
        'index', 'vocabularies_weights', 'vocabularies', 'weights'
    ]]

    # create output table
    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    topic_dist_name = topic_name + '_distribution'
    if topic_name in table.columns or topic_dist_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [
        doc_topic[i].argmax() + 1 for i in range(len(corpus))
    ]
    out_table[topic_dist_name] = doc_topic.tolist()

    # pyLDAvis
    prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer)
    html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'Number of topics': num_topic,
        'Number of words for each topic': num_topic_word,
        'Maximum number of iterations': max_iter,
        'Learning method': learning_method,
        'Learning offset': learning_offset,
        'Seed': random_state
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Latent Dirichlet Allocation Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Log Likelihood
    | {log_likelihood}
    |
    | ### Perplexity
    | {perplexity}
    |
    | ### Parameters
    | {params}
    """.format(log_likelihood=log_likelihood,
               perplexity=perplexity,
               params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['lda_model'] = lda_model
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Пример #23
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         seed=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(None,
              copy,
              whiten,
              svd_solver,
              tol,
              iterated_power,
              random_state=seed)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['_repr_brtc_'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
Пример #24
0
def _gsdmm(table,
           input_col,
           topic_name='topic',
           K=10,
           alpha=0.1,
           beta=0.1,
           max_iter=50,
           num_topic_words=3):
    docs = np.array(table[input_col])
    docs_set = [set(doc) for doc in docs]
    docs_preprocessed = [list(doc_set) for doc_set in docs_set]
    vocab_set = list(set.union(*docs_set))
    vocab_size = len(vocab_set)

    # initialize and train a GSDMM model
    mgp = gsdmm_rwalk.MovieGroupProcess(K=K,
                                        alpha=alpha,
                                        beta=beta,
                                        n_iters=max_iter)
    topics = mgp.fit(docs_preprocessed, vocab_size)

    # generate topic table
    topic_word_count = mgp.cluster_word_distribution
    topic_words_raw = [[ind, _count_to_ratio_raw(word_count)]
                       for ind, word_count in enumerate(topic_word_count)
                       if word_count]
    topic_words = [[item[0]] + _gen_table(item[1], num_topic_words)
                   for item in topic_words_raw]

    # reset topic ids
    nonempty_topic_indices = [item[0] for item in topic_words]
    reset_topic_ind = {
        old_ind: (new_ind + 1)
        for new_ind, old_ind in enumerate(nonempty_topic_indices)
    }
    topics = [reset_topic_ind[old_ind] for old_ind in topics]
    topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:]
                   for old_item in topic_words]

    # generate output dataframes
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains the topic column name. Please choose another name."
        }])
    out_table[topic_name] = topics
    columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights']
    topic_table = pd.DataFrame(topic_words, columns=columns)
    topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric)

    # pyLDAvis
    if len(topic_words) == 1:
        html_result = None
    else:
        topic_words_dicts = [item[1] for item in topic_words_raw]
        topic_term_dists = [[
            topic_words_dict.get(word, 0) for word in vocab_set
        ] for topic_words_dict in topic_words_dicts]
        num_docs = len(topics)
        num_topics = len(topic_words_raw)
        doc_topic_dists = np.zeros((num_docs, num_topics))
        for doc_id, topic_id in enumerate(topics):
            doc_topic_dists[doc_id][topic_id - 1] = 1.0
        doc_lengths = [len(doc) for doc in docs_preprocessed]
        vocab_count = functools.reduce(
            lambda dict_1, dict_2: {
                word: dict_1.get(word, 0) + dict_2.get(word, 0)
                for word in set(dict_1).union(dict_2)
            }, topic_word_count)
        term_frequency = [vocab_count.get(word) for word in vocab_set]

        prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists,
                                         doc_lengths, vocab_set,
                                         term_frequency)
        html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'K': K,
        'Alpha': alpha,
        'Beta': beta,
        'Maximum number of iterations': max_iter,
        'Number of words for each topic': num_topic_words
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## GSDMM Result
    | ### Summary
    |
    """))
    if html_result is not None:
        rb.addHTML(html_result)
        rb.addMD(strip_margin("""
        |
        """))
    rb.addMD(
        strip_margin("""
    | ### Final Number of Topics
    | {num_topics}
    |
    | ### Parameters
    | {params}
    """.format(num_topics=len(topic_words_raw), params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['gsdmm_model'] = mgp
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Пример #25
0
def _penalized_linear_regression_train(table,
                                       feature_cols,
                                       label_col,
                                       regression_type='ridge',
                                       alpha=1.0,
                                       l1_ratio=0.5,
                                       fit_intercept=True,
                                       max_iter=1000,
                                       tol=0.0001,
                                       random_state=None):
    out_table = table.copy()
    feature_names, features = check_col_type(out_table, feature_cols)
    label = out_table[label_col]
    if regression_type == 'ridge':
        regression_model = Ridge(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=None,
                                 tol=tol,
                                 solver='auto',
                                 random_state=random_state)
    elif regression_type == 'lasso':
        regression_model = Lasso(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=max_iter,
                                 tol=tol,
                                 random_state=random_state,
                                 selection='random')
    elif regression_type == 'elastic_net':
        regression_model = ElasticNet(alpha=alpha,
                                      l1_ratio=l1_ratio,
                                      fit_intercept=fit_intercept,
                                      max_iter=max_iter,
                                      tol=tol,
                                      random_state=random_state,
                                      selection='random')
    else:
        raise_runtime_error("Please check 'regression_type'.")

    regression_model.fit(features, label)

    out_table1 = pd.DataFrame([])
    out_table1['x_variable_name'] = [variable for variable in feature_names]
    out_table1['coefficient'] = regression_model.fit(features, label).coef_
    intercept = pd.DataFrame(
        [['intercept',
          regression_model.fit(features, label).intercept_]],
        columns=['x_variable_name', 'coefficient'])
    if fit_intercept == True:
        out_table1 = out_table1.append(intercept, ignore_index=True)

    predict = regression_model.predict(features)
    residual = label - predict

    out_table['predict'] = predict
    out_table['residual'] = residual

    if regression_type == 'elastic_net':
        params = {
            'Feature Columns': feature_names,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'L1 Ratio': l1_ratio,
            'Fit Intercept': fit_intercept,
            'Maximum Number of Iterations': max_iter,
            'Tolerance': tol
        }
    else:
        params = {
            'Feature Columns': feature_names,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'Fit Intercept': fit_intercept,
            'Maxium Number of Iterations': max_iter,
            'Tolerance': tol
        }

    score = {
        'MSE': mean_squared_error(label, predict),
        'R2': r2_score(label, predict)
    }

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)
    plt.clf()

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)
    plt.clf()

    # checking the magnitude of coefficients

    plt.figure()
    predictors = feature_names
    coef = Series(regression_model.coef_, predictors).sort_values()
    coef.plot(kind='bar', title='Model Coefficients')
    plt.tight_layout()
    fig_model_coefficients = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | # Penalized Linear Regression Result
    | ### Selected Parameters: 
    | {params}
    |
    | ## Results
    | ### Model Parameters
    | {out_table1}
    |
    | ### Regression Score
    | {score}
    |
    """.format(params=dict2MD(params),
               out_table1=pandasDF2MD(out_table1),
               score=dict2MD(score))))
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    |
    | ### Magnitude of Coefficients
    | {image5}
    |
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3,
               image5=fig_model_coefficients)))

    model = _model_dict('penalized_linear_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['regression_type'] = regression_type
    model['regression_model'] = regression_model
    model['parameters'] = params
    model['model_parameters'] = out_table1
    model['prediction_residual'] = out_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #26
0
def _tfidf(table,
           input_col,
           max_df=None,
           min_df=1,
           num_voca=1000,
           idf_weighting_scheme='inverseDocumentFrequency',
           norm='l2',
           smooth_idf=True,
           sublinear_tf=False,
           output_type=False):
    corpus = np.array(table[input_col])
    if max_df == None:
        max_df = len(corpus)
    tf_vectorizer = CountVectorizer(stop_words='english',
                                    max_df=max_df,
                                    min_df=min_df,
                                    max_features=num_voca)
    tf_vectorizer.fit(corpus)
    csr_matrix_tf = tf_vectorizer.transform(corpus)
    tfidf_vectorizer = TfidfTransformer(norm=norm,
                                        use_idf=True,
                                        smooth_idf=smooth_idf,
                                        sublinear_tf=sublinear_tf)
    csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf)

    voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1))
    len_voca = len(voca_dict)

    # tf-idf table

    tfidf_table = pd.DataFrame()
    document_list = []
    docID_list = []
    if output_type == False:
        vocabulary_list = []
        label_table = pd.DataFrame()
        for doc in range(len(corpus)):
            docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)]
            document_list += [str(corpus[doc]) for _ in range(len_voca)]
            vocabulary_list += [voca_dict[j][0] for j in range(len_voca)]
        label_table['document_id'] = docID_list
        label_table[input_col] = document_list
        label_table['vocabulary'] = vocabulary_list
        tfidf_table = label_table
        tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense())
        if idf_weighting_scheme == 'inverseDocumentFrequency':
            tfidf_table['tfidf score'] = np.ravel(csr_matrix_tfidf.todense())
        elif idf_weighting_scheme == 'unary':
            tfidf_table['tfidf score'] = list(
                map(float, np.array(tfidf_table['frequency'])))

    elif output_type == True:
        for doc in range(len(corpus)):
            docID_list += [
                'doc_{}'.format(doc)
                for _ in range(csr_matrix_tfidf.indptr[doc + 1] -
                               csr_matrix_tfidf.indptr[doc])
            ]
            document_list += [
                str(corpus[doc])
                for _ in range(csr_matrix_tfidf.indptr[doc + 1] -
                               csr_matrix_tfidf.indptr[doc])
            ]
        tfidf_table['document_id'] = docID_list
        tfidf_table[input_col] = document_list
        tfidf_table['vocabulary'] = [
            voca_dict[i][0] for i in csr_matrix_tf.indices
        ]
        tfidf_table['frequency'] = csr_matrix_tf.data
        data_list = []
        for doc in range(len(corpus)):
            data_list += [
                csr_matrix_tfidf.data[i]
                for i in range(csr_matrix_tfidf.indptr[doc + 1] -
                               csr_matrix_tfidf.indptr[doc])
            ][::-1]
        if idf_weighting_scheme == 'inverseDocumentFrequency':
            tfidf_table['tfidf score'] = data_list
        elif idf_weighting_scheme == 'unary':
            tfidf_table['tfidf score'] = list(
                map(float, np.array(tfidf_table['frequency'])))

    else:
        raise_runtime_error("Please check 'output_type'.")

        # idf table

    idf_table = pd.DataFrame()
    idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))]
    if idf_weighting_scheme == 'inverseDocumentFrequency':
        idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist()
    elif idf_weighting_scheme == 'unary':
        idf_table['idf weight'] = float(1)

    params = {
        'Input Column': input_col,
        'Max DF': max_df,
        'Min DF': min_df,
        'Number of Vocabularies': num_voca,
        'IDF Weighting Scheme': idf_weighting_scheme,
        'Norm': norm,
        'Smooth IDF': smooth_idf,
        'Sublinear TF': sublinear_tf,
        'Remove Zero Counts': output_type
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# TF-IDF Result"""))
    rb.addMD(
        strip_margin("""
    |
    |### Parameters
    |
    |{display_params}
    |
    |### IDF Table
    |
    |{idf_table}
    |
    |### TFIDF Table
    |
    |{tfidf_table}
    |
    """.format(display_params=dict2MD(params),
               idf_table=pandasDF2MD(idf_table, num_rows=200),
               tfidf_table=pandasDF2MD(tfidf_table, num_rows=200))))

    model = _model_dict('tfidf')
    model['csr_matrix_tf'] = csr_matrix_tf
    model['csr_matrix_tfidf'] = csr_matrix_tfidf
    model['parameter'] = params
    model['idf_table'] = idf_table
    model['tfidf_table'] = tfidf_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #27
0
def _decision_tree_regression_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='mse',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    param_validation_check = [
        greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'),
        greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'),
        greater_than_or_equal_to(min_weight_fraction_leaf, 0.0,
                                 'min_weight_fraction_leaf')
    ]
    if max_depth is not None:
        param_validation_check.append(
            greater_than_or_equal_to(max_depth, 1, 'max_depth'))

    validate(*param_validation_check)

    regressor = DecisionTreeRegressor(criterion, splitter, max_depth,
                                      min_samples_split, min_samples_leaf,
                                      min_weight_fraction_leaf, max_features,
                                      random_state, max_leaf_nodes,
                                      min_impurity_decrease,
                                      min_impurity_split, presort)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(regressor,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    feature_importance = regressor.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = regressor.max_features_
    model['n_features'] = regressor.n_features_
    model['n_outputs'] = regressor.n_outputs_
    model['tree'] = regressor.tree_
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Regression Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Пример #28
0
def _svm_classification_train(table,
                              feature_cols,
                              label_col,
                              c=1.0,
                              kernel='rbf',
                              degree=3,
                              gamma='auto',
                              coef0=0.0,
                              shrinking=True,
                              probability=True,
                              tol=1e-3,
                              max_iter=-1,
                              random_state=None,
                              class_weight=None):
    _table = table.copy()

    feature_names, features = check_col_type(table, feature_cols)
    _label_col = _table[label_col]

    if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')

    class_labels = sorted(set(_label_col))
    if class_weight is not None:
        if len(class_weight) != len(class_labels):
            raise ValueError(
                "Number of class weights should match number of labels.")
        else:
            class_weight = {
                class_labels[i]: class_weight[i]
                for i in range(len(class_labels))
            }

    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state,
                   class_weight=class_weight)
    _svc_model = _svc.fit(features, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_names
    get_param['label_col'] = label_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()

    return {'model': _model}
Пример #29
0
def _gaussian_mixture_train(table, input_cols, number_of_components=1, covariance_type='full', tolerance=0.001, \
                            regularize_covariance=1e-06, max_iteration=100, initial_params='kmeans', seed=None):

    gmm = GaussianMixture(n_components=number_of_components, covariance_type=covariance_type, tol=tolerance, \
                          reg_covar=regularize_covariance, max_iter=max_iteration, init_params=initial_params, random_state=seed)
    X_train = table[input_cols]
    gmm.fit(X_train)

    out_table = pd.DataFrame()

    comp_num_arr = []
    for i in range(0, number_of_components):
        comp_num_arr.append(i)

    mean_arr = []
    for i in range(0, number_of_components):
        mean_arr.append(gmm.means_[i].tolist())

    covar_arr = []
    for i in range(0, number_of_components):
        covar_arr.append(gmm.covariances_[i].tolist())

    out_table['component_number'] = comp_num_arr
    out_table['weight'] = gmm.weights_
    out_table['mean_coordinate'] = mean_arr
    out_table['covariance_matrix'] = covar_arr

    rb = BrtcReprBuilder()
    params = {
        'Input Columns': input_cols,
        'Number of Components': number_of_components,
        'Covariance Type': covariance_type,
        'Tolerance': tolerance,
        'Regularization of Covariance': regularize_covariance,
        'Number of Iteration': max_iteration,
        'Method to Initialize': initial_params
    }

    rb.addMD(
        strip_margin("""
    |## Gaussian Mixture Train Result 
    |
    |### Parameters
    |
    | {params}
    |
    |### Summary
    |
    |{result_table}
    |
    """.format(params=dict2MD(params), result_table=pandasDF2MD(out_table))))

    model = _model_dict('gaussian_mixture_train')
    model['input_cols'] = input_cols
    model['number_of_components'] = number_of_components
    model['covariance_type'] = covariance_type
    model['tolerance'] = tolerance
    model['regularize_covariance'] = regularize_covariance
    model['max_iteration'] = max_iteration
    model['initial_params'] = initial_params
    model['seed'] = seed
    model['summary'] = out_table
    model['gmm'] = gmm
    model['_repr_brtc_'] = rb.get()
    return {'model': model}
Пример #30
0
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=0,
                              seed=None,
                              missing=None,
                              sample_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):
    validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'),
             greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'),
             greater_than_or_equal_to(n_estimators, 1, 'n_estimators'))

    classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent,
                               objective, booster, n_jobs, nthread, gamma,
                               min_child_weight, max_delta_step, subsample,
                               colsample_bytree, colsample_bylevel, reg_alpha,
                               reg_lambda, scale_pos_weight, base_score,
                               random_state, seed, missing)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   eval_set, eval_metric, early_stopping_rounds, verbose,
                   xgb_model, sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Importance
    | {fig_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}