Пример #1
0
def main():
    X, y, train_X, train_y, test_X, test_y = ml_utils.load_data(my_dir, 0)
    estimator = RandomForestClassifier(n_estimators=40 , max_depth=5)
    estimator.fit(X, y)

    keys_model = model.Model(estimator, my_dir)
    keys_model.save(os.path.join(my_dir, "keys_model.pkl"))
Пример #2
0
def main(data, train, test=None):

    classes_to_score = [0, 1, 2, 3]
    train_text, train_labels, score_text, score_ids = load_data(
        data, train, test, classes_to_score, scoring=True)
    train_text, train_labels = fix_data(train_text, train_labels)
    score(train_text, train_labels, score_text, score_ids)
Пример #3
0
def main(data, train, test=None):

    classes_to_score = [0, 1, 2, 3, 4]
    train_text, train_labels, score_text, score_ids, timestamps = load_data(
        data, train, test, id_key='__index__', text_key='text', scoring=True)
    train_text, train_labels = fix_data(train_text, train_labels)
    score(train_text, train_labels, score_text, score_ids, timestamps,
          classes_to_score)
Пример #4
0
def main(data, train_data, test_data):

    train_text, train_labels, test_text, test_labels = load_data(
        data,
        train_data,
        test_data,
        text_key='article',
        id_key=['source', 'source_index'])
    analyze(train_text, train_labels, test_text, test_labels, threshold=.6)
Пример #5
0
def main(data, train_data, test_data):

    train_text, train_labels, test_text, test_labels = load_data(
        data,
        train_data,
        test_data,
        text_key='article',
        id_key=['source', 'source_index'])
    train(train_text, train_labels, test_text, test_labels)
Пример #6
0
def main(data, train_data, test_data):

    classes_to_analyze = [0, 1, 2, 3, 4]

    train_text, train_labels, test_text, test_labels = load_data(
        data, train_data, test_data, id_key='__index__', text_key='text')
    train_text, train_labels = fix_data(train_text, train_labels)
    test_text, test_labels = fix_data(test_text, test_labels)
    analyze(train_text, train_labels, test_text, test_labels,
            classes_to_analyze)
Пример #7
0
def main(data, train_data, test_data):

    CLASSES_TO_TRAIN = [0, 1, 2, 3, 4]

    # load the data
    train_text, train_labels, test_text, test_labels = load_data(
        data, train_data, test_data, id_key='__index__', text_key='text')
    # fix the data
    train_text, train_labels = fix_data(train_text, train_labels)
    test_text, test_labels = fix_data(test_text, test_labels)
    # train on the data
    train(train_text, train_labels, test_text, test_labels, CLASSES_TO_TRAIN)
Пример #8
0
def main():
    if len(sys.argv) < 2:
        print("Usage: {} <task directory>".format(sys.argv[0]))
        return

    task_dir = sys.argv[1]
    X, y, train_X, train_y, test_X, test_y = ml_utils.load_data(task_dir, TEST_FRACTION)
    curr_results_dir = os.path.join(task_dir, "results")

    # # Boosted Decision Tree
    # num_estimators_param = Parameter("Max Number of Estimators", "n_estimators", range(10, 111, 10))
    # base_estimators = [DecisionTreeClassifier(max_depth=d) for d in range(2, 6)]
    # estimator_depth_param = Parameter("Max Tree Depth", "base_estimator", base_estimators, range(1, 6))
    # boosted_algo = Algorithm("Boosted Decision Tree", [num_estimators_param, estimator_depth_param], AdaBoostClassifier())
    # analyze_algorithm(boosted_algo, curr_results_dir, train_X, train_y, X, y)
    # print()

    # Random Forest
    num_estimators_param = Parameter("Number of Estimators", "n_estimators", range(1, 121, 10))
    max_depth = Parameter("Max Tree Depth", "max_depth", range(1, 7))
    rf_algo = Algorithm("Random Forest", [num_estimators_param, max_depth], RandomForestClassifier())
    analyze_algorithm(rf_algo, curr_results_dir, train_X, train_y, X, y)
    print()

    # Naiive Bayes
    bayes_algo = Algorithm("Naiive Bayes", [], GaussianNB())
    analyze_algorithm(bayes_algo, curr_results_dir, train_X, train_y, X, y)
    print()

    # Support Vector Machine (Linear)
    c_param = Parameter("C Value", "C", [1e-2, 1e-1, 1e0, 1e1, 1e2], log_scale=True)
    linear_svm_algo = Algorithm("Linear Support Vector Machine", [c_param], svm.SVC(kernel="linear"))
    analyze_algorithm(linear_svm_algo, curr_results_dir, train_X, train_y, X, y)
    print()

    # Linear Discriminant Analysis
    lda_algo = Algorithm("Linear Discriminant Analysis", [], LinearDiscriminantAnalysis())
    analyze_algorithm(lda_algo, curr_results_dir, train_X, train_y, X, y)
    print()

    # Neural Network
    hidden_layer_param = Parameter("Number of Hidden Layers", "hidden_layer_sizes", vary_num_hidden_layers(30, 5), range(1, 6))
    alpha_param = Parameter("Regularization Strength (alpha)", "alpha", [1e0, 1e-2, 1e-4, 1e-6, 1e-8], log_scale=True)
    nn_algo = Algorithm("Neural Network", [hidden_layer_param, alpha_param], MLPClassifier(max_iter=5000))
    analyze_algorithm(nn_algo, curr_results_dir, train_X, train_y, X, y)
    print()
def train(model_class, domain=None):
    cache_path = './cache'
    if os.path.exists(
            os.path.join(cache_path,
                         "{}_boosted_tree.pkl".format(target_year))):
        df = pickle.load(
            open(
                os.path.join(cache_path,
                             "{}_boosted_tree.pkl".format(target_year)), 'rb'))
    else:
        df = load_data(target_year)
        pickle.dump(
            df,
            open(
                os.path.join(cache_path,
                             "{}_boosted_tree.pkl".format(target_year)), 'wb'))
    if domain is not None:
        print("Find important feature in {}".format(domain))
        domain2category = pickle.load(open("domain2category.pkl", 'rb'))
        df = df.loc[df['main_category'].isin(domain2category[domain])]
    selected_feature = const_selected_feature
    X = df[selected_feature].values
    bde = joblib.load('category_encoder.pkl')
    # bde = ce.BinaryEncoder(cols=categorical_feature, return_df=False)
    venue_embedding = bde.transform(df[['venue']]).values
    # pca = PCA(n_components=32)
    # venue_embedding = pca.fit_transform(venue_embedding)
    X = np.hstack((X, venue_embedding))
    venue_embedding_size = venue_embedding.shape[1]
    print(venue_embedding_size)
    for i in range(venue_embedding_size):
        selected_feature.append('venue_' + str(i))

    nlp_pipeline = joblib.load('nlp_pipeline.pkl')
    embeddings = nlp_pipeline.transform(df['summary'].values)
    embeddings_size = embeddings.shape[1]
    print(embeddings_size)
    for i in range(embeddings_size):
        selected_feature.append('title_' + str(i))
    X = np.hstack((X, embeddings))
    y = df['citationCount'].values
    y = np.clip(y, 0, 23)
    kf = KFold(n_splits=5)
    accuracies = []
    print(model_class)
    importance_stats = {}
    for train_index, test_index in tqdm(kf.split(y)):
        model = model_class(verbose=0)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_score = mse(y_pred, y_test)
        accuracies.append(mse_score)
        feature_importance = model.get_feature_importance(prettified=True)
        for feature_id, score in feature_importance:
            if selected_feature[int(feature_id)] not in importance_stats:
                importance_stats[selected_feature[int(feature_id)]] = score
            else:
                importance_stats[selected_feature[int(feature_id)]] += score
    limit = 0
    for key, value in importance_stats.items():
        print(key, " score: ", value)
        limit += 1
        if limit >= 30:
            break
    print("MSE: {} ({})".format(np.mean(accuracies), np.std(accuracies)))
    print("Sample : ")
    print(y_pred[:10])
    print(y_test[:10])