def multi_label_crf( labels, df_train, df_valid, df_test, binary, connectivity='full', vectorizer_method='count' ): """ Do you suffer from acute label correlations? Are your samples a part of more than one class? Do you have signs of labels have dependency? If you answered yes to at least one of those questions then sign up for structured learning today. For a low monthly membership fee of $39.99 you can solve all your multi-label woes! @param labels: @param df_train: @param df_valid: @param df_test: @param connectivity: @param vectorizer_method: @return: """ stats_container_valid = Statistics() stats_container_test = Statistics() if vectorizer_method == 'tf-idf': vectorizer_node = TfidfVectorizer( stop_words=['go:', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0) vectorizer_node.fit(df_train['terms'].values) alpha = None percentile = 100 elif vectorizer_method == 'count': vectorizer_node = CountVectorizer(stop_words=['go', '', ' '], binary=binary, lowercase=True) vectorizer_node.fit(df_train['terms'].values) alpha = None percentile = 100 else: raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method))) x_node_train, y_train, feature_names, selector_node = prep.select_features( df = df_train, vectorizer=vectorizer_node, feature_col='terms', label_col='label', select_method=None, continuous_col=[], alpha=alpha, percentile=percentile ) x_node_valid, y_valid = prep.transform_features( df=df_valid, vectorizer=vectorizer_node, selector=selector_node, feature_col='terms', label_col='label', continuous_cols=[] ) y_train = np.asarray([prep.binarise_labels(x, labels) for x in y_train], dtype=int) y_valid = np.asarray([prep.binarise_labels(x, labels) for x in y_valid], dtype=int) if connectivity == 'full' or connectivity == 'tree': n_labels = len(labels) edges = np.vstack([x for x in itertools.combinations(range(n_labels), 2)]) model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='ad3') elif connectivity == 'tree': edges = chow_liu_tree(y_train) model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='max-product') else: edges = None model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='unary') x_train = x_node_train.toarray() x_valid = x_node_valid.toarray() # -------------------- MAKE THE ESTIMATOR -------------------- # estimator = OneSlackSSVM(model, max_iter=2, tol=0.001, n_jobs=1) # -------------------- LEARN/STATS -------------------- # estimator.fit(x_train, y_train) stats_container_valid.merge(evaluate_crf_model(x_valid, y_valid, estimator, labels)) if isinstance(df_test, pd.DataFrame): x_node_test, y_test = prep.transform_features( df=df_test, vectorizer=vectorizer_node, selector=selector_node, feature_col='terms', label_col='label', continuous_cols=[] ) y_test = np.asarray([prep.binarise_labels(x, labels) for x in y_test], dtype=int) x_test = x_node_test.toarray() stats_container_test.merge(evaluate_crf_model(x_test, y_test, estimator, labels)) # -------------------- RETURN -------------------- # if isinstance(df_test, pd.DataFrame): return stats_container_valid, stats_container_test else: return stats_container_valid
df_train, df_hprd = prep.prep_data_frames(selection) vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True) vectorizer.fit(df_train['terms'].values) print("Transforming features...") x_train, y_train, feature_names, selector = prep.select_features( df = df_train, vectorizer=vectorizer, feature_col='terms', label_col='label', continuous_col=['sim'], alpha=None, percentile=100 ) y_train = np.asarray([prep.binarise_labels(y, labels) for y in y_train]) x_hprd, y_hprd = prep.transform_features( df = df_hprd, vectorizer=vectorizer, selector=selector, feature_col='terms', label_col='label', continuous_cols=['sim'] ) y_hprd = np.asarray([prep.binarise_labels(y, labels) for y in y_hprd]) x_test, y_test = prep.transform_features( df = df_test, vectorizer=vectorizer, selector=selector,