mat = sparse.hstack((mat, mat_arg_ctx_union_arg.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_ctx_diff_arg.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_ctx_inters_arg.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_ctx_l_minus_r_arg.astype(np.float64))) mat = sparse.hstack( (mat, mat_arg_ctx_r_minus_l_arg.astype(np.float64))).tocsr() # feat from similar context examples mat = sparse.hstack((mat, mat_arg_l_ctx_sim_paths.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_r_ctx_sim_paths.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_ctx_sim_union_arg.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_ctx_sim_diff_arg.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_ctx_sim_inters_arg.astype(np.float64))) mat = sparse.hstack((mat, mat_arg_ctx_sim_l_minus_r_arg.astype(np.float64))) mat = sparse.hstack( (mat, mat_arg_ctx_sim_r_minus_l_arg.astype(np.float64))).tocsr() mat = mat.tocsr()[:, 1:] # remove the first dummy vector model = tc.run_classification_test(mat, true_labels, binarize=True, percentage_train=0.8, print_train_test_set_stat=True, test_thresholds=False, random_seed=623519, d_args=d_triples) names = d_paths_ctx._id2w l = zip(names, model.coef_[0]) ls = sorted(l, reverse=True, key=lambda x: x[1])
# mat = sparse.hstack(( mat, mb_sim_intersect_w1_w2.astype( np.float64 ))) # mat = sparse.hstack(( mat, mb_sim_minus_w1_w2.astype( np.float64 ))) # mat = sparse.hstack(( mat, mb_sim_minus_w2_w1.astype( np.float64 ))) # pairs <set operation> single noun rand_seed = 623519 # rand_seed = 234123 logging.info("training classifier and predicting labels") mat = mat.tocsr()[:, 1:] model = tc.run_classification_test( mat, y_true, binarize=True, # True percentage_train=0.8, print_train_test_set_stat=True, test_thresholds=False, random_seed=rand_seed, d_triples=d_triples) # names = d_ctx_word._id2w # l = zip( names, model.coef_[0] ) # ls = sorted( l, reverse=True, key= lambda x: x[1] ) # interesting_id = 10 # names = d_ctx_word._id2w # x = zip( names, np.squeeze( np.array( mat[interesting_id,:].todense() ))) # x = sorted([ (x,i) for (i, x) in enumerate(x) if x[1] > 0 ], key= lambda x: x[0][1], reverse=True ) logging.info("inspect some of the features")
# mat = sparse.hstack(( mat, mb_sim_w2.astype( np.float64 ))) # mat = sparse.hstack(( mat, mb_sim_union_w1_w2.astype( np.float64 ))) # mat = sparse.hstack(( mat, mb_sim_diff_w1_w2.astype( np.float64 ))) # mat = sparse.hstack(( mat, mb_sim_intersect_w1_w2.astype( np.float64 ))) # mat = sparse.hstack(( mat, mb_sim_minus_w1_w2.astype( np.float64 ))) # mat = sparse.hstack(( mat, mb_sim_minus_w2_w1.astype( np.float64 ))) # pairs <set operation> single noun rand_seed = 623519 # rand_seed = 234123 logging.info( "training classifier and predicting labels" ) mat = mat.tocsr()[:,1:] model = tc.run_classification_test( mat, y_true, binarize=True, # True percentage_train=0.8, print_train_test_set_stat=True, test_thresholds=False, random_seed=rand_seed, d_triples=d_triples ) # names = d_ctx_word._id2w # l = zip( names, model.coef_[0] ) # ls = sorted( l, reverse=True, key= lambda x: x[1] ) # interesting_id = 10 # names = d_ctx_word._id2w # x = zip( names, np.squeeze( np.array( mat[interesting_id,:].todense() ))) # x = sorted([ (x,i) for (i, x) in enumerate(x) if x[1] > 0 ], key= lambda x: x[0][1], reverse=True ) logging.info( "inspect some of the features" ) # replicate hstacking here to attain names # names = d_ctx_word._id2w names = d_ctx_pair._id2w + d_ctx_word._id2w + d_ctx_word._id2w
mat = sparse.hstack((mat, mat_sim_arg_l.astype(np.float64))); mat = sparse.hstack((mat, mat_sim_arg_r.astype(np.float64))); mat = sparse.hstack((mat, mat_sim_union_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_sim_diff_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_sim_inters_arg .astype(np.float64))); mat = sparse.hstack((mat, mat_sim_l_minus_r_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_sim_r_minus_l_arg.astype(np.float64))).tocsr(); # feat from example contexts mat = sparse.hstack((mat, mat_arg_l_ctx_paths.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_r_ctx_paths.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_union_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_diff_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_inters_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_l_minus_r_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_r_minus_l_arg.astype(np.float64))).tocsr(); # feat from similar context examples mat = sparse.hstack((mat, mat_arg_l_ctx_sim_paths.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_r_ctx_sim_paths.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_sim_union_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_sim_diff_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_sim_inters_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_sim_l_minus_r_arg.astype(np.float64))); mat = sparse.hstack((mat, mat_arg_ctx_sim_r_minus_l_arg.astype(np.float64))).tocsr(); mat = mat.tocsr()[:,1:]; # remove the first dummy vector model = tc.run_classification_test(mat, true_labels, binarize=True, percentage_train=0.8, print_train_test_set_stat=True, test_thresholds=False, random_seed=623519, d_args=d_triples); names = d_paths_ctx._id2w l = zip(names, model.coef_[0]) ls = sorted(l, reverse=True, key= lambda x: x[1]);