num_triples = len( d_triples ) # Note: the prefix d_ indicates a dictionary, m_ a matrix, mb_ a boolean matrix if ctx: logging.info( 'loading context features for word pairs' ) d_ctx_pair = td.Dict() m_ctx_pair = tm.arg_l_arg_r_asjo_matrix( d_triples._rtuple2ids, fn_ctx_pair, num_triples, col_indices=d_ctx_pair, mmfile_presuffix='_pairs', reload=refresh ) logging.info( 'loading context features for words' ) d_ctx_word = td.Dict() m_ctx_w1 = tm.arg_asjo_matrix( d_triples._m2ids, d_ctx_word, fn_ctx_word, num_triples, transform_w2sig=lambda w2sig: sorted( list( w2sig ), key = lambda x: float( x[1] ), reverse=True )[:20], mmfile_presuffix='_w1', reload=refresh ) m_ctx_w2 = tm.arg_asjo_matrix( d_triples._r2ids, d_ctx_word, fn_ctx_word, num_triples, transform_w2sig = lambda w2sig: sorted( list( w2sig ), key = lambda x: float( x[1] ), reverse=True )[:20], mmfile_presuffix='_w2', reload=refresh ) # adjust ( context ) matrix dimensions, if they vary if m_ctx_w1.shape[1] < m_ctx_w2.shape[1]: if sparse.isspmatrix_coo(m_ctx_w1): m_ctx_w1 = m_ctx_w1.todok() m_ctx_w1.resize(m_ctx_w2.shape) if m_ctx_w2.shape[1] < m_ctx_w1.shape[1]: if sparse.isspmatrix_coo(m_ctx_w2): m_ctx_w2 = m_ctx_w2.todok() m_ctx_w2.resize(m_ctx_w1.shape)
def load_matrices(d_triples): matrices = [] ## just the left argument as feature logging.info("creating w1 as feature matrix") d_w1 = td.Dict() w1_mat = tm.w1Asfeature(d_triples, d_w1) matrices.append(("W1 as Feature", w1_mat, d_w1)) ## just the right argument as feature logging.info("creating w2 as feature matrix") d_w2 = td.Dict() w2_mat = tm.w2Asfeature(d_triples, d_w2) matrices.append(("W2 as Feature", w2_mat, d_w2)) ## relation pair features logging.info("loading paths between argument pairs") d_paths = td.Dict() mat_paths = tm.arg_l_arg_r_asjo_matrix( d_triples._rtuple2ids, svo_flipped_counts, len(d_triples), col_indices=d_paths, mmfile_presuffix=".paths", reload=False, ) matrices.append(("paths between ArgL and ArgR", mat_paths, d_paths)) logging.info("loading similar argument pairs") d_sim_pairs = td.Dict() mat_sim_pairs = tm.arg_l_arg_r_asjo_matrix( d_triples._rtuple2ids, svo_flipped_dt, len(d_triples), col_indices=d_sim_pairs, transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], mmfile_presuffix=".simpairs", reload=False, ) matrices.append(("similar ArgL - ArgR pairs", mat_sim_pairs, d_sim_pairs)) ## context features logging.info("loading argument context matrices") d_ctx = td.Dict() mat_arg_l_ctx = tm.arg_asjo_matrix( d_triples._m2ids, d_ctx, svo_counts, len(d_triples), transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], mmfile_presuffix=".ctx_w1", reload=False, ) mat_arg_r_ctx = tm.arg_asjo_matrix( d_triples._r2ids, d_ctx, svo_counts, len(d_triples), transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], mmfile_presuffix=".ctx_w2", reload=False, ) ## create some extra matrices logging.info("creating argument context intersection and set minus matrices.") # adjust dimensions, in case they are different if mat_arg_l_ctx.shape[1] < mat_arg_r_ctx.shape[1]: if sparse.isspmatrix_coo(mat_arg_l_ctx): mat_arg_l_ctx = mat_arg_l_ctx.todok() mat_arg_l_ctx.resize(mat_arg_r_ctx.shape) if mat_arg_r_ctx.shape[1] < mat_arg_l_ctx.shape[1]: if sparse.isspmatrix_coo(mat_arg_r_ctx): mat_arg_r_ctx = mat_arg_r_ctx.todok() mat_arg_r_ctx.resize(mat_arg_l_ctx.shape) if not sparse.isspmatrix_coo(mat_arg_l_ctx): mat_arg_l_ctx = mat_arg_l_ctx.tocoo() if not sparse.isspmatrix_coo(mat_arg_r_ctx): mat_arg_r_ctx = mat_arg_r_ctx.tocoo() mat_arg_l_ctx = mat_arg_l_ctx.astype(bool) mat_arg_r_ctx = mat_arg_r_ctx.astype(bool) mat_arg_union_ctx = mat_arg_l_ctx + mat_arg_r_ctx mat_arg_diff_ctx = mat_arg_l_ctx != mat_arg_r_ctx mat_arg_inters_ctx = mat_arg_union_ctx - mat_arg_diff_ctx mat_arg_l_minus_r_ctx = mat_arg_union_ctx - mat_arg_r_ctx mat_arg_r_minus_l_ctx = mat_arg_union_ctx - mat_arg_l_ctx matrices.append(("Contexts of ArgL", mat_arg_l_ctx.astype(np.float64), d_ctx)) matrices.append(("Contexts of ArgR", mat_arg_r_ctx.astype(np.float64), d_ctx)) matrices.append(("Contexts of ArgL or ArgR", mat_arg_union_ctx.astype(np.float64), d_ctx)) matrices.append(("Contexts of ArgL and ArgR", mat_arg_inters_ctx.astype(np.float64), d_ctx)) matrices.append(("Contexts difference of ArgL and ArgR", mat_arg_diff_ctx.astype(np.float64), d_ctx)) matrices.append(("Contexts of ArgL but not ArgR", mat_arg_l_minus_r_ctx.astype(np.float64), d_ctx)) matrices.append(("Contexts of ArgR but not ArgL", mat_arg_r_minus_l_ctx.astype(np.float64), d_ctx)) # topic features logging.info("loading lda feature matrices.") mat_topic = tm.arg_l_arg_r_to_topic_matrix( d_triples._rtuple2ids, svo_flipped_lda_w2t, len(d_triples), mmfile_presuffix=".bless.topic_pairs", reload=False ) matrices.append(("Topic of ArgL - ArgR pair", mat_topic, None)) mat_arg_l_topic = tm.arg_to_topic_matrix( d_triples._m2ids, svo_lda_w2t, len(d_triples), mmfile_presuffix=".bless.topic_w1", reload=False ) matrices.append(("Topic of ArgL", mat_arg_l_topic, None)) mat_arg_r_topic = tm.arg_to_topic_matrix( d_triples._r2ids, svo_lda_w2t, len(d_triples), mmfile_presuffix=".bless.topic_w2", reload=False ) matrices.append(("Topic of ArgR", mat_arg_r_topic, None)) # distributionally similar args for each arg logging.info("loading similar arguments.") d_arg = td.Dict() mat_sim_arg_l = tm.arg_asjo_matrix( d_triples._m2ids, d_arg, svo_dt, len(d_triples), transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], mmfile_presuffix=".sim_w1", reload=False, ) mat_sim_arg_r = tm.arg_asjo_matrix( d_triples._r2ids, d_arg, svo_dt, len(d_triples), transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], mmfile_presuffix=".sim_w2", reload=False, ) ### create some extra matrices logging.info("creating similar arguments intersection and set minus matrices.") # adjust dimensions, in case they are different if mat_sim_arg_l.shape[1] < mat_sim_arg_r.shape[1]: if sparse.isspmatrix_coo(mat_sim_arg_l): mat_sim_arg_l = mat_sim_arg_l.todok() mat_sim_arg_l.resize(mat_sim_arg_r.shape) if mat_sim_arg_r.shape[1] < mat_sim_arg_l.shape[1]: if sparse.isspmatrix_coo(mat_sim_arg_r): mat_sim_arg_r = mat_sim_arg_r.todok() mat_sim_arg_r.resize(mat_sim_arg_l.shape) if not sparse.isspmatrix_coo(mat_sim_arg_l): mat_sim_arg_l = mat_sim_arg_l.tocoo() if not sparse.isspmatrix_coo(mat_sim_arg_r): mat_sim_arg_r = mat_sim_arg_r.tocoo() # mat_sim_arg_l = mat_sim_arg_l.astype(bool) mat_sim_arg_r = mat_sim_arg_r.astype(bool) # mat_sim_union_arg = mat_sim_arg_l + mat_sim_arg_r mat_sim_diff_arg = mat_sim_arg_l != mat_sim_arg_r mat_sim_inters_arg = mat_sim_union_arg - mat_sim_diff_arg mat_sim_l_minus_r_arg = mat_sim_union_arg - mat_sim_arg_r mat_sim_r_minus_l_arg = mat_sim_union_arg - mat_sim_arg_l matrices.append(("Similar Args to ArgL", mat_sim_arg_l, d_arg)) matrices.append(("Similar Args to ArgR", mat_sim_arg_r, d_arg)) matrices.append(("Similar Args to ArgL or ArgR", mat_sim_union_arg, d_arg)) matrices.append(("Similar Args to ArgL and ArgR", mat_sim_inters_arg, d_arg)) matrices.append(("Difference of similar Args to ArgL and ArgR", mat_sim_diff_arg, d_arg)) matrices.append(("Similar Args to ArgL but not to ArgR", mat_sim_l_minus_r_arg, d_arg)) matrices.append(("Similar Args to ArgR but not to ArgL", mat_sim_r_minus_l_arg, d_arg)) return matrices
logging.info('loading context features for word pairs') d_ctx_pair = td.Dict() m_ctx_pair = tm.arg_l_arg_r_asjo_matrix(d_triples._rtuple2ids, fn_ctx_pair, num_triples, col_indices=d_ctx_pair, mmfile_presuffix='_pairs', reload=refresh) logging.info('loading context features for words') d_ctx_word = td.Dict() m_ctx_w1 = tm.arg_asjo_matrix( d_triples._m2ids, d_ctx_word, fn_ctx_word, num_triples, transform_w2sig=lambda w2sig: sorted( list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], mmfile_presuffix='_w1', reload=refresh) m_ctx_w2 = tm.arg_asjo_matrix( d_triples._r2ids, d_ctx_word, fn_ctx_word, num_triples, transform_w2sig=lambda w2sig: sorted( list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], mmfile_presuffix='_w2', reload=refresh) # adjust ( context ) matrix dimensions, if they vary
def load_matrices(d_triples): matrices = [] ## just the left argument as feature logging.info('creating w1 as feature matrix') d_w1 = td.Dict() w1_mat = tm.w1Asfeature(d_triples, d_w1) matrices.append(('W1 as Feature', w1_mat, d_w1)) ## just the right argument as feature logging.info('creating w2 as feature matrix') d_w2 = td.Dict() w2_mat = tm.w2Asfeature(d_triples, d_w2) matrices.append(('W2 as Feature', w2_mat, d_w2)) ## relation pair features logging.info('loading paths between argument pairs') d_paths = td.Dict() mat_paths = tm.arg_l_arg_r_asjo_matrix(d_triples._rtuple2ids, \ svo_flipped_counts,\ len(d_triples), col_indices = d_paths, \ mmfile_presuffix='.paths', reload=False) matrices.append(('paths between ArgL and ArgR', mat_paths, d_paths)) logging.info('loading similar argument pairs') d_sim_pairs = td.Dict() mat_sim_pairs = tm.arg_l_arg_r_asjo_matrix(d_triples._rtuple2ids, \ svo_flipped_dt,\ len(d_triples), col_indices = d_sim_pairs, \ transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20],\ mmfile_presuffix='.simpairs', reload=False) matrices.append(('similar ArgL - ArgR pairs', mat_sim_pairs, d_sim_pairs)) ## context features logging.info('loading argument context matrices') d_ctx = td.Dict() mat_arg_l_ctx = tm.arg_asjo_matrix(d_triples._m2ids,\ d_ctx, svo_counts,\ len(d_triples),\ transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20],\ mmfile_presuffix='.ctx_w1', reload=False) mat_arg_r_ctx = tm.arg_asjo_matrix(d_triples._r2ids,\ d_ctx, svo_counts,\ len(d_triples),\ transform_w2sig=lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20],\ mmfile_presuffix='.ctx_w2', reload=False) ## create some extra matrices logging.info( 'creating argument context intersection and set minus matrices.') # adjust dimensions, in case they are different if mat_arg_l_ctx.shape[1] < mat_arg_r_ctx.shape[1]: if sparse.isspmatrix_coo(mat_arg_l_ctx): mat_arg_l_ctx = mat_arg_l_ctx.todok() mat_arg_l_ctx.resize(mat_arg_r_ctx.shape) if mat_arg_r_ctx.shape[1] < mat_arg_l_ctx.shape[1]: if sparse.isspmatrix_coo(mat_arg_r_ctx): mat_arg_r_ctx = mat_arg_r_ctx.todok() mat_arg_r_ctx.resize(mat_arg_l_ctx.shape) if not sparse.isspmatrix_coo(mat_arg_l_ctx): mat_arg_l_ctx = mat_arg_l_ctx.tocoo() if not sparse.isspmatrix_coo(mat_arg_r_ctx): mat_arg_r_ctx = mat_arg_r_ctx.tocoo() mat_arg_l_ctx = mat_arg_l_ctx.astype(bool) mat_arg_r_ctx = mat_arg_r_ctx.astype(bool) mat_arg_union_ctx = mat_arg_l_ctx + mat_arg_r_ctx mat_arg_diff_ctx = mat_arg_l_ctx != mat_arg_r_ctx mat_arg_inters_ctx = mat_arg_union_ctx - mat_arg_diff_ctx mat_arg_l_minus_r_ctx = mat_arg_union_ctx - mat_arg_r_ctx mat_arg_r_minus_l_ctx = mat_arg_union_ctx - mat_arg_l_ctx matrices.append( ('Contexts of ArgL', mat_arg_l_ctx.astype(np.float64), d_ctx)) matrices.append( ('Contexts of ArgR', mat_arg_r_ctx.astype(np.float64), d_ctx)) matrices.append(('Contexts of ArgL or ArgR', mat_arg_union_ctx.astype(np.float64), d_ctx)) matrices.append(('Contexts of ArgL and ArgR', mat_arg_inters_ctx.astype(np.float64), d_ctx)) matrices.append(('Contexts difference of ArgL and ArgR', mat_arg_diff_ctx.astype(np.float64), d_ctx)) matrices.append(('Contexts of ArgL but not ArgR', mat_arg_l_minus_r_ctx.astype(np.float64), d_ctx)) matrices.append(('Contexts of ArgR but not ArgL', mat_arg_r_minus_l_ctx.astype(np.float64), d_ctx)) # topic features logging.info('loading lda feature matrices.') mat_topic = tm.arg_l_arg_r_to_topic_matrix(d_triples._rtuple2ids,\ svo_flipped_lda_w2t,\ len(d_triples), \ mmfile_presuffix='.bless.topic_pairs', reload=False) matrices.append(('Topic of ArgL - ArgR pair', mat_topic, None)) mat_arg_l_topic = tm.arg_to_topic_matrix(d_triples._m2ids,\ svo_lda_w2t,\ len(d_triples),\ mmfile_presuffix='.bless.topic_w1', reload=False) matrices.append(('Topic of ArgL', mat_arg_l_topic, None)) mat_arg_r_topic = tm.arg_to_topic_matrix(d_triples._r2ids,\ svo_lda_w2t,\ len(d_triples),\ mmfile_presuffix='.bless.topic_w2', reload=False) matrices.append(('Topic of ArgR', mat_arg_r_topic, None)) # distributionally similar args for each arg logging.info('loading similar arguments.') d_arg = td.Dict() mat_sim_arg_l = tm.arg_asjo_matrix(d_triples._m2ids,\ d_arg, svo_dt,\ len(d_triples),\ transform_w2sig = lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], \ mmfile_presuffix='.sim_w1', reload=False) mat_sim_arg_r = tm.arg_asjo_matrix(d_triples._r2ids,\ d_arg, svo_dt,\ len(d_triples),\ transform_w2sig = lambda w2sig: sorted(list(w2sig), key=lambda x: float(x[1]), reverse=True)[:20], \ mmfile_presuffix='.sim_w2', reload=False) ### create some extra matrices logging.info( 'creating similar arguments intersection and set minus matrices.') # adjust dimensions, in case they are different if mat_sim_arg_l.shape[1] < mat_sim_arg_r.shape[1]: if sparse.isspmatrix_coo(mat_sim_arg_l): mat_sim_arg_l = mat_sim_arg_l.todok() mat_sim_arg_l.resize(mat_sim_arg_r.shape) if mat_sim_arg_r.shape[1] < mat_sim_arg_l.shape[1]: if sparse.isspmatrix_coo(mat_sim_arg_r): mat_sim_arg_r = mat_sim_arg_r.todok() mat_sim_arg_r.resize(mat_sim_arg_l.shape) if not sparse.isspmatrix_coo(mat_sim_arg_l): mat_sim_arg_l = mat_sim_arg_l.tocoo() if not sparse.isspmatrix_coo(mat_sim_arg_r): mat_sim_arg_r = mat_sim_arg_r.tocoo() # mat_sim_arg_l = mat_sim_arg_l.astype(bool) mat_sim_arg_r = mat_sim_arg_r.astype(bool) # mat_sim_union_arg = mat_sim_arg_l + mat_sim_arg_r mat_sim_diff_arg = mat_sim_arg_l != mat_sim_arg_r mat_sim_inters_arg = mat_sim_union_arg - mat_sim_diff_arg mat_sim_l_minus_r_arg = mat_sim_union_arg - mat_sim_arg_r mat_sim_r_minus_l_arg = mat_sim_union_arg - mat_sim_arg_l matrices.append(('Similar Args to ArgL', mat_sim_arg_l, d_arg)) matrices.append(('Similar Args to ArgR', mat_sim_arg_r, d_arg)) matrices.append(('Similar Args to ArgL or ArgR', mat_sim_union_arg, d_arg)) matrices.append( ('Similar Args to ArgL and ArgR', mat_sim_inters_arg, d_arg)) matrices.append(('Difference of similar Args to ArgL and ArgR', mat_sim_diff_arg, d_arg)) matrices.append( ('Similar Args to ArgL but not to ArgR', mat_sim_l_minus_r_arg, d_arg)) matrices.append( ('Similar Args to ArgR but not to ArgL', mat_sim_r_minus_l_arg, d_arg)) return matrices