def getDocStats(input_dir, body_file, summ_file): """ """ corpus = buildCorpus(os.path.join(input_dir, body_file), os.path.join(input_dir, summ_file), w_exp=True) num_docs = 0 total_nodes = 0 total_edges = 0 selected_nodes = 0 selected_edges = 0 for inst in corpus: num_docs += 1 my_nodes, oracle_nodes, _ = inst.nodes my_edges, oracle_edges = inst.edges total_nodes += len(my_nodes) total_edges += len(my_edges) selected_nodes += len(oracle_nodes) selected_edges += len(oracle_edges) print 'avg nodes: %.1f' % (total_nodes/num_docs) print 'avg edges: %.1f' % (total_edges/num_docs) print 'selected nodes: %.1f' % (selected_nodes/num_docs) print 'selected edges: %.1f' % (selected_edges/num_docs)
def test(body_file, summ_file, param_file, oracle_len, w_exp): """ run summarizer, perform structured prediction """ logger.debug('start testing...') logger.debug('[settings]: len_%s_exp_%d' % (oracle_len, w_exp)) corpus = buildCorpus(body_file, summ_file, w_exp) # load parameters from file decoder = Decoder() decoder.weights.load(param_file) # perform structured prediction estimator = ParamEstimator() estimator.predict(decoder, corpus, oracle_len) return
def train(body_file, summ_file, param_file, loss_func, num_passes, oracle_len, w_exp): """ run summarizer, learn structured prediction parameters """ logger.debug('start training...') logger.debug('[settings]: %s_%d_passes_len_%s_exp_%d' % (loss_func, num_passes, oracle_len, w_exp)) corpus = buildCorpus(body_file, summ_file, w_exp) # learn parameters decoder = Decoder() estimator = ParamEstimator() final_weights = estimator.learnParamsAdaGrad(decoder, corpus, param_file, loss_func, num_passes, oracle_len) # output parameters to file with codecs.open(param_file, 'w', 'utf-8') as outfile: outfile.write('#num_passes#: %d\n' % num_passes) outfile.write('%s\n' % final_weights.toString()) return
def summ(body_file, summ_file, param_file, oracle_len, w_exp, jamr=False): """ run summarizer, perform structured prediction """ logger.debug('start testing...') logger.debug('[settings]: len_%s_exp_%d' % (oracle_len, w_exp)) corpus = buildCorpus(body_file, summ_file, w_exp) # load parameters from file decoder = Decoder() decoder.weights.load(param_file) # perform structured prediction estimator = ParamEstimator() output_folder = param_file.replace('params', 'summ') if jamr == True: output_folder = param_file.replace('params', 'jamr_summ') estimator.summarize(decoder, corpus, oracle_len, output_folder) return
# named entity or not feat_vec[('n', 'nam-ent')] = 1.0 if '_' in node.concept else 0.0 feat_vec[('n', 'date-ent')] = 1.0 if ( node.concept).startswith('date-entity') else 0.0 return feat_vec if __name__ == '__main__': input_dir = '/Users/user/Data/SemanticSumm/Proxy/gold/split/dev/' body_file = 'aligned-amr-release-1.0-dev-proxy-body.txt' summ_file = 'aligned-amr-release-1.0-dev-proxy-summary.txt' corpus = buildCorpus(os.path.join(input_dir, body_file), os.path.join(input_dir, summ_file)) feat_extr = FeatureExtractor() feat_vec = FeatureVector() for inst in corpus: curr_filename = inst.filename my_nodes, s_nodes = inst.nodes my_edges, s_edges = inst.edges # logger.debug('extracting features for file: %s' % curr_filename) # for k_edge, v_edge in my_edges.iteritems(): # for tag in [0,1]: # feat_vec += feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges) logger.debug('extracting features for file: %s' % curr_filename) for k_node, v_node in my_nodes.iteritems():
feat_vec = FeatureVector() # named entity or not feat_vec[('n', 'nam-ent')] = 1.0 if '_' in node.concept else 0.0 feat_vec[('n', 'date-ent')] = 1.0 if (node.concept).startswith('date-entity') else 0.0 return feat_vec if __name__ == '__main__': input_dir = '/Users/user/Data/SemanticSumm/Proxy/gold/split/dev/' body_file = 'aligned-amr-release-1.0-dev-proxy-body.txt' summ_file = 'aligned-amr-release-1.0-dev-proxy-summary.txt' corpus = buildCorpus(os.path.join(input_dir, body_file), os.path.join(input_dir, summ_file)) feat_extr = FeatureExtractor() feat_vec = FeatureVector() for inst in corpus: curr_filename = inst.filename my_nodes, s_nodes = inst.nodes my_edges, s_edges = inst.edges # logger.debug('extracting features for file: %s' % curr_filename) # for k_edge, v_edge in my_edges.iteritems(): # for tag in [0,1]: # feat_vec += feat_extr.getEdgeFeats(k_edge, v_edge, tag, curr_filename, my_nodes, my_edges) logger.debug('extracting features for file: %s' % curr_filename) for k_node, v_node in my_nodes.iteritems():