def _simple_cr(self, scu, topic_id): """ Process a single docset and write the result to corresponding output file :param scu: list of summary content units, which have score feature :param topic_id: topic id of this doc """ summary = self._get_summarizations(scu) write(summary, topic_id, output_folder_name=self.output_folder_name, over_write=True)
def save_data(args, out_data): if args.test_rate is not None: # Write to train and test files save_to_train_test(out_data, args) if args.kfold is not None: # Write to kfold files train and test files save_to_kfold(out_data, args) if args.out_file is not None: # Write data to file writer.write(out_data, args.out_file, args.out_format, args.user_col, args.skill_col, args.correct_col, args.exercise_col) print('Wrote', args.out_file)
def save_to_kfold(out_data, args): out_file = args.out_file or args.in_file kfold = KFold(n_splits=args.kfold, shuffle=args.shuffle) users = out_data[args.user_col].unique() for i, (train_i, test_i) in enumerate(kfold.split(users)): # Train-test split for split_name, split_index in [('train', train_i), ('test', test_i)]: filename = f'{out_file}.{split_name}.{i}' split_data = out_data[out_data[args.user_col].isin( users[split_index])].copy() writer.write(split_data, filename, args.out_format, args.user_col, args.skill_col, args.correct_col, args.exercise_col) print('Wrote', filename)
def save_to_train_test(out_data, args): out_file = args.out_file or args.in_file if args.test_rate > .5: print( "Warning: train test split rate is above .5, this means test set will be larger than train test." ) n_test = int(len(out_data) * args.test_rate) test_out_file = out_file + '.test' writer.write(out_data.iloc[:n_test], test_out_file, args.out_format, args.user_col, args.skill_col, args.correct_col, args.exercise_col) print('Wrote', test_out_file) n_valid = 0 if args.validation_rate is not None and 0 < args.validation_rate < 1: n_valid = int(len(out_data) * args.validation_rate) validation_out_file = out_file + '.valid' writer.write(out_data.iloc[n_test:n_test + n_valid], validation_out_file, args.out_format, args.user_col, args.skill_col, args.correct_col, args.exercise_col) print('Wrote', validation_out_file) train_out_file = out_file + '.train' writer.write(out_data.iloc[n_test + n_valid:], train_out_file, args.out_format, args.user_col, args.skill_col, args.correct_col, args.exercise_col) print('Wrote', train_out_file)
def test_to_asc(self): filepath = tmpdir + '/test.asc' writer.write(data1, filepath, format='asc') expected = """ 1 1 0 3 1,2,1 1,0,0 2 3,4 1,0 """.split() with open(filepath) as f: actual = f.read().splitlines() self.assertEqual(expected, actual) os.remove(filepath)
def _compression_ilp(self, scu, topic_id): """ Content realization by combining ILP and sentence compression :param scu: list[sentence] :param topic_id: str """ # Get pruned sentences all_candidates = [] group_lengths = [] for item in scu: candidates = self._generate_compressed_candidate(item) all_candidates += candidates group_lengths.append(len(candidates)) bigram_dict, bigram_set, bigram_freq = self._get_bigrams( all_candidates) n_bigram = len(bigram_set) n_sent = len(all_candidates) bigram_list = list(bigram_set) # the order of bigram variables # Calculate weights in objective function # Use number of occurrences of bigram as its weight weight = [] for i in range(n_bigram): weight.append(bigram_freq[bigram_list[i]]) # Give a very small weight to every sentence to reduce randomness weight_s = np.zeros(n_sent) delta = 0.005 for i in range(n_sent): weight_s[i] = delta * (n_sent - i) weight = np.concatenate((np.array(weight), weight_s), axis=0) # add s_j after c_i # Calculate coefs # Variable: c_i, s_j # First, calculate coefs for constraints 1 and 2. n_constraint = 0 coefs_1 = [] # constraint 1, i*j rows coefs_2 = [] # constraint 2, i rows for i in range(n_bigram): coefs_sum_s = np.zeros(n_sent) for j in range(n_sent): coefs_c = np.zeros(n_bigram) coefs_s = np.zeros(n_sent) if bigram_list[i] in bigram_dict[j]: # c_i in s_j, s_j - c_i <= 0 coefs_c[i] = -1 coefs_s[j] = 1 coefs_sum_s[j] = -1 coefs_1.append(np.concatenate((coefs_c, coefs_s), axis=0)) n_constraint += 1 coefs_sum_c = np.zeros(n_bigram) coefs_sum_c[i] = 1 coefs_2.append(np.concatenate((coefs_sum_c, coefs_sum_s), axis=0)) n_constraint += 1 # Constraint 3, length constraint, 1 row coefs_3_s = np.zeros(n_sent) for i in range(n_sent): coefs_3_s[i] = all_candidates[i].length() # Constraint for each group of compressed candidates coefs_4 = [] cur_pos = 0 # current position for n_group in group_lengths: coefs_4_s = np.zeros(n_sent) coefs_4_c = np.zeros(n_bigram) coefs_4_s[cur_pos:cur_pos + n_group] = 1 coefs_4.append(np.concatenate((coefs_4_c, coefs_4_s), axis=0)) cur_pos = cur_pos + n_group # Use pulp to solve ILP problem ilp_model = pulp.LpProblem('content realization', pulp.LpMaximize) # Define variables sentences = pulp.LpVariable.dict("sentence", (i for i in range(n_sent)), lowBound=0, upBound=1, cat=pulp.LpInteger) concepts = pulp.LpVariable.dict("concept", (i for i in range(n_bigram)), lowBound=0, upBound=1, cat=pulp.LpInteger) # Add objective function ilp_model += pulp.lpSum([ weight[int(key)] * concepts[key] for key in concepts ]), "Objective function" # Add length constraint ilp_model += pulp.lpSum( [coefs_3_s[int(key)] * sentences[key] for key in sentences]) <= self.max_length # Add constraints 1 for coefs in coefs_1: ilp_model += pulp.lpSum( [coefs[key] * concepts[key] for key in concepts] + [ coefs[key2 + n_bigram] * sentences[key2] for key2 in sentences ]) <= 0 # Add constraints 2 for coefs in coefs_2: ilp_model += pulp.lpSum( [coefs[key] * concepts[key] for key in concepts] + [ coefs[key2 + n_bigram] * sentences[key2] for key2 in sentences ]) <= 0 # Add constraints 4 for coefs in coefs_4: ilp_model += pulp.lpSum( [coefs[key] * concepts[key] for key in concepts] + [ coefs[key2 + n_bigram] * sentences[key2] for key2 in sentences ]) <= 1 # ilp_model.writeLP('ilp_model') # write ilp model to file ilp_model.solve(pulp.PULP_CBC_CMD()) indices = np.array([sentences[key].value() for key in sentences]) indices[np.isnan(indices.astype(float))] = 0 # convert None to 0 summary = [ sent.content() for sent in np.array(all_candidates)[indices > 0.1] ] # Write result write(summary, topic_id, output_folder_name=self.output_folder_name, over_write=True)
def _improved_ilp(self, scu, topic_id): """ An improved ILP algorithm for sentence realization. For details, please check: A Scalable Global Model for Summarization :param scu: list[Sentence] :param topic_id: topic id for this docset """ # scu = self.prune_pipeline(scu, self.prune_pipe) bigram_dict, bigram_set, bigram_freq = self._get_bigrams(scu) n_bigram = len(bigram_set) n_sent = len(scu) # Calculate weights in objective function # Count every bigram's occurrence bigram_freq = {} delta = 0.001 for index in bigram_dict: # For bigrams in each sentence[index] for bigram in bigram_dict[index]: # Bigrams in important sentences have higher weights if bigram not in bigram_freq: bigram_freq[bigram] = 1 + delta * (n_sent - index) else: bigram_freq[bigram] = bigram_freq[bigram] + 1 + delta * ( n_sent - index) bigram_list = list(bigram_set) # the order of bigram variables # Use frequency of bigram as its weight weight = [] for i in range(n_bigram): weight.append(bigram_freq[bigram_list[i]]) weight_s = np.zeros(n_sent) for i in range(n_sent): # Give a very small weight to every sentence weight_s[i] = delta * (n_sent - i) weight = np.concatenate((np.array(weight), weight_s), axis=0) # add s_j after c_i # Calculate coefs # Variable: c_i, s_j n_constraint = 0 coefs_1 = [] # constraint 1, i*j rows coefs_2 = [] # constraint 2, i rows for i in range(n_bigram): coefs_sum_s = np.zeros(n_sent) for j in range(n_sent): coefs_c = np.zeros(n_bigram) coefs_s = np.zeros(n_sent) if bigram_list[i] in bigram_dict[j]: # c_i in s_j, s_j - c_i <= 0 coefs_c[i] = -1 coefs_s[j] = 1 coefs_sum_s[j] = -1 coefs_1.append(np.concatenate((coefs_c, coefs_s), axis=0)) n_constraint += 1 coefs_sum_c = np.zeros(n_bigram) coefs_sum_c[i] = 1 coefs_2.append(np.concatenate((coefs_sum_c, coefs_sum_s), axis=0)) n_constraint += 1 coefs_3_s = np.zeros(n_sent) # constraint 3, length constraint, 1 row for i in range(n_sent): coefs_3_s[i] = scu[i].length() # Use pulp to solve ILP problem ilp_model = pulp.LpProblem('content realization', pulp.LpMaximize) # Define variables sentences = pulp.LpVariable.dict("sentence", (i for i in range(n_sent)), lowBound=0, upBound=1, cat=pulp.LpInteger) concepts = pulp.LpVariable.dict("concept", (i for i in range(n_bigram)), lowBound=0, upBound=1, cat=pulp.LpInteger) # Add objective function ilp_model += pulp.lpSum([ weight[int(key)] * concepts[key] for key in concepts ]), "Objective function" # Add length constraint ilp_model += pulp.lpSum( [coefs_3_s[int(key)] * sentences[key] for key in sentences]) <= self.max_length # Add constraints 1 for coefs in coefs_1: ilp_model += pulp.lpSum( [coefs[key] * concepts[key] for key in concepts] + [ coefs[key2 + n_bigram] * sentences[key2] for key2 in sentences ]) <= 0 # Add constraints 2 for coefs in coefs_2: ilp_model += pulp.lpSum( [coefs[key] * concepts[key] for key in concepts] + [ coefs[key2 + n_bigram] * sentences[key2] for key2 in sentences ]) <= 0 # ilp_model.writeLP('ilp_model') # write ilp model to file ilp_model.solve(pulp.PULP_CBC_CMD()) indices = np.array([sentences[key].value() for key in sentences]) indices[indices == None] = 0 summary = [sent.content() for sent in np.array(scu)[indices > 0.1]] # Write result write(summary, topic_id, output_folder_name=self.output_folder_name, over_write=True)
def _linear_prog(self, scu, topic_id): """ Integer linear programming solver. For details, please check: Extractive Multi-Document Summarization with Integer Linear Programming and Support Vector Regression :param scu: a list of sentence :param topic_id: topic id for this docset """ bigram_dict, bigram_set, bigram_freq = self._get_bigrams(scu) n_sentence = len(scu) # number of sentences n_bigram = len(bigram_set) # number of different bigrams bigram_list = list(bigram_set) # Get lengths for all sentences min_sent_len = 8 # get number of words in shortest sentence # Calculate coefficients in target function # All coefs are divided to two parts: importance(imp) and diversity(div) target_imp_coef = np.zeros(n_sentence) target_div_coef = np.zeros(n_bigram) for i in range(n_sentence): target_imp_coef[i] = self.lambda1 * scu[i].score() * scu[i].length( ) / (self.max_length / min_sent_len) for i in range(n_bigram): target_div_coef[i] = self.lambda2 / n_sentence # target_coef = np.concatenate((target_imp_coef, target_div_coef), axis=0) # Calculate coefs in constraint 1 # c1_coef = np.concatenate((np.array(sent_lens), np.zeros(n_bigram)), axis=0) # Calculate coefs in constraint 2, n_sentence constraints in total c2_coef = [] for i in range(n_sentence): n_gram_i = len(bigram_dict[i]) # number of bigrams in sentence i c2_sent_coef = np.zeros(n_sentence) c2_sent_coef[i] = n_gram_i # the coef of sentence i is n_gram_i c2_bi_coef = np.zeros(n_bigram) # If bigram appears in sentence i, set bigram's coef to -1 for bigram in bigram_dict[i]: bigram_index = bigram_list.index(bigram) c2_bi_coef[bigram_index] = -1 c2_coef.append(np.concatenate((c2_sent_coef, c2_bi_coef), axis=0)) c2_coef = np.array(c2_coef) # Calculate coefs in constraint 3, n_bigram constraints in total c3_coef = [] for j in range(n_bigram): cur_bigram = bigram_list[j] c3_sent_coef = np.zeros(n_sentence) c3_bi_coef = np.zeros(n_bigram) for i in range(n_sentence): # Current bigram appears in sentence i, set sentence's coef to -1 if cur_bigram in bigram_dict[i]: c3_sent_coef[i] = -1 c3_bi_coef[j] = 1 # coef of current bigram is 1 c3_coef.append(np.concatenate((c3_sent_coef, c3_bi_coef), axis=0)) c3_coef = np.array(c3_coef) ilp_model = pulp.LpProblem('content realization', pulp.LpMaximize) # Define variables sentences = pulp.LpVariable.dict("sentence", (i for i in range(n_sentence)), lowBound=0, upBound=1, cat=pulp.LpInteger) concepts = pulp.LpVariable.dict("bigram", (i for i in range(n_bigram)), lowBound=0, upBound=1, cat=pulp.LpInteger) # Add objective function ilp_model += pulp.lpSum( [target_imp_coef[int(key)] * sentences[key] for key in sentences] + [target_div_coef[int(key)] * concepts[key] for key in concepts]) # Add length constraint ilp_model += pulp.lpSum( [sent_lens[int(key)] * sentences[key] for key in sentences]) <= self.max_length # Add constraints 1 for coefs in c2_coef: ilp_model += pulp.lpSum( [coefs[key] * sentences[key] for key in sentences] + [ coefs[key2 + n_sentence] * concepts[key2] for key2 in concepts ]) <= 0 # Add constraints 2 for coefs in c3_coef: ilp_model += pulp.lpSum( [coefs[key] * sentences[key] for key in sentences] + [ coefs[key2 + n_sentence] * concepts[key2] for key2 in concepts ]) <= 0 # ilp_model.writeLP('ilp_model') # write ilp model to file ilp_model.solve() indices = np.array([sentences[key].value() for key in sentences]) summary = [sent.content() for sent in np.array(scu)[indices > 0.1]] # Write result write(summary, topic_id, output_folder_name=self.output_folder_name, over_write=True)