示例#1
0
 def _simple_cr(self, scu, topic_id):
     """
     Process a single docset and write the result to corresponding output file
     :param scu: list of summary content units, which have score feature
     :param topic_id: topic id of this doc
     """
     summary = self._get_summarizations(scu)
     write(summary,
           topic_id,
           output_folder_name=self.output_folder_name,
           over_write=True)
示例#2
0
def save_data(args, out_data):
    if args.test_rate is not None:
        # Write to train and test files
        save_to_train_test(out_data, args)
    if args.kfold is not None:
        # Write to kfold files train and test files
        save_to_kfold(out_data, args)
    if args.out_file is not None:
        # Write data to file
        writer.write(out_data, args.out_file, args.out_format, args.user_col,
                     args.skill_col, args.correct_col, args.exercise_col)
        print('Wrote', args.out_file)
示例#3
0
def save_to_kfold(out_data, args):
    out_file = args.out_file or args.in_file
    kfold = KFold(n_splits=args.kfold, shuffle=args.shuffle)

    users = out_data[args.user_col].unique()

    for i, (train_i, test_i) in enumerate(kfold.split(users)):
        # Train-test split
        for split_name, split_index in [('train', train_i), ('test', test_i)]:
            filename = f'{out_file}.{split_name}.{i}'
            split_data = out_data[out_data[args.user_col].isin(
                users[split_index])].copy()
            writer.write(split_data, filename, args.out_format, args.user_col,
                         args.skill_col, args.correct_col, args.exercise_col)
            print('Wrote', filename)
示例#4
0
def save_to_train_test(out_data, args):
    out_file = args.out_file or args.in_file
    if args.test_rate > .5:
        print(
            "Warning: train test split rate is above .5, this means test set will be larger than train test."
        )
    n_test = int(len(out_data) * args.test_rate)
    test_out_file = out_file + '.test'
    writer.write(out_data.iloc[:n_test], test_out_file, args.out_format,
                 args.user_col, args.skill_col, args.correct_col,
                 args.exercise_col)
    print('Wrote', test_out_file)

    n_valid = 0
    if args.validation_rate is not None and 0 < args.validation_rate < 1:
        n_valid = int(len(out_data) * args.validation_rate)
        validation_out_file = out_file + '.valid'
        writer.write(out_data.iloc[n_test:n_test + n_valid],
                     validation_out_file, args.out_format, args.user_col,
                     args.skill_col, args.correct_col, args.exercise_col)
        print('Wrote', validation_out_file)

    train_out_file = out_file + '.train'
    writer.write(out_data.iloc[n_test + n_valid:], train_out_file,
                 args.out_format, args.user_col, args.skill_col,
                 args.correct_col, args.exercise_col)
    print('Wrote', train_out_file)
    def test_to_asc(self):
        filepath = tmpdir + '/test.asc'
        writer.write(data1, filepath, format='asc')

        expected = """
        1
        1
        0
        3
        1,2,1
        1,0,0
        2
        3,4
        1,0
        """.split()

        with open(filepath) as f:
            actual = f.read().splitlines()

        self.assertEqual(expected, actual)

        os.remove(filepath)
示例#6
0
    def _compression_ilp(self, scu, topic_id):
        """
        Content realization by combining ILP and sentence compression
        :param scu: list[sentence]
        :param topic_id: str
        """
        # Get pruned sentences
        all_candidates = []
        group_lengths = []
        for item in scu:
            candidates = self._generate_compressed_candidate(item)
            all_candidates += candidates
            group_lengths.append(len(candidates))
        bigram_dict, bigram_set, bigram_freq = self._get_bigrams(
            all_candidates)
        n_bigram = len(bigram_set)
        n_sent = len(all_candidates)
        bigram_list = list(bigram_set)  # the order of bigram variables

        # Calculate weights in objective function
        # Use number of occurrences of bigram as its weight
        weight = []
        for i in range(n_bigram):
            weight.append(bigram_freq[bigram_list[i]])
        # Give a very small weight to every sentence to reduce randomness
        weight_s = np.zeros(n_sent)
        delta = 0.005
        for i in range(n_sent):
            weight_s[i] = delta * (n_sent - i)
        weight = np.concatenate((np.array(weight), weight_s),
                                axis=0)  # add s_j after c_i

        # Calculate coefs
        # Variable: c_i, s_j

        # First, calculate coefs for constraints 1 and 2.
        n_constraint = 0
        coefs_1 = []  # constraint 1, i*j rows
        coefs_2 = []  # constraint 2, i rows
        for i in range(n_bigram):
            coefs_sum_s = np.zeros(n_sent)
            for j in range(n_sent):
                coefs_c = np.zeros(n_bigram)
                coefs_s = np.zeros(n_sent)
                if bigram_list[i] in bigram_dict[j]:
                    # c_i in s_j, s_j - c_i <= 0
                    coefs_c[i] = -1
                    coefs_s[j] = 1
                    coefs_sum_s[j] = -1
                    coefs_1.append(np.concatenate((coefs_c, coefs_s), axis=0))
                    n_constraint += 1
            coefs_sum_c = np.zeros(n_bigram)
            coefs_sum_c[i] = 1
            coefs_2.append(np.concatenate((coefs_sum_c, coefs_sum_s), axis=0))
            n_constraint += 1

        # Constraint 3, length constraint, 1 row
        coefs_3_s = np.zeros(n_sent)
        for i in range(n_sent):
            coefs_3_s[i] = all_candidates[i].length()

        # Constraint for each group of compressed candidates
        coefs_4 = []
        cur_pos = 0  # current position
        for n_group in group_lengths:
            coefs_4_s = np.zeros(n_sent)
            coefs_4_c = np.zeros(n_bigram)
            coefs_4_s[cur_pos:cur_pos + n_group] = 1
            coefs_4.append(np.concatenate((coefs_4_c, coefs_4_s), axis=0))
            cur_pos = cur_pos + n_group

        # Use pulp to solve ILP problem
        ilp_model = pulp.LpProblem('content realization', pulp.LpMaximize)
        # Define variables
        sentences = pulp.LpVariable.dict("sentence",
                                         (i for i in range(n_sent)),
                                         lowBound=0,
                                         upBound=1,
                                         cat=pulp.LpInteger)
        concepts = pulp.LpVariable.dict("concept",
                                        (i for i in range(n_bigram)),
                                        lowBound=0,
                                        upBound=1,
                                        cat=pulp.LpInteger)
        # Add objective function
        ilp_model += pulp.lpSum([
            weight[int(key)] * concepts[key] for key in concepts
        ]), "Objective function"
        # Add length constraint
        ilp_model += pulp.lpSum(
            [coefs_3_s[int(key)] * sentences[key]
             for key in sentences]) <= self.max_length
        # Add constraints 1
        for coefs in coefs_1:
            ilp_model += pulp.lpSum(
                [coefs[key] * concepts[key] for key in concepts] + [
                    coefs[key2 + n_bigram] * sentences[key2]
                    for key2 in sentences
                ]) <= 0
        # Add constraints 2
        for coefs in coefs_2:
            ilp_model += pulp.lpSum(
                [coefs[key] * concepts[key] for key in concepts] + [
                    coefs[key2 + n_bigram] * sentences[key2]
                    for key2 in sentences
                ]) <= 0

        # Add constraints 4
        for coefs in coefs_4:
            ilp_model += pulp.lpSum(
                [coefs[key] * concepts[key] for key in concepts] + [
                    coefs[key2 + n_bigram] * sentences[key2]
                    for key2 in sentences
                ]) <= 1

        # ilp_model.writeLP('ilp_model')  # write ilp model to file
        ilp_model.solve(pulp.PULP_CBC_CMD())
        indices = np.array([sentences[key].value() for key in sentences])
        indices[np.isnan(indices.astype(float))] = 0  # convert None to 0
        summary = [
            sent.content() for sent in np.array(all_candidates)[indices > 0.1]
        ]
        # Write result
        write(summary,
              topic_id,
              output_folder_name=self.output_folder_name,
              over_write=True)
示例#7
0
    def _improved_ilp(self, scu, topic_id):
        """
        An improved ILP algorithm for sentence realization. For details, please check:
        A Scalable Global Model for Summarization
        :param scu: list[Sentence]
        :param topic_id: topic id for this docset
        """
        # scu = self.prune_pipeline(scu, self.prune_pipe)
        bigram_dict, bigram_set, bigram_freq = self._get_bigrams(scu)
        n_bigram = len(bigram_set)
        n_sent = len(scu)

        # Calculate weights in objective function
        # Count every bigram's occurrence
        bigram_freq = {}
        delta = 0.001
        for index in bigram_dict:
            # For bigrams in each sentence[index]
            for bigram in bigram_dict[index]:
                # Bigrams in important sentences have higher weights
                if bigram not in bigram_freq:
                    bigram_freq[bigram] = 1 + delta * (n_sent - index)
                else:
                    bigram_freq[bigram] = bigram_freq[bigram] + 1 + delta * (
                        n_sent - index)
        bigram_list = list(bigram_set)  # the order of bigram variables
        # Use frequency of bigram as its weight
        weight = []
        for i in range(n_bigram):
            weight.append(bigram_freq[bigram_list[i]])
        weight_s = np.zeros(n_sent)
        for i in range(n_sent):
            # Give a very small weight to every sentence
            weight_s[i] = delta * (n_sent - i)
        weight = np.concatenate((np.array(weight), weight_s),
                                axis=0)  # add s_j after c_i

        # Calculate coefs
        # Variable: c_i, s_j
        n_constraint = 0
        coefs_1 = []  # constraint 1, i*j rows
        coefs_2 = []  # constraint 2, i rows
        for i in range(n_bigram):
            coefs_sum_s = np.zeros(n_sent)
            for j in range(n_sent):
                coefs_c = np.zeros(n_bigram)
                coefs_s = np.zeros(n_sent)
                if bigram_list[i] in bigram_dict[j]:
                    # c_i in s_j, s_j - c_i <= 0
                    coefs_c[i] = -1
                    coefs_s[j] = 1
                    coefs_sum_s[j] = -1
                    coefs_1.append(np.concatenate((coefs_c, coefs_s), axis=0))
                    n_constraint += 1
            coefs_sum_c = np.zeros(n_bigram)
            coefs_sum_c[i] = 1
            coefs_2.append(np.concatenate((coefs_sum_c, coefs_sum_s), axis=0))
            n_constraint += 1

        coefs_3_s = np.zeros(n_sent)  # constraint 3, length constraint, 1 row
        for i in range(n_sent):
            coefs_3_s[i] = scu[i].length()

        # Use pulp to solve ILP problem
        ilp_model = pulp.LpProblem('content realization', pulp.LpMaximize)
        # Define variables
        sentences = pulp.LpVariable.dict("sentence",
                                         (i for i in range(n_sent)),
                                         lowBound=0,
                                         upBound=1,
                                         cat=pulp.LpInteger)
        concepts = pulp.LpVariable.dict("concept",
                                        (i for i in range(n_bigram)),
                                        lowBound=0,
                                        upBound=1,
                                        cat=pulp.LpInteger)
        # Add objective function
        ilp_model += pulp.lpSum([
            weight[int(key)] * concepts[key] for key in concepts
        ]), "Objective function"
        # Add length constraint
        ilp_model += pulp.lpSum(
            [coefs_3_s[int(key)] * sentences[key]
             for key in sentences]) <= self.max_length
        # Add constraints 1
        for coefs in coefs_1:
            ilp_model += pulp.lpSum(
                [coefs[key] * concepts[key] for key in concepts] + [
                    coefs[key2 + n_bigram] * sentences[key2]
                    for key2 in sentences
                ]) <= 0
        # Add constraints 2
        for coefs in coefs_2:
            ilp_model += pulp.lpSum(
                [coefs[key] * concepts[key] for key in concepts] + [
                    coefs[key2 + n_bigram] * sentences[key2]
                    for key2 in sentences
                ]) <= 0

        # ilp_model.writeLP('ilp_model')  # write ilp model to file
        ilp_model.solve(pulp.PULP_CBC_CMD())
        indices = np.array([sentences[key].value() for key in sentences])
        indices[indices == None] = 0
        summary = [sent.content() for sent in np.array(scu)[indices > 0.1]]
        # Write result
        write(summary,
              topic_id,
              output_folder_name=self.output_folder_name,
              over_write=True)
示例#8
0
    def _linear_prog(self, scu, topic_id):
        """
        Integer linear programming solver. For details, please check:
        Extractive Multi-Document Summarization with Integer Linear Programming and Support Vector Regression
        :param scu: a list of sentence
        :param topic_id: topic id for this docset
        """
        bigram_dict, bigram_set, bigram_freq = self._get_bigrams(scu)
        n_sentence = len(scu)  # number of sentences
        n_bigram = len(bigram_set)  # number of different bigrams
        bigram_list = list(bigram_set)

        # Get lengths for all sentences
        min_sent_len = 8  # get number of words in shortest sentence

        # Calculate coefficients in target function
        # All coefs are divided to two parts: importance(imp) and diversity(div)
        target_imp_coef = np.zeros(n_sentence)
        target_div_coef = np.zeros(n_bigram)
        for i in range(n_sentence):
            target_imp_coef[i] = self.lambda1 * scu[i].score() * scu[i].length(
            ) / (self.max_length / min_sent_len)
        for i in range(n_bigram):
            target_div_coef[i] = self.lambda2 / n_sentence
        # target_coef = np.concatenate((target_imp_coef, target_div_coef), axis=0)

        # Calculate coefs in constraint 1
        # c1_coef = np.concatenate((np.array(sent_lens), np.zeros(n_bigram)), axis=0)

        # Calculate coefs in constraint 2, n_sentence constraints in total
        c2_coef = []
        for i in range(n_sentence):
            n_gram_i = len(bigram_dict[i])  # number of bigrams in sentence i
            c2_sent_coef = np.zeros(n_sentence)
            c2_sent_coef[i] = n_gram_i  # the coef of sentence i is n_gram_i
            c2_bi_coef = np.zeros(n_bigram)
            # If bigram appears in sentence i, set bigram's coef to -1
            for bigram in bigram_dict[i]:
                bigram_index = bigram_list.index(bigram)
                c2_bi_coef[bigram_index] = -1
            c2_coef.append(np.concatenate((c2_sent_coef, c2_bi_coef), axis=0))
        c2_coef = np.array(c2_coef)

        # Calculate coefs in constraint 3, n_bigram constraints in total
        c3_coef = []
        for j in range(n_bigram):
            cur_bigram = bigram_list[j]
            c3_sent_coef = np.zeros(n_sentence)
            c3_bi_coef = np.zeros(n_bigram)
            for i in range(n_sentence):
                # Current bigram appears in sentence i, set sentence's coef to -1
                if cur_bigram in bigram_dict[i]:
                    c3_sent_coef[i] = -1
            c3_bi_coef[j] = 1  # coef of current bigram is 1
            c3_coef.append(np.concatenate((c3_sent_coef, c3_bi_coef), axis=0))
        c3_coef = np.array(c3_coef)

        ilp_model = pulp.LpProblem('content realization', pulp.LpMaximize)
        # Define variables
        sentences = pulp.LpVariable.dict("sentence",
                                         (i for i in range(n_sentence)),
                                         lowBound=0,
                                         upBound=1,
                                         cat=pulp.LpInteger)
        concepts = pulp.LpVariable.dict("bigram", (i for i in range(n_bigram)),
                                        lowBound=0,
                                        upBound=1,
                                        cat=pulp.LpInteger)
        # Add objective function
        ilp_model += pulp.lpSum(
            [target_imp_coef[int(key)] * sentences[key] for key in sentences] +
            [target_div_coef[int(key)] * concepts[key] for key in concepts])
        # Add length constraint
        ilp_model += pulp.lpSum(
            [sent_lens[int(key)] * sentences[key]
             for key in sentences]) <= self.max_length
        # Add constraints 1
        for coefs in c2_coef:
            ilp_model += pulp.lpSum(
                [coefs[key] * sentences[key] for key in sentences] + [
                    coefs[key2 + n_sentence] * concepts[key2]
                    for key2 in concepts
                ]) <= 0
        # Add constraints 2
        for coefs in c3_coef:
            ilp_model += pulp.lpSum(
                [coefs[key] * sentences[key] for key in sentences] + [
                    coefs[key2 + n_sentence] * concepts[key2]
                    for key2 in concepts
                ]) <= 0

        # ilp_model.writeLP('ilp_model')  # write ilp model to file
        ilp_model.solve()
        indices = np.array([sentences[key].value() for key in sentences])
        summary = [sent.content() for sent in np.array(scu)[indices > 0.1]]

        # Write result
        write(summary,
              topic_id,
              output_folder_name=self.output_folder_name,
              over_write=True)