Exemplo n.º 1
0
class Main:
    def __init__(self):
        self.m_preprocessor = Preprocessor()
        self.m_similarity = Similarity()
        self.m_plt = Plot()
        self.m_evaluator = Evaluator()
        self.m_file = "sts-train"
        self.m_metric = "path"
        self.m_ic = "brown"
        self.m_metric_w2v = "cosine"
        self.m_metric_t = "path"
        self.m_thr = 20
        self.m_mode = "ontology"

    def arg_parse(self):
        parser = argparse.ArgumentParser(description="Calculus of similarity",
                                         add_help=True)

        parser.add_argument(
            "-f",
            dest="file",
            action="store",
            type=str,
            choices=["sts-dev", "sts-test", "sts-train"],
            help="Select the csv file to calculate similarity.",
        )

        parser.add_argument(
            "-m",
            dest="metric",
            action="store",
            type=str,
            choices=["path", "lch", "wup", "res", "jcn", "lin"],
            help="Select the metric to calculate similarity.",
        )

        parser.add_argument(
            "-i",
            dest="ic",
            action="store",
            type=str,
            choices=["brown", "treebank"],
            help="Select the Information Content corpus.",
        )

        parser.add_argument(
            "-mw",
            dest="metric_w2v",
            action="store",
            type=str,
            choices=["cosine", "euclidean", "dot"],
            help="Select the metric to calculate similarity word2vec.",
        )

        parser.add_argument(
            "-mt",
            dest="metric_t",
            action="store",
            type=str,
            choices=["path", "jcn", "euclidean"],
            help="Select the metric to calculate textual similarity.",
        )

        parser.add_argument(
            "-t",
            dest="thr",
            action="store",
            type=int,
            choices=[20, 30, 40],
            help="Select the threshold to calculate textual similarity.",
        )

        args = parser.parse_args()
        if args.file != None:
            self.m_file = args.file
        if args.metric != None:
            self.m_mode = "ontology"
            self.m_metric = args.metric
        if args.ic != None:
            self.m_ic = args.ic
        if args.metric_w2v != None:
            self.m_mode = "word2vec"
            self.m_metric_w2v = args.metric_w2v
        if args.metric_t != None:
            self.m_mode = "textual"
            self.m_metric_t = args.metric_t
        if args.thr != None:
            self.m_thr = args.thr

        self.m_data_frame = self.m_preprocessor.initialize(
            "..\\stsbenchmark\\" + self.m_file + ".csv")

    def main(self):
        df_original = self.m_data_frame.copy()
        stats_original = self.data_frame_stats(self.m_data_frame)
        print("Printing original stats")
        print(stats_original)

        df_filtered = self.m_preprocessor.tokenize_and_filter_dataset(
            self.m_data_frame)
        print("Calcul similarity by " + self.m_mode + " similarity, using " +
              self.m_metric_t)

        if self.m_mode == "ontology":
            self.m_data_frame = self.m_similarity.calculate_ontology_similarity(
                self.m_metric, self.m_ic, df_filtered)
        elif self.m_mode == "word2vec":
            self.m_data_frame = self.m_similarity.calculate_word2vec_similarity(
                self.m_metric_w2v, df_filtered)
        elif self.m_mode == "textual":
            idf = self.m_preprocessor.idf_from_dataset(self.m_data_frame)
            self.m_data_frame = self.m_similarity.calculate_textual_similarity(
                self.m_thr, self.m_metric_t, df_filtered, idf)
        df_syntetic = self.m_data_frame.copy()
        stats_syntetic = self.data_frame_stats(self.m_data_frame)
        stats_total = stats_original.join(stats_syntetic,
                                          lsuffix="_o",
                                          rsuffix="_s")
        print("Printing all stats")
        print(stats_total)
        #self.m_plt.plot_scatter(stats_total)
        self.m_evaluator.evaluate(df_original, df_syntetic)

    def plot_base_data(self):
        for row in self.m_data_frame.itertuples():
            self.m_plt.prepare_data_to_plot(row.set, row.similarity)
        self.m_plt.sub_plot_bar(self.m_file)

    def plot_pos_tagged_data(self):
        df = self.m_preprocessor.tag_dataset(self.m_data_frame)
        for row in df.itertuples():
            self.m_plt.prepare_tags_to_plot(row.set, row.sentence_a,
                                            row.sentence_b)
        self.m_plt.sub_plot_bar(self.m_file)

    def calculate_sentences_length(self):
        df = self.m_preprocessor.tag_dataset(self.m_data_frame)
        lengths = {}
        for row in df.itertuples():
            if lengths.get(row.set, []) == []:
                lengths[row.set] = [len(row.sentence_a), len(row.sentence_b)]
                continue
            lengths[row.set].append(len(row.sentence_a))
            lengths[row.set].append(len(row.sentence_b))
        for key in lengths.keys():
            lengths[key] = mean(lengths[key])
        print(lengths)

    def data_frame_stats(self, df):
        return df.groupby(["set"])["similarity"].agg(["mean", "std", "median"])