示例#1
0
    def test_read_annotated_dat_one_line(self):
        pm = ParseAndModel()

        df_section_list = pd.DataFrame([[0, 0, "very pleased", True]],
                                       columns=["doc_id", "section_id", "section_text", "title"])
        df_feature_mapping = pd.DataFrame([])
        df_feature_list = defaultdict(int)

        pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1)

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
        self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"]))
示例#2
0
    def test_bem_two_section(self):
        pm = ParseAndModel()

        section_list = pd.DataFrame([[0, 0, "large clear screen", True]
                                        , [0, 1, "large broken bad", True]
                                     ], columns=["doc_id", "section_id", "section_text", "title"])

        pm.feature_list = ["screen"]
        pm.formatted_feature_list = pm.format_feature_list()

        pm.parsed_text = dict(section_list=section_list)
        pm.model_results = pm.build_explicit_models(lemmatize_words=False, log_base=2)

        expected_model_background = [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6]
        expected_model_feature = [[0.218, 0.282, 0.282, 0.109, 0.109]]
        expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})
            , 1: Counter({"large": 1, "broken": 1, "bad": 1})}
        expected_section_word_counts_matrix = [[1, 1, 1, 0, 0]
            , [1, 0, 0, 1, 1]]
        expected_model_background_matrix = np.array([1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6])
        expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282], [0.109], [0.109]])
        expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen', 3: 'broken', 4: 'bad'}

        self.assertEqual(True, expected_model_background == pm.model_results["model_background"])
        self.assertEqual(True, expected_model_feature == [[round(val, 3) for val in feature_model] for feature_model in
                                                          pm.model_results["model_feature"]])
        # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"])
        self.assertEqual(True,
                         np.array_equiv(expected_section_word_counts_matrix,
                                        csr_matrix.toarray(pm.model_results["section_word_counts_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_background_matrix,
                                              csr_matrix.toarray(pm.model_results["model_background_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_feature_matrix,
                                              np.round(pm.model_results["model_feature_matrix"], 3)))
        self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
示例#3
0
    def test_read_annotated_dat_one_feature_implicit(self):
        pm = ParseAndModel()

        df_section_list = pd.DataFrame([[0, 0, "it is handy to carry around because of the  and easy to store", False]],
                                       columns=["doc_id", "section_id", "section_text", "title"])
        df_feature_mapping = pd.DataFrame([[0, "size", False, 0]],
                                          columns=["doc_id", "feature", "is_explicit", "section_id"])
        df_feature_list = defaultdict(int)

        pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1,
                                                start_line=6)

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
        self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"]))
示例#4
0
    def test_bem_one_section(self):
        pm = ParseAndModel()

        section_list = pd.DataFrame([[0, 0, "large clear screen", True]
                                     ], columns=["doc_id", "section_id", "section_text", "title"])

        pm.feature_list = ["screen"]
        pm.formatted_feature_list = pm.format_feature_list()

        pm.parsed_text = dict(section_list=section_list)
        pm.model_results = pm.build_explicit_models(log_base=2)

        expected_model_background = [1 / 3, 1 / 3, 1 / 3]
        expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]]
        expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})}
        expected_section_word_counts_matrix = [[1, 1, 1]]
        expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3])
        expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]])
        expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'}

        self.assertEqual(True, expected_model_background == pm.model_results["model_background"])
        self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"])
        # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"])
        self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix,
                                              csr_matrix.toarray(pm.model_results["section_word_counts_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_background_matrix,
                                              csr_matrix.toarray(pm.model_results["model_background_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"]))
        self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
示例#5
0
    def test_read_annotated_dat_one_feature_explicit(self):
        pm = ParseAndModel()

        df_section_list = pd.DataFrame([[0, 0, "the battery life is outstanding (again, compared to the mini)", False]],
                                       columns=["doc_id", "section_id", "section_text", "title"])
        df_feature_mapping = pd.DataFrame([[0, "battery", True, 0]],
                                          columns=["doc_id", "feature", "is_explicit", "section_id"])
        df_feature_list = defaultdict(int)
        df_feature_list["battery"] = 1

        pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1,
                                                start_line=13)

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
        self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"]))
        self.assertEqual(True, dict(df_feature_list) == dict(pm.parsed_text["feature_list"]))
示例#6
0
 def load_data(self,
               feature_list: list = None,
               filename: str = None,
               input_type: Enum = ParseAndModel.InputType.annotated,
               nlines: int = None, ):
     """
     Loads user data.
     :param feature_list: a list of strings and lists of strings. Individual strings will be given separate ids, lists
     of strings will be treated as synonyms and given the same feature id.
     ex. ["sound", "battery", ["screen", "display"]]
     :param filename: Filename for the data set
     :param input_type: An enum of type InputType, specifying the type of input data so the correct read function can be chosen
         options are "annotated" - which expects data in Santu's original format and "onedocperline" - which expects
         all data to be in a single file with one document per line
     :param nlines: Maximum number of lines from the file to read or None to read all lines
     :return: None
     """
     logging.info(type(self).__name__, "- load_data...")
     if not feature_list:
         self.usage()
         exit(1)
     if not filename:
         self.usage()
         exit(1)
     self.pm = ParseAndModel(feature_list=feature_list,  # list of features
                             filename=filename,  # file with input data
                             nlines=nlines,  # number of lines to read
                             input_type=input_type)  # input type as enum
示例#7
0
    def test_format_feature_list_synonym(self):
        pm = ParseAndModel()

        df = pd.DataFrame([["sound", 0, 0],
                           ["battery", 1, 1],
                           ["screen", 2, 2],
                           ["display", 2, 3]], columns=["feature", "feature_id", "feature_term_id"])

        feature_list = ["sound", "battery", ["screen", "display"]]

        pm.feature_list = feature_list
        pm.formatted_feature_list = pm.format_feature_list()

        print(df)
        print(pm.formatted_feature_list)
        self.assertEqual(True, pd.DataFrame.equals(df, pm.formatted_feature_list))
示例#8
0
    def test_constructor_twoline(self):
        pm_inst = ParseAndModel(
            feature_list=["screen"],
            filename='../tests/data/parse_and_model/twoLineTest.txt',
            log_base=2)
        em = EmVectorByFeature(explicit_model=pm_inst)

        expected_section_word_counts_matrix = [[1, 1, 1, 0, 0],
                                               [1, 0, 0, 1, 1]]
        expected_model_background_matrix = np.array(
            [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6])
        expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282],
                                                  [0.109], [0.109]])

        self.assertEqual(True,
                         np.array_equiv(expected_section_word_counts_matrix,
                                        csr_matrix.toarray(em.reviews_matrix)),
                         msg="section counts do not match")
        self.assertEqual(True,
                         np.array_equiv(
                             expected_model_background_matrix,
                             csr_matrix.toarray(em.background_probability)),
                         msg="background model does not match")
        self.assertEqual(True,
                         np.array_equiv(expected_model_feature_matrix,
                                        np.round(em.topic_model, 3)),
                         msg="topic models do not match")

        print("testing")
示例#9
0
    def test_read_plain_dat_one_line(self):
        pm = ParseAndModel(filename='../tests/data/parse_and_model/oneLinePerDoc.txt', input_type=ParseAndModel.InputType.docperline, nlines=1)

        df_section_list = pd.DataFrame([[0, 0, "I am very pleased with the 4 GB iPod Nano that I purchased."],
                                        [0, 1, "It was very easy to download music onto it and it's very easy to move around in it."],
                                        [0, 2, "Recommend this item to anybody."]],
                                       columns=["doc_id", "section_id", "section_text"])

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
示例#10
0
    def test_read_annotated_dat_two_feature_explicit(self):
        pm = ParseAndModel()

        df_section_list = pd.DataFrame([[0, 0, "my son loves the nano, it is small and has a good size screen", False]],
                                       columns=["doc_id", "section_id", "section_text", "title"])
        df_feature_mapping = pd.DataFrame([[0, "screen", True, 0]
                                              , [0, "size", True, 0]],
                                          columns=["doc_id", "feature", "is_explicit", "section_id"])
        df_feature_list = defaultdict(int)
        df_feature_list["screen"] = 1
        df_feature_list["size"] = 1

        pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1,
                                                start_line=622)

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
        self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"]))
        self.assertEqual(True, dict(df_feature_list) == dict(pm.parsed_text["feature_list"]))
示例#11
0
    def test_read_plain_dat_two_line(self):
        pm = ParseAndModel(filename='../tests/data/parse_and_model/oneLinePerDoc.txt', input_type=ParseAndModel.InputType.docperline, nlines=2)

        df_section_list = pd.DataFrame([[0, 0, "I am very pleased with the 4 GB iPod Nano that I purchased."],
                                        [0, 1, "It was very easy to download music onto it and it's very easy to move around in it."],
                                        [0, 2, "Recommend this item to anybody."],
                                        [1, 3, "I like the compact ipod and the features it offered."],
                                        [1, 4, "It is handy to carry around because of the  and easy to store."],
                                        [1, 5, "The light weight also makes it easy to move with."],
                                        [1, 6, "It works well and I have had no problems with it."]],
                                       columns=["doc_id", "section_id", "section_text"])

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
示例#12
0
    def test_read_annotated_dat_complicated_reviews(self):
        pm = ParseAndModel()

        df_section_list = pd.DataFrame([[0, 0, "it could be better", True]
                                           , [0, 1,
                                              "this item is very nice and plays songs in a very good stereo sound but my problem with this item is the  it does not last not even near the 14 hours they claimed to last",
                                              False]
                                           , [0, 2, "i hope the new nano is at leat close to the hours claimed", False]
                                           , [1, 3, "pink apple 4gb nano ipod review", True]
                                           , [1, 4, "it was a gift for christmas to my daughter", False]
                                           , [1, 5,
                                              "she absolutely loves it! the sound quality is excellent!! the different colors make a nice option as well",
                                              False]
                                           ,
                                        [1, 6, "i originally picked a silver one, because that is all the store had",
                                         False]
                                           , [1, 7, "i then checked amazon", False]
                                           , [1, 8, "com that not only had it in pink(the color my daughter wanted), [",
                                              False]
                                           , [1, 9, "]i would recommend this product to everyone", False]

                                        ], columns=["doc_id", "section_id", "section_text", "title"])
        df_feature_mapping = pd.DataFrame([[0, "battery", False, 1]
                                              , [0, "sound", True, 1]
                                              , [1, "sound", True, 5]],
                                          columns=["doc_id", "feature", "is_explicit", "section_id"])
        df_feature_list = defaultdict(int)
        df_feature_list["battery"] = 1
        df_feature_list["sound"] = 2

        pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=10,
                                                start_line=660)

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
        self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"]))
        self.assertEqual(True, dict(df_feature_list) == dict(pm.parsed_text["feature_list"]))
示例#13
0
    def test_read_plain_dat_three_line(self):
        pm = ParseAndModel(filename='../tests/data/parse_and_model/oneLinePerDoc.txt', input_type=ParseAndModel.InputType.docperline)

        df_section_list = pd.DataFrame([[0, 0, "I am very pleased with the 4 GB iPod Nano that I purchased."],
                                        [0, 1, "It was very easy to download music onto it and it's very easy to move around in it."],
                                        [0, 2, "Recommend this item to anybody."],
                                        [1, 3, "I like the compact ipod and the features it offered."],
                                        [1, 4, "It is handy to carry around because of the  and easy to store."],
                                        [1, 5, "The light weight also makes it easy to move with."],
                                        [1, 6, "It works well and I have had no problems with it."],
                                        [2, 7, "This is my second iPod."],
                                        [2, 8, 'My first was a "mini" which the nano makes look like a "jumbo".'],
                                        [2, 9, "It's very lightweight, sound quality is typical of these devices."],
                                        [2, 10, "The battery life is outstanding (again, compared to the mini)."],
                                        [2, 11, "I've only had it for a month, but the battery so far is lasting over 8 hours."],
                                        [2, 12,
                                         "I haven't completely run it until it is dead yet, so I don't know how long it will really last."],
                                        [2, 13,"Awesome!"],
                                        ],
                                       columns=["doc_id", "section_id", "section_text"])

        self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
示例#14
0
    def test_constructor_one_section(self):
        pm = ParseAndModel(feature_list=["screen"], filename='data/parse_and_model/twoLineTest.txt',
                           lemmatize_words=False, nlines=1)

        section_list = pd.DataFrame([[0, 0, "large clear screen", True]
                                     ], columns=["doc_id", "section_id", "section_text", "title"])

        expected_model_background = [1 / 3, 1 / 3, 1 / 3]
        expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]]
        expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})}
        expected_section_word_counts_matrix = [[1, 1, 1]]
        expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3])
        expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]])
        expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'}

        self.assertEqual(True, expected_model_background == pm.model_results["model_background"])
        self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"])
        # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"])
        self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix,
                                              csr_matrix.toarray(pm.model_results["section_word_counts_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_background_matrix,
                                              csr_matrix.toarray(pm.model_results["model_background_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"]))
        self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
示例#15
0
        :return:
        """
        logging.info(type(self).__name__, '- compute cost...')

        delta = np.square(np.subtract(self.pi_matrix, self.previous_pi_matrix))

        return delta.sum()


if __name__ == '__main__':
    print("CWD:", os.getcwd())
    start_time = time.time()
    print("Calling ParseAndModel...")
    pm = ParseAndModel(feature_list=["sound", "battery", ["screen", "display"]],
                       filename='../tests/data/parse_and_model/iPod.final',
                       nlines=100)

    print(pm.model_results.keys())
    print("Calling EMVectorByFeature")
    em = EmVectorByFeature(explicit_model=pm,
                           max_iter=10)
    em.em()

    end_time = time.time()
    print("Elapsed: {} seconds".format(round(end_time - start_time, 4)))


"""
    * Notation:
            v = number of words in vocabulary
    def test_against_original_1_50_iteration(self):

        # load pi initalization
        infile = open("original_code_data/test_original_1_pi_init.data", 'rb')
        pi_init = pickle.load(infile)
        infile.close()

        # load gflm sentence calculation results
        infile = open(
            "original_code_data/test_original_1_gflm_sentence_probs.data",
            'rb')
        gflm_sentence_probs = pickle.load(infile)
        infile.close()

        # load gflm sentence tagging results
        infile = open(
            "original_code_data/test_original_1_gflm_sentence_results.data",
            'rb')
        gflm_sentence_results = pickle.load(infile)
        infile.close()

        # load gflm word calculation result
        infile = open(
            "original_code_data/test_original_1_gflm_word_probs.data", 'rb')
        gflm_word_probs = pickle.load(infile)
        infile.close()

        # load gflm word tagging results
        infile = open(
            "original_code_data/test_original_1_gflm_word_results.data", 'rb')
        gflm_word_results = pickle.load(infile)
        infile.close()

        pm_inst = ParseAndModel(feature_list=["sound", "battery"],
                                filename='data/parse_and_model/iPod.final',
                                remove_stopwords=False,
                                lemmatize_words=False,
                                log_base=None,
                                start_line=4,
                                nlines=11,
                                include_title_lines=False)

        # use fixed init
        # check section word counts
        pi_init_em = np.empty([
            pm_inst.model_results["section_word_counts_matrix"].shape[0],
            len(pm_inst.model_results["model_feature"])
        ])
        for review_id in range(0, len(pi_init)):
            review_sections = pm_inst.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id ==
                                              review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                for feature_id in range(
                        0, len(pi_init[review_id][section_og_id])):
                    feature_name_row = pm_inst.formatted_feature_list[
                        pm_inst.formatted_feature_list.feature_id ==
                        feature_id]
                    feature_name_row = feature_name_row.reset_index(drop=True)
                    pi_init_em[section_index,
                               feature_id] = pi_init[review_id][section_og_id][
                                   feature_name_row["feature"][0]]

                section_og_id += 1

        em = EmVectorByFeature(explicit_model=pm_inst,
                               max_iter=50,
                               lambda_background=0.7,
                               pi_init=pi_init_em)
        em.initialize_parameters()
        em.em_loop()

        # Calculate GFLM
        gflm = GFLM(em_results=em, section_threshold=0.35, word_threshold=0.35)
        gflm.calc_gflm_section()
        gflm.calc_gflm_word()

        # Check gflm word results
        for review_id in range(0, len(gflm_word_probs)):
            review_sections = pm_inst.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id ==
                                              review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                for feature_id in range(
                        0,
                        max(gflm.gflm_word_all.implicit_feature_id) + 1):
                    feature_name_row = pm_inst.formatted_feature_list[
                        pm_inst.formatted_feature_list.feature_id ==
                        feature_id]
                    feature_name_row = feature_name_row.reset_index(drop=True)
                    actual_param_data = gflm.gflm_word_all[
                        (gflm.gflm_word_all.section_id == section_index) &
                        (gflm.gflm_word_all.implicit_feature_id == feature_id)]
                    actual_param_data = actual_param_data.reset_index(
                        drop=True)
                    actual_param = actual_param_data.loc[0].gflm_word

                    # loop through words and grab relevant feature value
                    word_probs = list()
                    for word, probability in gflm_word_probs[review_id][
                            section_og_id].items():
                        word_probs.append(
                            probability[feature_name_row["feature"][0]])
                    original_param = max(word_probs)

                    print(
                        "checking GFLM word probs - section:" +
                        str(section_index), ", feature=" + str(feature_id))
                    self.assertEqual(
                        round(actual_param, 8),
                        round(original_param, 8),
                        msg="feature=" + str(feature_name_row["feature"][0]) +
                        ", section= " + str(section_index) + ", a=" +
                        str(actual_param) + ", e=" + str(original_param))

                section_og_id += 1

        # Check gflm section results
        for review_id in range(0, len(gflm_word_probs)):
            review_sections = pm_inst.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id ==
                                              review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                for feature_id in range(
                        0,
                        max(gflm.gflm_word_all.implicit_feature_id) + 1):
                    feature_name_row = pm_inst.formatted_feature_list[
                        pm_inst.formatted_feature_list.feature_id ==
                        feature_id]
                    feature_name_row = feature_name_row.reset_index(drop=True)
                    actual_param_data = gflm.gflm_section_all[
                        (gflm.gflm_section_all.section_id == section_index)
                        & (gflm.gflm_section_all.implicit_feature_id ==
                           feature_id)]
                    actual_param_data = actual_param_data.reset_index(
                        drop=True)
                    actual_param = actual_param_data.loc[0].gflm_section

                    original_param = gflm_sentence_probs[review_id][
                        section_og_id][feature_name_row["feature"][0]]

                    print(
                        "checking GFLM section probs - section:" +
                        str(section_index), ", feature=" + str(feature_id))
                    self.assertEqual(
                        round(actual_param, 8),
                        round(original_param, 8),
                        msg="feature=" + str(feature_name_row["feature"][0]) +
                        ", section= " + str(section_index) + ", a=" +
                        str(actual_param) + ", e=" + str(original_param))

                section_og_id += 1
示例#17
0
    def test_against_original_1_single_iteration(self):

        # load pi initalization
        infile = open("original_code_data/test_original_1_pi_init.data", 'rb')
        pi_init = pickle.load(infile)
        infile.close()

        # load pi params (end of iteration 1)
        infile = open("original_code_data/test_original_1_pi_params_it1.data",
                      'rb')
        pi_params = pickle.load(infile)
        infile.close()

        # load hidden params (end of iteration 1)
        infile = open(
            "original_code_data/test_original_1_hidden_params_it1.data", 'rb')
        hidden_params = pickle.load(infile)
        infile.close()

        # load hidden background params (end of iteration 1)
        infile = open(
            "original_code_data/test_original_1_hidden_background_params_it1.data",
            'rb')
        hidden_back_params = pickle.load(infile)
        infile.close()

        # load pi deltas
        infile = open("original_code_data/test_original_1_pi_delta_it1.data",
                      'rb')
        pi_delta = pickle.load(infile)
        infile.close()

        pm_inst = ParseAndModel(feature_list=["sound", "battery"],
                                filename='data/parse_and_model/iPod.final',
                                remove_stopwords=False,
                                lemmatize_words=False,
                                log_base=None,
                                start_line=4,
                                nlines=11,
                                include_title_lines=False)

        # use fixed init
        # check section word counts
        pi_init_em = np.empty([
            pm_inst.model_results["section_word_counts_matrix"].shape[0],
            len(pm_inst.model_results["model_feature"])
        ])
        for review_id in range(0, len(pi_init)):
            review_sections = pm_inst.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id ==
                                              review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                for feature_id in range(
                        0, len(pi_init[review_id][section_og_id])):
                    feature_name_row = pm_inst.formatted_feature_list[
                        pm_inst.formatted_feature_list.feature_id ==
                        feature_id]
                    feature_name_row = feature_name_row.reset_index(drop=True)
                    pi_init_em[section_index,
                               feature_id] = pi_init[review_id][section_og_id][
                                   feature_name_row["feature"][0]]

                section_og_id += 1

        em = EmVectorByFeature(explicit_model=pm_inst,
                               max_iter=1,
                               lambda_background=0.7,
                               pi_init=pi_init_em)

        # TODO: this should probably be moved into a constructor somewhere
        em.initialize_parameters()

        em.em_loop()

        # Check hidden parameters
        dense_hidden_params = list()
        for feature_id in range(0, len(em.hidden_parameters)):
            dense_hidden_params.append(
                em.hidden_parameters[feature_id].toarray())
        inverse_vocab_lookup = {
            strip_punctuation(v): k
            for k, v in pm_inst.model_results["vocabulary_lookup"].items()
        }

        for review_id in range(0, len(hidden_params)):
            review_sections = pm_inst.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id ==
                                              review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                for word in hidden_params[review_id][section_og_id].keys():
                    word_id = inverse_vocab_lookup[strip_punctuation(word)]
                    for feature_id in range(
                            0,
                            len(hidden_params[review_id][section_og_id]
                                [word])):
                        feature_name_row = pm_inst.formatted_feature_list[
                            pm_inst.formatted_feature_list.feature_id ==
                            feature_id]
                        feature_name_row = feature_name_row.reset_index(
                            drop=True)
                        actual_param = dense_hidden_params[feature_id][
                            section_row["section_id"], word_id]
                        original_param = hidden_params[review_id][
                            section_og_id][word][feature_name_row["feature"]
                                                 [0]]

                        print("checking word:" + word)
                        self.assertEqual(round(actual_param, 8),
                                         round(original_param, 8),
                                         msg="hidden feature - feature_id: " +
                                         str(feature_name_row["feature"][0]) +
                                         ", word=" + word + ", a=" +
                                         str(actual_param) + ", e=" +
                                         str(original_param))

                section_og_id += 1

        # Check hidden background parameters
        dense_hidden_back_params = em.hidden_parameters_background.toarray()
        for review_id in range(0, len(hidden_back_params)):
            review_sections = pm_inst.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id ==
                                              review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                for word in hidden_params[review_id][section_og_id].keys():
                    word_id = inverse_vocab_lookup[strip_punctuation(word)]
                    actual_param = dense_hidden_back_params[
                        section_row["section_id"], word_id]
                    original_param = hidden_back_params[review_id][
                        section_og_id][word]

                    print("checking word:" + word)
                    self.assertEqual(round(actual_param, 8),
                                     round(original_param, 8),
                                     msg="hidden background: " + ", word=" +
                                     word + ", a=" + str(actual_param) +
                                     ", e=" + str(original_param))

                section_og_id += 1

        # Check pi parameters
        for review_id in range(0, len(pi_params)):
            review_sections = pm_inst.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id ==
                                              review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                for feature_id in range(
                        0, len(pi_params[review_id][section_og_id])):
                    feature_name_row = pm_inst.formatted_feature_list[
                        pm_inst.formatted_feature_list.feature_id ==
                        feature_id]
                    feature_name_row = feature_name_row.reset_index(drop=True)
                    actual_param = em.pi_matrix[section_index, feature_id]
                    original_param = pi_params[review_id][section_og_id][
                        feature_name_row["feature"][0]]

                    print("checking section:" + str(section_index))
                    self.assertEqual(round(actual_param, 8),
                                     round(original_param, 8),
                                     msg="pi params: " + ", feature=" +
                                     str(feature_name_row["feature"][0]) +
                                     ", section= " + str(section_index) +
                                     ", a=" + str(actual_param) + ", e=" +
                                     str(original_param))

                section_og_id += 1

        # Check pi deltas
        self.assertEqual(round(pi_delta, 8),
                         round(em.pi_delta, 8),
                         msg="pi delta: " + ", a=" + str(em.pi_delta) +
                         ", e=" + str(pi_delta))
示例#18
0
    def test_against_original_1(self):
        os.getcwd()

        # load topic model
        infile = open("original_code_data/test_original_1_topic_model.data", 'rb')
        topic_model = pickle.load(infile)
        infile.close()

        # load section word counts
        infile = open("original_code_data/test_original_1_section_word_counts.data", 'rb')
        section_word_counts = pickle.load(infile)
        infile.close()

        # load background model
        infile = open("original_code_data/test_original_1_background_model.data", 'rb')
        background_model = pickle.load(infile)
        infile.close()

        pm = ParseAndModel(feature_list=["sound", "battery"], filename='data/parse_and_model/iPod.final',
                           remove_stopwords=False, lemmatize_words=False, log_base=None, start_line=4, nlines=11,
                           include_title_lines=False)

        # check section word counts
        pm_section_word_counts = pm.model_results["section_word_counts_matrix"].toarray()
        inverse_vocab_lookup = {strip_punctuation(v): k for k, v in pm.model_results["vocabulary_lookup"].items()}
        for review_id in range(0, len(section_word_counts)):
            print("SWC - Checking review: " + str(review_id))
            review_sections = pm.parsed_text["section_list"]
            review_sections = review_sections[review_sections.doc_id == review_id]
            section_og_id = 0
            for section_index, section_row in review_sections.iterrows():
                print("SWC - Checking section:" + str(section_row["section_id"]))
                for word in section_word_counts[review_id][section_og_id].keys():
                    word = strip_punctuation(word)
                    if word == '':
                        continue
                    vocab_word_id = inverse_vocab_lookup[word]
                    actual_count = pm_section_word_counts[section_row["section_id"], vocab_word_id]
                    original_count = section_word_counts[review_id][section_og_id][word]

                    self.assertEqual(actual_count, original_count, msg="SWC - section_id: " + str(
                        section_row["section_id"]) + ", " + word + " a=" + str(actual_count) + ", e=" + str(
                        original_count))

                section_og_id += 1

        # check background model
        for word in background_model.keys():
            print("Background - Checking word:" + word)
            word = strip_punctuation(word)
            vocab_word_id = inverse_vocab_lookup[word]

            actual_prob = pm.model_results["model_background"][vocab_word_id]
            original_prob = background_model[word]

            self.assertEqual(actual_prob, original_prob,
                             msg="Background prob:" + word + " a=" + str(
                                 actual_prob) + ", e=" + str(original_prob))

        # check topic model
        for f_index, feature_row in pm.formatted_feature_list.iterrows():
            for word_index, word in pm.model_results["vocabulary_lookup"].items():
                print("Topic - Checking word:" + word)

                word = strip_punctuation(word)
                feature_index = feature_row["feature_id"]
                actual_prob = pm.model_results["model_feature"][feature_index][word_index]
                original_prob = topic_model[feature_row["feature"]][word]

                self.assertEqual(round(actual_prob, 8), round(original_prob, 8),
                                 msg="topic - feature_id: " + str(
                                     feature_row["feature_id"]) + ", word=" + word + ", a=" + str(
                                     actual_prob) + ", e=" + str(original_prob))