def test_read_annotated_dat_one_line(self): pm = ParseAndModel() df_section_list = pd.DataFrame([[0, 0, "very pleased", True]], columns=["doc_id", "section_id", "section_text", "title"]) df_feature_mapping = pd.DataFrame([]) df_feature_list = defaultdict(int) pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"])) self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"]))
def test_bem_two_section(self): pm = ParseAndModel() section_list = pd.DataFrame([[0, 0, "large clear screen", True] , [0, 1, "large broken bad", True] ], columns=["doc_id", "section_id", "section_text", "title"]) pm.feature_list = ["screen"] pm.formatted_feature_list = pm.format_feature_list() pm.parsed_text = dict(section_list=section_list) pm.model_results = pm.build_explicit_models(lemmatize_words=False, log_base=2) expected_model_background = [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6] expected_model_feature = [[0.218, 0.282, 0.282, 0.109, 0.109]] expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1}) , 1: Counter({"large": 1, "broken": 1, "bad": 1})} expected_section_word_counts_matrix = [[1, 1, 1, 0, 0] , [1, 0, 0, 1, 1]] expected_model_background_matrix = np.array([1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6]) expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282], [0.109], [0.109]]) expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen', 3: 'broken', 4: 'bad'} self.assertEqual(True, expected_model_background == pm.model_results["model_background"]) self.assertEqual(True, expected_model_feature == [[round(val, 3) for val in feature_model] for feature_model in pm.model_results["model_feature"]]) # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(pm.model_results["section_word_counts_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_background_matrix, csr_matrix.toarray(pm.model_results["model_background_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, np.round(pm.model_results["model_feature_matrix"], 3))) self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
def test_read_annotated_dat_one_feature_implicit(self): pm = ParseAndModel() df_section_list = pd.DataFrame([[0, 0, "it is handy to carry around because of the and easy to store", False]], columns=["doc_id", "section_id", "section_text", "title"]) df_feature_mapping = pd.DataFrame([[0, "size", False, 0]], columns=["doc_id", "feature", "is_explicit", "section_id"]) df_feature_list = defaultdict(int) pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1, start_line=6) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"])) self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"]))
def test_bem_one_section(self): pm = ParseAndModel() section_list = pd.DataFrame([[0, 0, "large clear screen", True] ], columns=["doc_id", "section_id", "section_text", "title"]) pm.feature_list = ["screen"] pm.formatted_feature_list = pm.format_feature_list() pm.parsed_text = dict(section_list=section_list) pm.model_results = pm.build_explicit_models(log_base=2) expected_model_background = [1 / 3, 1 / 3, 1 / 3] expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]] expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})} expected_section_word_counts_matrix = [[1, 1, 1]] expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3]) expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]]) expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'} self.assertEqual(True, expected_model_background == pm.model_results["model_background"]) self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"]) # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(pm.model_results["section_word_counts_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_background_matrix, csr_matrix.toarray(pm.model_results["model_background_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"])) self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
def test_read_annotated_dat_one_feature_explicit(self): pm = ParseAndModel() df_section_list = pd.DataFrame([[0, 0, "the battery life is outstanding (again, compared to the mini)", False]], columns=["doc_id", "section_id", "section_text", "title"]) df_feature_mapping = pd.DataFrame([[0, "battery", True, 0]], columns=["doc_id", "feature", "is_explicit", "section_id"]) df_feature_list = defaultdict(int) df_feature_list["battery"] = 1 pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1, start_line=13) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"])) self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"])) self.assertEqual(True, dict(df_feature_list) == dict(pm.parsed_text["feature_list"]))
def load_data(self, feature_list: list = None, filename: str = None, input_type: Enum = ParseAndModel.InputType.annotated, nlines: int = None, ): """ Loads user data. :param feature_list: a list of strings and lists of strings. Individual strings will be given separate ids, lists of strings will be treated as synonyms and given the same feature id. ex. ["sound", "battery", ["screen", "display"]] :param filename: Filename for the data set :param input_type: An enum of type InputType, specifying the type of input data so the correct read function can be chosen options are "annotated" - which expects data in Santu's original format and "onedocperline" - which expects all data to be in a single file with one document per line :param nlines: Maximum number of lines from the file to read or None to read all lines :return: None """ logging.info(type(self).__name__, "- load_data...") if not feature_list: self.usage() exit(1) if not filename: self.usage() exit(1) self.pm = ParseAndModel(feature_list=feature_list, # list of features filename=filename, # file with input data nlines=nlines, # number of lines to read input_type=input_type) # input type as enum
def test_format_feature_list_synonym(self): pm = ParseAndModel() df = pd.DataFrame([["sound", 0, 0], ["battery", 1, 1], ["screen", 2, 2], ["display", 2, 3]], columns=["feature", "feature_id", "feature_term_id"]) feature_list = ["sound", "battery", ["screen", "display"]] pm.feature_list = feature_list pm.formatted_feature_list = pm.format_feature_list() print(df) print(pm.formatted_feature_list) self.assertEqual(True, pd.DataFrame.equals(df, pm.formatted_feature_list))
def test_constructor_twoline(self): pm_inst = ParseAndModel( feature_list=["screen"], filename='../tests/data/parse_and_model/twoLineTest.txt', log_base=2) em = EmVectorByFeature(explicit_model=pm_inst) expected_section_word_counts_matrix = [[1, 1, 1, 0, 0], [1, 0, 0, 1, 1]] expected_model_background_matrix = np.array( [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6]) expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282], [0.109], [0.109]]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(em.reviews_matrix)), msg="section counts do not match") self.assertEqual(True, np.array_equiv( expected_model_background_matrix, csr_matrix.toarray(em.background_probability)), msg="background model does not match") self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, np.round(em.topic_model, 3)), msg="topic models do not match") print("testing")
def test_read_plain_dat_one_line(self): pm = ParseAndModel(filename='../tests/data/parse_and_model/oneLinePerDoc.txt', input_type=ParseAndModel.InputType.docperline, nlines=1) df_section_list = pd.DataFrame([[0, 0, "I am very pleased with the 4 GB iPod Nano that I purchased."], [0, 1, "It was very easy to download music onto it and it's very easy to move around in it."], [0, 2, "Recommend this item to anybody."]], columns=["doc_id", "section_id", "section_text"]) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
def test_read_annotated_dat_two_feature_explicit(self): pm = ParseAndModel() df_section_list = pd.DataFrame([[0, 0, "my son loves the nano, it is small and has a good size screen", False]], columns=["doc_id", "section_id", "section_text", "title"]) df_feature_mapping = pd.DataFrame([[0, "screen", True, 0] , [0, "size", True, 0]], columns=["doc_id", "feature", "is_explicit", "section_id"]) df_feature_list = defaultdict(int) df_feature_list["screen"] = 1 df_feature_list["size"] = 1 pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=1, start_line=622) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"])) self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"])) self.assertEqual(True, dict(df_feature_list) == dict(pm.parsed_text["feature_list"]))
def test_read_plain_dat_two_line(self): pm = ParseAndModel(filename='../tests/data/parse_and_model/oneLinePerDoc.txt', input_type=ParseAndModel.InputType.docperline, nlines=2) df_section_list = pd.DataFrame([[0, 0, "I am very pleased with the 4 GB iPod Nano that I purchased."], [0, 1, "It was very easy to download music onto it and it's very easy to move around in it."], [0, 2, "Recommend this item to anybody."], [1, 3, "I like the compact ipod and the features it offered."], [1, 4, "It is handy to carry around because of the and easy to store."], [1, 5, "The light weight also makes it easy to move with."], [1, 6, "It works well and I have had no problems with it."]], columns=["doc_id", "section_id", "section_text"]) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
def test_read_annotated_dat_complicated_reviews(self): pm = ParseAndModel() df_section_list = pd.DataFrame([[0, 0, "it could be better", True] , [0, 1, "this item is very nice and plays songs in a very good stereo sound but my problem with this item is the it does not last not even near the 14 hours they claimed to last", False] , [0, 2, "i hope the new nano is at leat close to the hours claimed", False] , [1, 3, "pink apple 4gb nano ipod review", True] , [1, 4, "it was a gift for christmas to my daughter", False] , [1, 5, "she absolutely loves it! the sound quality is excellent!! the different colors make a nice option as well", False] , [1, 6, "i originally picked a silver one, because that is all the store had", False] , [1, 7, "i then checked amazon", False] , [1, 8, "com that not only had it in pink(the color my daughter wanted), [", False] , [1, 9, "]i would recommend this product to everyone", False] ], columns=["doc_id", "section_id", "section_text", "title"]) df_feature_mapping = pd.DataFrame([[0, "battery", False, 1] , [0, "sound", True, 1] , [1, "sound", True, 5]], columns=["doc_id", "feature", "is_explicit", "section_id"]) df_feature_list = defaultdict(int) df_feature_list["battery"] = 1 df_feature_list["sound"] = 2 pm.parsed_text = pm.read_annotated_data(filename='data/parse_and_model/iPod.final', nlines=10, start_line=660) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"])) self.assertEqual(True, pd.DataFrame.equals(df_feature_mapping, pm.parsed_text["feature_mapping"])) self.assertEqual(True, dict(df_feature_list) == dict(pm.parsed_text["feature_list"]))
def test_read_plain_dat_three_line(self): pm = ParseAndModel(filename='../tests/data/parse_and_model/oneLinePerDoc.txt', input_type=ParseAndModel.InputType.docperline) df_section_list = pd.DataFrame([[0, 0, "I am very pleased with the 4 GB iPod Nano that I purchased."], [0, 1, "It was very easy to download music onto it and it's very easy to move around in it."], [0, 2, "Recommend this item to anybody."], [1, 3, "I like the compact ipod and the features it offered."], [1, 4, "It is handy to carry around because of the and easy to store."], [1, 5, "The light weight also makes it easy to move with."], [1, 6, "It works well and I have had no problems with it."], [2, 7, "This is my second iPod."], [2, 8, 'My first was a "mini" which the nano makes look like a "jumbo".'], [2, 9, "It's very lightweight, sound quality is typical of these devices."], [2, 10, "The battery life is outstanding (again, compared to the mini)."], [2, 11, "I've only had it for a month, but the battery so far is lasting over 8 hours."], [2, 12, "I haven't completely run it until it is dead yet, so I don't know how long it will really last."], [2, 13,"Awesome!"], ], columns=["doc_id", "section_id", "section_text"]) self.assertEqual(True, pd.DataFrame.equals(df_section_list, pm.parsed_text["section_list"]))
def test_constructor_one_section(self): pm = ParseAndModel(feature_list=["screen"], filename='data/parse_and_model/twoLineTest.txt', lemmatize_words=False, nlines=1) section_list = pd.DataFrame([[0, 0, "large clear screen", True] ], columns=["doc_id", "section_id", "section_text", "title"]) expected_model_background = [1 / 3, 1 / 3, 1 / 3] expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]] expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})} expected_section_word_counts_matrix = [[1, 1, 1]] expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3]) expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]]) expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'} self.assertEqual(True, expected_model_background == pm.model_results["model_background"]) self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"]) # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(pm.model_results["section_word_counts_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_background_matrix, csr_matrix.toarray(pm.model_results["model_background_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"])) self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
:return: """ logging.info(type(self).__name__, '- compute cost...') delta = np.square(np.subtract(self.pi_matrix, self.previous_pi_matrix)) return delta.sum() if __name__ == '__main__': print("CWD:", os.getcwd()) start_time = time.time() print("Calling ParseAndModel...") pm = ParseAndModel(feature_list=["sound", "battery", ["screen", "display"]], filename='../tests/data/parse_and_model/iPod.final', nlines=100) print(pm.model_results.keys()) print("Calling EMVectorByFeature") em = EmVectorByFeature(explicit_model=pm, max_iter=10) em.em() end_time = time.time() print("Elapsed: {} seconds".format(round(end_time - start_time, 4))) """ * Notation: v = number of words in vocabulary
def test_against_original_1_50_iteration(self): # load pi initalization infile = open("original_code_data/test_original_1_pi_init.data", 'rb') pi_init = pickle.load(infile) infile.close() # load gflm sentence calculation results infile = open( "original_code_data/test_original_1_gflm_sentence_probs.data", 'rb') gflm_sentence_probs = pickle.load(infile) infile.close() # load gflm sentence tagging results infile = open( "original_code_data/test_original_1_gflm_sentence_results.data", 'rb') gflm_sentence_results = pickle.load(infile) infile.close() # load gflm word calculation result infile = open( "original_code_data/test_original_1_gflm_word_probs.data", 'rb') gflm_word_probs = pickle.load(infile) infile.close() # load gflm word tagging results infile = open( "original_code_data/test_original_1_gflm_word_results.data", 'rb') gflm_word_results = pickle.load(infile) infile.close() pm_inst = ParseAndModel(feature_list=["sound", "battery"], filename='data/parse_and_model/iPod.final', remove_stopwords=False, lemmatize_words=False, log_base=None, start_line=4, nlines=11, include_title_lines=False) # use fixed init # check section word counts pi_init_em = np.empty([ pm_inst.model_results["section_word_counts_matrix"].shape[0], len(pm_inst.model_results["model_feature"]) ]) for review_id in range(0, len(pi_init)): review_sections = pm_inst.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): for feature_id in range( 0, len(pi_init[review_id][section_og_id])): feature_name_row = pm_inst.formatted_feature_list[ pm_inst.formatted_feature_list.feature_id == feature_id] feature_name_row = feature_name_row.reset_index(drop=True) pi_init_em[section_index, feature_id] = pi_init[review_id][section_og_id][ feature_name_row["feature"][0]] section_og_id += 1 em = EmVectorByFeature(explicit_model=pm_inst, max_iter=50, lambda_background=0.7, pi_init=pi_init_em) em.initialize_parameters() em.em_loop() # Calculate GFLM gflm = GFLM(em_results=em, section_threshold=0.35, word_threshold=0.35) gflm.calc_gflm_section() gflm.calc_gflm_word() # Check gflm word results for review_id in range(0, len(gflm_word_probs)): review_sections = pm_inst.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): for feature_id in range( 0, max(gflm.gflm_word_all.implicit_feature_id) + 1): feature_name_row = pm_inst.formatted_feature_list[ pm_inst.formatted_feature_list.feature_id == feature_id] feature_name_row = feature_name_row.reset_index(drop=True) actual_param_data = gflm.gflm_word_all[ (gflm.gflm_word_all.section_id == section_index) & (gflm.gflm_word_all.implicit_feature_id == feature_id)] actual_param_data = actual_param_data.reset_index( drop=True) actual_param = actual_param_data.loc[0].gflm_word # loop through words and grab relevant feature value word_probs = list() for word, probability in gflm_word_probs[review_id][ section_og_id].items(): word_probs.append( probability[feature_name_row["feature"][0]]) original_param = max(word_probs) print( "checking GFLM word probs - section:" + str(section_index), ", feature=" + str(feature_id)) self.assertEqual( round(actual_param, 8), round(original_param, 8), msg="feature=" + str(feature_name_row["feature"][0]) + ", section= " + str(section_index) + ", a=" + str(actual_param) + ", e=" + str(original_param)) section_og_id += 1 # Check gflm section results for review_id in range(0, len(gflm_word_probs)): review_sections = pm_inst.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): for feature_id in range( 0, max(gflm.gflm_word_all.implicit_feature_id) + 1): feature_name_row = pm_inst.formatted_feature_list[ pm_inst.formatted_feature_list.feature_id == feature_id] feature_name_row = feature_name_row.reset_index(drop=True) actual_param_data = gflm.gflm_section_all[ (gflm.gflm_section_all.section_id == section_index) & (gflm.gflm_section_all.implicit_feature_id == feature_id)] actual_param_data = actual_param_data.reset_index( drop=True) actual_param = actual_param_data.loc[0].gflm_section original_param = gflm_sentence_probs[review_id][ section_og_id][feature_name_row["feature"][0]] print( "checking GFLM section probs - section:" + str(section_index), ", feature=" + str(feature_id)) self.assertEqual( round(actual_param, 8), round(original_param, 8), msg="feature=" + str(feature_name_row["feature"][0]) + ", section= " + str(section_index) + ", a=" + str(actual_param) + ", e=" + str(original_param)) section_og_id += 1
def test_against_original_1_single_iteration(self): # load pi initalization infile = open("original_code_data/test_original_1_pi_init.data", 'rb') pi_init = pickle.load(infile) infile.close() # load pi params (end of iteration 1) infile = open("original_code_data/test_original_1_pi_params_it1.data", 'rb') pi_params = pickle.load(infile) infile.close() # load hidden params (end of iteration 1) infile = open( "original_code_data/test_original_1_hidden_params_it1.data", 'rb') hidden_params = pickle.load(infile) infile.close() # load hidden background params (end of iteration 1) infile = open( "original_code_data/test_original_1_hidden_background_params_it1.data", 'rb') hidden_back_params = pickle.load(infile) infile.close() # load pi deltas infile = open("original_code_data/test_original_1_pi_delta_it1.data", 'rb') pi_delta = pickle.load(infile) infile.close() pm_inst = ParseAndModel(feature_list=["sound", "battery"], filename='data/parse_and_model/iPod.final', remove_stopwords=False, lemmatize_words=False, log_base=None, start_line=4, nlines=11, include_title_lines=False) # use fixed init # check section word counts pi_init_em = np.empty([ pm_inst.model_results["section_word_counts_matrix"].shape[0], len(pm_inst.model_results["model_feature"]) ]) for review_id in range(0, len(pi_init)): review_sections = pm_inst.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): for feature_id in range( 0, len(pi_init[review_id][section_og_id])): feature_name_row = pm_inst.formatted_feature_list[ pm_inst.formatted_feature_list.feature_id == feature_id] feature_name_row = feature_name_row.reset_index(drop=True) pi_init_em[section_index, feature_id] = pi_init[review_id][section_og_id][ feature_name_row["feature"][0]] section_og_id += 1 em = EmVectorByFeature(explicit_model=pm_inst, max_iter=1, lambda_background=0.7, pi_init=pi_init_em) # TODO: this should probably be moved into a constructor somewhere em.initialize_parameters() em.em_loop() # Check hidden parameters dense_hidden_params = list() for feature_id in range(0, len(em.hidden_parameters)): dense_hidden_params.append( em.hidden_parameters[feature_id].toarray()) inverse_vocab_lookup = { strip_punctuation(v): k for k, v in pm_inst.model_results["vocabulary_lookup"].items() } for review_id in range(0, len(hidden_params)): review_sections = pm_inst.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): for word in hidden_params[review_id][section_og_id].keys(): word_id = inverse_vocab_lookup[strip_punctuation(word)] for feature_id in range( 0, len(hidden_params[review_id][section_og_id] [word])): feature_name_row = pm_inst.formatted_feature_list[ pm_inst.formatted_feature_list.feature_id == feature_id] feature_name_row = feature_name_row.reset_index( drop=True) actual_param = dense_hidden_params[feature_id][ section_row["section_id"], word_id] original_param = hidden_params[review_id][ section_og_id][word][feature_name_row["feature"] [0]] print("checking word:" + word) self.assertEqual(round(actual_param, 8), round(original_param, 8), msg="hidden feature - feature_id: " + str(feature_name_row["feature"][0]) + ", word=" + word + ", a=" + str(actual_param) + ", e=" + str(original_param)) section_og_id += 1 # Check hidden background parameters dense_hidden_back_params = em.hidden_parameters_background.toarray() for review_id in range(0, len(hidden_back_params)): review_sections = pm_inst.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): for word in hidden_params[review_id][section_og_id].keys(): word_id = inverse_vocab_lookup[strip_punctuation(word)] actual_param = dense_hidden_back_params[ section_row["section_id"], word_id] original_param = hidden_back_params[review_id][ section_og_id][word] print("checking word:" + word) self.assertEqual(round(actual_param, 8), round(original_param, 8), msg="hidden background: " + ", word=" + word + ", a=" + str(actual_param) + ", e=" + str(original_param)) section_og_id += 1 # Check pi parameters for review_id in range(0, len(pi_params)): review_sections = pm_inst.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): for feature_id in range( 0, len(pi_params[review_id][section_og_id])): feature_name_row = pm_inst.formatted_feature_list[ pm_inst.formatted_feature_list.feature_id == feature_id] feature_name_row = feature_name_row.reset_index(drop=True) actual_param = em.pi_matrix[section_index, feature_id] original_param = pi_params[review_id][section_og_id][ feature_name_row["feature"][0]] print("checking section:" + str(section_index)) self.assertEqual(round(actual_param, 8), round(original_param, 8), msg="pi params: " + ", feature=" + str(feature_name_row["feature"][0]) + ", section= " + str(section_index) + ", a=" + str(actual_param) + ", e=" + str(original_param)) section_og_id += 1 # Check pi deltas self.assertEqual(round(pi_delta, 8), round(em.pi_delta, 8), msg="pi delta: " + ", a=" + str(em.pi_delta) + ", e=" + str(pi_delta))
def test_against_original_1(self): os.getcwd() # load topic model infile = open("original_code_data/test_original_1_topic_model.data", 'rb') topic_model = pickle.load(infile) infile.close() # load section word counts infile = open("original_code_data/test_original_1_section_word_counts.data", 'rb') section_word_counts = pickle.load(infile) infile.close() # load background model infile = open("original_code_data/test_original_1_background_model.data", 'rb') background_model = pickle.load(infile) infile.close() pm = ParseAndModel(feature_list=["sound", "battery"], filename='data/parse_and_model/iPod.final', remove_stopwords=False, lemmatize_words=False, log_base=None, start_line=4, nlines=11, include_title_lines=False) # check section word counts pm_section_word_counts = pm.model_results["section_word_counts_matrix"].toarray() inverse_vocab_lookup = {strip_punctuation(v): k for k, v in pm.model_results["vocabulary_lookup"].items()} for review_id in range(0, len(section_word_counts)): print("SWC - Checking review: " + str(review_id)) review_sections = pm.parsed_text["section_list"] review_sections = review_sections[review_sections.doc_id == review_id] section_og_id = 0 for section_index, section_row in review_sections.iterrows(): print("SWC - Checking section:" + str(section_row["section_id"])) for word in section_word_counts[review_id][section_og_id].keys(): word = strip_punctuation(word) if word == '': continue vocab_word_id = inverse_vocab_lookup[word] actual_count = pm_section_word_counts[section_row["section_id"], vocab_word_id] original_count = section_word_counts[review_id][section_og_id][word] self.assertEqual(actual_count, original_count, msg="SWC - section_id: " + str( section_row["section_id"]) + ", " + word + " a=" + str(actual_count) + ", e=" + str( original_count)) section_og_id += 1 # check background model for word in background_model.keys(): print("Background - Checking word:" + word) word = strip_punctuation(word) vocab_word_id = inverse_vocab_lookup[word] actual_prob = pm.model_results["model_background"][vocab_word_id] original_prob = background_model[word] self.assertEqual(actual_prob, original_prob, msg="Background prob:" + word + " a=" + str( actual_prob) + ", e=" + str(original_prob)) # check topic model for f_index, feature_row in pm.formatted_feature_list.iterrows(): for word_index, word in pm.model_results["vocabulary_lookup"].items(): print("Topic - Checking word:" + word) word = strip_punctuation(word) feature_index = feature_row["feature_id"] actual_prob = pm.model_results["model_feature"][feature_index][word_index] original_prob = topic_model[feature_row["feature"]][word] self.assertEqual(round(actual_prob, 8), round(original_prob, 8), msg="topic - feature_id: " + str( feature_row["feature_id"]) + ", word=" + word + ", a=" + str( actual_prob) + ", e=" + str(original_prob))