def remove_stopped_terms(term_list): term_list = remove_terms_with_number(term_list) stopped_terms = ftools.read_txt("stopped_terms.txt") used_terms = ftools.read_txt("used_terms.txt") remove_list = [] for term in term_list: if term.lower( ) in stopped_terms or len(term) < 5 or not term.lower() in used_terms: remove_list.append(term) for re in remove_list: term_list.remove(re) term_list = [te.lower() for te in term_list] term_list = list(set(term_list)) return term_list
def read_20_newsgroup_weight(root_path): file_path_list = [] sub_dir = os.listdir(root_path + "mini_newsgroups/") text_term_list = [] dictionary = ftools.read_txt(root_path + "dictionary.txt") for sd in sub_dir: file_names = os.listdir(root_path + "mini_newsgroups/" + sd + "/") for fn in file_names: file_path_list.append(root_path + "mini_newsgroups/" + sd + "/" + fn) for fp in file_path_list: text_term_list.append(read_20_newsgroup_file_term(fp)) weights = data_preprocess.get_txt_weight(text_term_list, dictionary) return weights
def read_20_newsgroup_file_term(text_path): file_txt_lines = ftools.read_txt(text_path) file_txt_lines_terms = [data_preprocess.get_txt_terms(line) for line in file_txt_lines] lines_num = -1 for line_term in file_txt_lines_terms: if len(line_term) > 1 and line_term[0] == "Lines": lines_num = int(line_term[1]) break needed_lines_terms = file_txt_lines_terms[-lines_num:] file_terms = [] for nlt in needed_lines_terms: file_terms.extend(nlt) file_terms = list(set(file_terms)) file_terms = data_preprocess.remove_stopped_terms(file_terms) return file_terms
def read_NYSK_weight(root_path): text_term_list = read_NYSK_file_text_term_list(root_path) dictionary = ftools.read_txt(root_path + "dictionary.txt") weights = data_preprocess.get_txt_weight(text_term_list, dictionary) return weights
def read_NYSK_feature_vector(root_path): text_term_list = read_NYSK_file_text_term_list(root_path) dictionary = ftools.read_txt(root_path + "dictionary.txt") feature_vector = data_preprocess.get_txt_feature_vector(text_term_list, dictionary) return feature_vector