def WORDNET_pre_proc(suspicious_corpus): pre_processed_files = [] for text in suspicious_corpus: suspicious = Pre_Processing.lower_case(text) Pre_Processing.remove_punctuation(suspicious) Pre_Processing.clean_text(suspicious) pre_processed_files.append(suspicious) print("WordNet Pre-Processing Complete") return pre_processed_files
def TFIDF_pre_proc(suspicious_corpus): pre_processed_files = [] for text in suspicious_corpus: suspicious = Pre_Processing.lower_case(text) suspicious = Pre_Processing.remove_punctuation(suspicious) suspicious = Pre_Processing.clean_text(suspicious) suspicious = Pre_Processing.tokenization(suspicious) suspicious = Pre_Processing.remove_stopwords(suspicious) suspicious = Pre_Processing.lemmatize_words(suspicious) pre_processed_files.append(suspicious) print("TFIDF Pre-Processing Complete") return pre_processed_files
def LCS_pre_proc(original_corpora, suspicious_corpus): pre_processed_files = [] sus = [] original = Pre_Processing.lower_case(original_corpora) original = Pre_Processing.remove_punctuation(original) original = Pre_Processing.clean_text(original) pre_processed_files.append(original) for text in suspicious_corpus: suspicious = Pre_Processing.lower_case(text) suspicious = Pre_Processing.remove_punctuation(suspicious) suspicious = Pre_Processing.clean_text(suspicious) sus.append(suspicious) pre_processed_files.append(sus) print("LCS Pre-Processing Complete") return pre_processed_files
filenames = os.listdir( "C:/Users/Chris/Documents/UoB_MSc_Computer_Science/MSc_Dissertation/cjh748/scikit-machine-learning/train_test_corpus" ) files = [] array_data = [] array_label = [] for file in filenames: with codecs.open( "C:/Users/Chris/Documents/UoB_MSc_Computer_Science/" "MSc_Dissertation/cjh748/scikit-machine-learning/train_test_corpus/" + file, "r", encoding='utf-8', errors='ignore') as file_data: open_file = file_data.read() open_file = Pre_Processing.lower_case(open_file) open_file = Pre_Processing.remove_punctuation(open_file) open_file = Pre_Processing.clean_text(open_file) files.append(open_file) for file in files: if 'inheritance' in file: array_data.append(file) array_label.append('Inheritance (object-oriented programming)') elif 'pagerank' in file: array_data.append(file) array_label.append('PageRank') elif 'vector space model' in file: array_data.append(file) array_label.append('Vector Space Model') elif 'bayes' in file:
def NGRAM_pre_proc(original_corpus, suspicious_corpus): pre_processed_files = [] sus = [] orig = [] for text in original_corpus: original = Pre_Processing.lower_case(text) original = Pre_Processing.remove_punctuation(original) original = Pre_Processing.clean_text(original) original = Pre_Processing.tokenization(original) original = Pre_Processing.remove_stopwords(original) original = Pre_Processing.lemmatize_words(original) orig.append(original) pre_processed_files.append(orig) for text in suspicious_corpus: suspicious = Pre_Processing.lower_case(text) suspicious = Pre_Processing.remove_punctuation(suspicious) suspicious = Pre_Processing.clean_text(suspicious) suspicious = Pre_Processing.tokenization(suspicious) suspicious = Pre_Processing.remove_stopwords(suspicious) suspicious = Pre_Processing.lemmatize_words(suspicious) sus.append(suspicious) pre_processed_files.append(sus) print("NGram Overlap Pre-Processing Complete") return pre_processed_files