def __init__(self, N, file_name): self.dic_ngram = {} self.total_count = 0 self.N = N self.cf = TextCleaner(file_name)
def test_words(): """test spliting a sentence into words""" text = TextCleaner() sentence = "a necro philiac \"tim burton's corpse bride\"" word_list = text.words(sentence) assert word_list == ['a', 'necro', 'philiac', 'tim', "burton's", 'corpse', 'bride']
def test_clean_df_multilingual(): input_df = pd.DataFrame( { "input_text": [ "I did a 10k run this morning at 6h34 follow me @superRunnerdu95 didn't I?", "Nous cherchâmes des informations sur https://www.google.com/ le 03/11/2046 l'aventures", "#Barcelona Fútbol es la vida [email protected] ℌ ①", ], "language": ["en", "fr", "es"], } ) token_filters = {"is_stop", "is_measure", "is_datetime", "like_url", "like_email", "is_username", "is_hashtag"} text_cleaner = TextCleaner( tokenizer=MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path), token_filters=token_filters, lemmatization=True, lowercase=False, unicode_normalization=UnicodeNormalization.NFKD, ) output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language_column="language") cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0] cleaned_texts = output_df[cleaned_text_column].values.tolist() expected_cleaned_texts = [ "run morning follow not ?", "chercher information aventurer", "Fútbol vida H 1", ] assert cleaned_texts == expected_cleaned_texts
def main(filename): tc = TextCleaner("corpse_bride.txt") list_of_sentences = tc.read_file() RANK = 10 unigram = NgramFrequencies(RANK) bigram = NgramFrequencies(RANK) trigram = NgramFrequencies(RANK) for sentence in list_of_sentences: words = sentence.split() for i in range(len(words)): unigram.add_item(words[i]) if i < len(words) - 1: bigram.add_item(words[i] + "_" + words[i + 1]) if i < len(words) - 2: trigram.add_item(words[i] + "_" + words[i + 1] + "_" + words[i + 2]) print("Top 10 unigrams:") print(unigram.top_n_freqs()) print("Top 10 bigrams:") print(bigram.top_n_freqs()) print("Top 10 trigrams:") print(trigram.top_n_freqs())
def test_clean_text(): """test text_clean""" text_cleaner = TextCleaner("corpse_bride.txt") text_cleaner.text = "Hi, Mr.Lee -went to the park. Let's go" text_cleaner.clean_text() assert text_cleaner.sentence == [ "hi COMMA mr lee went to the park", " let's go" ]
def test_clean_df_english(): input_df = pd.DataFrame({"input_text": ["Hi, I have two apples costing 3$ 😂 \n and unicode has #snowpersons ☃"]}) token_filters = {"is_punct", "is_stop", "like_num", "is_symbol", "is_currency", "is_emoji"} text_cleaner = TextCleaner(tokenizer=MultilingualTokenizer(), token_filters=token_filters, lemmatization=True) output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language="en") cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0] cleaned_text = output_df[cleaned_text_column][0] expected_cleaned_text = "apple cost unicode #snowpersons" assert cleaned_text == expected_cleaned_text
def test_open_file(): """test open file""" text_cleaner = TextCleaner("corpse_bride.txt") text_f = open("corpse_bride.txt") text_f.readline() text_f.readline() text = text_f.read() assert text == text_cleaner.text text_cleaner = TextCleaner("corpse.txt")
def test_pre_process(): """test all preprocess""" text = TextCleaner() line = "\"Tim Burton's Corpse Bride\". Marks the Dr. Liu latest, venture." text.pre_process(line) assert text.word_list == [ ["tim", "burton's", "corpse", "bride"], ["marks", "the", "drdot", "liu", "latest", "COMMA", "venture"], [] ]
def test_clean_file(): """test the clean file method""" text = TextCleaner() f = open("test_file.txt") text.clean_file(f) assert (text.text) == [[ 'a', 'bunch', 'of', 'cute', 'and', 'spooky', 'animals', 'are', 'dropping', 'by.' ], ['pick', 'trick', 'or', 'treat.'], ['trick', 'COMMA', '', 'treat.'], [ 'by', 'mr', 'zeng', 'COMMA', 'mrs', 'liao', 'and', 'dr', 'zhang' ]]
def main(filename): tc = TextCleaner(filename) word_list = tc.do_the_cleaning() ng_uni = NgramFrequencies(word_list, UNI_COUNT) ng_bi = NgramFrequencies(word_list, BI_COUNT) ng_tri = NgramFrequencies(word_list, TRI_COUNT) ng_uni.add_item() ng_bi.add_item() ng_tri.add_item() print('Top 10 unigrams:') print_output_ngram(ng_uni.top_n_freqs(10)) print('Top 10 bigrams:') print_output_ngram(ng_bi.top_n_freqs(10)) print('Top 10 trigrams:') print_output_ngram(ng_tri.top_n_freqs(10))
def main(): with open("corpse_bride.txt", "r") as f: content = f.read() word_lists = TextCleaner().clean(content) print_ngram(1, "unigrams", word_lists) print_ngram(2, "bigrams", word_lists) print_ngram(3, "trigrams", word_lists)
def main(): clean_text = TextCleaner() try: f = open(sys.argv[1]) clean_text.clean_file(f) except FileNotFoundError: print("Can't find", sys.argv[1]) return text = clean_text.text # Report top ten unigrams by frequency unigram = NgramFrequencies() print("Top 10 unigram:") for line in text: for char in line: unigram.add_item(char) print_output(unigram.frequency(10)) # Report top ten bigrams by frequency # if word end with ".", then it cannot connect with the next word bigram = NgramFrequencies() print("Top 10 bigram:") for line in text: for i in range(len(line) - 1): if "." in line[i]: continue else: bi_pattern = line[i] + "_" + line[i + 1] bigram.add_item(bi_pattern) print_output(bigram.frequency(10)) # Report top ten trigrams by frequency # if word itself and the next word end with "." # then they cannot form trigram trigram = NgramFrequencies() print("Top 10 trigram:") for line in text: for j in range(len(line) - 2): if "." in line[j] or "." in line[j + 1]: continue else: tri_pattern = line[j] + "_" + line[j + 1] + "_" + line[j + 2] trigram.add_item(tri_pattern) print_output(trigram.frequency(10))
class NgramFrequencies: def __init__(self, N, file_name): self.dic_ngram = {} self.total_count = 0 self.N = N self.cf = TextCleaner(file_name) def make_ngram(self): # make n_gram word, convert them to string and add them # to dictionary! list_ngram = [] for sentence in self.cf.open_file(): for i in range((self.N - 1), len(sentence)): for j in range(self.N - 1, -1, -1): list_ngram.append(sentence[i - j]) list_to_str = '_'.join(map(str, list_ngram)) self.add_item(list_to_str) list_ngram = [] def add_item(self, ngram): # This method takes an ngram and increment it's count the dictionary. self.total_count += 1 if ngram in self.dic_ngram.keys(): self.dic_ngram[ngram] += 1 else: self.dic_ngram[ngram] = 1 def top_n_counts(self, n): # This method return list of words sorted on the count. # with the most frequent first. sorted_word = sorted(self.dic_ngram.items(), key=lambda x: x[1], reverse=True) return sorted_word[:n] def frequency(self): # This method return dictionary of frequency of words! freq_dic = {key: round(self.dic_ngram[key]/self.total_count, 3) for key in self.dic_ngram.keys()} return freq_dic def top_n_freqs(self, n): # This method return list of sorted frequencies of words! temp_dic = self.frequency() sorted_freq = sorted(temp_dic.items(), key=lambda x: x[1], reverse=True) return sorted_freq[:n]
def get_email_dict_array(self, clean=False): extension = self.file_name.split('.')[-1].strip() if extension == 'csv': self._read_in_csv(self.file_name) elif extension == 'xlsx': data = self._read_in_xlxs(self.file_name) else: print "Unsupported data format!" email_dict_array = [] for row in data: subject = ' '.join(TextCleaner(row[3]).tokenize_str()) body = ' '.join(TextCleaner(row[4]).tokenize_str()) email_dict_array.append({ 'direction': row[1], 'date': row[2], 'subject': subject, 'body': body }) return email_dict_array
def __init__(self, filename): # Member variables self.email_data = [] self.lda = None self.feature_names = None self.num_topics = NUM_TOPICS self.num_words_per_topic = NUM_WORDS_PER_TOPIC self.num_features = NUM_FEATURES # Load emails from full path to file emails = EmailLoader(filename).get_email_dict_array() # Process emails into a list of email body contents for email_rec in emails: if email_rec['body']: # Clean the text and add to list cleaner = TextCleaner(email_rec['body']) self.email_data.append(" ".join(cleaner.tokenize_str()))
def main(): '''read the file and process by calling methods''' file_name = "corpse_bride.txt" global N # so that N can be called in print_output() N = 10 # open the file try: f = open(file_name, encoding="utf8") except Exception: print("Can't open corpse_bride.txt") return # handle TextCleaner class clean = TextCleaner(f) # handle NgramFrequencies class ngram = NgramFrequencies(clean.format()) ngram.add_item() ngram.top_n_counts(N) ngram.frequency() # call print_output print_output(ngram.top_n_freqs(N))
def main(): """collect n-gram frequencies and print the top 10 of eacy type out""" text_cleaner = TextCleaner("corpse_bride.txt") clean_text(text_cleaner) print("Top 10 unigrams: ") n_gram_1 = NgramFrequencies(text_cleaner.sentence, 1) top_n_count(n_gram_1) print("Top 10 bigrams: ") n_gram_2 = NgramFrequencies(text_cleaner.sentence, 2) top_n_count(n_gram_2) print("Top 10 trigrams: ") n_gram_3 = NgramFrequencies(text_cleaner.sentence, 3) top_n_count(n_gram_3) print("Check frequency for N-gram word:") gram_input = input("Enter unigram/bigram/trigram: ") if gram_input == "unigram": check_freq(n_gram_1) elif gram_input == "bigram": check_freq(n_gram_2) elif gram_input == "trigram": check_freq(n_gram_3)
def main(file_name): """Given the file name, print n-grams frequencies String -> None""" text = TextCleaner() ngrams = NgramFrequencies() text.read_file(file_name) for i in range(0, len(text.lines)): text.pre_process(text.lines[i]) for word_per_list in text.word_list: ngrams.fill_in_dic(word_per_list) ngrams_list = [ ngrams.unigrams_dic, ngrams.bigrams_dic, ngrams.trigrams_dic ] ngrams_name_list = ["unigrams", "bigrams", "trigrams"] for i in range(3): grams_top = ngrams.top_n_grams(ngrams_list[i], 10) print_output(grams_top, ngrams_name_list[0])
def test_constructor(): '''Test the constructor''' clean_file = TextCleaner('test') assert clean_file.file_name == 'test'
def test_change_comma(): '''Test the change comma method''' clean_file = TextCleaner("") actual_string = clean_file.change_comma("ab, cd,") assert actual_string == "ab COMMA cd COMMA"
def test_split_file(): '''Test the split file method''' clean_file = TextCleaner("") actual_list = clean_file.split_file("mr. dr. ms. ab cd. ab ab.") assert actual_list == [['mr', 'dr', 'ms', 'ab', 'cd'], ['ab', 'ab']]
def test_text_cleaner(): """test the constructor""" text_cleaner = TextCleaner("corpse_bride.txt") assert text_cleaner.sentence == []
def test_sentence(): """test splitting a paragraph into sentence""" text = TextCleaner() line = "we are align students. we are fall students" sentence_list = text.sentence(line) assert sentence_list == ["we are align students", " we are fall students"]
def test_constructor(): """Test the constructor""" text = TextCleaner() assert text.new_line is None assert text.text == []
def test_constructor(): """test the constructor""" text = TextCleaner() assert text.lines == [] assert text.word_list == []
def cleaned_comments(subreddit): tc = TextCleaner() return [tc.clean_text(line).text for line in comments(subreddit)]
def test_delete_punctuation(): '''Test the delete punctuation method''' clean_file = TextCleaner("") actual_string = clean_file.delet_punctuation("ab)($><") assert actual_string == "ab"
from text_cleaner import TextCleaner import sys filename = sys.argv[2] tc = TextCleaner(filename) def test_deal_special_dot(): assert tc.deal_special_dot("abcdmr.") == "abcdmr" assert tc.deal_special_dot("abcddr.abcd") == "abcddrabcd" def test_deal_commna(): assert tc.deal_comma("abc,de") == "abc COMMAde" def test_deal_apostro(): assert tc.deal_apostro("burton's") == "burtonAPOSs" def test_split_sentence(): assert tc.split_sentence("I am a girl. You are a boy.") == [ "I am a girl", " You are a boy", ""] def test_deal_punc(): assert tc.deal_punc("A necro- philiac entertainment-for") == [ "A", "necro-", "philiac", "entertainment-for"]
def test_open_file(): '''Test the open file method''' clean_file = TextCleaner('test_file.txt') actual_list = clean_file.open_file() assert actual_list == [["ab", "dr", "d"], ["gh", "COMMA", "ab's"]]
def test_init(): '''test if text cleaner opens file properly''' f = open("corpse_bride.txt", encoding="utf8") tc = TextCleaner(f) assert tc.f == f return tc