def normalize(self, text): """ Make some word improvements before feeding to the sentence tokenizer. """ rr = RepeatReplacer(self.lexicon) normalized_text = [] final = None try: for word in text.split(): normal = rr.replace(word.lower()) if word[0].isupper(): normal = normal[0].upper() + normal[1:] normalized_text.append(normal) final = " ".join(normalized_text) except: final = text return final
def clean(sentences): """Remove unwanted characters, tags and patterns from sentences""" import string # Compile all the regex patterns punctuation = re.compile(r'[\[\]\'!"#$%&\\()*+,-./:;<=>?@^_`{}~|0-9]') multispace = re.compile(r'\s+') sentences = [ word.lower() for word in sentences ] # Remove the non-printable characters and covert string to lower case # Replace the common apostophe patterns like 's, 'll etc. replacer = RegexpReplacer() sentences = [replacer.replace(tweet) for tweet in sentences] sentences = [punctuation.sub(' ', tweet) for tweet in sentences] # Remove all the punctuation marks sentences = [multispace.sub(' ', tweet).strip() for tweet in sentences ] # Remove leading,trailing,multiple spaces replacer = RepeatReplacer() sentences = [replacer.replace(tweet) for tweet in sentences] return sentences
import os, os.path import re from nltk.tokenize import WordPunctTokenizer from replacer import RepeatReplacer from nltk.corpus import wordnet rep = RepeatReplacer() file = os.path.expanduser('output') tokenizer = WordPunctTokenizer() tweets = [] specific_tweets = [] sports_word = [] if not os.path.exists(file): print "No such file found" else: for line in open(file): a = tokenizer.tokenize(line) tweets.append(a) sports_synset = wordnet.synset('sport.n.01') cricket_synset = wordnet.synset('cricket.n.02') i = 0 for line in tweets: out = False for word in line: rep.replace(word) if not wordnet.synsets(word): continue
from replacer import RegexReplacer givenpatterns = [(r'won\'t', 'will not'), (r'\'s', ' is'), (r'\'d', ' would'), (r'mad man', 'crazy arse mother f*****g anthropoid')] replacer = RegexReplacer(givenpatterns) # replacer.patterns = givenpatterns txt = replacer.replace("He's gone") print(txt) from replacer import RepeatReplacer replacer = RepeatReplacer() txt = replacer.replace("Anthhhhropoiiid") print(txt) # Tutorial 15: https://www.youtube.com/watch?v=r37OYsdH6Z8&list=PLcTXcpndN-Sl9eYrKM6jtcOTgC52EJnqH&index=15 # Thanks to the wordnet checking function, 'book' or # 'cattle' will return as they are txt = replacer.replace("Book") print(txt) txt = replacer.replace("cattle") print(txt) txt = replacer.replace("botttleeee") print(txt)
if Ave_cache < Ave[eq_s_count_2]: eq_s_a = eq_s_X[eq_s_count_2] eq_s_b = eq_s_Y[eq_s_count_2] eq_s_c = eq_s_Z[eq_s_count_2] Ave_cache = Ave[eq_s_count_2] eq_s_count_2 = eq_s_count_2 + 1 return ([eq_s_a, eq_s_b, eq_s_c]) ###---------------------------------------------------------------------------- ### Global Variables male_names = all_lower(names.words('male.txt')) female_names = all_lower(names.words('female.txt')) rep_1 = RepeatReplacer() # Object of RepeatReplacer (Main Circuit, PART 1) rep_2 = RegexpReplacer() # Object of RegexpReplacer (Main Circuit, PART 1) count = 0 # The Number of Found Result, TEMPORARY tag = 0 # Flag1: if find aim in database, tag=1 con_count = 0 cit_count = 0 mon_count = 0 sit_count = 0 weigh_method_1 = 0 # Weigh1: method1 weigh_method_2 = 0 # Weigh2: method2 weigh_method_3 = 0 # Weigh3: method3 weigh_list = [None] * 3 # List1 : Cache list editdis = 1000 # Param1: the result of edit_distance (Main Circuit, PART 2) line_num = 0 # Param2: line number of the most possible answer (Main Circuit, PART 2)