def normalize(self, text):
     """
         Make some word improvements before feeding to the sentence tokenizer.
     """  
     rr = RepeatReplacer(self.lexicon)
     normalized_text = []
     final = None
     try:
         for word in text.split():
             normal = rr.replace(word.lower()) 
             if word[0].isupper(): 
                 normal = normal[0].upper() + normal[1:]
             
             normalized_text.append(normal)
             final = " ".join(normalized_text)
     except:
             final = text
 
     return final
예제 #2
0
 def normalize(self, text):
     """
         Make some word improvements before feeding to the sentence tokenizer.
     """  
     rr = RepeatReplacer(self.lexicon)
     normalized_text = []
     final = None
     try:
         for word in text.split():
             normal = rr.replace(word.lower()) 
             if word[0].isupper(): 
                 normal = normal[0].upper() + normal[1:]
             
             normalized_text.append(normal)
             final = " ".join(normalized_text)
     except:
             final = text
 
     return final
예제 #3
0
def clean(sentences):
    """Remove unwanted characters, tags and patterns from sentences"""
    import string

    # Compile all the regex patterns
    punctuation = re.compile(r'[\[\]\'!"#$%&\\()*+,-./:;<=>?@^_`{}~|0-9]')
    multispace = re.compile(r'\s+')

    sentences = [
        word.lower() for word in sentences
    ]  # Remove the non-printable characters and covert string to lower case

    # Replace the common apostophe patterns like 's, 'll etc.
    replacer = RegexpReplacer()
    sentences = [replacer.replace(tweet) for tweet in sentences]

    sentences = [punctuation.sub(' ', tweet)
                 for tweet in sentences]  # Remove all the punctuation marks
    sentences = [multispace.sub(' ', tweet).strip() for tweet in sentences
                 ]  # Remove leading,trailing,multiple spaces
    replacer = RepeatReplacer()
    sentences = [replacer.replace(tweet) for tweet in sentences]

    return sentences
예제 #4
0
import os, os.path
import re
from nltk.tokenize import WordPunctTokenizer
from replacer import RepeatReplacer
from nltk.corpus import wordnet

rep = RepeatReplacer()

file = os.path.expanduser('output')
tokenizer = WordPunctTokenizer()
tweets = []
specific_tweets = []
sports_word = []

if not os.path.exists(file):
	print "No such file found"
else:
	for line in open(file):
		a = tokenizer.tokenize(line)
		tweets.append(a)
	
	sports_synset = wordnet.synset('sport.n.01')
	cricket_synset = wordnet.synset('cricket.n.02')
	i = 0	

	for line in tweets:
		out = False
		for word in line:
			rep.replace(word)
			if not wordnet.synsets(word):
				continue
예제 #5
0
from replacer import RegexReplacer

givenpatterns = [(r'won\'t', 'will not'), (r'\'s', ' is'), (r'\'d', ' would'),
                 (r'mad man', 'crazy arse mother f*****g anthropoid')]

replacer = RegexReplacer(givenpatterns)
# replacer.patterns = givenpatterns

txt = replacer.replace("He's gone")

print(txt)

from replacer import RepeatReplacer

replacer = RepeatReplacer()

txt = replacer.replace("Anthhhhropoiiid")
print(txt)

# Tutorial 15: https://www.youtube.com/watch?v=r37OYsdH6Z8&list=PLcTXcpndN-Sl9eYrKM6jtcOTgC52EJnqH&index=15

# Thanks to the wordnet checking function, 'book' or
# 'cattle' will return as they are
txt = replacer.replace("Book")
print(txt)

txt = replacer.replace("cattle")
print(txt)

txt = replacer.replace("botttleeee")
print(txt)
예제 #6
0
        if Ave_cache < Ave[eq_s_count_2]:
            eq_s_a = eq_s_X[eq_s_count_2]
            eq_s_b = eq_s_Y[eq_s_count_2]
            eq_s_c = eq_s_Z[eq_s_count_2]
            Ave_cache = Ave[eq_s_count_2]
        eq_s_count_2 = eq_s_count_2 + 1

    return ([eq_s_a, eq_s_b, eq_s_c])


###----------------------------------------------------------------------------

### Global Variables
male_names = all_lower(names.words('male.txt'))
female_names = all_lower(names.words('female.txt'))
rep_1 = RepeatReplacer()  # Object of RepeatReplacer (Main Circuit, PART 1)
rep_2 = RegexpReplacer()  # Object of RegexpReplacer (Main Circuit, PART 1)
count = 0  # The Number of Found Result, TEMPORARY
tag = 0  # Flag1: if find aim in database, tag=1
con_count = 0
cit_count = 0
mon_count = 0
sit_count = 0

weigh_method_1 = 0  # Weigh1: method1
weigh_method_2 = 0  # Weigh2: method2
weigh_method_3 = 0  # Weigh3: method3
weigh_list = [None] * 3  # List1 : Cache list

editdis = 1000  # Param1: the result of edit_distance (Main Circuit, PART 2)
line_num = 0  # Param2: line number of the most possible answer (Main Circuit, PART 2)