Exemplo n.º 1
0
 def __init__(self, word2vec_provider: Word2VecProvider,
              emoji_provider: EmojiProvider):
     self._emoji_provider = emoji_provider
     self._repeat_replacer = RepeatReplacer()
     self._polarity_replacer = PolarityReplacer()
     self._replacement_patterns = NEGATION_REPLACEMENT_PATTERNS
     self._replacement_patterns.extend([
         # remove urls
         (r'((www\.[^\s]+)|(https?://[^\s]+))', ''),
         # remove usernames
         (r'@[^\s]+', ''),
         # remove # from hashtags
         (r'#([^\s]+)', r'\1'),
         # leave only letters
         (r'[^a-zA-Z]+', ' '),
         # remove months
         (r'(\b\d{1,2}\D{0,3})?\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|'
          +
          r'aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|(nov|dec)(?:ember)?)\D?(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])'
          + r'\D?)?((19[7-9]\d|20\d{2})|\d{2})*', '')
     ])
     self._regexp_replacer = RegexpReplacer(self._replacement_patterns)
     self._stem_replacer = StemReplacer()
     self._word2vec_provider = word2vec_provider
     self._stopwords = stopwords.words('english')
     # drop negation words from stopwords
     self._stopwords.extend(['NEG_' + word for word in self._stopwords])
     self._stopwords.extend(["'nt", "st", "nd", "rd", "th", "rt"])
     self._stopwords.extend(self._emoji_provider.emoji)
Exemplo n.º 2
0
def aquire_text(path):
    filename = os.path.join(path, 'picscontent.txt')
    replacer = RegexpReplacer()
    content = ''
    with open(filename, 'r', encoding="ISO-8859-1") as f:
        for line in f.readlines():
            if len(line) < 2 or '.jpg' in line or '.png' in line:
                continue
            else:
                content = content + ' ' + line.strip()
    words = word_count(content)
    singlewords = []
    for key in words.keys():
        singlewords.append(key)
    content_update = replacer.replace(content)
    content_sent = sent_tokenize(content_update)
    words_content = []
    for item in content_sent:
        words_content.append(nltk.pos_tag(word_tokenize(item)))


#to generate word card
    word_list(words, path)
    #to classify word by tag class
    word_classify(words_content, words, path)

    # to get the vowel of words
    find_vowel(singlewords, path)

    # to get the distance of words
    word_distance(singlewords, path)
Exemplo n.º 3
0
 def clean(self, text):
     result = []
     replacer = RegexpReplacer()
     text1 = replacer.replace_simple(text)
     text2 = replacer.replace(text1)
     sent_text = sent_tokenize(text2)
     for item in sent_text:
         if len(item) > 0:
             result.append(item)
     return (result)
Exemplo n.º 4
0
    def unset_apostrophe(self):
        """
        This function needs to import RegexpReplacer.
        """
        replacer = RegexpReplacer()

        self.unset_apostrophe_list = []

        for element in self.words:
            temp_elem = replacer.replace(element)
            self.unset_apostrophe_list.append(temp_elem.replace('\'', ''))

        return self.unset_apostrophe_list
Exemplo n.º 5
0
def tokenizer(raw):
    # print(stopwords.words('english'))
    stop_words = stopwords.words('english')
    symbols = ["'", '"', '`', '.', ',', '-', '!', '?', ':', ';',
               '(', ')', '--', '\'s', '\'', '\'re', '{', '}', 'ー']

    replacer = RegexpReplacer()
    replaced_raw = replacer.replace(raw).lower()
    tokens = [word 
                 for word in word_tokenize(replaced_raw)
                     if word not in stop_words + symbols]
    
    text = nltk.Text(word_tokenize(replaced_raw))
    return tokens, text
Exemplo n.º 6
0
def create_bag(df):
    '''Create a "bag of words" from the review text.

    Input:
    - Pandas dataframe containing review text. 
    Output:
    - List of sentences.
    '''
    replacer = RegexpReplacer()
    sentences = []
    for review in df['text']:
        tmp = replacer.replace(review)
        tmp1 = tmp.strip()
        sentences.append(sent_tokenize(tmp1))
    sentences = [ inner for sublist in sentences for inner in sublist ]
    return sentences
Exemplo n.º 7
0
	def obtain_analysis_objective(self, filename):

		replacer = RegexpReplacer()

		pathname = filename

		pos = pathname.rindex('/')

		self.objectivename = pathname[int(pos)+1:len(pathname)]

		f = open(pathname, 'r+')

		for line in f.readlines():

			line = replacer.replace(line)

			self.objectiveoutstring += line
Exemplo n.º 8
0
def tokenize(text):
    regex = re.compile(r'^[a-zA-Z]')
    replacer = RegexpReplacer()
    lemmatizer = WordNetLemmatizer()
    temp = replacer.replace(text)
    sent_temp = sent_tokenize(temp)
    word_temp = [word_tokenize(doc) for doc in sent_temp]
    wordlist = [item for sub in word_temp for item in sub]
    x = re.compile('[%s]' % re.escape(string.punctuation))
    y = re.compile('^[a-zA-Z]')
    newwordlist = []
    for word in wordlist:
        if (not bool(re.search(x, word))) and (word.lower()
                                               not in stops) and bool(
                                                   re.search(y, word)):
            t = lemmatizer.lemmatize(word.lower())
            newwordlist.append(t)
    return newwordlist
Exemplo n.º 9
0
	def initialization(self):

		replacer = RegexpReplacer()

		basicwordfile = '/Users/yanchunyang/Documents/highschools/scripts/highfreq.txt'

		g = open(basicwordfile, 'r+')

		self.basicwords = g.readline().strip().split(' ')
 def __init__(self, text, settings):
     """
     @param text: List of text
     @param settings: dictionary of booleans
     @type settings: C{dictionary}
     param['replaceContractions'] is True or False
     """
     self.settings = settings
     self.text = text
     # lowercase all
     #self.text = [w.lower() for w in text]
     #replace contractions
     try:
         if settings['replaceContractions'] == True:
             replacer = RegexpReplacer()
             self.text = [replacer.replace(w) for w in self.text]
     except:
         print('failed to replace contractions')
         pass
Exemplo n.º 11
0
def readfile(path, filename):
    replacer = RegexpReplacer()
    with codecs.open(os.path.join(path, filename),
                     "r",
                     encoding='utf-8',
                     errors='ignore') as f:
        orgtext = f.read()
    orgtext1 = replacer.replace(orgtext)
    temp = []
    parse_pattern = []
    sent_text = sent_tokenize(orgtext1)
    grammar = "NP:{<DT>?<JJ>*<NN><IN>?<NN>*}"
    find = nltk.RegexpParser(grammar)
    for sent in sent_text:
        sent_word = nltk.pos_tag(word_tokenize(sent))
        if find.parse(sent_word):
            parse_pattern.append(sent)
        temp.append(nltk.pos_tag(word_tokenize(sent)))
    print(len(parse_pattern))
    print(parse_pattern[0:5])
Exemplo n.º 12
0
def obtain_text():
    text = []
    replacer = RegexpReplacer()
    st = LancasterStemmer()
    stops = stopwords.words('english')
    path = '/Users/yanchunyang/Documents/highschools/subtext/'
    for filename in os.listdir(path):
        if '3' in filename:
            with open(os.path.join(path, filename), 'r') as f:
                text.append(replacer.replace(f.read()))

    word_content = []

    for sub in text:
        temp = word_tokenize(sub)
        word_content.append(
            [st.stem(word) for word in temp if word not in stops])

    dictionary = corpora.Dictionary(word_content)

    corpus = [dictionary.doc2bow(text) for text in word_content]

    get_tdidf_lda(corpus, dictionary)
Exemplo n.º 13
0
 def __init__(self, text):
     """
     False
     @param text: List of text
     @param settings: dictionary of booleans
     @type settings: C{dictionary}
     param['replaceContractions'] is True or False
     """
     self.text = text
     self.lemmatize = False
     self.porter_stem = False
     self.remove_numerals = False
     self.remove_punctuation = False
     self.remove_stops = False
     # lowercase all
     #self.text = [w.lower() for w in text]
     #replace contractions
     try:
         if self.replace_contractions is True:
             replacer = RegexpReplacer()
             self.text = [replacer.replace(w) for w in self.text]
     except Exception as e:
         print(('failed to replace contractions %s' % e))
 def __init__(self, text):
     """
     False
     @param text: List of text
     @param settings: dictionary of booleans
     @type settings: C{dictionary}
     param['replaceContractions'] is True or False
     """
     self.text = text
     self.lemmatize = False
     self.porter_stem = False
     self.remove_numerals = False
     self.remove_punctuation = False
     self.remove_stops = False
     # lowercase all
     #self.text = [w.lower() for w in text]
     #replace contractions
     try:
         if self.replace_contractions is True:
             replacer = RegexpReplacer()
             self.text = [replacer.replace(w) for w in self.text]
     except Exception as e:
         print(('failed to replace contractions %s' % e))
Exemplo n.º 15
0
class translate(object):
    def __init__(self, basic=basicwords, stop=stopwords_one):
        self.basic = basicwords
        self.stop = stopwords_one
        self.d = enchant.Dict("en_US")
        self.lemmatizer = WordNetLemmatizer()
        self.replacer = RegexpReplacer()

    def translate_to_chinese(self, string):
        translate_result = {}
        org_string = self.replacer.replace(string)
        sent_word = word_tokenize(org_string)
        for word in sent_word:
            word = self.lemmatizer.lemmatize(word)
            if (word.lower() not in self.stop) and (
                    word not in self.basic) and self.d.check(word):
                translate_result[word] = []
                syns = wn.synsets(word)
                for item in syns:
                    name = item.name()
                    result = wn.synset(name).lemma_names('cmn')
                    for subitem in result:
                        translate_result[word].append(subitem)
        return translate_result
Exemplo n.º 16
0
from replacers import RegexpReplacer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.stem import LancasterStemmer

filename = []
outputstring = ""
secondoutput = ""
wordlist = {}


f = open("picturebooks.txt", 'r')

flag = 0

replacer = RegexpReplacer()
tokenizer = RegexpTokenizer("[\w']+")

for line in f.readlines():
	line = line.strip()
	line = replacer.replace(line)
	if not line:
		continue

	if 'pdf' in line:
		filename.append(line)
		
	elif 'www' in line:
		continue
	elif 'the end' in line.lower() or 'about the author' in line.lower() or 'more books' in line.lower():
		continue
Exemplo n.º 17
0
import re

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans

from replacers import RegexpReplacer

url = 'data-pre-processing.csv'
dataframe = pandas.read_csv(url)

tokenizer = RegexpTokenizer("[\w']+")
replacer = RegexpReplacer()
english_stops = set(stopwords.words('english'))
stemmer = PorterStemmer()
vectorizer = CountVectorizer()

corpus = []

print("Removing contraction - replacer.replace")
print("Removing special chars - re.sub")
print("Getting tokens from comment and iterating through it - tokenizer.tokenizer")
print("Removing stopwords - word not in english_stops")
print("Steeming words - steemer.stem")
for videoId,author,date,content,classification in dataframe.values:
    comment = []
    content = content.lower()
    # Replace contractions such as as I'm to I am
#!c:\Python27\python.exe
#!/usr/bin/env python
import os
import cgitb
cgitb.enable()
import cgi, cgitb
import re
from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer = RegexpReplacer()
import csv
from nltk.corpus import wordnet
from nltk.tokenize.punkt import PunktSentenceTokenizer
from replacers import SpellingReplacer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
print "Content-Type: text/html"
print
print """
    <html>
    <head>
    <title>Spam Fighter</title>
    <link href='/style.css' rel='stylesheet' type='text/css' />
    </head>
Exemplo n.º 19
0
from replacers import RegexpReplacer
from replacers import RepeatReplacer
from replacers import AntonymReplacer
from replacers import SpellingReplacer

# from pickle import dump
#
# output = open('t2.pkl', 'wb')
# dump(t2, output, -1)
# output.close()

test = "DO NOT GO THERE !!!\n\n1. I knew it was questionbale when i brought in oil i purchased for them to change out. He said they don't do this, because they like to purchase it. In other words, he needed to mark up the price for the same oil.\n\n2. He told me that our Shocks were blown out and said that we can't drive too far. Normally, when your shocks are blown out, your ride will be like a bouncing ball. I closely monitored my drive and i did not have a bumpy ride that indicated blown out shocks. I took it to two separate mechanics and they tested the car and said if the shocks were bad, the car would bounce up and down. \n\nBasically, the owner lied about the shocks to get me to pay to fix them. \n\n3. One of my light bulbs is going out. I looked up the model # to replace them and i went to autozone to purchase the ones for my car. The owner said that these are the wrong headlights and I needed a more expensive set. Now, mind you- the model's I had were based on Lexus' recommendation. \n\nHe then said that it would cost over $300 dollars to change out the bulbs. The bulbs he recommend was about $80 bucks, which means over 200 of labor. \n\nHe will over exaggerate everything to get you to pay more. \n\n\nBtw, I sent my wife in to see if he would try to run up maintenance. \n\nI would not recommend this place at all. He is not goood."
test = test.lower()

regex_replacer = RegexpReplacer()
repeat_replacer = RepeatReplacer()
spell_replacer = SpellingReplacer()
antonym_replacer = AntonymReplacer()

test = regex_replacer.replace(test)

# test = repeat_replacer.replace(test)
# tokens = antonym_replacer.replace_negations(sentence)
# tokens = repeat_replacer.replace(word)

# print(test)

sentences = nltk.sent_tokenize(test)
# # print(sentences)
stopwords = nltk.corpus.stopwords.words('english')
Exemplo n.º 20
0
import nltk
from replacers import RegexpReplacer
replacer= RegexpReplacer()
replacer.replace("Don't hesitate to ask questions")
print(replacer.replace("She must've gone to the market but she didn't go"))
Exemplo n.º 21
0
#convert sentences to all lowercase
lowercase_sentences = [None] * len(sent)

for i in range(0, len(sent)):
    lowercase_sentences[i] = sent[i].lower()

#convert tokenized words to all lowercase
lowercase_words = [None] * len(tokens)

for i in range(0, len(tokens)):
    lowercase_words[i] = tokens[i].lower()

#replacing words with regular expressiong, i.e., 'won't' with 'will not'
#start with s, the untokenized text
replacer = RegexpReplacer()
replacedText = replacer.replace(s)
print(replacedText[:1000])

a = "I'm art won't bar can't he isn't you won't and they've but would've and she's while you're good and i'd here I'd"
a = replacer.replace(a)
print(a)

#edit words with repeating characters and then tokenize a test sentence
#will probably use on forum posts
forumPost = 'I just looooooove it. It is ooooooh so fun aaah oooookaaay whateverrrrr'
repReplacer = RepeatReplacer()
forumPostTokenized = word_tokenize(forumPost)

for i in range(0, len(forumPostTokenized)):
    forumPostTokenized[i] = repReplacer.replace(forumPostTokenized[i])
Exemplo n.º 22
0
def pass_replacer(sent):
    replace_sentence = []
    replacer_object = RegexpReplacer()
    for sentence in sent:
        replace_sentence.append(replacer_object.replace(sentence))
    return replace_sentence
Exemplo n.º 23
0
from replacers import RegexpReplacer
replacer = RegexpReplacer()
replacer.replace("@anirudh24seven hi")
Exemplo n.º 24
0
import nltk
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer = RegexpReplacer()
word_tokenize("Don't hesitate to ask questions")
print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
Exemplo n.º 25
0
    (' S ', ''),  ##the stragglers: 'S' and 'A' tags
    (' A ', ''),
    ("[ ]{2,10}", ' '),  ##clearing out extra whitespaces (2-10 in a row)
    #("[A-Z]{1}", "\n\1")
    ("(?P<FIRST>[A-Z]{1})", "\n\g<FIRST>")
]

##importing a file and turning it into a useable string
with open("SBARQ.txt", "r") as myfile:
    data = myfile.read().replace('\n', '')


##this takes a string, you have to turn your file into a string first
class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl)
                         for (regex, repl) in patterns]

    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s


##now the actual function
from replacers import RegexpReplacer
replacer = RegexpReplacer()
with open("Output.txt", "w") as text_file:
    text_file.write(replacer.replace(data))
Exemplo n.º 26
0
 def __init__(self, basic=basicwords, stop=stopwords_one):
     self.basic = basicwords
     self.stop = stopwords_one
     self.d = enchant.Dict("en_US")
     self.lemmatizer = WordNetLemmatizer()
     self.replacer = RegexpReplacer()
Exemplo n.º 27
0
from replacers import RegexpReplacer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#reading the csv file and extracting the column of tweets into a list
csv_file=sys.argv[1]

df=pd.read_csv(csv_file)
saved_column=df['text']
list1=list(saved_column)
#print (list1)

replacer=AntonymReplacer()
rep1=RepeatReplacer()
rep2=RegexpReplacer()

for i in range(0,len(list1)):
    list1[i]=re.sub(r'[^\x00-\x7F]',r' ',list1[i]) #Replacing non-ascii characters with a space
    list1[i]=rep2.replace(list1[i])                 #texts like can't are converted into can not
    list1[i]=list1[i].split()                       #Splitting each sentence into words
    #list1[i]=[w for w in list1[i] if (len(w)>2)]    #String length of a word is more than 2
    list1[i]=replacer.replace_negations(list1[i])   #Replaces the negative words with antonyms

emo={}
f=open('emotions.txt','r')
for line in f:
    line=line.split(',')
    emo[line[0]]=line[1].rstrip()
#print(emo)
abb={}
Exemplo n.º 28
0
import nltk
from replacers import RegexpReplacer
replacer = RegexpReplacer()
replacer.replace("Don't hesitate to ask questions")
print(replacer.replace("She must've gone to the market but she didn't go"))
Exemplo n.º 29
0
class TweetFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec_provider: Word2VecProvider,
                 emoji_provider: EmojiProvider):
        self._emoji_provider = emoji_provider
        self._repeat_replacer = RepeatReplacer()
        self._polarity_replacer = PolarityReplacer()
        self._replacement_patterns = NEGATION_REPLACEMENT_PATTERNS
        self._replacement_patterns.extend([
            # remove urls
            (r'((www\.[^\s]+)|(https?://[^\s]+))', ''),
            # remove usernames
            (r'@[^\s]+', ''),
            # remove # from hashtags
            (r'#([^\s]+)', r'\1'),
            # leave only letters
            (r'[^a-zA-Z]+', ' '),
            # remove months
            (r'(\b\d{1,2}\D{0,3})?\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|'
             +
             r'aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|(nov|dec)(?:ember)?)\D?(\d{1,2}(st|nd|rd|th)?)?(([,.\-\/])'
             + r'\D?)?((19[7-9]\d|20\d{2})|\d{2})*', '')
        ])
        self._regexp_replacer = RegexpReplacer(self._replacement_patterns)
        self._stem_replacer = StemReplacer()
        self._word2vec_provider = word2vec_provider
        self._stopwords = stopwords.words('english')
        # drop negation words from stopwords
        self._stopwords.extend(['NEG_' + word for word in self._stopwords])
        self._stopwords.extend(["'nt", "st", "nd", "rd", "th", "rt"])
        self._stopwords.extend(self._emoji_provider.emoji)

    @classmethod
    def _count_with_func(cls, tweet, func):
        count = 0
        for word in tweet.split(' '):
            if func(word):
                count += 1
        return count

    @classmethod
    def _count_occurrences(cls, tweet, letter):
        count = 0
        for l in tweet:
            if l == letter:
                count += 1
        return count

    @classmethod
    def _count_uppercase_words(cls, tweet):
        return cls._count_with_func(tweet, lambda word: word == word.upper())

    @classmethod
    def count_exclamation(cls, tweet):
        return cls._count_occurrences(tweet, '!')

    @classmethod
    def count_question_marks(cls, tweet):
        return cls._count_occurrences(tweet, '!')

    def count_positive_emoji(self, tweet):
        return self._count_with_func(
            tweet,
            lambda word: self._emoji_provider.is_positive_emoji(word.strip()))

    def count_negative_emoji(self, tweet):
        return self._count_with_func(
            tweet,
            lambda word: self._emoji_provider.is_negative_emoji(word.strip()))

    def clean_tweet(self, tweet):
        tweet = tweet.lower()
        # transform html encoded symbols
        tweet = BeautifulSoup(tweet, 'lxml').get_text()
        tweet = self._regexp_replacer.replace(tweet)
        tweet = word_tokenize(tweet)
        # eg loooove -> love
        tweet = self._repeat_replacer.replace(tweet)
        # replace negations
        tweet = self._stem_replacer.replace(tweet)
        tweet = self._polarity_replacer.mark_negations(tweet)
        return " ".join(
            [word for word in tweet if word not in self._stopwords]).strip()

    def get_avg_word_similarity(self, tweet, main_word):
        current_similarities = set()
        for word in tweet.split(' '):
            sim = self._word2vec_provider.get_similarity(
                main_word, word.lower())
            if sim is not None:
                current_similarities.add(sim)

        if len(current_similarities) == 0:
            return

        if len(current_similarities) == 1:
            return current_similarities.pop()

        # return np.mean(zscore(list(current_similarities)))

        # if len(current_similarities) == 1:
        #    return current_similarities[0 ]
        current_similarities = list(current_similarities)

        max_sim = np.max(current_similarities)
        min_sim = np.min(current_similarities)
        # normalize to <0;1>
        return list(
            np.mean([((sim - min_sim) / (max_sim - min_sim))
                     for sim in current_similarities]))

    def get_word2vec_vector(self, tweet):
        current_word2vec = []
        for word in tweet.split(' '):
            vec = self._word2vec_provider.get_vector(word.lower())
            if vec is not None:
                current_word2vec.append(vec)

        if len(current_word2vec) == 0:
            return np.zeros(200)

        return np.array(current_word2vec).mean(axis=0)

    def fit(self, x, y=None):
        return self

    def transform(self, texts):
        features = np.recarray(shape=(len(texts), ),
                               dtype=[('pos_emoji_count', float),
                                      ('neg_emoji_count', float),
                                      ('uppercase_word_count', float),
                                      ('exclamation_count', float),
                                      ('question_mark_count', float),
                                      ('clean_text', object),
                                      ('word2vec', np.ndarray)])

        for i, text in enumerate(texts):
            features['pos_emoji_count'][i] = self.count_positive_emoji(text)
            features['neg_emoji_count'][i] = self.count_negative_emoji(text)
            features['uppercase_word_count'][i] = self._count_uppercase_words(
                text)
            features['exclamation_count'][i] = self.count_exclamation(text)
            features['question_mark_count'][i] = self.count_question_marks(
                text)
            features['clean_text'][i] = self.clean_tweet(text)
            features['word2vec'][i] = self.get_word2vec_vector(text)

        return features
Exemplo n.º 30
0
import nltk
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer=RegexpReplacer()
word_tokenize("Don't hesitate to ask questions")
print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
Exemplo n.º 31
0
# Regex Replacer
from replacers import RegexpReplacer
replacer = RegexpReplacer()
print replacer.replace("can't is a contraction")
print replacer.replace("I should've done that thing I didn't do")


# Before Tokenizing
from nltk.tokenize import word_tokenize
replacer = RegexpReplacer()
print word_tokenize("can't is a contraction")
print word_tokenize(replacer.replace("can't is a contraction"))


# Replace repeating characters
from replacers import RepeatReplacer
replacer = RepeatReplacer()
print replacer.replace('looooove')
print replacer.replace('oooooh')
print replacer.replace('goose')


# Replacing snonnyms from a word map
from replacers import WordReplacer
replacer = WordReplacer({'bday': 'birthday'})
print replacer.replace('bday')
print replacer.replace('happy')


# Word replacing using synonym file
from replacers import CsvWordReplacer
Exemplo n.º 32
0
basicwordfile = 'highfreq.txt'

g = open(basicwordfile, 'r+')
basicwords = g.readline().strip().split(' ')
g.close()

stopwords_one = set(stopwords.words('english') + list(punctuation))
lemmatizer = WordNetLemmatizer()

#f = open(os.path.join(path, "exp.txt"), 'w+')

for filename in os.listdir(path):
    if '.txt' in filename:
        sys.stdout.write(filename)
        sys.stdout.write('\n')
        replacer = RegexpReplacer()
        with codecs.open(os.path.join(path, filename),
                         "r",
                         encoding='utf-8',
                         errors='ignore') as f:
            orgtext = f.read()
        orgtext1 = replacer.replace(orgtext)
        sent_text = sent_tokenize(orgtext1)
        for sent in sent_text:
            sent_word = word_tokenize(sent)
            for word in sent_word:
                word = lemmatizer.lemmatize(word)
                if (word.lower() not in stopwords_one) and (
                        word not in basicwords) and d.check(word):
                    sys.stdout.write(word)
                    sys.stdout.write('\t')
#!c:\Python27\python.exe
#!/usr/bin/env python
import os
import cgitb; cgitb.enable()
import cgi, cgitb 
import re
from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer = RegexpReplacer()
import csv
from nltk.corpus import wordnet
from nltk.tokenize.punkt import PunktSentenceTokenizer
from replacers import SpellingReplacer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
print "Content-Type: text/html"
print
print """
    <html>
    <head>
    <title>Spam Fighter</title>
    <link href='/style.css' rel='stylesheet' type='text/css' />
    </head>
    <body>
import csv
import os

import io
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader import CHILDESCorpusReader
from nltk.corpus import wordnet as wn

from replacers import RegexpReplacer

corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-UK-MOR/')

replacer = RegexpReplacer()
wordnet_lemmatizer = WordNetLemmatizer()

path = 'Corpus/FolderByAge/'

classeAberta = [
    'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ',
    'JJR', 'JJS', 'RB', 'RBR', 'RBS'
]

# TODO tirar as onomatopeias?? POS = on


def canonicalTag(palavra):

    j = nltk.tag.pos_tag([palavra])
    # pal_pos = (j[0][0], j[0][1])
Exemplo n.º 35
0
from replacers import RegexpReplacer, RepeatReplacer, WordReplacer, CsvWordReplacer

# contraction
replacer = RegexpReplacer()
print(replacer.replace("can't is a contraction"))
print(replacer.replace("I should't done that thing I didn't do"))

# repeat letters
replacer = RepeatReplacer()
print(replacer.replace('looooove'))
print(replacer.replace('oooooh'))
print(replacer.replace('goose'))

# synonyms
replacer = WordReplacer({'bday': 'birthday'})
print(replacer.replace('bday'))
print(replacer.replace('happy'))

replacer = CsvWordReplacer('syn.csv')
print(replacer.replace('bday'))
print(replacer.replace('NLP'))
print(replacer.replace('happy'))
Exemplo n.º 36
0
def regex_replacer_document(document):
	from replacers import RegexpReplacer
	replacer = RegexpReplacer()
	return replacer.replace(document)
Exemplo n.º 37
0
print(stemmerporter.stem('happiness'))
print(lemmatizer.lemmatize('happiness'))

print(stemmerporter.stem('believes') == 'believ')
print(lemmatizer.lemmatize('believes') == 'belief')

print(stemmerporter.stem('buses') == 'buse')
print(lemmatizer.lemmatize('buses') == 'bus')
print(stemmerporter.stem('bus') == 'bu')

print('============================================')
print('Replacing Words Matching Regular Expressions')
print('============================================')

replacer = RegexpReplacer()
print(replacer.replace("can't is a contraction") == 'cannot is a contraction')
print(
    replacer.replace("I should've done that thing I didn't do") ==
    'I should have done that thing I did not do')

print(
    word_tokenize("can't is a contraction") ==
    ['ca', "n't", 'is', 'a', 'contraction'])
print(
    word_tokenize(replacer.replace("can't is a contraction")) ==
    ['can', 'not', 'is', 'a', 'contraction'])

print('=============================')
print('Removing Repeating Characters')
print('=============================')
import csv

from posTag import POSTagger
from replacers import RegexpReplacer
from scores import Score

replacer = RegexpReplacer()
tagger = POSTagger()
some_score = Score()

original_words = []
splited_words = []
splited_words_pos = []
canonical_words_pos = []

with open('input/palavras_por_lista_artigos.csv') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=';')
    for row in reader:
        original_words.append((row['list'], row['word'].strip()))


splited_words = replacer.replace_all_list(original_words)
canonical_words_pos = tagger.canonicalTag(splited_words)

classeAberta = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']

list = []
words = []
index = canonical_words_pos[0][0]
# print(index)
tofind_sentiment_sentence = " ".join(
    [word.lower() for word in tofind_sentiment_sentence.split(" ")])
sentiment_wordslist = tofind_sentiment_sentence.split(" ")

# conjunctive present removing after that keyword
for conjuctive_word_index, conjuctive_word in enumerate(sentiment_wordslist):
    if conjuctive_word in firstclause_emotion:
        remove_sentence = 'after'
    if conjuctive_word in secondclause_emotion:
        remove_sentence = 'before'

try:
    if (remove_sentence == 'after'):
        tofind_sentiment_sentence = " ".join(
            sentiment_wordslist[conjuctive_word_index:])
except:
    pass

# print(tofind_sentiment_sentence)
ob = RegexpReplacer()
replaced_after = ob.replace(tofind_sentiment_sentence)
#print(replaced_after)
dickeyword_count = emotion_count(
    replaced_after)  #keyword_analysis counter returned
dicpharse_count = pharse_sentiment(
    replaced_after)  #pharse_analysis counter returned
print(dickeyword_count.most_common(3))
print(dicpharse_count.most_common(3))

print("Overall emotion is anger")