Exemplo n.º 1
0
def lima(word, words):
    # print(word)
    lemmatiser = wnl()
    words_tag = dict(pos_tag(words))
    # print(wordnet.synsets(word))
    # print(get_wordnet_pos(words_tag.get(word)))
    # if word.isalpha() and wordnet.synsets(word):
    return  lemmatiser.lemmatize(word, get_wordnet_pos(words_tag.get(word)))
Exemplo n.º 2
0
def clean_text(text):  
    data = [char for char in text if char not in string.punctuation]
    data = ''.join(data)
    data = str(data)
    words = regex.sub(r"(@[A-Za-z0-9]+)|([^A-Za-z0-9 \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", data )  
    words = words.lower()
    final_words =  [wnl().lemmatize(word , pos = "v") for word in words.split()]
    final_words = ' '.join(final_words)
    return(final_words)
Exemplo n.º 3
0
def clean_text_data(data):
    regex = re.compile("[^a-zA-Z]")
    stem_data_without_stop = [
        " ".join([
            wnl().lemmatize(regex.sub("", word.lower()))
            for word in text.split(" ") if word not in stop
        ]) for text in data
    ]
    return stem_data_without_stop
Exemplo n.º 4
0
    def __init__(self):
        self.error_list = []
        self.error_dic = dict()
        self.rule_dic = {'i': ['am', 'could', 'should', 'have', 'did', 'had', 'will', 'was', 'can', 'shall', 'may', 'might', 'must', 'would'],
                         'he': ['is', 'could', 'should', 'did',  'has', 'will', 'had', 'was', 'can', 'shall', 'may', 'might', 'must', 'would'],
                         'you': ['are', 'had', 'could', 'should', 'did',  'have', 'will', 'were', 'can', 'shall', 'may', 'might', 'must', 'would']
                         }
        self.tool = language_check.LanguageTool('en-US')
        self.lemmatizer = wnl()
        self.nlp = spacy.load('en')

        logging.basicConfig(filename="log_file.log", format='%(asctime)s %(message)s', filemode='w', level=logging.DEBUG)
Exemplo n.º 5
0
def PreprocessCSV(csvfile, outputfile):
    """
    output a csv file and return a word list.
    """
    print("Start preprocessing %s ..." % csvfile)
    voc = []
    dataframe = pandas.read_csv(csvfile, usecols=["Insult", "Comment"])
    labels = dataframe.iloc[:, 0].tolist()
    sents = dataframe.iloc[:, 1].tolist()
    newsents = []
    for sent in sents:
        # process sentences of samples
        # in case of blank, add a useless flag at the end
        sent = sent.strip("\"").lower()
        sent = sent.replace("\t", " ")
        sent = sent.replace("\n", " ")
        sent = sent.replace("\xa0", " ")
        sent = sent.replace("\xc2", " ")
        sent = sent.replace("\xc8", " ")
        sent = sent.replace("\xec", " ")
        sent = sent.replace("\x80", " ")
        sent = sent.replace("\xa6", " ")
        sent = re.sub("[$%^&*\[\]]", "", sent)
        tks = wt(sent)
        newtks = []

        #built first-part features
        for tk in tks:
            if tk.isalpha():
                tk = wnl().lemmatize(tk)
                newtks.append(tk)
                voc.append(tk)
            else:
                pass
        newsent = " ".join(newtks)
        newsent = newsent + " " + "auselessflag"
        newsents.append(newsent)

    # write the outputfile
    col_order = ["Insult", "Comment"]
    dataframe2 = pandas.DataFrame({"Insult": labels, "Comment": newsents})
    dataframe2.to_csv(outputfile, index=False, columns=col_order)
    fdist = FreqDist(voc)
    keys = fdist.keys()
    wordlist = []
    for key in keys:
        wordlist.append(key)
    print(
        "file \"%s\" is preprocessed, and there are %d keys in the return wordlist."
        % (csvfile, len(wordlist)))
    return wordlist
Exemplo n.º 6
0
def preProcessData(tuple):
	count=0
	for tweet,sentiment in tuple:	
	
		#Remove HashTags
		tweet=re.sub('#','',tweet)
		
		#Remove Username like @Rahul
		tweet=re.sub('@[\w\d_]*','',tweet)

		#Remove URL's
		tweet=re.sub('http.//[\w\d\.\\/]*','',tweet)

		#Remove Puntuations
		tweet=re.sub(r'[%\.\'\"\?:,;!-]',' ',tweet)
		
		#Remove HTML Tags
		tweet=re.sub('<.*?>','',tweet)
		
		#Remove rpeadted Words
		tweet=re.sub(r'([a-z])\1+',r'\1',tweet)
		
		#Removing words that start with a number or a special character
		tweet = re.sub(r'^[^a-zA-Z]+',' ',tweet)
		
		#Convert camel Casing into space Separated word
		tweet=re.sub("([a-z])([A-Z])","\g<1> \g<2>",tweet)
		
		#Remove additional white spaces
		tweet = re.sub('[\s]+', ' ', tweet)
		
		#Remove StopWords
		tweet=tweet.split()
		nltkVariable=nltk.corpus.stopwords.words('english')
		for word in tweet:
			if word in nltkVariable:
				tweet.remove(word)
				
		#Lemmatize Words
		tweet=[wnl().lemmatize(word) for word in tweet]
		
		if count==6:
			#print tweet
		#	tweet=re.sub('http.//[\w\d\.\\/]*','',tweet)
			print tweet
		count=count+1
Exemplo n.º 7
0
            else:
                uncontracted.append(x)
        elif x.lower() in contractions.keys():
            uncontracted.append(contractions[x.lower()])
        elif x in contractions.keys():
            uncontracted.append(contractions[x])
        else:
            uncontracted.append(x)

    return (" ".join(uncontracted))


# In[17]:

from nltk.stem import WordNetLemmatizer as wnl
lemmatizer = wnl()

# In[18]:


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
Exemplo n.º 8
0
               ranscendent cinematic experience. Thank you to everybody at 
               Fox and New Regency … my entire team. I have to thank 
               everyone from the very onset of my career … To my parents; 
               none of this would be possible without you. And to my 
               friends, I love you dearly; you know who you are. And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world. A world that we
               collectively felt in 2015 as the hottest year in recorded
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating. We need to
               support leaders around the world who do not speak for the 
               big polluters, but who speak for all of humanity, for the
               indigenous people of the world, for the billions and 
               billions of underprivileged people out there who would be
               most affected by this. For our children’s children, and 
               for those people out there whose voices have been drowned
               out by the politics of greed. I thank you all for this 
               amazing award tonight. Let us not take this planet for 
               granted. I do not take tonight for granted. Thank you so very much."""

s = nk.sent_tokenize(paragraph)
lm = wnl()

#Lemmatization that better meaningful word gives as output
for i in range(len(s)):
    words = nk.word_tokenize(s[i])
    nwords = [lm.lemmatize(word) for word in words]
    s[i] = ' '.join(nwords)
def wordLemmatizer(tweet_words):
    return [wnl().lemmatize(word) for word in tweet_words]
Exemplo n.º 10
0
    def __init__(self):

        self.tokenizer = data.load('tokenizers/punkt/english.pickle')
        self.wnl       = wnl()
        self.delims = {".", ',', "!", ":", ";"}
        self.stopwords = {"the"}
Exemplo n.º 11
0
def lemmatizer_list(row):
	lemmatizer = wnl()
	tokenized_words = row['tokenized_text']
	lemmatized_tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokenized_words]
	return lemmatized_tokens
Exemplo n.º 12
0
sys.path.append("..")
from Parsers.Stemming.lovins    import stem              as lovins_stem
from nltk.stem.porter           import PorterStemmer     as porter
from nltk.stem.lancaster        import LancasterStemmer  as lancs
from nltk.stem.snowball         import EnglishStemmer    as snowball
from nltk.stem                  import WordNetLemmatizer as wnl

#Source : http://stackoverflow.com/questions/446052/python-best-way-to-check-for-python-version-in-a-program-that-uses-new-language
reqVersion = (3,0)
curVersion = sys.version_info

_lancs = lancs()
_porter = porter()
_snowball = snowball()
_wnl = wnl()
        
def removeEscapeChars(textString):
        if curVersion < reqVersion:
                return textString.encode('string_escape').decode('string_escape')  #Python 2
        else:
                return textString.encode('encode_escape').decode("unicode_escape") #Python 3

def getASCIIChars(textString):
        return unicode(string, 'ascii', 'ignore')

#Source: http://www.packtpub.com/article/parsing-specific-data-python-text-processing
def detectEncoding(textString):
        try:
                return chardet.detect(textString)
        except UnicodeDecodeError:
Exemplo n.º 13
0
# generate the list of words
f = open('word_freq_final.txt', 'r')
wordList = []
count = 0
for line in f:
	wordList.append(line.split(',')[0])
wordList.sort()
"""g = open('wordlist_final.txt', 'w')
for i in range(len(wordList)):
	g.write(str(i) + "-" + wordList[i] + "\n")"""

cwd = os.getcwd()
valid_ext = ".jpg"

d = enchant.Dict("en")
wnl = wnl()

# files of data set images
# the data set must be contained in a folder (titled "imgs") within the working directory
paths = ["imgs/1_early-renaissance"
		, "imgs/2_high-renaissance"
		, "imgs/3_mannerism-late-renaissance"
		, "imgs/4_northern-renaissance"
		, "imgs/5_baroque"
		, "imgs/6_rococo"
		, "imgs/7_romanticism"
		, "imgs/8_impressionism"
		, "imgs/9_post-impressionism"
		, "imgs/10_realism"
		, "imgs/11_art-nouveau-modern"
		, "imgs/12_cubism"
def wordLemmatizer(tweet_words):
    return [wnl().lemmatize(word) for word in tweet_words]