def normalize(text): normalizr = Normalizr(language='en') normalizations = [ 'remove_extra_whitespaces', ('replace_punctuation', { 'replacement': ' ' }), 'lower_case', ('remove_stop_words', { 'ignore_case': 'False' }) ] h = HTMLParser() text = normalizr.normalize(xstr(text), normalizations) return str(h.unescape(text))
def normalisation(tweet): mention_removed = re.sub(r'(?:@[\w_]+)', '', tweet.lower()) html_removed = re.sub(r'<[^>]+>', '', mention_removed) hashtag_removed = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", '', html_removed) removed_repeated_chars = re.sub(r'(.)\1+', r'\1\1', hashtag_removed) normalised_text1 = re.sub(' +', ' ', removed_repeated_chars) normalizr = Normalizr(language='en') normalizations = [ ('replace_urls', { 'replacement': ' ' }), ('replace_punctuation', { 'replacement': ' ' }), ('replace_emojis', { 'replacement': ' ' }), ('replace_hyphens', { 'replacement': ' ' }), ('replace_symbols', { 'replacement': ' ' }), 'remove_accent_marks', 'remove_stop_words', 'remove_extra_whitespaces', ] normalised_text2 = normalizr.normalize(normalised_text1, normalizations) array_words = normalised_text2.split() #print (array_words) normalised_text3 = [correction(word) for word in array_words] normalised_tweet = " ".join(normalised_text3) return normalised_tweet
bannedWords = ["", "rt", "amp"] for x in range(0, len(content)): stringList.append([]) stringList[x] = content[x].split(" ") #Used to store the index of the tweet that contains a word in the corpus dxInCorpus = -1 for x in range(0, len(content)): if (x % 100 == 0): print("tweet " + str(x) + " of " + str(len(content))) tweetWords = stringList[x] numWords = len(tweetWords) for i in range(0, numWords): word = normalizr.normalize(stringList[x][i].lower()) stringList[x][i] = word #numWordsInCorpus = 0; for i in range(0, numWords): word = stringList[x][i] #if (word in crpNodeList): # numWordsInCorpus = numWordsInCorpus + 1; #if (numWordsInCorpus > 1): for i in range(0, numWords): firstWord = stringList[x][i] for j in range(i + 1, numWords): secondWord = stringList[x][j] w = 1 #if (firstWord in crpNodeList or secondWord in crpNodeList): #if (firstWord in crpNodeList and secondWord in crpNodeList): if graph.has_edge(firstWord, secondWord):
}), ('replace_emojis', { 'replacement': ' ' }), ('replace_hyphens', { 'replacement': ' ' }), ('replace_symbols', { 'replacement': ' ' }), 'remove_accent_marks', 'remove_stop_words', 'remove_extra_whitespaces', ] arq_2.write(normalizr.normalize(texto, normalizations)) arq_2.close() arq.close() #calculando a quantidade total de palavras válidas porém repetidas da base. TOTAL : 4650 ''' arq_2 = open("FINAL_Entretenimento.txt", 'w') st = "" for z in arq_2: st += z z = z.split() print (z) print (len(z))