def normalize(text):
    normalizr = Normalizr(language='en')
    normalizations = [
        'remove_extra_whitespaces',
        ('replace_punctuation', {
            'replacement': ' '
        }), 'lower_case', ('remove_stop_words', {
            'ignore_case': 'False'
        })
    ]
    h = HTMLParser()
    text = normalizr.normalize(xstr(text), normalizations)
    return str(h.unescape(text))
def normalisation(tweet):
    mention_removed = re.sub(r'(?:@[\w_]+)', '', tweet.lower())
    html_removed = re.sub(r'<[^>]+>', '', mention_removed)
    hashtag_removed = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", '',
                             html_removed)
    removed_repeated_chars = re.sub(r'(.)\1+', r'\1\1', hashtag_removed)
    normalised_text1 = re.sub(' +', ' ', removed_repeated_chars)

    normalizr = Normalizr(language='en')

    normalizations = [
        ('replace_urls', {
            'replacement': ' '
        }),
        ('replace_punctuation', {
            'replacement': ' '
        }),
        ('replace_emojis', {
            'replacement': ' '
        }),
        ('replace_hyphens', {
            'replacement': ' '
        }),
        ('replace_symbols', {
            'replacement': ' '
        }),
        'remove_accent_marks',
        'remove_stop_words',
        'remove_extra_whitespaces',
    ]

    normalised_text2 = normalizr.normalize(normalised_text1, normalizations)
    array_words = normalised_text2.split()
    #print (array_words)

    normalised_text3 = [correction(word) for word in array_words]
    normalised_tweet = " ".join(normalised_text3)

    return normalised_tweet
Exemplo n.º 3
0
bannedWords = ["", "rt", "amp"]
for x in range(0, len(content)):
    stringList.append([])
    stringList[x] = content[x].split(" ")

#Used to store the index of the tweet that contains a word in the corpus
dxInCorpus = -1

for x in range(0, len(content)):
    if (x % 100 == 0):
        print("tweet " + str(x) + " of " + str(len(content)))
    tweetWords = stringList[x]
    numWords = len(tweetWords)
    for i in range(0, numWords):
        word = normalizr.normalize(stringList[x][i].lower())
        stringList[x][i] = word
    #numWordsInCorpus = 0;
    for i in range(0, numWords):
        word = stringList[x][i]
        #if (word in crpNodeList):
        #   numWordsInCorpus = numWordsInCorpus + 1;
        #if (numWordsInCorpus > 1):
        for i in range(0, numWords):
            firstWord = stringList[x][i]
            for j in range(i + 1, numWords):
                secondWord = stringList[x][j]
                w = 1
                #if (firstWord in crpNodeList or secondWord in crpNodeList):
                #if (firstWord in crpNodeList and secondWord in crpNodeList):
                if graph.has_edge(firstWord, secondWord):
Exemplo n.º 4
0
    }),
    ('replace_emojis', {
        'replacement': ' '
    }),
    ('replace_hyphens', {
        'replacement': ' '
    }),
    ('replace_symbols', {
        'replacement': ' '
    }),
    'remove_accent_marks',
    'remove_stop_words',
    'remove_extra_whitespaces',
]

arq_2.write(normalizr.normalize(texto, normalizations))

arq_2.close()
arq.close()

#calculando a quantidade total de palavras válidas porém repetidas da base. TOTAL : 4650
'''
arq_2 = open("FINAL_Entretenimento.txt", 'w')

st = ""
for z in arq_2:
	st += z

z = z.split()
print (z)
print (len(z))