def normalize(text):
    text = text.decode('utf-8')
    text = re.sub(r'[a-zA-z]+://[^\s]*', '', text)
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)
    text = strip_accents_ascii(text)
    text = text.encode('utf-8')
    text = ' '.join(map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text)))
    return text
Пример #2
0
def clean_text(text):
    if hasattr(text, "decode"):
        text = text.decode("utf-8")
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words
Пример #3
0
def normalize(text):
    text = text.decode('utf-8')
    text = re.sub(r'[a-zA-z]+://[^\s]*', '', text)
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)
    text = text.encode('utf-8')
    return text
Пример #4
0


# This section triggers if you supply the --generate flag, indicating 
# you want to recreate the training data/labels
if REGENERATE:

	print("Generating data from scratch.")

	texts = pickle.load(open(OUTFILE, 'rb'))[0]

	# This splits your list of texts into a list of sentences
	# At this point (in the training data) document borders
	# are removed.

	sentences = [item for text in texts for item in PunktSentenceTokenizer().tokenize(text.decode("utf8"))]
	sentences = [i.strip(' \n,.;:').replace('\n', ' ').split(' ') for i in sentences]

	# Create and train bigram/trigram converters
	unigram = Phrases(sentences, threshold=float("inf"))
	unigrams = unigram.export_phrases(sentences)

	grams = [unigram]

	sentences_copy = sentences

	threshold = 9.0

	while True:
		bigram = Phrases(sentences_copy, threshold=threshold)
		bigrams = bigram.export_phrases(sentences_copy)
def detect_sentiment(text):
    return TextBlob(text.decode('utf-8')).sentiment.polarity