def normalize(text): text = text.decode('utf-8') text = re.sub(r'[a-zA-z]+://[^\s]*', '', text) text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text) text = strip_accents_ascii(text) text = text.encode('utf-8') text = ' '.join(map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text))) return text
def clean_text(text): if hasattr(text, "decode"): text = text.decode("utf-8") tokens = word_tokenize(text) tokens = [w.lower() for w in tokens] table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] words = [word for word in stripped if word.isalpha()] from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] return words
def normalize(text): text = text.decode('utf-8') text = re.sub(r'[a-zA-z]+://[^\s]*', '', text) text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text) text = text.encode('utf-8') return text
# This section triggers if you supply the --generate flag, indicating # you want to recreate the training data/labels if REGENERATE: print("Generating data from scratch.") texts = pickle.load(open(OUTFILE, 'rb'))[0] # This splits your list of texts into a list of sentences # At this point (in the training data) document borders # are removed. sentences = [item for text in texts for item in PunktSentenceTokenizer().tokenize(text.decode("utf8"))] sentences = [i.strip(' \n,.;:').replace('\n', ' ').split(' ') for i in sentences] # Create and train bigram/trigram converters unigram = Phrases(sentences, threshold=float("inf")) unigrams = unigram.export_phrases(sentences) grams = [unigram] sentences_copy = sentences threshold = 9.0 while True: bigram = Phrases(sentences_copy, threshold=threshold) bigrams = bigram.export_phrases(sentences_copy)
def detect_sentiment(text): return TextBlob(text.decode('utf-8')).sentiment.polarity