예제 #1
0
 def _clean_line(self, text):
     text = self._re_sub(r"http\S+", "<url>", text)
     text = self._re_sub(r"@[A-Za-z0-9]+", "<user>", text)
     text = self._re_sub(r"#[A-Za-z0-9]+", "", text)
     text = text.lower()
     text = text.strip()
     return text
def preprocess_text(text):
    
    """
    Function to preprocess text: removes links, punctuation, spaces, non-alpha words and stop_words
    
    Parameters
    ----------
    text: str
        a string to be preprocessed
        
    Returns
    -------
    text: str
        a preprocessed string
    """
    text = text.lower()                                    #lowercase
    text = re.sub(r"http\S+", "", text)                    #replace links with ""
    text = re.sub(r"\@\S+", "", text)                      #replace mentions with ""
    text = re.sub(r"#\S+", "", text)                       #replace hashtags with ""
    text = re.sub(r"won\'t", "would not", text)            #deal with contractions
    text = re.sub(r"n\'t", " not", text)                   #deal with contractions
    text = REPLACE_BY_SPACE.sub(' ', text)                 #replace punctuation with space
    text = [word.strip() for word in text.split()]         #strip space from words
    text = [word for word in text if len(word)>2]          #removing words less than 2 characters
    text = [word for word in text if word!='amp']          #removing twitter amp
    text = ' '.join(text)
    return text
 def preprocess(self, text):
   punctuation_edit = string.punctuation +"0123456789"
   text = text.lower()
   #remove numbers
   text = re.sub(r'\d+', '', text)
   #remove extra whitespace
   text = " ".join(text.split())
   #remove punctuation
   text = text.translate(str.maketrans('', '', string.punctuation))
   #stop words
   stop_words = stopwords.words('english')
   word_tokens = word_tokenize(text)
   for word in stop_words:
     if word in word_tokens:
       word_tokens.remove(word)
   #Lemmatization
   lemmatizer = WordNetLemmatizer()
   lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
   text = ' '.join(lemmas)
   text =  [text]
   tok = Tokenizer(num_words=20000, filters=punctuation_edit)
   tok.fit_on_texts(list(text))
   seq = tok.texts_to_sequences(text)
   pad = sequence.pad_sequences(seq, maxlen=100)
   return pad
예제 #4
0
 def _clean_line(self, text):
     text = re.sub(r'http\S+', '', text)
     text = re.sub(r'@[A-Za-z0-9]+', '', text)
     text = re.sub(r'#[A-Za-z0-9]+', '', text)
     text = text.replace('RT', '')
     text = text.lower()
     text = text.strip()
     return text
예제 #5
0
 def _clean_line(self, text):
     text = re.sub(r"http\S+", "", text)
     text = re.sub(r"@[A-Za-z0-9]+", "", text)
     text = re.sub(r"#[A-Za-z0-9]+", "", text)
     text = text.replace("RT","")
     text = text.lower()
     text = text.strip()
     return text
예제 #6
0
def clean_text(text):
    '''

    text: a string

    returns: a modified shorter string

    '''
    text = BeautifulSoup(text, "html.parser").text # html encoding
    text = text.lower() # convert text to all lowercase
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replaces the REPLACE_BY_SPACE_RE symbols with a space
    text = BAD_SYMBOLS_RE.sub('', text) # removes BAD_SYMBOLS_RE
    text =  ' '.join(word for word in text.split() if word not in STOPWORDS) # deletes stopwords
    return text # returns cleaner comment_text
예제 #7
0
def lowercase_text(text):
    text = text.lower()
    return text
예제 #8
0
# #using speech_recognition library in python for converting speech to text
r = sr.Recognizer()  
try: 
# use the microphone as source for input. 
    with sr.Microphone() as source: 
        #taking 1 second to recognize the background noise for clearer and more effective audio to text translation 
        r.adjust_for_ambient_noise(source, duration=1) 
        print("Getting an idea of your background noise")
        time.sleep(1.1)
        #listens for the user's input  
        print("Speak now")
        audio = r.listen(source)        
        # Using ggogle to recognize audio 
        text = r.recognize_google(audio) 
        # MyText = r.recognize_sphinx(audio) 
        text = text.lower() 
        print(text)  
except sr.RequestError as e: 
        text=""
        print("Error") 
except sr.UnknownValueError: 
        text=""
        print("unknown error")

test_requests=[
    text
]

tag_encoder = MultiLabelBinarizer()
# tags_encoded = tag_encoder.fit_transform(sentiments_encoded)
# num_tags = len(tags_encoded[0])