def remove_users(self): '''Takes a string and removes retweet and @user information''' self.text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', self.text) # remove retweet self.text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', self.text) # remove tweeted at return self
def text_cleaning(text): stop = stopwords.words('english') + [ "would", "could", "also", "one", "ha", "can't", "it's", "i've", "u", "it", "us", "we", "t", "s" ] # define stopwords list # cleaning if pd.isnull(text): return "" text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text) # remove URLs text = text.lower() # to lowercase text = ''.join([i for i in text if not i.isdigit()]) # remove digits # text = text.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE) # remove unicodes and emojis text = re.sub(r'(.)\1+', r'\1\1', text) unis_emojis_pattern = re.compile( pattern="[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) text = unis_emojis_pattern.sub(r' ', text) text = re.sub(r'[^\w\s]', ' ', text) # remove punctuations split_text = text.split() text = ' '.join(x for x in text.split() if x not in stop) # remove stopwords return text
def clean_tweet(tweet): link_removed = re.sub('https?://[A-Za-z0-9./]+', '', tweet) number_removed = re.sub('[^a-zA-Z]', ' ', link_removed) lower_case_tweet = number_removed.lower() tok = WordPunctTokenizer() words = tok.tokenize(lower_case_tweet) clean_tweet = (' '.join(words)).strip() return clean_tweet
def clean_text(text): # remove backslash-apostrophe text = re.sub("\'", "", text) # remove everything except alphabets text = re.sub("[^a-zA-Z]", " ", text) # remove whitespaces text = ' '.join(text.split()) # convert text to lowercase text = text.lower() return text
def remove_hashtags(self): self.text = re.sub(pattern=get_hashtags_pattern(), repl=r'\1', string=self.text) #left word delete only hash tag #self.text = re.sub(pattern=get_hashtags_pattern(), repl=segement(r'\1'), string=self.text) #left word delete only hash tag # text = re.sub(r'#([^\s]+)', r'\1', text) #self.text = re.sub(pattern=get_hashtags_pattern(), repl='', string=self.text) #completely delete hashtag return self
def remove_special_character(text: str) -> str: """ replace some special character with space from text :param text: :return: """ # https://www.compart.com/en/unicode/ # remove some non-chinese special symbols # https://unicode-table.com/en/blocks/basic-latin/ # !"#$%'()*+,;<=>[\]^`{|}~ # https://unicode-table.com/en/blocks/box-drawing/ # 2500—257F # https://unicode-table.com/en/blocks/cjk-symbols-and-punctuation/ # 3000—303F # https://unicode-table.com/en/blocks/cjk-compatibility-forms/ # FE30—FE4F # https://unicode-table.com/en/blocks/halfwidth-and-fullwidth-forms/ # FF00—FFEF return re.sub( r'[' r'!"#$%\'()*+,;<=>[\\\]^`{|}~' r'\u2500—\u257F' r'\u3000-\u303F' r'\uFE30-\uFE4F' r'\uFF00-\uFFEF' r']+', ' ', text)
def hashtags(self, w): # w = "#MOSHEBiran_TheKing-OFThe!World19" lst_parsed = [] if w[0] == "#": # lst_parsed.append(w.replace("_", "").lower()) w = w[1:len(w)] if w.isalpha() and (w.islower() or w.isupper()): lst_parsed.append(w.lower()) return lst_parsed if w.isalpha() and ("_" or "-" or "~") not in w: myString = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', w) lst_parsed.extend([word.lower() for word in myString.split()]) return lst_parsed else: myString = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', w) temp = ([word1.lower() for word1 in myString.split()]) for word in temp: if not word.isalpha(): start_of_next = 0 i = 0 while len(word) > i: if word[i].islower(): while len(word) > i and word[i].islower(): i += 1 elif word[i].isnumeric(): while len(word) > i and word[i].isnumeric(): i += 1 else: while len(word) > i and not word[i].isalnum(): i += 1 start_of_next = i continue lst_parsed.append(word[start_of_next: i].lower()) start_of_next = i continue else: lst_parsed.append(word.lower()) return lst_parsed
def build_training_set(train_file): t0 = time.time() nlp = en_core_web_sm.load() file = open(train_file, "r", encoding='utf8') uniques = [[],[]] #frequency array for each unique word in the corpus sents = [line for line in file if line != '\n'] del sents[0] #remove unwanted extraneous carriage return at top of input file #for each sentence in the training file for s, sent in enumerate(sents[:2000]): #rebuild sentence and pass it the NLP parser for POS tagging parsed = nlp(re.sub("(\|\S*){2}(\s|$)"," ",sent)) #initialise sentence container sents[s] = re.sub('\\n+','',sent).split(" ") quote = False #some sentences fail to close open quotes. This forces closure for t, (token, tagged) in enumerate(zip(sents[s], parsed)): word = token.split("|") if quote: quote = (tagged.tag_ != "''") #close currently open quote, as tagged by spacy else: quote = (tagged.tag_ == '``') #open a new quote, as tagged by spacy sents[s][t] = tuple([word[0], tagged.tag_, re.sub("^.-","",word[2]), quote]) #remove I/B prefixes #increment frequency of this word try: pos = uniques[0].index(word[0]) #find position in word array except: uniques[0].append(word[0]) #if not found then append the new word uniques[1].append(1) #and initialise with frequency of 1 else: uniques[1][pos] += 1 #else increment frequency if s/2000 == int(s/2000): #track the time (this is a long process) print(time.time() - t0, 'secs to proc', s, 'sentences') """==========================build features and labels=======================================================""" X_train = [] y_train = [] for sent in sents[:2000]: if len(sent) > 1: #first get the word frequencies (ie. do this once only for this sentence) frequency = [uniques[1][uniques[0].index(token[0])] for token in sent] #extract the features for each word in this sentence X_train.append([extract_features(sent, position, frequency) for position in range(len(sent))]) #append the training labels for each word in this sentence y_train.append([token[2] for token in sent]) return X_train, y_train
def convert_emoticons(old_text): smiley = emot.emoticons(old_text) new_text = re.sub(r'https?:\/\/.*[\r\n]*', '', old_text) # remove URL before if len(smiley) > 1 and smiley['flag']: for i in range(0, len(smiley['value'])): new_text = old_text.replace(smiley['value'][i], " "+smiley['mean'][i]+" ") old_text = new_text return new_text
def preprocess_tweet(text): new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', text) # remove URL new_tweet = re.sub(r'<[^>]+>', '', new_tweet) # remove html (line breaks etc.) new_tweet = re.sub( re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '', new_tweet) # remove email new_tweet = re.sub(r'#', '', new_tweet) # remove hash sign from hashtags new_tweet = re.sub("[^a-zA-Z]", " ", new_tweet) # remove remaining special characters words = new_tweet.lower().split() # do lowercase, split into words words = [word for word in words if not word in stopwords_english] # remove stop words words = [stemmer.stem(word) for word in words] # stemming words = [lemma.lemmatize(word) for word in words] # lemmatization # join words list back to one tweet return " ".join(words)
def convert_emphesize(text, return_count=False): emphs = re.findall(r'\b[A-Z]{2,}\b', text) emphs = set(emphs) if return_count: return len(emphs) for emph_ in emphs: text = re.sub(r'\b' + emph_ + r'\b', emph_ + ' emphh', text) return text
def convert_emojis(old_text): smiley = emot.emoji(old_text) new_text = old_text if smiley['flag']: for i in range(0, len(smiley['value'])): new_text = re.sub(smiley['value'][i], smiley['mean'][i], old_text) old_text = new_text return new_text
def text_tokenizer(self, text: str): text = re.sub('\S*@\S*\s?', '', text) # remove emails text = re.sub(r'^https?://.*[\r\n]*', '', text, flags=re.MULTILINE) # remove websites words = word_tokenize(text, 'english') words = list(filter(lambda word: len(word) >= self.min_length, words)) # text = (list(map(lambda x: self.stemmer.stem(x), words))) tokens = (list(map(lambda x: self.lemmatizer.lemmatize(x), words))) p = re.compile('[a-zA-Z]+') filtered_tokens = list( filter( lambda token: p.match(token) and len(token) >= self.min_length, tokens)) return filtered_tokens
def cleaning(sentences): words = [] for s in sentences: clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s) w = nltk.word_tokenize(clean) # lemmatizing words.append([lemmatizer.lemmatize(i.lower()) for i in w]) return words
def clean(string_rep): string_rep = re.sub('[^a-zA-Z .]', ' ', string_rep) string_rep = string_rep.lower() new_vocab = word_tokenize(string_rep) new_vocab = [ w for w in new_vocab if w not in set(stopwords.words('english')) ] data = ' '.join(new_vocab) return data
def clean_text(text): text = text.lower() # lower text text = re.sub('[^a-zA-Z]', ' ', text) text = word_tokenize(text) text = [x for x in text if x not in stop_words] # remove stop words pos_tags = pos_tag(text) text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags] # lemmatize text -ing, -ed, s,ss, # text = [WordNetLemmatizer().lemmatize(t) for t in text] # Other option of lemmatizer text = [t for t in text if len(t) > 1] # remove words with only one letter or empty return (text)
def prepare_arguments(query, stopwords=set()): tokens = query.split() prepared_tokens = [] stemmer = PorterStemmer() for token in tokens: token = re.sub('\W+', '', token) token = stemmer.stem(token) if token and token not in stopwords: prepared_tokens.append(token) return prepared_tokens
def preprocess_corpus(corpus: List[str]) -> List[str]: """ Preprocess nlp corpus Parameters ---------- corpus : List[str] list of tweet texts Returns ------- final_corpus : List[str] list of tweet texts, but processed """ # Download the English stop words from the NLTK repository. nltk.download('stopwords', quiet=True) # Removing the links from the tweets (starting with https:// until a space) corpus_no_url = [re.sub('(https://)\S*(\s|$)', '', line) for line in corpus] # Removing stopwords from English language stopWords = set(stopwords.words('english')) corpus_no_stops_no_url = [" ".join([re.sub('\n', "", l) for l in line.split(" ") if l not in stopWords]) for line in corpus_no_url] # Removing @... corpus_noAt_no_stops_no_url = [re.sub('(@)\S*(\s|$)', '', line) for line in corpus_no_stops_no_url] # Remove # corpus_noht_noAt_no_stops_no_url = [re.sub('#', '', line) for line in corpus_noAt_no_stops_no_url] # Set lowercase corpus_lowercase = [line.lower() for line in corpus_noht_noAt_no_stops_no_url] # Remove numbers corpus_no_numbers = [re.sub('[0-9]+', '', line) for line in corpus_lowercase] final_corpus = corpus_no_numbers # if debug_prints: # print("\n*** CORPUS BEFORE ***") # [print("\t* " + str(l)) for l in corpus[0:5]] # print() # # print("\n\n*** CORPUS AFTER ***") # [print("\t* " + str(l)) for l in final_corpus[0:5]] # print() return final_corpus
def replace_owner_names(text): regex = r"dimitr(?:iou|io|i)s*|\bef{1,2}i|nan(?:c|s)+y|giannis*|marin*(?:a|o)+s*|mano(?:li)*s*|mike|mic*halis*|" \ r"niko(?:la)*s*|artemis*|alex(?:andro|i)+s*|tasos*|al(?:i|y)+ki|g(?:i|e)+org(?:i|o)+s|george|efthimia|" \ r"mohammed|eva|antonia|nikk*i(?:ta)*s*|mustafa|anton(?:io|i)+s*|vill*y|k(?:y|i)+riakos*|vett*a|giorgaros*|" \ r"anthi|(?:ch|x)+ristina|yannis*|dion(?:i|y|u|h|e)+s(?:i|y|u|h|e)+s*|thaleia|(?:ch|x)+ristos*|" \ r"andria(?:nna)*|panagi(?:oti)*s|theopoula|popp*(?:i|y)+|vass*il(?:eio|i)+s*|asimina|fross*o|tina|andreas*|" \ r"geronim(?:u|o)s*|gerr*y|babis*|areti|nick|savvas*|stergos*|sergios*" replace = 'owner_name' return re.sub(regex, replace, text)
def add_doc_to_index(self, doc_text: str, doc_id: int, stemmer): doc_text = doc_text.lower() tokens = doc_text.split() cnt = 0 for token in tokens: token = re.sub('\W+', '', token) if not token: continue token = stemmer.stem(token) if token not in self.stopwords: self.index[token].append((doc_id, cnt)) cnt += 1
def parseEuroparl(): txt = "" test_txt = "" cc = 0 print "Reading Files ..." X = len(glob.glob('/home/antok/Downloads/txt/en/*.txt')) for i, filename in enumerate( glob.glob('/home/antok/Downloads/txt/en/*.txt')): print "Progress %d of %d\r" % (i, X), if cc < 500: with open(filename, "r") as f: for line in f: if line not in ['\n', '\r\n']: line = re.sub('<.*.>', "", line) txt += line if len(txt) != 0: if txt[len(txt) - 1] != ".": txt += "." cc += 1 elif cc >= 500 and cc < 650: with open(filename, "r") as f: for line in f: if line not in ['\n', '\r\n']: line = re.sub('<.*.>', "", line) test_txt += line if test_txt[len(test_txt) - 1] != ".": test_txt += "." cc += 1 else: break print "decoding to utf-8 ..." txt = txt.decode('utf-8') test_txt = test_txt.decode('utf-8') print "tokenization ..." sentences = [sent for sent in sent_tokenize(txt)] test_sentences = [sent for sent in sent_tokenize(test_txt)] return txt, test_txt, sentences, test_sentences
def Parse_HTML(html): # Get the metadata if its available. meta_data = [] meta_keywords = html.find("meta", attrs={"name": "keywords"}) meta_description = html.find("meta", attrs={"name": "description"}) if (meta_keywords): meta_data.append(("meta", meta_keywords.get("content"))) elif (meta_description): meta_data.append(("meta", meta_description.get("content"))) # Remove script tags which don't provide useful information. for script_tag in html("script"): script_tag.decompose() # Extract the headings and title from the html. headings = [] for heading_tag in html.find_all( ["title", "h1", "h2", "h3", "h4", "h5", "h6"]): heading = heading_tag.extract() tag_name = "h" if (heading.name == "title"): tag_name = "title" heading_data = (tag_name, heading.get_text().replace("\n", " ")) headings.append(heading_data) # Get the rest of the content and clean the format up. initial_text = html.get_text() edited_text = re.sub("\n +", "\n", initial_text) edited_text = re.sub("\n{3,}", "\n", edited_text) text = ("contents", edited_text) # Compiles all the extracted informatino into an arary. parsed_text = [] for pair in headings: parsed_text.append(pair) for pair in meta_data: parsed_text.append(pair) parsed_text.append(text) return parsed_text
def clean_tweets(tweet): # remove hyperlinks tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove hashtags # only removing the hash # sign from the word tweet = re.sub(r'#', '', tweet) # tokenize tweets tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tweet_tokens = tokenizer.tokenize(tweet) tweets_clean = [] for word in tweet_tokens: if (word not in stopwords_english and # remove stopwords word not in string.punctuation): # remove punctuation # tweets_clean.append(word) stem_word = stemmer.stem(word) # stemming word tweets_clean.append(stem_word) return tweets_clean
def remove_numbers(self, preserve_years=False): text_list = self.text.split(' ') for text in text_list: if text.isnumeric(): if preserve_years: if not is_year(text): text_list.remove(text) else: text_list.remove(text) self.text = ' '.join(text_list) self.text = re.sub('([0-9]+)', '', self.text) return self
def pre_processing(rel3): #Removing punctuations rel3 = re.sub('[^a-zA-Z0-9_\.]', ' ', rel3) #Lemmatization lem = WordNetLemmatizer() rel3 = [lem.lemmatize(i, pos='v') for i in rel3] rel3 = [lem.lemmatize(i, pos='n') for i in rel3] rel3 = [lem.lemmatize(i, pos='a') for i in rel3] rel3 = ''.join(rel3) #Tokenization tokens = nltk.word_tokenize(rel3) return tokens
def clean_news(news): output_news = [] filtered_sentence = [] x = ' '.join( re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', news).split()) stop_words = set(stopwords.words('english')) # tokens of words word_tokens = word_tokenize(x) for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) output_news.append(" ".join(filtered_sentence)) # print(analyze_sentiment(output_news[0])) return analyze_sentiment(output_news[0])
def preprocessing (df): corpus = [] pstem = PorterStemmer() for i in range(df['text'].shape[0]): # Remove unwanted words text = re.sub("[^a-zA-Z]", ' ', df['text'][i]) # Transform words to lowercase text = text.lower() text = text.split() # Remove stopwords then Stemming it text = [pstem.stem(word) for word in text if not word in set(stopwords.words('english'))] text = ' '.join(text) # Append cleaned tweet to corpus corpus.append(text) print("Corpus created successfully") return corpus
def stemming(string, top): string_rep = re.sub('[^a-zA-Z]', ' ', string) string_rep = string_rep.lower() new_vocab = word_tokenize(string_rep) #inter_stem=[ps.stem(l) for l in new_vocab] vocab_stem = [ ps.stem(w) for w in new_vocab if w not in set(stopwords.words('english')) ] dictry_stem = set(vocab_stem) data = ' '.join(vocab_stem) arr = [] c = 0 t = 0 for i in dictry_stem: c = len(re.findall(i, data)) t = t + c arr.append([i, c]) #print(i,c) aux = [] for q, w in arr: if w > 1: aux.append([q, w]) aux.sort(key=lambda i: i[1], reverse=True) aux = [w for w in aux if w[1] > 1] x = [] y = [] for wrd, co in aux: x.append(wrd) y.append(co) print("total number of words:", t, "\t set of words:", len(set(dictry_stem))) #print("Top ",top," words occurring more than twice: ",aux) print("%age of total length these word account for:", (sum(y) / t) * 100) #plt.bar(x,y,color='red',alpha=0.8) #plt.xlabel('Word') #plt.ylabel('Frequency') #plt.title('Words occurring more than twice') return (x, string_rep)
def prepare_document(text): tokenizer = RegexpTokenizer(r'\w+') raw = text.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens filtered_tokens = [i for i in tokens if i not in stopwords.words('english')] # remove numbers number_tokens = [re.sub(r'[\d]', ' ', i) for i in filtered_tokens] number_tokens = ' '.join(number_tokens).split() # stem tokens stemmed_tokens = [PorterStemmer().stem(i) for i in number_tokens] # remove empty length_tokens = [i for i in stemmed_tokens if len(i) > 1] return length_tokens
def get_url(urls): response = request.urlopen(urls) rw = str(response.read().decode('utf-8')) soup = BeautifulSoup(rw, "html.parser") # Removing script and style elements for script in soup(["script", "style"]): script.decompose() meta = soup.find_all('meta') rel3 = '' for tag in meta: if 'name' in tag.attrs.keys() and tag.attrs['name'].strip().lower( ) in ['description', 'keywords']: rel3 += tag.attrs['content'] rel3 += '\n' # get plain text text_soup = soup.get_text() # Removing leading and trailing spaces lines = (line.strip() for line in text_soup.splitlines()) # Breaking headlines into a line chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Removing blank lines text_chunk = '\n'.join(chunk for chunk in chunks if chunk) rel3 += text_chunk rel3 = re.sub('[^a-zA-Z0-9_\.]', ' ', rel3) #Lemmatization lem = WordNetLemmatizer() rel3 = [lem.lemmatize(i, pos='v') for i in rel3] rel3 = [lem.lemmatize(i, pos='n') for i in rel3] rel3 = [lem.lemmatize(i, pos='a') for i in rel3] rel3 = ''.join(rel3) #print(rel3) #Tokenization rel3 = nltk.word_tokenize(rel3) return rel3
def remove2(t): t=str(t) t=re.sub('\\s+', ',', t) t=t.split(',') return t
def remove(line): line = re.sub(';', ' ', line) line =line.split("\\s+") return line
test_data_labels=y_train1 #print "real labels" #print y_train1 #clf1 = GradientBoostingClassifier(n_estimators = 1000) #clf2 = AdaBoostClassifier(n_estimators = 1000) #clf3 = RandomForestClassifier() #print "the test data is" #print test[1:3] f=open('/home/mohan/Downloads/Theano-Tutorials-master/crawledcontents95.txt') raw = f.read() u1 = unicode(raw, "utf-8") string = unicodedata.normalize('NFKD', u1).encode('ascii','ignore') sent = nl.sent_tokenize(string) test_sents = [] for i in range(len(sent)): test_sents.append(' '.join(nl.word_tokenize(re.sub('''[!@#$(),.;'":/\n?!\W]''',' ', sent[i])))) ########################################################################################################################################### clf1.fit(train_data2,y_train1) X_test1 = vctr.transform(test_sents).toarray() grad_label = clf1.predict(X_test1) grad_label = grad_label.astype(int) print "grad_label" print grad_label for i in range(len(grad_label)): print test_sents[i] ,'=>', grad_label[i] ################################################grad prob####################################### clf1.fit(train_data2,y_train1) X_test1 = vctr.transform(test_sents).toarray() grad_label1 = clf1.predict_proba(X_test1) #grad_label1 = grad_label.astype(int) print "grad_label with probablistic labels "
def reformatWord(word): alphaNumericString = re.sub(r'\W+', '', word) # keep only alphanumeric characters stemmer = nltk.stem.snowball.EnglishStemmer() stemmedString = stemmer.stem(alphaNumericString) return stemmedString