def import_data(a_file, a_row, b_file, b_row): a_content = [] a_content_1 = open(a_file, 'r') csv_reader_a = csv.reader(a_content_1) for row in csv_reader_a: row_new = remove_stopwords(row[a_row]) row_new = strip_numeric(row_new) row_new = strip_non_alphanum(row_new) row_new = strip_short(row_new, minsize=3) a_content.append(row_new) a_length = len(a_content) a_label = np.ones(a_length) a_label = a_label.tolist() b_content = [] b_content_1 = open(b_file, 'r') csv_reader_b = csv.reader(b_content_1) for row in csv_reader_b: row_new = remove_stopwords(row[a_row]) row_new = strip_numeric(row_new) row_new = strip_non_alphanum(row_new) row_new = strip_short(row_new, minsize=3) b_content.append(row_new) b_length = len(b_content) b_label = np.zeros(b_length) b_label = b_label.tolist() return a_content, a_label, b_content, b_label
def import_data(file): human = [] machine = [] content = open(file, 'r') csv_reader = csv.reader(content) for row in csv_reader: row1 = unicode(row[2], errors='ignore') row_new1 = remove_stopwords(row1) row_new1 = strip_numeric(row_new1) #row_new = strip_non_alphanum(row_new) row_new1 = strip_short(row_new1, minsize=3) human.append(row_new1) row2 = unicode(row[3], errors='ignore') row_new2 = remove_stopwords(row2) row_new2 = strip_numeric(row_new2) #row_new = strip_non_alphanum(row_new) row_new2 = strip_short(row_new2, minsize=3) machine.append(row_new2) length = len(human) human_label = np.ones(length) human_label = human_label.tolist() machine_label = np.zeros(length) machine_label = machine_label.tolist() return human, human_label, machine, machine_label
def ALLCAPS(text): '''Calculates the number of ALL CAPS words at the start of the message after removing http addresses, numbers and multiple whitespaces input: text: a string returns: the number of ALL CAPS words at the start of the message ''' text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) words = text.split() ALLCAPScount = 0 for w in words: if w.isupper() == False: break ALLCAPScount = ALLCAPScount + 1 if ALLCAPScount: if (words[ALLCAPScount-1] == 'A'): ALLCAPScount = ALLCAPScount - 1 return ALLCAPScount
def getLemmatizedText(name, content, language): language = language[:2] language = language.lower() outText = "" if (language): if (language=="is"): outText = getLemmatizedTextIS(name, content) print("IS") else: outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content) print(language.upper()) else: text = name+" "+content outText = text.lower().replace('.','.') print("ERROR: No language for Lemmatizing text") cleaned = re.sub(' +', ' ',outText) cleaned = cleaned.replace('\n', '') cleaned = cleaned.replace('\r', '') cleaned = remove_stopwords(cleaned) cleaned = strip_tags(cleaned) cleaned = strip_punctuation(cleaned) cleaned = strip_numeric(cleaned) cleaned = strip_short(cleaned, 1) cleaned = strip_multiple_whitespaces(cleaned) cleaned = cleaned.lower() print("Lemmatized CLEAN: "+cleaned) return cleaned
def preprocessing(text): '''Preprocesses a text using standard gensim techniques: removes stopwords, strips short words (1-2 characters), strips numbers, strips http addresses, strips Unicode from emoji etc., lowercases everything, strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming input: text: a string returns: the preprocessed string. ''' text = text.lower() text = preprocess.remove_stopwords(text) # remove stop words text = preprocess.strip_short(text) #get rid of short words text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) text = preprocess.strip_punctuation(text) text = preprocess.strip_non_alphanum(text) text = preprocess.remove_stopwords(text) text = preprocess.strip_short(text) # stemming words = text.split() stemmed_words = [stemmer.stem(word) for word in words] text = ' '.join(stemmed_words) return text
def clean_text(text): """ Cleans the text in the only argument in various steps ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" if isfloat(text): try: if math.isnan(text): return '' except TypeError: print('text: {}'.format(text)) return '' # Replace newlines by space. We want only one doc vector. text = text.replace('\n', ' ').lower() # Expand contractions: you're to you are and so on. # text = contractions.fix(text) # Remove stop words text = preprocessing.remove_stopwords(text) # Remove html tags and numbers: can numbers possible be useful? text = preprocessing.strip_tags(preprocessing.strip_numeric(text)) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(text)) #text = re.sub(r'[^\w\s]', '', text.lower()) # STEMMING (Porter) automatically lower-cases as well # To stem or not to stem, that is the question #text = preprocessing.stem_text(text) return text
def merge_elements(self, json_zone: dict): """ Documentation Merge the zone elements in a dictionnary in order to avoid overrides Parameter: json_zone: json containing the zones info Out: merge_dictio: dictionnary of zones information merged """ merge_dictio = {} for k in json_zone.keys(): merge_dictio[k] = {} for el in json_zone[k].keys(): merge_dictio[k][strip_non_alphanum( strip_numeric(el.split('.')[0])).replace(' ', '')] = [] keys = merge_dictio.keys() for k in json_zone.keys(): for el in json_zone[k].keys(): for merge_key in merge_dictio[k].keys(): if merge_key in el: merge_dictio[k][merge_key] += json_zone[k][el] merge_dictio[k][merge_key] = list( dict.fromkeys(merge_dictio[k][merge_key])) return merge_dictio
def noPuncNoNumb(corpora): List_No_punct_numb = [[[strip_punctuation(stringa) and strip_numeric(stringa) for stringa in group] for group in corpus] for corpus in corpora] # print("\nList_No_punct_numb:") # print(List_No_punct_numb) return List_No_punct_numb
def readFromDir(osList): """ This reads the scraped raw data """ textList = [] for i in range(len(osList)): filesList = [] textArray = [] for (dirpath, dirnames, filenames) in os.walk(osList[i]): filesList.extend(filenames) os.chdir(osList[i]) for _ in range(len(filesList)): with open('{}'.format(filesList[_]), 'r', encoding='utf-8') as file: text_str = file.read() textArray.append(text_str.lower()) text_arr = ','.join(textArray) text_arr = strip_punctuation(text_arr) text_arr = strip_numeric(text_arr) text_arr = strip_non_alphanum(text_arr) textList.append(text_arr) os.chdir('..') return textList
def sentence_tokenize_and_word_tokenize_and_remove_stop_words( text, tokenizer, stop_word1, stop_word2): try: if isinstance(text, str): sentences = tokenizer.tokenize(text.lower()) else: sentences = tokenizer.tokenize(str(text).lower()) except UnicodeDecodeError as e: return '' if len(sentences) == 0: return '' text_total = '' for sentence in sentences: words = sentence.split() if len(words) == 0: continue text = ' '.join(filter(lambda x: x not in stop_word1, words)) try: text = preprocessing.strip_punctuation(text) text = preprocessing.strip_non_alphanum(text) text = preprocessing.strip_numeric(text) text = preprocessing.strip_tags(text) text = preprocessing.strip_multiple_whitespaces(text) words = text.split() if len(words) == 0: continue text = ' '.join(filter(lambda x: x not in stop_word2, words)) text_total = text_total + text.encode('utf-8') + '#' except UnicodeDecodeError as e: pass return text_total
def file_read_csv(path, txt_column=[]): print("Pickle File I/O Example - text Read") myfile = open(path, "r") text = myfile.read() text = strip_non_alphanum(text) text = strip_numeric(text) return [text]
def clean_text (self, text_tag, processes = ["urls", "punctuation", "numeric", "lower"]): text = self.texts [text_tag] #print (text) if "urls" in processes: text = [re.sub(r"(?:\@|https?\://)\S+", "", str(x)) for x in text] text = [re.sub(r' +', ' ', str(x)) for x in text] if "stopwords" in processes: text = [remove_stopwords (x) for x in text] if "punctuation" in processes: text = [strip_punctuation(x) for x in text] if "numeric" in processes: text = [strip_numeric(x) for x in text] text = [x.replace('"', "") for x in text] text = [x.replace('©', "") for x in text] text = [x.replace('\n', " ") for x in text] text = [x.replace('\r', ".") for x in text] text = [x.replace('QT', " ") for x in text] text = [x.replace('RT', " ") for x in text] text = [x.replace('#', " ") for x in text] text = [strip_multiple_whitespaces(x) for x in text] text = [x.strip() for x in text] if "lower" in processes: text = [x.lower() for x in text] # clean_text = [nltk.sent_tokenize (x) for x in clean_text] self.texts[text_tag] = text
def preprocess_text(corpus=[]): print("Preprocessing Corpus from list data structure") for i, val in enumerate(corpus): #iterate through list corpus[i] = corpus[i].strip('\n') corpus[i] = strip_punctuation(corpus[i]) corpus[i] = strip_non_alphanum(corpus[i]) corpus[i] = strip_numeric(corpus[i]) return corpus
def raw_text_preprocess(raw): raw = re.sub(r"http\S+", "", raw) raw = strip_non_alphanum(raw).lower().strip() raw = split_alphanum(raw) raw = strip_short(raw, minsize=2) raw = strip_numeric(raw) raw = ViTokenizer.tokenize(raw) return raw
def raw_text_preprocess(d): d = re.sub(r"http\S+", "", d) d = strip_non_alphanum(d).lower().strip() d = split_alphanum(d) d = strip_short(d, minsize=2) d = strip_numeric(d) d = ViTokenizer.tokenize(d) return d
def clean(sx): sx = strip_tags(sx) sx = strip_numeric(sx) sx = re.sub(r'\n', ' ', sx) sx = re.sub(r'\[', '', sx) sx = re.sub(r'\]', '', sx) sx = strip_multiple_whitespaces(sx) return sx
def _normalize(s): s = s.lower() for k, v in contractions.items(): s.replace(k, v) return strip_multiple_whitespaces( strip_non_alphanum(strip_numeric(strip_punctuation( strip_tags(s))))).split()
def preprocessing(corpus): for document in corpus: doc = strip_numeric(document) doc = remove_stopwords(doc) doc = strip_short(doc, 3) #doc = stem_text(doc) doc = strip_punctuation(doc) strip_tags(doc) yield gensim.utils.tokenize(doc, lower=True)
def topWords(billText, numberWords): text = remove_stopwords(billText) text = strip_numeric(text) text = strip_short(text, minsize=2) words = re.findall(r'\w+', text) topW = collections.Counter(words).most_common(numberWords) return topW
def gen_wf(text): words = nltk.tokenize.word_tokenize(strip_numeric(remove_stopwords(text))) resultwords = [ word.lower() for word in words if ((word.lower() not in stopwords) and (len(word) > 2)) ] fdist = nltk.FreqDist(resultwords) return dict(fdist)
def preprocess_mail(mail): mail = re.sub("https\S+", "", mail) # Loại bỏ các đường dẫn mail = strip_non_alphanum(mail).lower().strip( ) # Loại bỏ các kí tự không phải là chữ cái, chuyển tất cả kí tự thành chữ thường mail = split_alphanum(mail) # Tách văn bản thành các từ mail = strip_short( mail, minsize=2 ) # Lấy các từ có độ dài >= 2 kí tự, loại bỏ các từ có 1 chữ cái mail = strip_numeric(mail) # mail = ViTokenizer.tokenize(mail) return mail
def preprocess_text(corpus,field_name = 'Comment'): print("Preprocessing Corpus from pandas data frame") for index, row in corpus.iterrows(): #iterate through rows in dataframe line = row['Comment'].strip('\n') line = strip_punctuation(line) line = strip_non_alphanum(line) line = strip_numeric(line) line = strip_multiple_whitespaces(line) line = strip_short(line) #add cleaned text line to new dataframe corpus.at[index,field_name] = line #set value at row/column in corpus dataframet return corpus
def preprocess(self, data): data = [s.lower() for s in data] data = [parser.remove_stopwords(s) for s in data] data = [parser.strip_numeric(s) for s in data] data = [tokenizer.tokenize(s) for s in data] data = [[token for token in doc if len(token) > 1] for doc in data] data = [[lemmatizer.lemmatize(word) for word in doc] for doc in data] return data
def preprocess_text(text): text = parse_html_v2(text) text = text.lower() text = remove_links_content(text) text = remove_emails(text) text = remove_special_tags(text) # remove content between {} text = remove_punctuation(text) # remove all puntuations text = split_alphanum(text) # add space between word and numeric text = strip_numeric(text) # remove digits text = strip_non_alphanum(text) # remove non-alphabetic characters text = strip_short(text, minsize=2) # remove word with length < minsize text = remove_multiple_space(text).strip() # remove space and strip text = ViTokenizer.tokenize(text) return text
def import_data(file, row_content, x): content = [] label = [] content_1 = open(file, 'r') csv_reader = csv.reader(content_1) for row in csv_reader: row_new = remove_stopwords(row[row_content]) row_new = strip_numeric(row_new) #row_new = strip_non_alphanum(row_new) row_new = strip_short(row_new, minsize=3) content.append(row_new) length = len(content) for i in range(0, length): label.append(x) return content, label
def remove_non_plain(document): """ Replaces urls, @usernames, #tags, emojis and numbers with a ' ' (space). Also removes accents and punctuation to finally remove redundant whitespace and lowercase all characters :param document: string :return: processed unicode string """ document = to_unicode(document) document = non_plain_re.sub(' ', document) document = proc.strip_non_alphanum(document) document = proc.strip_numeric(document) document = proc.strip_multiple_whitespaces(document) document = deaccent(document) return document.lower()
def clean_text(x: str) -> str: """ :param x: raw string :return x: cleaned string """ x = x.lower() x = re.sub('ssense|exclusive', '', x) x = strip_non_alphanum(x) x = strip_numeric(x) x = strip_short(x, minsize=2) x = remove_stopwords(x) x = strip_punctuation(x) x = strip_multiple_whitespaces(x) return x
def read_data(data_source_path): corpus = [] for filename in os.listdir(data_source_path): if filename.endswith(".zip"): filename = os.path.join(data_source_path, filename) """Extract the first file enclosed in a zip file as a list of words.""" with zipfile.ZipFile(filename) as f: data = preprocessing.remove_stopwords( f.read(f.namelist()[0]).lower()) data = preprocessing.strip_multiple_whitespaces(data) data = preprocessing.strip_numeric(data) #data = preprocessing.split_alphanum(data) #data = f.read(f.namelist()[0]) data = tf.compat.as_str(data).split() #data = preprocessing(data) corpus.append(data) return corpus
def prep_text_czech(self, text): res = preprocessing.strip_punctuation(text.lower()) if self.settings['strip_nums']: res = preprocessing.strip_numeric(res) if self.settings['use_lemmatizer']: res = " ".join( [czech_lemmatizer.lemmatize(word) for word in res.split()]) res = " ".join( [word for word in res.split() if word not in cz_stopwords]) if self.settings['strip_short']: res = preprocessing.strip_short(res, minsize=3) if self.settings['use_stemmer']: res = " ".join( [czech_stemmer.cz_stem(word) for word in res.split()]) return res
def wordcount(text): '''Calculate post length after removing http addresses, numbers and multiple whitespaces input: text: a string returns: the adjusted wordcount. ''' text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) words = text.split() count = len(words) return count
def testStripNumeric(self): self.assertEqual(strip_numeric("salut les amis du 59"), "salut les amis du ")