def ALLCAPS(text): '''Calculates the number of ALL CAPS words at the start of the message after removing http addresses, numbers and multiple whitespaces input: text: a string returns: the number of ALL CAPS words at the start of the message ''' text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) words = text.split() ALLCAPScount = 0 for w in words: if w.isupper() == False: break ALLCAPScount = ALLCAPScount + 1 if ALLCAPScount: if (words[ALLCAPScount-1] == 'A'): ALLCAPScount = ALLCAPScount - 1 return ALLCAPScount
def preprocessing(text): '''Preprocesses a text using standard gensim techniques: removes stopwords, strips short words (1-2 characters), strips numbers, strips http addresses, strips Unicode from emoji etc., lowercases everything, strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming input: text: a string returns: the preprocessed string. ''' text = text.lower() text = preprocess.remove_stopwords(text) # remove stop words text = preprocess.strip_short(text) #get rid of short words text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) text = preprocess.strip_punctuation(text) text = preprocess.strip_non_alphanum(text) text = preprocess.remove_stopwords(text) text = preprocess.strip_short(text) # stemming words = text.split() stemmed_words = [stemmer.stem(word) for word in words] text = ' '.join(stemmed_words) return text
def get_text_sentences(filepath, sbd_model): tokens_by_sentence = [] with codecs.open(filepath, encoding='utf8') as f: raw_text = f.read() #raw_text = raw_text.lower() raw_text = strip_multiple_whitespaces(raw_text) sentences = splitta.sbd.sbd_text(sbd_model, raw_text, do_tok=False) for s in sentences: new_s = strip_punctuation(s) tokens_by_sentence.append(list(utils.tokenize(new_s, deacc=True, lowercase=True))) #print raw_text #for filt in self.preprocess: # raw_text = filt(raw_text) #text = list(utils.tokenize(raw_text, deacc=True, lowercase=True)) return sentences, tokens_by_sentence
def wordcount(text): '''Calculate post length after removing http addresses, numbers and multiple whitespaces input: text: a string returns: the adjusted wordcount. ''' text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) words = text.split() count = len(words) return count
def testStripMultipleWhitespaces(self): self.assertEqual(strip_multiple_whitespaces("salut les\r\nloulous!"), "salut les loulous!")
def chatbot_interface(interaction, word2vec_model, fasttext_model, ptlkb64_model, glove300_model, numberbatch_model): """ Function used to run the chatbot interface """ # Flag to indicate if classification should be used (1) or not (0) classification_flag = 1 # Flag to indicate if the binary classifier should be used (1) or not (0) binary_classifier_flag = 1 # choose if stopwords should be removed from the user interaction process_interaction_toggle = 0 # choose if pre-selection should not be used (0), used with word embeddings (1) or used with Whoosh (2) pre_selection_toggle = 2 # parameters used to tune the selection of more than one response sr_alpha = 0.1 sr_beta = 3 # TODO: The STS model class can't perform feature selection but can use a model that uses it already. # location of the STS model model = STSModel() model.load_model( 'model_0905_SVR_R_pos_adv-dependency_parsing-word2vec-ptlkb-numberbatch' ) if classification_flag: print("The classifier is being used.") # read the different class sets class_1, class_2, class_3 = read_class_set() # transform the class sets from lists to dataframes class_1_df = pd.DataFrame(class_1, columns=['text']) class_2_df = pd.DataFrame(class_2, columns=['text']) class_3_df = pd.DataFrame(class_3, columns=['text']) faqs_variants_load_path = os.path.join(ROOT_PATH, 'datasets', 'AIA-BDE_v2.0.txt') with open(faqs_variants_load_path) as faqs_file: faqs_variants_corpus = faqs_file.read().splitlines() faqs_file.close() faqs_variants_corpus = [ line.replace('\t', '') for line in faqs_variants_corpus ] faqs_variants_corpus = [ line.split(':', 1) for line in faqs_variants_corpus ] # add the original question to a different list to improve the conversational presentation of a response position = 0 faqs_variants_questions = [] for element in faqs_variants_corpus: if element[0] == 'P' and element[1] not in faqs_variants_questions: faqs_variants_questions.append(element[1]) position += 1 # add the original question to a different list to improve the conversational presentation of a response position = 0 faqs_variants_answers = [] for element in faqs_variants_corpus: if element[0] == 'R' and element[1] not in faqs_variants_answers: faqs_variants_answers.append(element[1]) position += 1 faqs_variants_corpus = [ line for line in faqs_variants_corpus if len(line) == 2 and line[1] ] faqs_variants_corpus = [[line[0], strip_non_alphanum(line[1])] if line[0] != 'R' else [line[0], line[1]] for line in faqs_variants_corpus] faqs_variants_corpus = [[line[0].rstrip(), line[1].rstrip()] if line[0] != 'R' else [line[0], line[1]] for line in faqs_variants_corpus] faqs_variants_corpus = [ [line[0], strip_multiple_whitespaces(line[1])] if line[0] != 'R' else [line[0], line[1]] for line in faqs_variants_corpus ] faqs_variants_corpus = [ [line[0], line[1].lower()] if line[0] != 'R' else [line[0], line[1]] for line in faqs_variants_corpus ] position = 0 corpus = [] for element in faqs_variants_corpus: if element[0] == 'P': corpus.append([element[1]]) if element[0] == 'R': corpus[position].extend([element[1]]) position += 1 aux_list_of_questions = [phrases[0] for phrases in corpus] aux_df = pd.DataFrame(faqs_variants_questions, columns=['text']) # remove duplicate sentences from the aux_list_of_questions in order for Whoosh to work. clean_aux_list_of_questions = [] for pair in corpus: if pair not in clean_aux_list_of_questions: clean_aux_list_of_questions.append(pair) if process_interaction_toggle: print("The original sentence was: {}".format(interaction)) stp = set(stopwords.words('portuguese') + list(punctuation)) interaction = ' '.join( [word for word in interaction.split(' ') if word not in stp]) print("The sentenced after removing stopwords and punctuation: {}". format(interaction)) unprocessed_corpus = [] if classification_flag: # apply the classifier before using the STS model if binary_classifier_flag: predicted_class = corre_para_frase_bin(interaction) print("Saí daqui") if predicted_class == 0: print("The provided interaction is out of domain!\n") else: predicted_class = corre_para_frase_multi(interaction) if predicted_class == 1: print("The provided interaction belongs to class 1!\n") aux_df = class_1_df aux_list_of_questions = class_1 elif predicted_class == 2: print("The provided interaction belongs to class 2!\n") aux_df = class_2_df aux_list_of_questions = class_2 elif predicted_class == 3: print("The provided interaction belongs to class 3!\n") aux_df = class_3_df aux_list_of_questions = class_3 else: print("The provided interaction is out of domain!\n") if predicted_class == 1: if 'response' not in aux_df: aux_df.insert(1, 'response', interaction) else: aux_df['response'] = interaction if pre_selection_toggle != 2: for j in range(len(faqs_variants_questions)): if pre_selection_toggle == 1: unprocessed_corpus.append( [faqs_variants_questions[j], interaction]) else: unprocessed_corpus.extend( [faqs_variants_questions[j], interaction]) if pre_selection_toggle == 1: corpus_pairs, indexes = pre_selection(unprocessed_corpus, fasttext_model, position) if corpus_pairs is None: index_path = os.path.join(ROOT_PATH, 'indexers', 'Whoosh', 'indexes', 'cobaia_chitchat_v1.5') query_response = qwi.query_indexer(interaction, index_path) if (query_response[0] is None) or (not query_response[0]): response = "Desculpe, não percebi, pode colocar a sua questão de outra forma?" return response else: return response selected_aux_df = aux_df.iloc[indexes] selected_aux_df = selected_aux_df.reset_index(drop=True) else: if pre_selection_toggle == 2: pre_selection_index_path = os.path.join( ROOT_PATH, 'indexers', 'Whoosh', 'indexes', 'FAQs_no_analyser_AIA-BDE_v2.0') query_response = qwi.query_indexer(interaction, pre_selection_index_path) options_docnumbers = query_response[2] if len(options_docnumbers) == 0: response = "Desculpe, não percebi, pode colocar a sua questão de outra forma?" return response else: possible_variants_questions = [] possible_variants_answers = [] for pos, elem in enumerate(options_docnumbers): unprocessed_corpus.extend( [faqs_variants_questions[elem], interaction]) possible_variants_questions.append( faqs_variants_questions[elem]) possible_variants_answers.append( faqs_variants_answers[elem]) corpus_pairs = unprocessed_corpus selected_aux_df = aux_df element_features = model.extract_multiple_features( corpus_pairs, 0, word2vec_mdl=word2vec_model, fasttext_mdl=fasttext_model, ptlkb_mdl=ptlkb64_model, glove_mdl=glove300_model, numberbatch_mdl=numberbatch_model) predicted_similarity = model.predict_similarity(element_features) predicted_similarity = predicted_similarity.tolist() highest_match = max(predicted_similarity) selectable_range = (max(predicted_similarity) - min(predicted_similarity)) * sr_alpha if sr_beta > len(predicted_similarity): tmp_sr_beta = len(predicted_similarity) sr_beta_range = tmp_sr_beta possible_matches = n_max_elements(predicted_similarity, tmp_sr_beta) else: sr_beta_range = sr_beta possible_matches = n_max_elements(predicted_similarity, sr_beta) highest_match_index = predicted_similarity.index( max(predicted_similarity)) if pre_selection_toggle == 2: response = ("Se a sua pergunta foi: %s \nR: %s\n" % (possible_variants_questions[highest_match_index], possible_variants_answers[highest_match_index])) for i in range(1, sr_beta_range): if abs(highest_match - possible_matches[i]) <= selectable_range: response += ( "Também poderá estar interessado em: %s\nR: %s\n" % (possible_variants_questions[ predicted_similarity.index(possible_matches[i])], possible_variants_answers[predicted_similarity.index( possible_matches[i])])) return response else: #should be index 1, for testing purposes it is 0 response = ("Se a sua pergunta foi: %s \nR: %s\n" % (faqs_variants_questions[highest_match_index], faqs_variants_answers[highest_match_index])) for i in range(1, sr_beta): if abs(highest_match - possible_matches[i]) <= selectable_range: response += ( "Também poderá estar interessado em: %s\nR: %s\n" % (faqs_variants_questions[predicted_similarity.index( possible_matches[i])], faqs_variants_answers[predicted_similarity.index( possible_matches[i])])) return response else: # the query search will return a list of phrases with the highest matches, which will be used with the similarity model in order to evaluate which answer should be returned to the user index_path = os.path.join(ROOT_PATH, 'indexers', 'Whoosh', 'indexes', 'cobaia_chitchat_v1.5') query_response = qwi.query_indexer(interaction, index_path, 1) print(query_response[0]) print(query_response[1]) if (query_response[0] is None) or (not query_response[0]): response = "Desculpe, não percebi, pode colocar a sua questão de outra forma?" return response else: ''' unprocessed_answers = [] aux_qwi = pd.DataFrame(query_response[0], columns=['text']) if 'response' not in aux_qwi: aux_qwi.insert(1, 'response', interaction) else: aux_qwi['response'] = interaction for k in range(len(query_response[0])): unprocessed_answers.extend([faqs_variants_questions[k], interaction]) # element_features_qwi = extract_features(0, unprocessed_answers, aux_qwi, word2vec_mdl=word2vec_model, fasttext_mdl=fasttext_model, ptlkb64_mdl=ptlkb64_model, glove300_mdl=glove300_model, numberbatch_mdl=numberbatch_model, f_selection=converted_mask) element_features_qwi = model.extract_multiple_features(unprocessed_answers, 0, word2vec_mdl=word2vec_model, fasttext_mdl=fasttext_model, ptlkb_mdl=ptlkb64_model, glove_mdl=glove300_model, numberbatch_mdl=numberbatch_model) predicted_similarity_qwi = model.predict_similarity(element_features_qwi) predicted_similarity_qwi = predicted_similarity_qwi.tolist() print(predicted_similarity_qwi) highest_match_index_qwi = predicted_similarity_qwi.index(max(predicted_similarity_qwi)) return query_response[1][highest_match_index_qwi] ''' return query_response[1][0]
args = parser.parse_args() # # train fasttext # from gensim.models.fasttext import * from gensim.test.utils import datapath from gensim.parsing.preprocessing import preprocess_string,strip_punctuation,strip_short,strip_multiple_whitespaces import gensim with open(args.in_file_plain,"r",encoding="utf8") as in_file_plain: corpus = in_file_plain.read().splitlines() clean_corpus = [] for line in corpus: clean_corpus.append(strip_multiple_whitespaces(strip_short(strip_punctuation(line))).split()) def gen(): for line in clean_corpus: yield line #model = gensim.models.FastText(size=300,workers=50,min_count=3,window=7) model = gensim.models.FastText.load_fasttext_format(args.pretrained_model) model.workers = 50 # build the vocabulary model.build_vocab(sentences=clean_corpus,update=True) # train the model model.train( sentences=clean_corpus, epochs=100,
#strip_punctuation:Replace punctuation characters with spaces in `s` for i in range(0, len(df['content'])): regex = strip_punctuation(str(df['content2'][i])) df['content2'][i] = regex # In[17]: #test df['content2'][20] # In[18]: #strip_multiple_whitespaces: Remove repeating whitespace characters (spaces, tabs, line breaks) from `s` #and turns tabs & line breaks into spaces for i in range(0, len(df['content'])): regex = strip_multiple_whitespaces(str(df['content2'][i])) df['content2'][i] = regex # In[19]: #test df['content2'][20] # In[20]: #Transform all letters to lower case ones for i in range(0, len(df['content'])): regex = (str(df['content2'][i])).lower() df['content2'][i] = regex # In[21]:
def __call__(self, doc): striped = prep.strip_punctuation(doc) striped = prep.strip_tags(striped) striped = prep.strip_multiple_whitespaces(striped).lower() return striped
def clean_raw_content(textIn): cleaner = textIn.replace("\\n", "") cleaner = strip_tags(cleaner) cleaner = strip_multiple_whitespaces(cleaner) cleaner = cleaner.lower() return cleaner
file_dir = os.path.join('C:\\', 'Users', 'cruze', 'Documents', 'CS664') inputfile = os.path.join(file_dir, 'train_E6oV3lV.csv') df = pd.read_csv(inputfile) from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum, strip_numeric, strip_multiple_whitespaces, stem messages = df.iloc[:, 2] temp = [] for msg in messages: string = remove_stopwords(msg) string = strip_punctuation(string) string = strip_non_alphanum(string) string = strip_numeric(string) string = strip_multiple_whitespaces(string) string = stem(string) temp.append(string) df = pd.DataFrame({'tweet': temp, 'class': df.iloc[:, 1]}) ##----------------------------------------------------------------------------- #df.iloc[:, -1].value_counts() from sklearn.metrics import confusion_matrix, precision_score, f1_score from sklearn.model_selection import train_test_split from keras.layers import Dense, Dropout from keras.models import Sequential #from keras.regularizers import l2
def preprocessing(text, tokenization=0, rm_stopwords=0, numbers_to_text=0, to_tfidf=0): """ Function used to preprocess the training data """ train_data = pd.DataFrame(columns=['text', 'response']) prep_0 = [strip_non_alphanum(line) for line in text] prep_1 = [line for line in prep_0 if line.rstrip()] prep_2 = [strip_multiple_whitespaces(line) for line in prep_1] prep_3 = [line.lower() for line in prep_2] if to_tfidf == 1: #when using tf_idf, removes single character words given that they are ignored by sklearn's TfidfVectorizer prep_3 = [ ' '.join([word for word in line.split() if len(word) > 1]) for line in prep_3 ] if tokenization == 1: prep_3 = [line.split(' ') for line in prep_3] #removes whitespaces from the list prep_3 = [list(filter(None, line)) for line in prep_3] else: prep_3 = [line[:-1] if line[-1] == " " else line for line in prep_3] if numbers_to_text == 1 and tokenization == 1: #convert all numbers to integers and convert these numbers to its written form temp_prep = [] for sentence in prep_3: temporary_sentence = [] for word in sentence: if str(word).isdigit(): converted_words = num2words(int(word), to='cardinal', lang='pt').split(' ') if to_tfidf == 1 and rm_stopwords == 0: converted_words = [ word for word in converted_words if word != 'e' ] temporary_sentence.extend(converted_words) else: temporary_sentence.append(word) temp_prep.append(temporary_sentence) prep_3 = temp_prep elif numbers_to_text == 1 and tokenization == 0: #convert all numbers to integers and convert these numbers to its written form temp_prep = [] for sentence in prep_3: temporary_sentence = [] for word in sentence.split(' '): if str(word).isdigit(): converted_words = num2words(int(word), to='cardinal', lang='pt').split(' ') if to_tfidf == 1 and rm_stopwords == 0: converted_words = [ word for word in converted_words if word != 'e' ] temporary_sentence.extend(converted_words) else: temporary_sentence.append(word) temporary_sentence = ' '.join(temporary_sentence) temp_prep.append(temporary_sentence) prep_3 = temp_prep if rm_stopwords == 1: stp = set(stopwords.words('portuguese') + list(punctuation)) if tokenization == 1: prep_3 = [[word for word in sentence if word not in stp] for sentence in prep_3] elif tokenization == 0: prep_3 = [ ' '.join( [word for word in sentence.split(' ') if word not in stp]) for sentence in prep_3 ] tmp = pd.DataFrame({'text': prep_3[::2], 'response': prep_3[1::2]}) train_data = train_data.append(tmp[['text', 'response']], ignore_index=True) return train_data
def Removenewlines(self): self.processedtext=strip_multiple_whitespaces(self.processedtext) print(self.processedtext)
def strip_whitespaces(inStr): """Filters out multiple whitespaces.""" filtered_string = strip_multiple_whitespaces(inStr) return filtered_string