def ALLCAPS(text):
    '''Calculates the number of ALL CAPS words at the start of the message
     after removing http addresses, numbers and multiple whitespaces

    input: 
        text: a string
    returns: 
        the number of ALL CAPS words at the start of the message
    '''
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    words = text.split()
    ALLCAPScount = 0

    for w in words:
        if w.isupper() == False:
            break
        ALLCAPScount = ALLCAPScount + 1

    if ALLCAPScount:    
        if (words[ALLCAPScount-1] == 'A'):    
            ALLCAPScount = ALLCAPScount - 1

    return ALLCAPScount
def preprocessing(text):
    '''Preprocesses a text using standard gensim techniques: 
    removes stopwords, strips short words (1-2 characters), strips numbers, 
    strips http addresses, strips Unicode from emoji etc., lowercases everything, 
    strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming

    input: 
        text: a string
    returns: 
        the preprocessed string.
    '''
    text = text.lower()
    text = preprocess.remove_stopwords(text) # remove stop words
    text = preprocess.strip_short(text) #get rid of short words
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    text = preprocess.strip_punctuation(text)
    text = preprocess.strip_non_alphanum(text)
    text = preprocess.remove_stopwords(text)
    text = preprocess.strip_short(text)
# stemming
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)

    return text
예제 #3
0
def get_text_sentences(filepath, sbd_model):
    tokens_by_sentence = []
    with codecs.open(filepath, encoding='utf8') as f:
        raw_text = f.read()
        #raw_text = raw_text.lower()
        raw_text = strip_multiple_whitespaces(raw_text)
        sentences = splitta.sbd.sbd_text(sbd_model, raw_text, do_tok=False)
        for s in sentences:
            new_s = strip_punctuation(s)
            tokens_by_sentence.append(list(utils.tokenize(new_s, deacc=True, lowercase=True)))
        #print raw_text
        #for filt in self.preprocess:
        #    raw_text = filt(raw_text)
        #text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
    return sentences, tokens_by_sentence
def wordcount(text):
    '''Calculate post length after removing http addresses, 
       numbers and multiple whitespaces

    input: 
        text: a string
    returns: 
        the adjusted wordcount.
    '''
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    words = text.split()
    count = len(words)
    return count
예제 #5
0
 def testStripMultipleWhitespaces(self):
     self.assertEqual(strip_multiple_whitespaces("salut  les\r\nloulous!"), "salut les loulous!")
예제 #6
0
def chatbot_interface(interaction, word2vec_model, fasttext_model,
                      ptlkb64_model, glove300_model, numberbatch_model):
    """ Function used to run the chatbot interface """
    # Flag to indicate if classification should be used (1) or not (0)
    classification_flag = 1

    # Flag to indicate if the binary classifier should be used (1) or not (0)
    binary_classifier_flag = 1

    # choose if stopwords should be removed from the user interaction
    process_interaction_toggle = 0

    # choose if pre-selection should not be used (0), used with word embeddings (1) or used with Whoosh (2)
    pre_selection_toggle = 2

    # parameters used to tune the selection of more than one response
    sr_alpha = 0.1
    sr_beta = 3

    # TODO: The STS model class can't perform feature selection but can use a model that uses it already.

    # location of the STS model
    model = STSModel()
    model.load_model(
        'model_0905_SVR_R_pos_adv-dependency_parsing-word2vec-ptlkb-numberbatch'
    )

    if classification_flag:
        print("The classifier is being used.")

        # read the different class sets
        class_1, class_2, class_3 = read_class_set()

        # transform the class sets from lists to dataframes
        class_1_df = pd.DataFrame(class_1, columns=['text'])
        class_2_df = pd.DataFrame(class_2, columns=['text'])
        class_3_df = pd.DataFrame(class_3, columns=['text'])

    faqs_variants_load_path = os.path.join(ROOT_PATH, 'datasets',
                                           'AIA-BDE_v2.0.txt')

    with open(faqs_variants_load_path) as faqs_file:
        faqs_variants_corpus = faqs_file.read().splitlines()

    faqs_file.close()

    faqs_variants_corpus = [
        line.replace('\t', '') for line in faqs_variants_corpus
    ]
    faqs_variants_corpus = [
        line.split(':', 1) for line in faqs_variants_corpus
    ]

    # add the original question to a different list to improve the conversational presentation of a response
    position = 0
    faqs_variants_questions = []

    for element in faqs_variants_corpus:
        if element[0] == 'P' and element[1] not in faqs_variants_questions:
            faqs_variants_questions.append(element[1])
            position += 1

    # add the original question to a different list to improve the conversational presentation of a response
    position = 0
    faqs_variants_answers = []

    for element in faqs_variants_corpus:
        if element[0] == 'R' and element[1] not in faqs_variants_answers:
            faqs_variants_answers.append(element[1])
            position += 1

    faqs_variants_corpus = [
        line for line in faqs_variants_corpus if len(line) == 2 and line[1]
    ]
    faqs_variants_corpus = [[line[0], strip_non_alphanum(line[1])]
                            if line[0] != 'R' else [line[0], line[1]]
                            for line in faqs_variants_corpus]
    faqs_variants_corpus = [[line[0].rstrip(), line[1].rstrip()]
                            if line[0] != 'R' else [line[0], line[1]]
                            for line in faqs_variants_corpus]
    faqs_variants_corpus = [
        [line[0], strip_multiple_whitespaces(line[1])]
        if line[0] != 'R' else [line[0], line[1]]
        for line in faqs_variants_corpus
    ]
    faqs_variants_corpus = [
        [line[0], line[1].lower()] if line[0] != 'R' else [line[0], line[1]]
        for line in faqs_variants_corpus
    ]

    position = 0
    corpus = []

    for element in faqs_variants_corpus:
        if element[0] == 'P':
            corpus.append([element[1]])

        if element[0] == 'R':
            corpus[position].extend([element[1]])

            position += 1

    aux_list_of_questions = [phrases[0] for phrases in corpus]
    aux_df = pd.DataFrame(faqs_variants_questions, columns=['text'])

    # remove duplicate sentences from the aux_list_of_questions in order for Whoosh to work.
    clean_aux_list_of_questions = []

    for pair in corpus:
        if pair not in clean_aux_list_of_questions:
            clean_aux_list_of_questions.append(pair)

    if process_interaction_toggle:
        print("The original sentence was: {}".format(interaction))
        stp = set(stopwords.words('portuguese') + list(punctuation))
        interaction = ' '.join(
            [word for word in interaction.split(' ') if word not in stp])
        print("The sentenced after removing stopwords and punctuation: {}".
              format(interaction))

    unprocessed_corpus = []

    if classification_flag:
        # apply the classifier before using the STS model
        if binary_classifier_flag:
            predicted_class = corre_para_frase_bin(interaction)
            print("Saí daqui")
            if predicted_class == 0:
                print("The provided interaction is out of domain!\n")
        else:
            predicted_class = corre_para_frase_multi(interaction)

            if predicted_class == 1:
                print("The provided interaction belongs to class 1!\n")
                aux_df = class_1_df
                aux_list_of_questions = class_1
            elif predicted_class == 2:
                print("The provided interaction belongs to class 2!\n")
                aux_df = class_2_df
                aux_list_of_questions = class_2
            elif predicted_class == 3:
                print("The provided interaction belongs to class 3!\n")
                aux_df = class_3_df
                aux_list_of_questions = class_3
            else:
                print("The provided interaction is out of domain!\n")

    if predicted_class == 1:
        if 'response' not in aux_df:
            aux_df.insert(1, 'response', interaction)
        else:
            aux_df['response'] = interaction

        if pre_selection_toggle != 2:
            for j in range(len(faqs_variants_questions)):
                if pre_selection_toggle == 1:
                    unprocessed_corpus.append(
                        [faqs_variants_questions[j], interaction])
                else:
                    unprocessed_corpus.extend(
                        [faqs_variants_questions[j], interaction])

        if pre_selection_toggle == 1:
            corpus_pairs, indexes = pre_selection(unprocessed_corpus,
                                                  fasttext_model, position)

            if corpus_pairs is None:
                index_path = os.path.join(ROOT_PATH, 'indexers', 'Whoosh',
                                          'indexes', 'cobaia_chitchat_v1.5')

                query_response = qwi.query_indexer(interaction, index_path)

                if (query_response[0] is None) or (not query_response[0]):
                    response = "Desculpe, não percebi, pode colocar a sua questão de outra forma?"
                    return response
                else:
                    return response

            selected_aux_df = aux_df.iloc[indexes]
            selected_aux_df = selected_aux_df.reset_index(drop=True)
        else:
            if pre_selection_toggle == 2:
                pre_selection_index_path = os.path.join(
                    ROOT_PATH, 'indexers', 'Whoosh', 'indexes',
                    'FAQs_no_analyser_AIA-BDE_v2.0')

                query_response = qwi.query_indexer(interaction,
                                                   pre_selection_index_path)
                options_docnumbers = query_response[2]

                if len(options_docnumbers) == 0:
                    response = "Desculpe, não percebi, pode colocar a sua questão de outra forma?"
                    return response
                else:
                    possible_variants_questions = []
                    possible_variants_answers = []

                    for pos, elem in enumerate(options_docnumbers):
                        unprocessed_corpus.extend(
                            [faqs_variants_questions[elem], interaction])
                        possible_variants_questions.append(
                            faqs_variants_questions[elem])
                        possible_variants_answers.append(
                            faqs_variants_answers[elem])

            corpus_pairs = unprocessed_corpus
            selected_aux_df = aux_df

        element_features = model.extract_multiple_features(
            corpus_pairs,
            0,
            word2vec_mdl=word2vec_model,
            fasttext_mdl=fasttext_model,
            ptlkb_mdl=ptlkb64_model,
            glove_mdl=glove300_model,
            numberbatch_mdl=numberbatch_model)

        predicted_similarity = model.predict_similarity(element_features)
        predicted_similarity = predicted_similarity.tolist()

        highest_match = max(predicted_similarity)

        selectable_range = (max(predicted_similarity) -
                            min(predicted_similarity)) * sr_alpha

        if sr_beta > len(predicted_similarity):
            tmp_sr_beta = len(predicted_similarity)
            sr_beta_range = tmp_sr_beta
            possible_matches = n_max_elements(predicted_similarity,
                                              tmp_sr_beta)
        else:
            sr_beta_range = sr_beta
            possible_matches = n_max_elements(predicted_similarity, sr_beta)

        highest_match_index = predicted_similarity.index(
            max(predicted_similarity))

        if pre_selection_toggle == 2:
            response = ("Se a sua pergunta foi: %s \nR: %s\n" %
                        (possible_variants_questions[highest_match_index],
                         possible_variants_answers[highest_match_index]))

            for i in range(1, sr_beta_range):
                if abs(highest_match -
                       possible_matches[i]) <= selectable_range:
                    response += (
                        "Também poderá estar interessado em: %s\nR: %s\n" %
                        (possible_variants_questions[
                            predicted_similarity.index(possible_matches[i])],
                         possible_variants_answers[predicted_similarity.index(
                             possible_matches[i])]))

            return response
        else:
            #should be index 1, for testing purposes it is 0
            response = ("Se a sua pergunta foi: %s \nR: %s\n" %
                        (faqs_variants_questions[highest_match_index],
                         faqs_variants_answers[highest_match_index]))

            for i in range(1, sr_beta):
                if abs(highest_match -
                       possible_matches[i]) <= selectable_range:
                    response += (
                        "Também poderá estar interessado em: %s\nR: %s\n" %
                        (faqs_variants_questions[predicted_similarity.index(
                            possible_matches[i])],
                         faqs_variants_answers[predicted_similarity.index(
                             possible_matches[i])]))

            return response
    else:
        # the query search will return a list of phrases with the highest matches, which will be used with the similarity model in order to evaluate which answer should be returned to the user
        index_path = os.path.join(ROOT_PATH, 'indexers', 'Whoosh', 'indexes',
                                  'cobaia_chitchat_v1.5')

        query_response = qwi.query_indexer(interaction, index_path, 1)
        print(query_response[0])
        print(query_response[1])
        if (query_response[0] is None) or (not query_response[0]):
            response = "Desculpe, não percebi, pode colocar a sua questão de outra forma?"
            return response
        else:
            '''
			unprocessed_answers = []
			aux_qwi = pd.DataFrame(query_response[0], columns=['text'])

			if 'response' not in aux_qwi:
				aux_qwi.insert(1, 'response', interaction)
			else:
				aux_qwi['response'] = interaction

			for k in range(len(query_response[0])):
				unprocessed_answers.extend([faqs_variants_questions[k], interaction])

			# element_features_qwi = extract_features(0, unprocessed_answers, aux_qwi, word2vec_mdl=word2vec_model, fasttext_mdl=fasttext_model, ptlkb64_mdl=ptlkb64_model, glove300_mdl=glove300_model, numberbatch_mdl=numberbatch_model, f_selection=converted_mask)

			element_features_qwi = model.extract_multiple_features(unprocessed_answers, 0, word2vec_mdl=word2vec_model, fasttext_mdl=fasttext_model, ptlkb_mdl=ptlkb64_model, glove_mdl=glove300_model, numberbatch_mdl=numberbatch_model)

			predicted_similarity_qwi = model.predict_similarity(element_features_qwi)
			predicted_similarity_qwi = predicted_similarity_qwi.tolist()
			print(predicted_similarity_qwi)

			highest_match_index_qwi = predicted_similarity_qwi.index(max(predicted_similarity_qwi))

			return query_response[1][highest_match_index_qwi]
			'''
            return query_response[1][0]
예제 #7
0
args = parser.parse_args()

#
# train fasttext
# 

from gensim.models.fasttext import *
from gensim.test.utils import datapath
from gensim.parsing.preprocessing import preprocess_string,strip_punctuation,strip_short,strip_multiple_whitespaces
import gensim

with open(args.in_file_plain,"r",encoding="utf8") as in_file_plain:
    corpus = in_file_plain.read().splitlines()
    clean_corpus = []
    for line in corpus:
        clean_corpus.append(strip_multiple_whitespaces(strip_short(strip_punctuation(line))).split())

def gen():
    for line in clean_corpus:
        yield line

#model = gensim.models.FastText(size=300,workers=50,min_count=3,window=7)
model = gensim.models.FastText.load_fasttext_format(args.pretrained_model)
model.workers = 50

# build the vocabulary
model.build_vocab(sentences=clean_corpus,update=True)

# train the model
model.train(
    sentences=clean_corpus, epochs=100,
예제 #8
0
#strip_punctuation:Replace punctuation characters with spaces in `s`
for i in range(0, len(df['content'])):
    regex = strip_punctuation(str(df['content2'][i]))
    df['content2'][i] = regex

# In[17]:

#test
df['content2'][20]

# In[18]:

#strip_multiple_whitespaces: Remove repeating whitespace characters (spaces, tabs, line breaks) from `s`
#and turns tabs & line breaks into spaces
for i in range(0, len(df['content'])):
    regex = strip_multiple_whitespaces(str(df['content2'][i]))
    df['content2'][i] = regex

# In[19]:

#test
df['content2'][20]

# In[20]:

#Transform all letters to lower case ones
for i in range(0, len(df['content'])):
    regex = (str(df['content2'][i])).lower()
    df['content2'][i] = regex

# In[21]:
예제 #9
0
 def __call__(self, doc):
     striped = prep.strip_punctuation(doc)
     striped = prep.strip_tags(striped)
     striped = prep.strip_multiple_whitespaces(striped).lower()
     return striped
예제 #10
0
 def testStripMultipleWhitespaces(self):
     self.assertEqual(strip_multiple_whitespaces("salut  les\r\nloulous!"),
                      "salut les loulous!")
예제 #11
0
def clean_raw_content(textIn):
    cleaner = textIn.replace("\\n", "")
    cleaner = strip_tags(cleaner)
    cleaner = strip_multiple_whitespaces(cleaner)
    cleaner = cleaner.lower()
    return cleaner
예제 #12
0
file_dir = os.path.join('C:\\', 'Users', 'cruze', 'Documents', 'CS664')
inputfile = os.path.join(file_dir, 'train_E6oV3lV.csv')
df = pd.read_csv(inputfile)

from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum, strip_numeric, strip_multiple_whitespaces, stem

messages = df.iloc[:, 2]

temp = []

for msg in messages:
    string = remove_stopwords(msg)
    string = strip_punctuation(string)
    string = strip_non_alphanum(string)
    string = strip_numeric(string)
    string = strip_multiple_whitespaces(string)
    string = stem(string)

    temp.append(string)

df = pd.DataFrame({'tweet': temp, 'class': df.iloc[:, 1]})

##-----------------------------------------------------------------------------

#df.iloc[:, -1].value_counts()

from sklearn.metrics import confusion_matrix, precision_score, f1_score
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout
from keras.models import Sequential
#from keras.regularizers import l2
예제 #13
0
파일: tools.py 프로젝트: ZPedroP/ASAPPpy
def preprocessing(text,
                  tokenization=0,
                  rm_stopwords=0,
                  numbers_to_text=0,
                  to_tfidf=0):
    """ Function used to preprocess the training data """
    train_data = pd.DataFrame(columns=['text', 'response'])

    prep_0 = [strip_non_alphanum(line) for line in text]
    prep_1 = [line for line in prep_0 if line.rstrip()]
    prep_2 = [strip_multiple_whitespaces(line) for line in prep_1]
    prep_3 = [line.lower() for line in prep_2]

    if to_tfidf == 1:
        #when using tf_idf, removes single character words given that they are ignored by sklearn's TfidfVectorizer
        prep_3 = [
            ' '.join([word for word in line.split() if len(word) > 1])
            for line in prep_3
        ]

    if tokenization == 1:
        prep_3 = [line.split(' ') for line in prep_3]
        #removes whitespaces from the list
        prep_3 = [list(filter(None, line)) for line in prep_3]
    else:
        prep_3 = [line[:-1] if line[-1] == " " else line for line in prep_3]

    if numbers_to_text == 1 and tokenization == 1:
        #convert all numbers to integers and convert these numbers to its written form
        temp_prep = []
        for sentence in prep_3:
            temporary_sentence = []
            for word in sentence:
                if str(word).isdigit():
                    converted_words = num2words(int(word),
                                                to='cardinal',
                                                lang='pt').split(' ')
                    if to_tfidf == 1 and rm_stopwords == 0:
                        converted_words = [
                            word for word in converted_words if word != 'e'
                        ]
                    temporary_sentence.extend(converted_words)
                else:
                    temporary_sentence.append(word)
            temp_prep.append(temporary_sentence)

        prep_3 = temp_prep
    elif numbers_to_text == 1 and tokenization == 0:
        #convert all numbers to integers and convert these numbers to its written form
        temp_prep = []
        for sentence in prep_3:
            temporary_sentence = []
            for word in sentence.split(' '):
                if str(word).isdigit():
                    converted_words = num2words(int(word),
                                                to='cardinal',
                                                lang='pt').split(' ')
                    if to_tfidf == 1 and rm_stopwords == 0:
                        converted_words = [
                            word for word in converted_words if word != 'e'
                        ]
                    temporary_sentence.extend(converted_words)
                else:
                    temporary_sentence.append(word)
            temporary_sentence = ' '.join(temporary_sentence)
            temp_prep.append(temporary_sentence)
        prep_3 = temp_prep

    if rm_stopwords == 1:
        stp = set(stopwords.words('portuguese') + list(punctuation))
        if tokenization == 1:
            prep_3 = [[word for word in sentence if word not in stp]
                      for sentence in prep_3]
        elif tokenization == 0:
            prep_3 = [
                ' '.join(
                    [word for word in sentence.split(' ') if word not in stp])
                for sentence in prep_3
            ]

    tmp = pd.DataFrame({'text': prep_3[::2], 'response': prep_3[1::2]})
    train_data = train_data.append(tmp[['text', 'response']],
                                   ignore_index=True)

    return train_data
예제 #14
0
    def Removenewlines(self):
        self.processedtext=strip_multiple_whitespaces(self.processedtext)

        print(self.processedtext)
예제 #15
0
def strip_whitespaces(inStr):
    """Filters out multiple whitespaces."""
    filtered_string = strip_multiple_whitespaces(inStr)
    return filtered_string