def process_sentence(text, objects='False'):
    '''
    Simple and dirty text preprocessing to remove some mispelled words 
    and lemmatize
    '''
    text = text.lower()
    old_text = text
    
    text = text.replace('1', 'one').replace('2','two').replace(
        '3','three').replace('4','four').replace('5','five').replace('6','six').replace(
        '.','').replace('contains', 'contain').replace(
        'which','').replace('are there','there are').replace(
        'there is', '').replace('ablue', 'a blue').replace(
        'corner','edge').replace('wall', 'edge').replace('yelow', 'yellow').replace(
        'below','beneath').replace(
        'brick','block').replace('leats','least').replace('is touching', 'touching')
    text = re.sub(r'colour([\W])', 'color ', text)
    text = re.sub(r'colored([\W])', 'color ', text)
    text = re.sub(r'coloured([\W])', 'color ', text)
    text = text.split(' ')
    text = map(correction, [t for t in text if t])
    text = [lemmatizer.lemmatize(x) if not x in [u'as',u'wall'] else x for x in text]
    text = ' '.join(text)
    if 'that' in text:
        text = text.replace('that', '')
    if 'contain' in text or 'ha ' in text:
        text = text.replace('contain', 'with').replace('ha ','with ')
    text = re.sub(r'(^|\W)a([\W])', ' one ', text)
    text = re.sub(r'(^)ll ', ' ', text)
    text = re.sub(r'(^)t ', 'at ', text)
    text = ' '.join([t for t in text.split(' ') if t])
    text = text.replace('based', 'base')
    return text
示例#2
0
def preprocessing():
    PATH = '../input/text-classification-20/'
    punct = ",./-'?!#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’'
    dataset_length = 5331
    neg = pd.DataFrame(index=range(dataset_length), columns=['label', 'text'])
    infile = open(PATH + 'rt-polarity.neg', encoding='utf-8', errors='ignore')
    textlines = infile.readlines()
    for i in range(dataset_length):
        text = textlines[i].strip('\n').strip()
        for p in punct:
            text = text.replace(p, ' ')
        neg.loc[i, 'text'] = text
        neg.loc[i, 'label'] = 0
    pos = pd.DataFrame(index=range(dataset_length), columns=['label', 'text'])
    infile = open(PATH + 'rt-polarity.pos', encoding='utf-8', errors='ignore')
    textlines = infile.readlines()
    for i in range(dataset_length):
        text = textlines[i].strip('\n').strip()
        for p in punct:
            text = text.replace(p, ' ')
        pos.loc[i, 'text'] = text
        pos.loc[i, 'label'] = 1

    train_df = pd.concat([neg, pos], ignore_index=True)
    train_df.to_csv('Sentiment_dataset.csv', index=False, encoding='utf-8')

    x_train = train_df['text'].astype(str)
    y_train = train_df['label'].values.astype(int)
    # print(np.isnan(x_train).any())
    return x_train, y_train
示例#3
0
def subchar(text):
    text = text.replace("á", "a")
    text = text.replace("ó", "o")
    text = text.replace("é", "e")
    text = text.replace("í", "i")
    text = text.replace("ú", "u")
    return text
示例#4
0
def preprocess(text):

    text = text.lower()
    list_happen = [
        "😊", "❤️", "😁", "😄", "😆", "😍", "🤣", "😂", "🤩", "😚", "😋", '😜', "😝", "🤗",
        ":)", ":}", "^^", ";)", "👌", "=))", "😅", "👍", "👍🏻", "💕", "❤", "👏", "💟",
        "<3", ":D", ":P", "^_^", "😉", "✌️"
    ]
    list_sad = [
        "😡", "🤔", "🤨", "😐", "😏", "😒", "😶", "🙄", "😌", "😔", "🤕", "🤒", "👿", "🤬",
        "😤", '😫', "😩", "😭", ":(", "😈", "-_-", "👎"
    ]
    for happen in list_happen:
        text = text.replace(happen, " vui")
    for sad in list_sad:
        text = text.replace(sad, " buồn")

    text = re.sub(
        '[\n!,.?@#?!.,#$%\()*+-/:;<=>@[\\]^_`{|}~`"""“”’∞θ÷α•−β∅³π‘₹´°£€\×™√²—–&]',
        '', text)
    #     text = preprocess1(text)
    text = ViTokenizer.tokenize(text)
    # emoticons = re.findall(r"(?:|;|=)(?:-)?(?:\)\(|D|P)", text)
    # text = re.sub(r"[\W]+", " ", text.lower()) + " ".join(emoticons).replace('-', '')
    # text = re.sub("\n", ' ', text)
    return text
示例#5
0
def preprocessFastText(text):
    text = text.replace("' ", " ' ")
    signs = set(';:,.?!\'“”‘’\"')
    prods = set(text) & signs # text 기호들 추출
    if not prods:
        return text # 기호 없는 text return

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )  #4)
    return text
示例#6
0
def preprocessFastText(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text
示例#7
0
def clean_preprocess(text, punct = punct, mapping = punct_mapping, mispell = mispell_dict):
    for p in mapping:
        text = text.replace(p, mapping[p])
    for p in punct:
        text = text.replace(p, ' ')
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    for word in mispell.keys():
        text = text.replace(word, mispell[word])
    return text
示例#8
0
def preprocess(text):
    # text = re.sub('[\n!,.?@#]', '', text)
    text = text.lower()
    list_happen = ["😊","❤️","😁","😄","😆","😍","🤣","😂","🤩","😚","😋",'😜',"😝","🤗",":)",":}","^^",";)",
    "👌","=))","😅","👍","👍🏻","💕","❤","👏","💟","<3",":D",":P","^_^","😉","✌️"]
    list_sad = ["😡","🤔","🤨","😐","😏","😒","😶","🙄","😌","😔","🤕","🤒","👿","🤬","😤",'😫',"😩","😭",":(","😈","-_-","👎"]
    for happen in list_happen:          
        text = text.replace(happen, "vui")
    for sad in list_sad:          
        text = text.replace(sad, "tệ")
    # text = ViTokenizer.tokenize(text)
    return text
示例#9
0
def clean_special_chars(text, punct, mapping):
    '''
    credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings 
    credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model
    input: current text, punctuations, punctuation mapping
    output: cleaned text
    '''
    for p in mapping:
        text = text.replace(p, mapping[p])
    for p in punct:
        text = text.replace(p, f' {p} ') 
    return text
示例#10
0
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, ' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text
示例#11
0
def preprocess(text):
    s_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    specials = ["’", "‘", "´", "`"]
    p_mapping = {"_":" ", "`":" "}    
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([s_mapping[t] if t in s_mapping else t for t in text.split(" ")])
    for p in p_mapping:
        text = text.replace(p, p_mapping[p])    
    for p in punct:
        text = text.replace(p, f' {p} ')     
    return text
示例#12
0
def tokenize(text):

    # remove urls
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    #tokenize
    tokens = word_tokenize(text)

    # Remove stop words
    tokens = [w for w in tokens if w not in stopwords.words("english")]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).strip()
        clean_tokens.append(clean_tok)

    return ' '.join(clean_tokens)
示例#13
0
 def preprocess_text(text):
     tokens = TOKENIZER.tokenize(text.replace("'", " ' "))
     return ' '.join([
         tok for tok in tokens
         if tok not in ENGLISH_STOP_WORDS and len(tok) > 1 and
         not all([char.isdigit() or char in [",.:-[]'`"] for char in tok])
     ])
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join(
        [mapping[t] if t in mapping else t for t in text.split(" ")])
    return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    wiki_reg = r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
    url_reg = r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
    ip_reg = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
    WIKI_LINK = ' WIKI_LINK '
    URL_LINK = ' URL_LINK '
    IP_LINK = ' IP_LINK '
    #clear link
    c = re.findall(wiki_reg, text)
    for u in c:
        text = text.replace(u, WIKI_LINK)
    c = re.findall(url_reg, text)
    for u in c:
        text = text.replace(u, WIKI_LINK)
    c = re.findall(wiki_reg, text)
    for u in c:
        text = text.replace(u, URL_LINK)
    c = re.findall(ip_reg, text)

    # Regex to remove all Non-Alpha Numeric and space
    special_character_removal = re.compile(r'[^A-Za-z\d!?*\' ]', re.IGNORECASE)
    # regex to replace all numerics
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)

    # text = text.lower().split()
    text = text.split()
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)
    # Remove Special Characters
    text = special_character_removal.sub('', text)
    # Replace Numbers
    text = replace_numbers.sub('NUMBERREPLACER', text)
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    # Return a list of words
    return (text)
示例#16
0
 def text_to_data(self, text, author):
     #remove newlines, numbers, some punctuation
     text = text.replace('\n', " ")
     text = re.sub(r'[0-9]+', '', text)
     sent_tokenize_list = sent_tokenize(text)
     total_arr = [(x, author) for x in sent_tokenize_list]
     vocab_count = len(set(text.split(' ')))
     return total_arr, vocab_count
示例#17
0
def clean_text(text):
    text = remove_urls(text)
    text = remove_users(text.replace('@ ', '@'))
    # text = remove_hash_tags(text)    
    text = re.sub(r'\s+', ' ', text).strip()    
    text = filter(lambda x: x in printable, text)    
    text = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()
示例#18
0
def replace_num(text):
    import re
    p1 = r"[0-9]+"  # 这是我们写的正则表达式规则,你现在可以不理解啥意思
    pattern1 = re.compile(p1)  # 我们在编译这段正则表达式
    res = pattern1.findall(text)
    res.sort(key=lambda i: len(i), reverse=True)
    if len(res) > 0:  # 如果匹配成功
        for i in range(len(res)):
            new_text = text.replace(res[i], '')  # 打印出来
            text = new_text
    return text
示例#19
0
def clean_contractions(text, mapping):
    '''
    credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings 
    credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model
    input: current text, contraction mappings
    output: modify the comments to use the base form from contraction mapping
    '''
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text
示例#20
0
def clean_text(text):
    # clean latex maths
    text = p.sub(' [ math ] ', text)
    # clean invisible chars
    text = p_space.sub(r'', text)
    # clean punctuations
    for punct in punct_mapping:
        if punct in text:
            text = text.replace(punct, punct_mapping[punct])
    tokens = []
    for token in text.split():
        # replace contractions & correct misspells
        token = mispell_dict.get(token.lower(), token)
        tokens.append(token)
    text = ' '.join(tokens)
    return text
示例#21
0
def clean_text(text):    
    #fixing apostrope
    text = text.replace("’", "'")
    #to lower
    text = text.lower()
    #remove \n
    text = re.sub("\\n","",text)

    # remove leaky elements like ip,user
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",text)
    
    
    #Split the sentences into words
    words = tokenizer.tokenize(text)
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    words = [fill[word] if word in fill else word for word in words]
    #words = [lem.lemmatize(word, "v") for word in words]
    #words = [i for i in text.split() if i not in eng_stopwords]
    text = " ".join(words)
    return text
def preprocessing(path, label):
    path = './20_newsgroups/alt.atheism/'
    files = os.listdir(path)
    punct = "/-'?!#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&' + '\n\r\t'
    columns = ["label", "text"]
    dataset = pd.DataFrame(index=range(1000), columns=columns)
    i = 0
    for file in files:  # 遍历文件夹

        if not os.path.isdir(file):  # 判断是否是文件夹,不是文件夹才打开
            f = os.path.basename(file)
            print(f)  # 打印结果
            paths = path + f
            infile = open(paths, encoding='ANSI', errors='ignore')
            text = infile.read()
            text = text.split('\n\n', maxsplit=1)[1]

            for p in punct:
                text = text.replace(p, ' ')
            text = re.sub('\s+', ' ', text)
            dataset.loc[i, 'text'] = text
            dataset.loc[i, 'label'] = label
            i += 1
    return dataset
示例#23
0
def remove_useless_symbols(text):
    symbols = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&' + '\n'
    for symbol in symbols:
        text = text.replace(symbol, ' ')
    return text
示例#24
0
def remove_hash_tags(text):
    hts = [part[1:] for part in text.split() if part.startswith('#')]
    for ht in hts:
        text = text.replace(ht, '')
    return text
示例#25
0
 def clean_special_chars(text, punct):
     for p in punct:
         text = text.replace(p, ' ')
     return text
示例#26
0
def remove_urls(text):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    for u in urls:
        text = text.replace(u, '')
    return text
示例#27
0
def remove_users(text):
    users = re.findall("@([a-z0-9_]+)", text, re.I)
    for u in users:
        text = text.replace('@' + u, '')
    return text
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    for p in punct:
        text = text.replace(p, f' {p} ')
    return text
def text_parse(text, remove_stopwords=False, stem_words=False):
    wiki_reg = r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
    url_reg = r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
    url_reg2 = r'www.[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
    ip_reg = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
    WIKI_LINK = ' WIKILINKREPLACER '
    URL_LINK = ' URLLINKREPLACER '
    IP_LINK = ' IPLINKREPLACER '
    #clear link
    # replace endline with '. '
    endline = re.compile(r'.?\n', re.IGNORECASE)
    text = endline.sub('. ', text)

    c = re.findall(wiki_reg, text)
    for u in c:
        text = text.replace(u, WIKI_LINK)
    c = re.findall(url_reg, text)
    for u in c:
        text = text.replace(u, URL_LINK)
    c = re.findall(url_reg2, text)
    for u in c:
        text = text.replace(u, URL_LINK)
    c = re.findall(ip_reg, text)
    for u in c:
        text = text.replace(u, IP_LINK)

    bad_word_dict = get_bad_word_dict()
    # Regex to remove all Non-Alpha Numeric and space
    special_character_removal = re.compile(r'[^A-Za-z\d!?*\'.,; ]',
                                           re.IGNORECASE)
    # regex to replace all numerics
    replace_numbers = re.compile(r'\b\d+\b', re.IGNORECASE)
    text = text.lower().split()
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    text = " ".join(text)
    # Remove Special Characters
    text = special_character_removal.sub(' ', text)
    for k, v in bad_word_dict.items():
        # bad_reg = re.compile('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]'+ re.escape(k) +'[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]')
        bad_reg = re.compile('[\W]?' + re.escape(k) + '[\W]|[\W]' +
                             re.escape(k) + '[\W]?')
        text = bad_reg.sub(' ' + v + ' ', text)
        '''
        bad_reg = re.compile('[\W]'+ re.escape(k) +'[\W]?')
        text = bad_reg.sub(' '+ v, text)
        bad_reg = re.compile('[\W]?'+ re.escape(k) +'[\W]')
        text = bad_reg.sub(v + ' ', text)
        '''

    # Replace Numbers
    text = replace_numbers.sub('NUMBERREPLACER', text)
    text = text.split()
    text = " ".join(text)

    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    # rake parsing
    text = rake_parse(text)
    return text
示例#30
0
def clean_punct(text):
    text = str(text)
    for punct in PUNCTS:
        text = text.replace(punct, ' {} '.format(punct))

    return text