Пример #1
0
 def find_npnp_patterns(self, wordpair, doc):
     # TODO Make an easier switch from head and full NP?
     # check each sentence and find both words in word pair.
     # if they are both in the sentence, then create a pattern.
     # Creating pattern: if adjacent, <X><Y> or <Y><X>, if not then <X>blahblah<Y> where X and Y are NPs.
     # X and Y are NPs but we want to extract the heads of X and Y.
     # X = anaphor, y = antecedent
     ret = []
     for s in doc.sentences:
         tokens = [tok.token for tok in s.words]
         if wordpair.anaphor.token in tokens and wordpair.antecedent.token in tokens:
             pattern_str1 = '(' + wordpair.anaphor.token + ')(.*)(' + wordpair.antecedent.token + ')'
             pattern_str2 = '(' + wordpair.antecedent.token + ')(.*)(' + wordpair.anaphor.token + ')'
             pattern1 = re.compile(pattern_str1)
             pattern2 = re.compile(pattern_str2)
             sent_str = ' '.join([w.token for w in s.words])
             match1 = re.search(pattern1, sent_str)
             match2 = re.search(pattern2, sent_str)
             if match1:
                 print(match1.group(1, 2, 3))
                 ret.append(NPNP('<X>(' + match1.group(2) + ')<Y>'))
             if match2:
                 print(match2.group(1, 2, 3))
                 ret.append(NPNP('<Y>(' + match2.group(2) + ')<X>'))
     return ret
Пример #2
0
def get_emojis_pattern():
    try:
        # UCS-4
        emojis_pattern = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    except re.error:
        # UCS-2
        emojis_pattern = re.compile(
            u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
    return emojis_pattern
Пример #3
0
def get_emojis_pattern():
    #emojis_pattern= re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF'u'\u2600-\u26FF\u2700-\u27BF]+',re.UNICODE)

    emojis_pattern = re.compile("["
               u"\U0001F600-\U0001F64F"  # emoticons
               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
               u"\U0001F680-\U0001F6FF"  # transport & map symbols
               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
               u"\U00002500-\U00002BEF"  # chinese char
              u"\U00002702-\U000027B0"
              u"\U00002702-\U000027B0"
             u"\U000024C2-\U0001F251"
             u"\U0001f926-\U0001f937"
                u"\U00010000-\U0010ffff"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u200d"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\ufe0f"  # dingbats
                u"\u3030"
                "]+", re.UNICODE)

    return emojis_pattern
Пример #4
0
    def tokenize(self):
        filename = r'../static/data/news_threeDays_crawling.csv'
        with open(filename, 'r', encoding='utf-8') as f:
            self.texts = f.read()
        texts = self.texts.replace('\n', '')
        tokenizer = re.compile(r'[^ㄱ-힣]')
        self.texts = tokenizer.sub(' ', texts)
        self.tokens = word_tokenize(self.texts)
        _arr = []
        for token in self.tokens:
            token_pos = self.okt.pos(token)
            _ = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == 'Noun']
            if len("".join(_)) > 1:
                _arr.append("".join(_))
        self.noun_tokens = " ".join(_arr)

        filename = r'../static/data/stopwords.txt'
        with open(filename, 'r', encoding='utf-8') as f:
            self.stopword = f.read()
        print(type(self.stopword))
        self.noun_tokens = word_tokenize(self.noun_tokens)
        self.noun_tokens = [text for text in self.noun_tokens
                            if text not in self.stopword]
        keyword_list = self.noun_tokens
        self.freqtxt = pd.Series(dict(FreqDist(keyword_list))).sort_values(ascending=False)
        c2 = collections.Counter(keyword_list)
        a = c2.most_common(50)
        file = open('../static/data/news_threeDays_mining.csv', 'w', encoding='utf-8', newline='')
        print(file.name)
        csvfile = csv.writer(file)
        for row in a:
            csvfile.writerow(row)
        file.close()
        return file
Пример #5
0
def text_cleaning(text):
    stop = stopwords.words('english') + [
        "would", "could", "also", "one", "ha", "can't", "it's", "i've", "u",
        "it", "us", "we", "t", "s"
    ]  # define stopwords list

    # cleaning
    if pd.isnull(text):
        return ""
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)  # remove URLs
    text = text.lower()  # to lowercase
    text = ''.join([i for i in text if not i.isdigit()])  # remove digits
    # text = text.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)  # remove unicodes and emojis
    text = re.sub(r'(.)\1+', r'\1\1', text)
    unis_emojis_pattern = re.compile(
        pattern="["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    text = unis_emojis_pattern.sub(r' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuations
    split_text = text.split()
    text = ' '.join(x for x in text.split()
                    if x not in stop)  # remove stopwords
    return text
    def parseText(text):
        words = word_tokenize(text, language=QuickViewExtractor.LANGUAGE)

        regex = re.compile('[^a-zA-Z0-9]')
        words = [regex.sub('', w).lower() for w in words]
        words = [w for w in words if w]

        return words
def get_negations_pattern():
    negations_ = {"isn't": "is not", "can't": "can not", "couldn't": "could not", "hasn't": "has not",
                  "hadn't": "had not", "won't": "will not",
                  "wouldn't": "would not", "aren't": "are not",
                  "haven't": "have not", "doesn't": "does not", "didn't": "did not",
                  "don't": "do not", "shouldn't": "should not", "wasn't": "was not", "weren't": "were not",
                  "mightn't": "might not",
                  "mustn't": "must not"}
    return re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b')
 def parse_tweets(self, tweets):
     regex = re.compile(
         '[,#@`:)(\[\]\'%^~=&*+/;<>{}|!?._]|http[,#@`\-:)(\[\]\'%^=&_*+/;<>{}|.!?a-z]*'
     )
     named_entities_tree = ''
     for tweet in tweets:
         text = str.lower(str(tweet.processed_text))
         text = regex.sub('', text)
         current_tree = self.parse(pos_tag(word_tokenize(text)))
         named_entities_tree += str(current_tree)
     return named_entities_tree
Пример #9
0
def get_emojis_pattern():
    return re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)
Пример #10
0
def preprocess_text(old_text):
    """preprocess given text and return str"""
    new_text = re.sub(r'https?:\/\/.*[\r\n]*', '', old_text)  # remove URL before
    new_text = re.sub(r'<[^>]+>', '', new_text)  # remove html (line breaks etc.)
    new_text = re.sub(re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '',
                      new_text)  # remove email
    new_text = re.sub(r'#', '', new_text)  # remove hash sign from hashtags, hashtag itself remains
    new_text = re.sub('@[^\s]+', '', new_text)  # deletes mentions with @ TODO I think it has no influence
    new_text = re.sub("[^a-zA-Z]", " ", new_text)  # remove remaining special characters
    words = new_text.lower().split()  # do lowercase, split into words
    #words = [word for word in words if not word in stopwords_english]  # remove stop words -> makes results worse
    #words = [stemmer.stem(word) for word in words]  # stemming -> leads to worse results
    #words = [lemma.lemmatize(word) for word in words]  # lemmatization -> leads to worse results
    # join words list back to one tweet
    return " ".join(words)
Пример #11
0
    def text_tokenizer(self, text: str):
        text = re.sub('\S*@\S*\s?', '', text)  # remove emails
        text = re.sub(r'^https?://.*[\r\n]*', '', text,
                      flags=re.MULTILINE)  # remove websites

        words = word_tokenize(text, 'english')
        words = list(filter(lambda word: len(word) >= self.min_length, words))

        # text = (list(map(lambda x: self.stemmer.stem(x), words)))
        tokens = (list(map(lambda x: self.lemmatizer.lemmatize(x), words)))
        p = re.compile('[a-zA-Z]+')
        filtered_tokens = list(
            filter(
                lambda token: p.match(token) and len(token) >= self.min_length,
                tokens))

        return filtered_tokens
Пример #12
0
def preprocess_tweet(text):
    new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', text)  # remove URL
    new_tweet = re.sub(r'<[^>]+>', '',
                       new_tweet)  # remove html (line breaks etc.)
    new_tweet = re.sub(
        re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '',
        new_tweet)  # remove email
    new_tweet = re.sub(r'#', '', new_tweet)  # remove hash sign from hashtags
    new_tweet = re.sub("[^a-zA-Z]", " ",
                       new_tweet)  # remove remaining special characters
    words = new_tweet.lower().split()  # do lowercase, split into words
    words = [word for word in words
             if not word in stopwords_english]  # remove stop words
    words = [stemmer.stem(word) for word in words]  # stemming
    words = [lemma.lemmatize(word) for word in words]  # lemmatization
    # join words list back to one tweet
    return " ".join(words)
Пример #13
0
def get_mentions_pattern():
    return re.compile(r'@\w*')
Пример #14
0
def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|rt|FAV|fav|VIA|via)')
Пример #15
0
def get_blank_spaces_pattern():
    return re.compile(r'\s{2,}|\t')
Пример #16
0
def get_single_letter_words_pattern():
    return re.compile(r'(?<![\w\-])\w(?![\w\-])')
Пример #17
0
def get_hashtags_pattern():
    return re.compile(r'#\w*')
Пример #18
0
def get_hashtags_pattern():
    return re.compile(r'#([^\s]+)')
Пример #19
0
# @end AccessText

# @begin PreprocessFile @desc To preprocess the text data
# @in stopwords
# @in regexr @as regular_expression
# @in TextRead
# @out dictionary
# @out train_corpus
# @out test_corpus
en_stopwords = set(stopwords.words('english'))
de_stopwords = set(stopwords.words('german'))
fr_stopwords = set(stopwords.words('french'))
stopwords = en_stopwords | de_stopwords | fr_stopwords

regexr = re.compile('([a-z])\w+')

file_tokens = (word.lower() for word in text_file)
clean_file = [word for word in file_tokens if word not in stopwords]
# calculate word frequencies
word_frequency = defaultdict(int)
for text in text_file:
    for token in text:
        word_frequency[token] += 1
# only keep words that occur more than once
processed_corpus = [[token for token in text if word_frequency[token] > 1]
                    for text in text_file]
# associate each word in the corpus with a unique integer ID
dictionary = corpora.Dictionary(processed_corpus)
corpus = [dictionary.doc2bow(text) for text in processed_corpus]
 def remove_numbers(self):
     """remove any numbers"""
     pattern = re.compile(r'[0-9]+')
     self.text = re.sub(pattern=pattern, repl='', string=self.text)
     return self
Пример #21
0
import string

import nltk
from nltk.corpus import stopwords
from nltk import re

MIN_YEAR = 1900
MAX_YEAR = 2100

emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "]+",
    flags=re.UNICODE)
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'


def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')


def get_emojis_pattern():
    try:
        # UCS-4
        emojis_pattern = re.compile(
            u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])'
Пример #22
0
def tokenize(s):
    return tokens_re.findall(s)


def preprocess(s):
    tokens = tokenize(s)
    return tokens


regex_str = [
    r'<[^>]+>',  # HTML tags
    r'(?:@[\w_]+)',  # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)'  # anything else
]
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                       re.VERBOSE | re.IGNORECASE)
print("Token compilation completed")

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + [
    'rt', 'via', '…', 'trump', 'donaldtrump', 'therealdonaldtrump',
    'president', "trump's", 'donald'
]
print("Stopword list construction completed")
 def remove_punctuation(self):
     """remove any punctutations"""
     pattern = re.compile(r'[^\w\s]')
     self.text = re.sub(pattern=pattern, repl='', string=self.text)
     return self
Пример #24
0
def clean_text(text):
    import nltk
    nltk.download('stopwords')
    nltk.download('wordnet')

    # split into words by white space
    words = text.split()
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in words]
    text = " ".join(text)
    #print(stripped[:100])

    ## Remove puncuation
    #text = text.translate(string.punctuation)

    ########################################################################################
    # replace urls
    re_url = re.compile(
        r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                        .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
        re.MULTILINE | re.UNICODE)
    # replace ips
    re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

    # replace URLs
    text = re_url.sub("URL", text)

    # replace IPs
    text = re_ip.sub("IPADDRESS", text)
    ####################################################################

    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    #stops = set(stopwords.words("english"))
    #text = [w for w in text if not w in stops and len(w) >= 3]

    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    #stemmer = SnowballStemmer('english')
    #stemmed_words = [stemmer.stem(word) for word in text]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)

    return text
Пример #25
0
 def __init__(self, pattern):
     Pattern.__init__(self, pattern)
     self.is_anaphor_first = pattern.startswith(
         '<X>')  # Track whether anaphor is first
     self.regex_pattern = re.compile(
         self.pattern.replace('<X>', '(\S+)').replace('<Y>', '(\S+)'))
Пример #26
0
def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')
Пример #27
0
plt.hist(non_spam_emph, label='Not spam')
plt.legend(loc='upper right')
plt.title('Emphasized words count - Spam VS not-spam')
fig3.show()

print('\nURLs exists in %s/%s of the spam docs' % (sum(i > 0 for i in spam_urls), spam_docs_count))
print('URLs exists in %s/%s of the non-spam docs' % (sum(i > 0 for i in non_spam_urls), non_spam_docs_count))

# Pre Processing data
lengths = 0
stem_it = True
sw = clean_sw()
max_features = 800
test_size = .2
# Clean repeating chars - looooooooooooooooooove -> love
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)

for idx, doc in enumerate(x_train_):
    doc = strip_url(doc)
    doc = is_long_number(doc)
    doc = pattern.sub(r"\1", doc)
    doc = convert_emphesize(doc)
    tokens = [english_stemmer(w) for w in text_to_word_sequence(doc, filters=filters, lower=True)]
    x_train_[idx] = [w for w in tokens if w not in sw]
    lengths += len(x_train_[idx])

max_len = round(lengths / idx)
# Maybe I should get the average length of a spam document VS a non spam document
print('Average document length: %s\n' % max_len)

x_train, x_test, y_train, y_test = train_test_split(x_train_, y_train,
 def remove_twitter_handle(self):
     """remove twitter handles"""
     pattern = re.compile(r'RT')
     self.text = re.sub(pattern=pattern, repl='', string=self.text)
     return self
Пример #29
0
def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|FAV|VIA)')
Пример #30
0
def get_arabic_pattern():
    #return re.compile('[\u0627-\u064a]')
    return re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+' )