Python compile примеры, nltk.re.compile Python примеры использования

Пример #1

0

Показать файл

Файл: Bootstrapper.py Проект: cpalenmichel/Master-Thesis

 def find_npnp_patterns(self, wordpair, doc):
     # TODO Make an easier switch from head and full NP?
     # check each sentence and find both words in word pair.
     # if they are both in the sentence, then create a pattern.
     # Creating pattern: if adjacent, <X><Y> or <Y><X>, if not then <X>blahblah<Y> where X and Y are NPs.
     # X and Y are NPs but we want to extract the heads of X and Y.
     # X = anaphor, y = antecedent
     ret = []
     for s in doc.sentences:
         tokens = [tok.token for tok in s.words]
         if wordpair.anaphor.token in tokens and wordpair.antecedent.token in tokens:
             pattern_str1 = '(' + wordpair.anaphor.token + ')(.*)(' + wordpair.antecedent.token + ')'
             pattern_str2 = '(' + wordpair.antecedent.token + ')(.*)(' + wordpair.anaphor.token + ')'
             pattern1 = re.compile(pattern_str1)
             pattern2 = re.compile(pattern_str2)
             sent_str = ' '.join([w.token for w in s.words])
             match1 = re.search(pattern1, sent_str)
             match2 = re.search(pattern2, sent_str)
             if match1:
                 print(match1.group(1, 2, 3))
                 ret.append(NPNP('<X>(' + match1.group(2) + ')<Y>'))
             if match2:
                 print(match2.group(1, 2, 3))
                 ret.append(NPNP('<Y>(' + match2.group(2) + ')<X>'))
     return ret

Пример #2

0

Показать файл

def get_emojis_pattern():
    try:
        # UCS-4
        emojis_pattern = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    except re.error:
        # UCS-2
        emojis_pattern = re.compile(
            u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
    return emojis_pattern

Пример #3

0

Показать файл

Файл: twitter_preprocessor.py Проект: SE0428/Twitter

def get_emojis_pattern():
    #emojis_pattern= re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF'u'\u2600-\u26FF\u2700-\u27BF]+',re.UNICODE)

    emojis_pattern = re.compile("["
               u"\U0001F600-\U0001F64F"  # emoticons
               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
               u"\U0001F680-\U0001F6FF"  # transport & map symbols
               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
               u"\U00002500-\U00002BEF"  # chinese char
              u"\U00002702-\U000027B0"
              u"\U00002702-\U000027B0"
             u"\U000024C2-\U0001F251"
             u"\U0001f926-\U0001f937"
                u"\U00010000-\U0010ffff"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u200d"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\ufe0f"  # dingbats
                u"\u3030"
                "]+", re.UNICODE)

    return emojis_pattern

Пример #4

0

Показать файл

Файл: Text_mining_create_csv.py Проект: h222jin/Muffin

    def tokenize(self):
        filename = r'../static/data/news_threeDays_crawling.csv'
        with open(filename, 'r', encoding='utf-8') as f:
            self.texts = f.read()
        texts = self.texts.replace('\n', '')
        tokenizer = re.compile(r'[^ㄱ-힣]')
        self.texts = tokenizer.sub(' ', texts)
        self.tokens = word_tokenize(self.texts)
        _arr = []
        for token in self.tokens:
            token_pos = self.okt.pos(token)
            _ = [txt_tag[0] for txt_tag in token_pos if txt_tag[1] == 'Noun']
            if len("".join(_)) > 1:
                _arr.append("".join(_))
        self.noun_tokens = " ".join(_arr)

        filename = r'../static/data/stopwords.txt'
        with open(filename, 'r', encoding='utf-8') as f:
            self.stopword = f.read()
        print(type(self.stopword))
        self.noun_tokens = word_tokenize(self.noun_tokens)
        self.noun_tokens = [text for text in self.noun_tokens
                            if text not in self.stopword]
        keyword_list = self.noun_tokens
        self.freqtxt = pd.Series(dict(FreqDist(keyword_list))).sort_values(ascending=False)
        c2 = collections.Counter(keyword_list)
        a = c2.most_common(50)
        file = open('../static/data/news_threeDays_mining.csv', 'w', encoding='utf-8', newline='')
        print(file.name)
        csvfile = csv.writer(file)
        for row in a:
            csvfile.writerow(row)
        file.close()
        return file

Пример #5

0

Показать файл

def text_cleaning(text):
    stop = stopwords.words('english') + [
        "would", "could", "also", "one", "ha", "can't", "it's", "i've", "u",
        "it", "us", "we", "t", "s"
    ]  # define stopwords list

    # cleaning
    if pd.isnull(text):
        return ""
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)  # remove URLs
    text = text.lower()  # to lowercase
    text = ''.join([i for i in text if not i.isdigit()])  # remove digits
    # text = text.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)  # remove unicodes and emojis
    text = re.sub(r'(.)\1+', r'\1\1', text)
    unis_emojis_pattern = re.compile(
        pattern="["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    text = unis_emojis_pattern.sub(r' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuations
    split_text = text.split()
    text = ' '.join(x for x in text.split()
                    if x not in stop)  # remove stopwords
    return text

Пример #6

0

Показать файл

Файл: quick_view_extractor.py Проект: CoveoWhisper/ModelGenerator

    def parseText(text):
        words = word_tokenize(text, language=QuickViewExtractor.LANGUAGE)

        regex = re.compile('[^a-zA-Z0-9]')
        words = [regex.sub('', w).lower() for w in words]
        words = [w for w in words if w]

        return words

Пример #7

0

Показать файл

Файл: twitter_preprocess.py Проект: fangzheng79/Interest_Extraction_Library

def get_negations_pattern():
    negations_ = {"isn't": "is not", "can't": "can not", "couldn't": "could not", "hasn't": "has not",
                  "hadn't": "had not", "won't": "will not",
                  "wouldn't": "would not", "aren't": "are not",
                  "haven't": "have not", "doesn't": "does not", "didn't": "did not",
                  "don't": "do not", "shouldn't": "should not", "wasn't": "was not", "weren't": "were not",
                  "mightn't": "might not",
                  "mustn't": "must not"}
    return re.compile(r'\b(' + '|'.join(negations_.keys()) + r')\b')

Пример #8

0

Показать файл

Файл: ner_chunker.py Проект: TeamAntriksh/Automated-Detection-of-Hazards

 def parse_tweets(self, tweets):
     regex = re.compile(
         '[,#@`:)(\[\]\'%^~=&*+/;<>{}|!?._]|http[,#@`\-:)(\[\]\'%^=&_*+/;<>{}|.!?a-z]*'
     )
     named_entities_tree = ''
     for tweet in tweets:
         text = str.lower(str(tweet.processed_text))
         text = regex.sub('', text)
         current_tree = self.parse(pos_tag(word_tokenize(text)))
         named_entities_tree += str(current_tree)
     return named_entities_tree

Пример #9

0

Показать файл

Файл: twitter_preprocessor.py Проект: kk0walski/AJNLab1

def get_emojis_pattern():
    return re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)

Пример #10

0

Показать файл

Файл: tfidf.py Проект: drndr/sentiment_analysis

def preprocess_text(old_text):
    """preprocess given text and return str"""
    new_text = re.sub(r'https?:\/\/.*[\r\n]*', '', old_text)  # remove URL before
    new_text = re.sub(r'<[^>]+>', '', new_text)  # remove html (line breaks etc.)
    new_text = re.sub(re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '',
                      new_text)  # remove email
    new_text = re.sub(r'#', '', new_text)  # remove hash sign from hashtags, hashtag itself remains
    new_text = re.sub('@[^\s]+', '', new_text)  # deletes mentions with @ TODO I think it has no influence
    new_text = re.sub("[^a-zA-Z]", " ", new_text)  # remove remaining special characters
    words = new_text.lower().split()  # do lowercase, split into words
    #words = [word for word in words if not word in stopwords_english]  # remove stop words -> makes results worse
    #words = [stemmer.stem(word) for word in words]  # stemming -> leads to worse results
    #words = [lemma.lemmatize(word) for word in words]  # lemmatization -> leads to worse results
    # join words list back to one tweet
    return " ".join(words)

Пример #11

0

Показать файл

    def text_tokenizer(self, text: str):
        text = re.sub('\S*@\S*\s?', '', text)  # remove emails
        text = re.sub(r'^https?://.*[\r\n]*', '', text,
                      flags=re.MULTILINE)  # remove websites

        words = word_tokenize(text, 'english')
        words = list(filter(lambda word: len(word) >= self.min_length, words))

        # text = (list(map(lambda x: self.stemmer.stem(x), words)))
        tokens = (list(map(lambda x: self.lemmatizer.lemmatize(x), words)))
        p = re.compile('[a-zA-Z]+')
        filtered_tokens = list(
            filter(
                lambda token: p.match(token) and len(token) >= self.min_length,
                tokens))

        return filtered_tokens

Пример #12

0

Показать файл

Файл: tfidf_airline.py Проект: drndr/sentiment_analysis

def preprocess_tweet(text):
    new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', text)  # remove URL
    new_tweet = re.sub(r'<[^>]+>', '',
                       new_tweet)  # remove html (line breaks etc.)
    new_tweet = re.sub(
        re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"), '',
        new_tweet)  # remove email
    new_tweet = re.sub(r'#', '', new_tweet)  # remove hash sign from hashtags
    new_tweet = re.sub("[^a-zA-Z]", " ",
                       new_tweet)  # remove remaining special characters
    words = new_tweet.lower().split()  # do lowercase, split into words
    words = [word for word in words
             if not word in stopwords_english]  # remove stop words
    words = [stemmer.stem(word) for word in words]  # stemming
    words = [lemma.lemmatize(word) for word in words]  # lemmatization
    # join words list back to one tweet
    return " ".join(words)

Пример #13

0

Показать файл

def get_mentions_pattern():
    return re.compile(r'@\w*')

Пример #14

0

Показать файл

def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|rt|FAV|fav|VIA|via)')

Пример #15

0

Показать файл

def get_blank_spaces_pattern():
    return re.compile(r'\s{2,}|\t')

Пример #16

0

Показать файл

def get_single_letter_words_pattern():
    return re.compile(r'(?<![\w\-])\w(?![\w\-])')

Пример #17

0

Показать файл

def get_hashtags_pattern():
    return re.compile(r'#\w*')

Пример #18

0

Показать файл

Файл: twitter_preprocessor.py Проект: SE0428/Twitter

def get_hashtags_pattern():
    return re.compile(r'#([^\s]+)')

Пример #19

0

Показать файл

Файл: topic_modeling.py Проект: LanLi2017/yw-text-mining

# @end AccessText

# @begin PreprocessFile @desc To preprocess the text data
# @in stopwords
# @in regexr @as regular_expression
# @in TextRead
# @out dictionary
# @out train_corpus
# @out test_corpus
en_stopwords = set(stopwords.words('english'))
de_stopwords = set(stopwords.words('german'))
fr_stopwords = set(stopwords.words('french'))
stopwords = en_stopwords | de_stopwords | fr_stopwords

regexr = re.compile('([a-z])\w+')

file_tokens = (word.lower() for word in text_file)
clean_file = [word for word in file_tokens if word not in stopwords]
# calculate word frequencies
word_frequency = defaultdict(int)
for text in text_file:
    for token in text:
        word_frequency[token] += 1
# only keep words that occur more than once
processed_corpus = [[token for token in text if word_frequency[token] > 1]
                    for text in text_file]
# associate each word in the corpus with a unique integer ID
dictionary = corpora.Dictionary(processed_corpus)
corpus = [dictionary.doc2bow(text) for text in processed_corpus]

Пример #20

0

Показать файл

Файл: twitter_preprocessor.py Проект: yujiawang210/IEOR4577_hw3

 def remove_numbers(self):
     """remove any numbers"""
     pattern = re.compile(r'[0-9]+')
     self.text = re.sub(pattern=pattern, repl='', string=self.text)
     return self

Пример #21

0

Показать файл

import string

import nltk
from nltk.corpus import stopwords
from nltk import re

MIN_YEAR = 1900
MAX_YEAR = 2100

emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "]+",
    flags=re.UNICODE)
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'


def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')


def get_emojis_pattern():
    try:
        # UCS-4
        emojis_pattern = re.compile(
            u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])'

Пример #22

0

Показать файл

def tokenize(s):
    return tokens_re.findall(s)


def preprocess(s):
    tokens = tokenize(s)
    return tokens


regex_str = [
    r'<[^>]+>',  # HTML tags
    r'(?:@[\w_]+)',  # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)'  # anything else
]
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                       re.VERBOSE | re.IGNORECASE)
print("Token compilation completed")

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + [
    'rt', 'via', '…', 'trump', 'donaldtrump', 'therealdonaldtrump',
    'president', "trump's", 'donald'
]
print("Stopword list construction completed")

Пример #23

0

Показать файл

Файл: twitter_preprocessor.py Проект: yujiawang210/IEOR4577_hw3

 def remove_punctuation(self):
     """remove any punctutations"""
     pattern = re.compile(r'[^\w\s]')
     self.text = re.sub(pattern=pattern, repl='', string=self.text)
     return self

Пример #24

0

Показать файл

def clean_text(text):
    import nltk
    nltk.download('stopwords')
    nltk.download('wordnet')

    # split into words by white space
    words = text.split()
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in words]
    text = " ".join(text)
    #print(stripped[:100])

    ## Remove puncuation
    #text = text.translate(string.punctuation)

    ########################################################################################
    # replace urls
    re_url = re.compile(
        r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                        .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
        re.MULTILINE | re.UNICODE)
    # replace ips
    re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

    # replace URLs
    text = re_url.sub("URL", text)

    # replace IPs
    text = re_ip.sub("IPADDRESS", text)
    ####################################################################

    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    #stops = set(stopwords.words("english"))
    #text = [w for w in text if not w in stops and len(w) >= 3]

    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    #stemmer = SnowballStemmer('english')
    #stemmed_words = [stemmer.stem(word) for word in text]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)

    return text

Пример #25

0

Показать файл

Файл: Bootstrapper.py Проект: cpalenmichel/Master-Thesis

 def __init__(self, pattern):
     Pattern.__init__(self, pattern)
     self.is_anaphor_first = pattern.startswith(
         '<X>')  # Track whether anaphor is first
     self.regex_pattern = re.compile(
         self.pattern.replace('<X>', '(\S+)').replace('<Y>', '(\S+)'))

Пример #26

0

Показать файл

def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')

Пример #27

0

Показать файл

plt.hist(non_spam_emph, label='Not spam')
plt.legend(loc='upper right')
plt.title('Emphasized words count - Spam VS not-spam')
fig3.show()

print('\nURLs exists in %s/%s of the spam docs' % (sum(i > 0 for i in spam_urls), spam_docs_count))
print('URLs exists in %s/%s of the non-spam docs' % (sum(i > 0 for i in non_spam_urls), non_spam_docs_count))

# Pre Processing data
lengths = 0
stem_it = True
sw = clean_sw()
max_features = 800
test_size = .2
# Clean repeating chars - looooooooooooooooooove -> love
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)

for idx, doc in enumerate(x_train_):
    doc = strip_url(doc)
    doc = is_long_number(doc)
    doc = pattern.sub(r"\1", doc)
    doc = convert_emphesize(doc)
    tokens = [english_stemmer(w) for w in text_to_word_sequence(doc, filters=filters, lower=True)]
    x_train_[idx] = [w for w in tokens if w not in sw]
    lengths += len(x_train_[idx])

max_len = round(lengths / idx)
# Maybe I should get the average length of a spam document VS a non spam document
print('Average document length: %s\n' % max_len)

x_train, x_test, y_train, y_test = train_test_split(x_train_, y_train,

Пример #28

0

Показать файл

Файл: twitter_preprocessor.py Проект: yujiawang210/IEOR4577_hw3

 def remove_twitter_handle(self):
     """remove twitter handles"""
     pattern = re.compile(r'RT')
     self.text = re.sub(pattern=pattern, repl='', string=self.text)
     return self

Пример #29

0

Показать файл

def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|FAV|VIA)')

Пример #30

0

Показать файл

Файл: twitter_preprocessor.py Проект: SE0428/Twitter

def get_arabic_pattern():
    #return re.compile('[\u0627-\u064a]')
    return re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+' )

Python compile примеры использования