Python RegexpTokenizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk.corpus

Класс/Тип: RegexpTokenizer

Примеров на hotexamples.com: 9

Python RegexpTokenizer - 9 примеров найдено. Это лучшие примеры Python кода для nltk.corpus.RegexpTokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

RegexpTokenizer(7)

tokenize(6)

Основные методы

RegexpTokenizer (7)

tokenize (6)

Пример #1

Показать файл

def get_list(text):  # Tokenize each string and change to normalize
    list1 = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
    list1_text = list1.tokenize(str(text))
    temp = []
    for lis in list1_text:
        lis = lis.lower()
        temp.append(lis)
    return temp

Пример #2

Показать файл

def filter_words(text_file):
    text = read_text(text_file)
    tokenizer = RegexpTokenizer(r'\w+')
    token = tokenizer.tokenize(text)
    pos_list = nltk.pos_tag(token)
    filtered_words = [
        w for w in pos_list if not w[0] in stopwords.words('english')
    ]
    return filtered_words

Пример #3

Показать файл

    def tokenization(self):
        """ Tokenize the contents of the posts and remove the strings that contains punctuations,
            numbers or only single letter

        :return: a dataframe which each row becomes list of tokens
        """

        tqdm.pandas()
        tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}')
        tokens_df = self.text_df.progress_apply(
            lambda x: tokenizer.tokenize(x.lower()))
        return tokens_df

Пример #4

Показать файл

def inference(model_dir, text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}')
    tokens = tokenizer.tokenize(text.lower())
    text = ' '.join(tokens)

    # load vectorizer
    transformer = pickle.load(open(model_dir + "tfidf_transformer.pkl", 'rb'))

    # Create new tfidfVectorizer with old vocabulary
    vectorizer = TfidfVectorizer(stop_words='english',
                                 ngram_range=(1, 2),
                                 lowercase=True,
                                 vocabulary=transformer.vocabulary_)

    vec = vectorizer.fit_transform([text])
    # load model from file
    model = pickle.load(open(model_dir + "bigram_SVM.dat", "rb"))
    y_pred = model.predict(vec)
    return y_pred[0]  # 1: STRESS

Пример #5

Показать файл

lines = file.readlines()

for index, line in enumerate(lines):
  if "PROLOGUE" in line:
    lines = lines[index+1:]

sentences = []
for line in lines:
  if len(line) > 1:
    for sentence in sent_tokenize(line):
      sentences.append(sentence)

#print(sentences)

stop_words = set(stopwords.words("english"))
tokenizer = RegexpTokenizer(r'\w+')
filtered_sentences = []

for sentence in sentences:
  sent = []
  strspace = " "
  #print(sent.join(tokenizer.tokenize(str(sentence))))
  for word in word_tokenize( " ".join(tokenizer.tokenize(str(sentence)))):
    if word not in list(stop_words):
      sent.append(word.lower())
  filtered_sentences.append(" ".join(sent))

#print(filtered_sentences[:100])

model = gensim.models.Word2Vec(filtered_sentences)

Пример #6

Показать файл

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus import RegexpTokenizer

stop_words = stopwords.words("english")
tokenizer = RegexpTokenizer(r"\w+")


def semantic_info(word, lemma, context):
    #context = set(w for w in context
    #              if w != word
    #              and w != lemma)

    context.discard(word)
    context.discard(lemma)

    return simplified_lesk(lemma, context)


def synsets_for(word):
    return wn.synsets(word)


def remove_stopwords(sentence):
    return set(w for w in sentence if w not in stop_words)


def simplified_lesk(lemma, context):
    synsets = synsets_for(lemma)

    best_sense = synsets[0]

Пример #7

Показать файл

f = open('data.txt')
documents = f.readlines()
f.close()

# stoplist = set('for a of the and to in'.split())
# texts = [[word for word in document.lower().split() if word not in stoplist]
#          for document in documents]

from nltk.corpus import RegexpTokenizer
from nltk.corpus import stopwords
from string import punctuation

texts = []
for sentence in documents:
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    custom_set = set(stopwords.words('english') + list(punctuation))
    filtered_words = [w for w in tokens if w not in custom_set]
    texts.append(filtered_words)

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint  # pretty-printer
from gensim import corpora

Пример #8

Показать файл

Файл: 03_stopwords.py Проект: rmachado/nlp

'''
Created on 06/05/2013

@author: Rodrigo
'''

from nltk.corpus import stopwords, RegexpTokenizer
english_stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer('\s+', gaps=True)
print [w for w in tokenizer.tokenize("This is not a common book") if not w in english_stops]

Пример #9

Показать файл

Файл: train.py Проект: frannievas/PLN-2017

from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    pattern = r'''(?ix)    # set flag to allow verbose regexps
          (?:sr\.|sra\.)
        | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*        # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
        | \.\.\.            # ellipsis
        | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
    '''
    tokenizer = RegexpTokenizer(pattern)

    root = '.'
    corpus = PlaintextCorpusReader(root, 'books\.txt', word_tokenizer=tokenizer)

    sents = corpus.sents()

    # train the model
    n = int(opts['-n'])

    if opts['-m'] == 'addone':
        model = AddOneNGram(n, sents)
    elif opts['-m'] == 'inter':
        gamma = opts['-g']
        if gamma is None:
            model = InterpolatedNGram(n, sents, None, False)