예제 #1
0
def get_list(text):  # Tokenize each string and change to normalize
    list1 = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
    list1_text = list1.tokenize(str(text))
    temp = []
    for lis in list1_text:
        lis = lis.lower()
        temp.append(lis)
    return temp
예제 #2
0
def filter_words(text_file):
    text = read_text(text_file)
    tokenizer = RegexpTokenizer(r'\w+')
    token = tokenizer.tokenize(text)
    pos_list = nltk.pos_tag(token)
    filtered_words = [
        w for w in pos_list if not w[0] in stopwords.words('english')
    ]
    return filtered_words
예제 #3
0
    def tokenization(self):
        """ Tokenize the contents of the posts and remove the strings that contains punctuations,
            numbers or only single letter

        :return: a dataframe which each row becomes list of tokens
        """

        tqdm.pandas()
        tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}')
        tokens_df = self.text_df.progress_apply(
            lambda x: tokenizer.tokenize(x.lower()))
        return tokens_df
예제 #4
0
def inference(model_dir, text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}')
    tokens = tokenizer.tokenize(text.lower())
    text = ' '.join(tokens)

    # load vectorizer
    transformer = pickle.load(open(model_dir + "tfidf_transformer.pkl", 'rb'))

    # Create new tfidfVectorizer with old vocabulary
    vectorizer = TfidfVectorizer(stop_words='english',
                                 ngram_range=(1, 2),
                                 lowercase=True,
                                 vocabulary=transformer.vocabulary_)

    vec = vectorizer.fit_transform([text])
    # load model from file
    model = pickle.load(open(model_dir + "bigram_SVM.dat", "rb"))
    y_pred = model.predict(vec)
    return y_pred[0]  # 1: STRESS
예제 #5
0
for index, line in enumerate(lines):
  if "PROLOGUE" in line:
    lines = lines[index+1:]

sentences = []
for line in lines:
  if len(line) > 1:
    for sentence in sent_tokenize(line):
      sentences.append(sentence)

#print(sentences)

stop_words = set(stopwords.words("english"))
tokenizer = RegexpTokenizer(r'\w+')
filtered_sentences = []

for sentence in sentences:
  sent = []
  strspace = " "
  #print(sent.join(tokenizer.tokenize(str(sentence))))
  for word in word_tokenize( " ".join(tokenizer.tokenize(str(sentence)))):
    if word not in list(stop_words):
      sent.append(word.lower())
  filtered_sentences.append(" ".join(sent))

#print(filtered_sentences[:100])

model = gensim.models.Word2Vec(filtered_sentences)

model.vocabulary
예제 #6
0
documents = f.readlines()
f.close()

# stoplist = set('for a of the and to in'.split())
# texts = [[word for word in document.lower().split() if word not in stoplist]
#          for document in documents]

from nltk.corpus import RegexpTokenizer
from nltk.corpus import stopwords
from string import punctuation

texts = []
for sentence in documents:
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    custom_set = set(stopwords.words('english') + list(punctuation))
    filtered_words = [w for w in tokens if w not in custom_set]
    texts.append(filtered_words)

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint  # pretty-printer
from gensim import corpora
# pprint(texts)
예제 #7
0
'''
Created on 06/05/2013

@author: Rodrigo
'''

from nltk.corpus import stopwords, RegexpTokenizer
english_stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer('\s+', gaps=True)
print [w for w in tokenizer.tokenize("This is not a common book") if not w in english_stops]