def get_list(text): # Tokenize each string and change to normalize list1 = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') list1_text = list1.tokenize(str(text)) temp = [] for lis in list1_text: lis = lis.lower() temp.append(lis) return temp
def filter_words(text_file): text = read_text(text_file) tokenizer = RegexpTokenizer(r'\w+') token = tokenizer.tokenize(text) pos_list = nltk.pos_tag(token) filtered_words = [ w for w in pos_list if not w[0] in stopwords.words('english') ] return filtered_words
def tokenization(self): """ Tokenize the contents of the posts and remove the strings that contains punctuations, numbers or only single letter :return: a dataframe which each row becomes list of tokens """ tqdm.pandas() tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}') tokens_df = self.text_df.progress_apply( lambda x: tokenizer.tokenize(x.lower())) return tokens_df
def inference(model_dir, text): tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}') tokens = tokenizer.tokenize(text.lower()) text = ' '.join(tokens) # load vectorizer transformer = pickle.load(open(model_dir + "tfidf_transformer.pkl", 'rb')) # Create new tfidfVectorizer with old vocabulary vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, vocabulary=transformer.vocabulary_) vec = vectorizer.fit_transform([text]) # load model from file model = pickle.load(open(model_dir + "bigram_SVM.dat", "rb")) y_pred = model.predict(vec) return y_pred[0] # 1: STRESS
lines = file.readlines() for index, line in enumerate(lines): if "PROLOGUE" in line: lines = lines[index+1:] sentences = [] for line in lines: if len(line) > 1: for sentence in sent_tokenize(line): sentences.append(sentence) #print(sentences) stop_words = set(stopwords.words("english")) tokenizer = RegexpTokenizer(r'\w+') filtered_sentences = [] for sentence in sentences: sent = [] strspace = " " #print(sent.join(tokenizer.tokenize(str(sentence)))) for word in word_tokenize( " ".join(tokenizer.tokenize(str(sentence)))): if word not in list(stop_words): sent.append(word.lower()) filtered_sentences.append(" ".join(sent)) #print(filtered_sentences[:100]) model = gensim.models.Word2Vec(filtered_sentences)
from nltk.corpus import stopwords from nltk.corpus import wordnet as wn from nltk.corpus import RegexpTokenizer stop_words = stopwords.words("english") tokenizer = RegexpTokenizer(r"\w+") def semantic_info(word, lemma, context): #context = set(w for w in context # if w != word # and w != lemma) context.discard(word) context.discard(lemma) return simplified_lesk(lemma, context) def synsets_for(word): return wn.synsets(word) def remove_stopwords(sentence): return set(w for w in sentence if w not in stop_words) def simplified_lesk(lemma, context): synsets = synsets_for(lemma) best_sense = synsets[0]
f = open('data.txt') documents = f.readlines() f.close() # stoplist = set('for a of the and to in'.split()) # texts = [[word for word in document.lower().split() if word not in stoplist] # for document in documents] from nltk.corpus import RegexpTokenizer from nltk.corpus import stopwords from string import punctuation texts = [] for sentence in documents: sentence = sentence.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) custom_set = set(stopwords.words('english') + list(punctuation)) filtered_words = [w for w in tokens if w not in custom_set] texts.append(filtered_words) from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] from pprint import pprint # pretty-printer from gensim import corpora
''' Created on 06/05/2013 @author: Rodrigo ''' from nltk.corpus import stopwords, RegexpTokenizer english_stops = set(stopwords.words('english')) tokenizer = RegexpTokenizer('\s+', gaps=True) print [w for w in tokenizer.tokenize("This is not a common book") if not w in english_stops]
from languagemodeling.ngram import NGram, AddOneNGram, InterpolatedNGram if __name__ == '__main__': opts = docopt(__doc__) # load the data pattern = r'''(?ix) # set flag to allow verbose regexps (?:sr\.|sra\.) | (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' tokenizer = RegexpTokenizer(pattern) root = '.' corpus = PlaintextCorpusReader(root, 'books\.txt', word_tokenizer=tokenizer) sents = corpus.sents() # train the model n = int(opts['-n']) if opts['-m'] == 'addone': model = AddOneNGram(n, sents) elif opts['-m'] == 'inter': gamma = opts['-g'] if gamma is None: model = InterpolatedNGram(n, sents, None, False)