def text_cleaner(text): negations_dictionary = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } negations_pattern = re.compile(r'\b(' + '|'.join(negations_dictionary.keys()) + r')\b') tokenizer = WordPunctTokenizer() processed_text = text.lower() negation_handled = negations_pattern.sub( lambda x: negations_dictionary[x.group()], processed_text) processed_text = re.sub("[^A-Za-z]", ' ', negation_handled) words = [x for x in tokenizer.tokenize(processed_text) if len(x) > 1] return words
def __init__(self, input_size, batch_size, path_to_write): self.word_vectors = FastText.load("D:\\Typing\\araneum_none_fasttextskipgram_300_5_2018.model") self.input_size = input_size self.tokenizer = WordPunctTokenizer() self.batch_size = batch_size self.path = path_to_write self.punctuations = ['.', ',', '-', '\'', '\"', '!', '?', '(', ')', ':', ';']
def filter_stop_words(text, stop_words): wpt = WordPunctTokenizer() tokenized_words = wpt.tokenize(text) processed_words = [word for word in tokenized_words if not word in stop_words] text = ' '.join([str(word) for word in processed_words]) return text
def clean_tweet(tweet): link_removed = re.sub('https?://[A-Za-z0-9./]+', '', tweet) number_removed = re.sub('[^a-zA-Z]', ' ', link_removed) lower_case_tweet = number_removed.lower() tok = WordPunctTokenizer() words = tok.tokenize(lower_case_tweet) clean_tweet = (' '.join(words)).strip() return clean_tweet
def sentence2words(sentence): result = [] word_punct_tokenizer = WordPunctTokenizer() words = word_punct_tokenizer.tokenize(sentence) stemmer = nltk.stem.SnowballStemmer('english') for word in words: ori_word = stemmer.stem(word) result.append(ori_word) return result
def load_task2(articles_path, labels_path, tokenizer='punct'): file_names, labels, spans = get_class_labels(labels_path) corpus = load_data(articles_path) tknz = WordPunctTokenizer() samples = [] for span, file_name in zip(spans, file_names): article = corpus[file_name] tokenized_span = tknz.tokenize(article[span[0]:span[1]]) samples.append(tokenized_span) return samples, labels, spans, file_names
def _tokenize(self, doc): all_tokens = [] sentences = sent_tokenize(doc) tokenizer = WordPunctTokenizer() for sentence in sentences: words = tokenizer.tokenize(sentence.lower()) words = [word for word in words if word not in punctuation] all_tokens.extend(words) return all_tokens
def _sentence_tok(delex_texts: List[str]) -> List[List[List[str]]]: #tokenize the texts sentence_tok_texts = [] tknzr = WordPunctTokenizer() for text in delex_texts: sentences = sent_tokenize(text) tok_sentences = [] for sentence in sentences: tok_sentences.append(tknzr.tokenize(sentence)) sentence_tok_texts.append(tok_sentences) return sentence_tok_texts
def stemming_words(text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) text = ' '.join([str(word) for word in stemmed_words]) # print (stemmed_words) return text
def stemming_words(self, text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) # try: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word[0:5]) # except: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word) text = ' '.join([str(word) for word in stemmed_words]) return text
class PunctTokenizer(object): def __init__(self, lower=True, prepend_cls=False, prepend_bos=False, append_eos=False, stopwords=None, specials=SPECIAL_TOKENS): self.lower = lower self.specials = SPECIAL_TOKENS self.pre_id = [] self.post_id = [] self.stopwords = stopwords if prepend_cls and prepend_bos: raise ValueError("prepend_bos and prepend_cls are" " mutually exclusive") if prepend_cls: self.pre_id.append(self.specials.CLS.value) if prepend_bos: self.pre_id.append(self.specials.BOS.value) if append_eos: self.post_id.append(self.specials.EOS.value) self.punct = WordPunctTokenizer() def __call__(self, x): if self.lower: x = x.lower() x = (self.pre_id + self.punct.tokenize(x) + self.post_id) if self.stopwords: x = [w for w in x if w not in self.stopwords] return x
def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.bgrams = {} self.sorted_bgrams = [] self.tokenizer = WordPunctTokenizer() self.token_count = 0
def setupTextPreProcess(): #import stopwords global stopWords global tokenizer global stemmer #save some space stopWords = [ 'wouldn', 'which', "it's", "didn't", 'her', 'she', "hasn't", "you've", 'some', 'more', 'is', 'couldn', 'hers', 'no', 'so', 'ours', 'd', "shouldn't", 'off', 'was', 'about', "mightn't", 'own', 'above', 'most', 'out', 'you', 'just', "hadn't", 'haven', 'o', 'once', 'shouldn', 'few', 'y', 've', 'your', 'ourselves', 'why', 'does', 'me', 'at', 'mustn', "you'd", 'then', 'weren', 're', 'below', 'should', 'doesn', 'when', "she's", 'can', 'of', 'that', "weren't", 'after', 'any', "should've", 'wasn', 't', 'very', 'these', 'yourself', 'nor', 'my', 'won', 'his', 'both', 'same', 'yours', 'only', 'but', 'our', 'it', 'has', 'be', "wasn't", 'herself', 'its', 'mightn', "needn't", 'as', 'am', 'are', 'they', 'their', 'doing', 'during', 'again', 'all', "doesn't", 'between', 'under', 'over', "that'll", "haven't", 'this', 'will', 'until', "couldn't", 'had', 'myself', 'because', 'than', "won't", 'he', 'have', 'with', 'don', 'hadn', 'who', 'up', "wouldn't", 'ain', 'them', 'i', 'through', 'aren', 'here', 'themselves', "shan't", 'if', 'what', 'ma', 'yourselves', 'theirs', 'a', 'against', 'being', 'to', 'where', 'on', 'having', 'himself', 'each', "you'll", "don't", 'further', "isn't", 'while', 'in', 'how', 'such', 'now', 'from', 'needn', 'there', 'hasn', 'too', 'm', 'or', 'not', 'didn', 'whom', 'down', 'shan', 'll', "you're", 's', "mustn't", 'him', 'were', 'an', 'we', "aren't", 'by', 'been', 'the', 'itself', 'before', 'did', 'into', 'and', 'for', 'do', 'other', 'those', 'isn' ] tokenizer = WordPunctTokenizer() stemmer = PorterStemmer()
def pre_process(data): clean_reviews = [] # 需要移除的无用符号 rmSignal = ['.', '#', '$', '?', '!', '%', ':/', ':', '-', '+', '/', '"'] for comment_content in data['review']: texts = WordPunctTokenizer().tokenize(comment_content) # NLTK分词 text = [word.lower() for word in texts] for word in text: if word in rmSignal: # 去掉符号 text.remove(word) continue if word.isdigit(): # 移除数字 text.remove(word) if re.search("[(.|…)]+", word): # 移除分词不能识别的省略号 text.remove(word) new_sentence = (" ".join(text)) # change to string clean_reviews.append(new_sentence) return clean_reviews
def __init__(self): """Set up map.""" self.word_tokenizer = WordPunctTokenizer() filename = join(split(__file__)[0], 'data', 'compounds.txt') self.decompound_map = {} with open(filename, encoding='utf-8') as fid: for line in fid: parts = line.strip().split('|') compound = "".join(parts) decompounded_parts = [ part for part in parts if part != 's' and part != 'e' ] decompounded = " ".join(decompounded_parts) self.decompound_map[compound] = decompounded
class NewsgroupsReader(object): def __init__(self, tokenize): self._tokenize = tokenize self._tokenizer = WordPunctTokenizer() def get_training(self): return self._get_docs('datasets/20news-bydate-train') def get_test(self): return self._get_docs('datasets/20news-bydate-test') def _get_docs(self, path): doc_objects = [] i = 0 for category in listdir(path): for f in listdir(path + "/" + category): with codecs.open(path + "/" + category + "/" + f, 'r', encoding='latin1') as content_file: text = content_file.read() tokens = self._tokenizer.tokenize(text) if self._tokenize else text doc_objects.append(Document(i, tokens, category)) i += 1 random.shuffle(doc_objects) return doc_objects
def nGrams(string,corpus,number,clean=True): global wordList biList=[] triList=[] words = WordPunctTokenizer().tokenize(string) stopset = set(stopwords.words('english')) if clean == True: words = [word.lower() for word in words] if clean == False: words = [word.lower() for word in words] filter = lambda words: len(words) < 2 or words.isdigit() bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter) biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number) tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter) triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number) for i in range(len(biResult)): if len(biResult) > 0: biPrint = " ".join(biResult[i]) biList.append(biPrint) else: biList=[] csv = open('db\cyttron-keywords.csv','a') if len(biList) > 1: csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";') else: csv.write('"' + ''.join(biList) + '";') csv.close() for i in range(len(triResult)): if len(triResult) > 0: triPrint = " ".join(triResult[i]) triList.append(triPrint) else: triList=[] csv = open('db\cyttron-keywords.csv','a') if len(triList) > 1: csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n') else: csv.write('"' + ''.join(triList) + '"\n') csv.close() print biList print triList
def nGrams(string, corpus, number, clean=True): global wordList biList = [] triList = [] words = WordPunctTokenizer().tokenize(string) stopset = set(stopwords.words('english')) if clean == True: words = [word.lower() for word in words] if clean == False: words = [word.lower() for word in words] filter = lambda words: len(words) < 2 or words.isdigit() bcf = BigramCollocationFinder.from_words(words) bcf.apply_word_filter(filter) biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number) tcf = TrigramCollocationFinder.from_words(words) tcf.apply_word_filter(filter) triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number) for i in range(len(biResult)): if len(biResult) > 0: biPrint = " ".join(biResult[i]) biList.append(biPrint) else: biList = [] csv = open('db\cyttron-keywords.csv', 'a') if len(biList) > 1: csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";') else: csv.write('"' + ''.join(biList) + '";') csv.close() for i in range(len(triResult)): if len(triResult) > 0: triPrint = " ".join(triResult[i]) triList.append(triPrint) else: triList = [] csv = open('db\cyttron-keywords.csv', 'a') if len(triList) > 1: csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n') else: csv.write('"' + ''.join(triList) + '"\n') csv.close() print biList print triList
def query(self, sent): sent = Query.decorate_sent(sent) elements = WordPunctTokenizer().tokenize(sent) post = self.to_post(elements) print(post) result = self.calculate_post(post) return result
def get_tense_verb_groups(text, verbose=False): tokens = WordPunctTokenizer().tokenize(text) if verbose: print("*************", len(tokens)) tokens_idx = list(WordPunctTokenizer().span_tokenize(text)) tokens = nltk.pos_tag(tokens) tokens = manual_pos_correction(tokens) if verbose: print("================", len(tokens), len(tokens_idx)) print("tokens: ", tokens) verb_groups = get_verbs(tokens, tokens_idx) if verbose: print("verb_groups: ", verb_groups) tense_verb_groups = [] for verb_group in verb_groups: tense_verb_group = get_tense(verb_group, verbose=verbose) tense_verb_groups.append(tense_verb_group) return tense_verb_groups
def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.doc_id = 0 self.token_count = 0 self.token_sum_len = 0 self.iindex = {} self.paragraphs = [] self.tokenizer = WordPunctTokenizer() self.stemmer = RussianStemmer()
def freqNouns(string, corpus, number): list = [] words = WordPunctTokenizer().tokenize(string) pos = nltk.pos_tag(words) for i in range(len(pos)): if len(pos[i][0]) > 1: if pos[i][1] == 'NN' or pos[i][1] == 'NNP': list.append(pos[i][0]) newString = ' '.join(list).lower() freqWords(newString, corpus, number)
def cleanDoc(doc): # String goes in, list of words comes out global stopset stemmer = nltk.PorterStemmer() tokens = WordPunctTokenizer().tokenize(doc) clean = [ token.lower() for token in tokens if token.lower() not in stopset and len(token) > 1 and token.isalnum() is True ] final = [stemmer.stem(word) for word in clean] return final
def stemOnto(ontolist): stemmer = nltk.PorterStemmer() templist = [] for i in range(len(ontolist)): templist = [] tokens = WordPunctTokenizer().tokenize(ontolist[i][0]) for j in range(len(tokens)): stem = stemmer.stem(tokens[j]) templist.append(stem) ontolist[i][0] = ' '.join(templist) print "Stemmed", len(ontolist), "things"
def stemList(sourceList): stemmer = nltk.PorterStemmer() templist = [] for i in range(len(sourceList)): templist = [] tokens = WordPunctTokenizer().tokenize(sourceList[i]) for j in range(len(tokens)): stem = stemmer.stem(tokens[j]) templist.append(stem) sourceList[i] = ' '.join(templist) print sourceList[i] print "Stemmed", len(sourceList), "texts"
def get_average_embedding(embedding, review): """ returns a list of word vectors for all words in review then average them to return a final vector :param embedding: embedding object - will be either Fasttext or Word2Vec :param review: review text :return: """ log.debug(f'Getting average embedding for: [{review}]') wpt = WordPunctTokenizer() # word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review)] word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review) if word in embedding.wv.vocab] log.debug(f'word_vector shape [{np.shape(word_vectors)}]') # return average all word vectors to come up with final vector for the review # since we are using pre-trained embedding, we may not be able to find all the words if np.shape(word_vectors)[0] > 1: return np.average(word_vectors, axis=0) return None
def __init__(self, thesaurus, need_deeppavlov=True, deeppavlov_model=None, need_syntax=True, syntax_model=None): self.need_deeppavlov = need_deeppavlov if need_deeppavlov: self.deeppavlov_lemma = deeppavlov_model if deeppavlov_model else build_model( configs.morpho_tagger.BERT.morpho_ru_syntagrus_bert, download=False) if need_syntax: self.syntax_model = syntax_model if syntax_model else build_model( configs.syntax.syntax_ru_syntagrus_bert, download=False) else: self.syntax_model = None self.tokenizer = WordPunctTokenizer() self.thesaurus = thesaurus
def choose_longest(mention): if len(mention) == 0: return "UNK" else: mention = [ " ".join( list(WordPunctTokenizer().tokenize(re.sub('[^ ]- ', '', item)))) for item in mention ] idx = np.argmax(np.asarray([len(a.strip().split()) for a in mention])) return mention[idx]
def preprocess(ftrain, ftest): train_data = [] test_data = [] word_list = [] word_set = set() ftrain.readline() ftest.readline() sw = [] f = open("D:/input/stopwords.txt") for line in f.readlines(): line = line.split('\n')[0] sw.append(line) for line in ftrain.readlines(): ls = line.split('\t') ls[2].lower() word_list = WordPunctTokenizer().tokenize(ls[2]) train_data.append([ls[0], int(ls[1]), set(word_list)]) for word in word_list: word_set.add(word) print("Training data processed") for line in ftest.readlines(): ls = line.split('\t') word_list = WordPunctTokenizer().tokenize(ls[1]) for word in word_list: if word in sw: word_list.remove(word) test_data.append([ls[0], 0, set(word_list)]) print("Testing data processed") word_set = word_set - (word_set & set(sw)) return train_data, test_data, word_set
def __init__(self, lower=True, prepend_cls=False, prepend_bos=False, append_eos=False, stopwords=None, specials=SPECIAL_TOKENS): self.lower = lower self.specials = SPECIAL_TOKENS self.pre_id = [] self.post_id = [] self.stopwords = stopwords if prepend_cls and prepend_bos: raise ValueError("prepend_bos and prepend_cls are" " mutually exclusive") if prepend_cls: self.pre_id.append(self.specials.CLS.value) if prepend_bos: self.pre_id.append(self.specials.BOS.value) if append_eos: self.post_id.append(self.specials.EOS.value) self.punct = WordPunctTokenizer()
def wordNetWordMatch(string): newString = "" string = WordPunctTokenizer().tokenize(string) for i in range(len(string)): currentWord = string[i].lower() synonyms = [] for syn in wordnet.synsets(currentWord): for lemma in syn.lemmas: synonyms.append(str(lemma.name).replace('_', ' ').lower()) synonyms = set(synonyms) word = ', '.join(synonyms) newString += word wordMatch(newString)
def remove_stop_words(self, sentence): """ remove stop_words of sentence :param sentence: :return: """ words = WordPunctTokenizer().tokenize(sentence) st = stopwords.words('english') str_list = [] for token in words: if token not in st: str_list.append(token) return " ".join(str_list)
class CustomTokenizer: def __init__(self, unicode_to_ascii=True, punct_one_token_per_char=True): self.unicode_to_ascii = unicode_to_ascii self.punct_one_token_per_char = punct_one_token_per_char self._re_punct = re.compile("(\p{P})") self._tokenizer = WordPunctTokenizer() def tokenize(self, text): if self.unicode_to_ascii: text = unidecode(text) if self.punct_one_token_per_char: text = re.sub(self._re_punct, "\\1 ", text) return self._tokenizer.tokenize(text)
def label_data(corpus, corpus_labels, tokenizer='punct'): samples, labels, spans, file_names = [], [], [], [] tknz = tree_bank_tokenizer( ) if tokenizer == 'treebank' else WordPunctTokenizer() print('Preprocessing') regex = get_regex() for article_name in corpus: article_propaganda_spans = iter([]) if corpus_labels: article_propaganda_spans = iter( sorted(corpus_labels[article_name] )) if article_name in corpus_labels else iter([]) propaganda_span = next(article_propaganda_spans, []) split_article = corpus[article_name].split('\n') index_offset = 0 first_in_span = True for line in split_article: if not line: index_offset += 1 continue line = re.sub(regex, lambda sent: '~' * len(sent.group()), line) line = sub_unicode_chars(line) tokenized_line = tknz.tokenize(line) span_tokenized_line = tknz.span_tokenize(line) line_spans, line_labels = [], [] for _ in tokenized_line: span_wo_offset = next(span_tokenized_line) current_span = (span_wo_offset[0] + index_offset, span_wo_offset[1] + index_offset) if propaganda_span and current_span[0] >= propaganda_span[1]: propaganda_span = next(article_propaganda_spans, []) first_in_span = True if propaganda_span and propaganda_span[0] <= current_span[ 0] < propaganda_span[1]: line_labels.append(1 if first_in_span else 2) first_in_span = False else: line_labels.append(0) line_spans.append(current_span) assert len(tokenized_line) == len(line_labels) == len( line_spans ), 'Number of tokens is not equal to the number of spans or labels' labels.append(line_labels) spans.append(line_spans) samples.append(tokenized_line) file_names.append(article_name) index_offset += len(line) + 1 print('Preprocessing done') return samples, labels, spans, file_names
class TolstojParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.bgrams = {} self.sorted_bgrams = [] self.tokenizer = WordPunctTokenizer() self.token_count = 0 def handle_starttag(self, tag, attrs): if tag == "dd": self.inside_dd = True else: self.inside_dd = False def handle_data(self, data): if self.inside_dd: tokens = self.tokenizer.tokenize(unicode(data, 'utf-8').lower()) for t1, t2 in itertools.izip(tokens, tokens[1:]): self.token_count += 1 if (t1[0] in string.punctuation) or (t2[0] in string.punctuation): continue key = t1.encode('utf-8') + ' ' + t2.encode('utf-8') if self.bgrams.has_key(key): self.bgrams[key] += 1 else: self.bgrams[key] = 1 def dump_bgrams(self, output_name): output = open(output_name, 'wb') pickle.dump(self.bgrams, output) output.close() def make_sorted_bgrams(self): self.sorted_bgrams = sorted(self.bgrams.items(), key=lambda x: x[1], reverse=True) def print_sorted_bgrams(self): for key, count in self.sorted_bgrams: print key, count
from gensim import corpora, models, similarities from nltk import WordPunctTokenizer import re NUM_TOPICS = 40 stopwords = open('stopwords.txt').read().split('\n') word_re = re.compile('[a-z0-9\s]+') tokenizer = WordPunctTokenizer() tokenize = lambda text: [w.lower() for w in tokenizer.tokenize(text) if re.match(word_re, w.lower()) and w.lower() not in stopwords] id2word = corpora.Dictionary.load('dictionary.dict') mm = corpora.MmCorpus('tfidf.mm') lsi = models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=NUM_TOPICS) dic = corpora.Dictionary.load('dictionary.dict') def get_topics(text, num, model=lsi): """ get +num+ topics for text +text+ """ topics = [] for t in sorted(model[dic.doc2bow(tokenize(text))], key=lambda t: t[1], reverse=True)[:num]: topics.append([u[1] for u in lsi.show_topic(t[0])]) return topics
def __init__(self, tokenize): self._tokenize = tokenize self._tokenizer = WordPunctTokenizer()
class KareninaParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.doc_id = 0 self.token_count = 0 self.token_sum_len = 0 self.iindex = {} self.paragraphs = [] self.tokenizer = WordPunctTokenizer() self.stemmer = RussianStemmer() def handle_starttag(self, tag, attrs): if tag == "dd": self.inside_dd = True self.doc_id += 1 else: self.inside_dd = False def handle_data(self, data): if self.inside_dd: self.paragraphs.append(data) terms = set() for token in self.tokenizer.tokenize(unicode(data.lower(), 'utf-8')): if token[0] in string.punctuation: continue self.token_count += 1 self.token_sum_len += len(token) term = self.stemmer.stem(token) if not term in terms: terms.add(term) if self.iindex.has_key(term): self.iindex[term].append(self.doc_id) else: self.iindex[term] = [ self.doc_id ] def dump_iindex(self, output_name): output = open(output_name, 'wb') pickle.dump(self.iindex, output) output.close() def dump_paragraphs(self, output_name): output = open(output_name, 'wb') pickle.dump(self.paragraphs, output) output.close() def get_stat(self): term_sum_len = 0 for term in self.iindex.keys(): term_sum_len += len(term) term_count = len(self.iindex.keys()) if not (term_count and self.token_count): self.stat = {} else: self.stat = { 'token_count': self.token_count, 'token_avg_len': self.token_sum_len/float(self.token_count), 'term_count': term_count, 'term_avg_len': term_sum_len/float(term_count) } return self.stat def print_iindex(self): for term in sorted(self.iindex.keys()): posting_list = self.iindex[term] print term print len(posting_list) print posting_list print '---------------------'