def __index_content(url_id, db, soup): title = soup.title.text if title is not None: if isinstance(title, basestring): title.encode('utf8') else: unicode(title).encode('utf8') content = soup.find("div", {"id": "mw-content-text"}).text if isinstance(content, basestring): content.encode('utf8') else: unicode(content).encode('utf8') # content = soup.text custom_tokenizer = PunktSentenceTokenizer() tokenized_sentences = custom_tokenizer.tokenize(unicode(content)) page = dict() page["title"] = title hints_list = list() try: for sentence in tokenized_sentences: words = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(words) grammar = r"""NP: {<DT|PP\$>?<JJ>*<NN>} {<NNP>+}""" chunk_parser = nltk.RegexpParser(grammar) chunked = chunk_parser.parse(tagged) for chunk in chunked.subtrees(): if chunk.label() == "NP": line = list() for each in chunk.leaves(): if len(each[0]) > 2: line.append(each[0]) if len(line) > 0: final_value = (" ".join(line)).lower() hints_list.append(final_value) page["hints"] = hints_list except Exception as e: print(str(e)) db.known_urls.update_one({"_id": url_id}, {"$set": {"content": page}}) page_content_size = len(page["hints"]) print(colored("\t\tUpdated With Indexed Content", "yellow")) # current_dir = os.getcwd() # files_dir = current_dir + "/Originals/" # file_name = url_id # file_path = files_dir + str(file_name) # created_file = open(file_path, "w") # created_file.write(content.encode("utf-8")) # created_file.close() # print("\t\tOriginal Content Is Saved") return page_content_size return
def get_nltk_sents(txt: str, tokenizer: nltk.PunktSentenceTokenizer, extra_abbreviations: Set[str] = None) -> List[str]: if extra_abbreviations is not None: tokenizer._params.abbrev_types.update(extra_abbreviations) return tokenizer.tokenize(txt)
class LanguageModel: """ N-gram model """ def __init__(self, n_gram=2, missed_value=0.99): """ :param n_gram: length of n-gram :param missed_value: default value for all unseen n-gram """ self.n = n_gram self.n_grams = {} self.context = {} self.sentence_tokenizer = SentenceTokenizer() self.tokenizer = Tokenizer() self.missed_value = missed_value def build_model(self, text): sentenses = self.sentence_tokenizer.tokenize(text) words = [ list( filter( lambda s: s.isalpha(), self.tokenizer.tokenize(sentence.strip()) ) ) for sentence in sentenses ] for sentence in words: if len(sentence) < self.n: key = " ".join(sentence) self.context.update({key: self.context.get(key, 0) + 1}) else: for i in range(len(sentence) - self.n + 1): context_key = " ".join(sentence[i:i + self.n - 1]) n_gram_key = " ".join(sentence[i:i + self.n]) self.context.update({context_key: self.context.get(context_key, 0) + 1}) self.n_grams.update({n_gram_key: self.n_grams.get(n_gram_key, 0) + 1}) def calculate_proba(self, sentence): words = list( filter( lambda s: s.isalpha(), self.tokenizer.tokenize(sentence.strip()) ) ) result = 1 for i in range(min(self.n - 2, len(words) - 1), len(words)): if i < self.n - 1: size = sum([val for key, val in self.context.items() if len(key.split(" ")) == i+1]) result *= self.context.get(" ".join(words[:i+1]), self.missed_value if i == self.n - 2 else 0) / size elif i > self.n - 2: context_key = " ".join(words[i-self.n+1:i]) n_gram_key = " ".join(words[i-self.n+1:i+1]) context_val = self.context.get(context_key, self.missed_value) n_gram_val = self.n_grams.get(n_gram_key, self.missed_value) p = n_gram_val / context_val result *= p return result
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len( current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
class SentenceToVec(BaseEstimator, TransformerMixin): def __init__(self, stop_words, vector_len=1000): self.vocab = [] self.stop_words = stop_words self.vector_len = vector_len self.tokenizer = PunktSentenceTokenizer() def format_word(self, word): if word.isdigit(): return "0" elif word in self.stop_words: return "" else: return word.strip() def tokenize(self, sentence): res_tokens = [] tokens_temp = self.tokenizer.tokenize(sentence) for tokens in tokens_temp: tokens = nltk.word_tokenize(tokens) tokens = [self.format_word(t) for t in tokens] res_tokens += [t for t in tokens if t] return res_tokens def fit(self, X, y=None): self.vocab = [] word_freq = Counter() for i in range(X.shape[0]): for w in self.tokenize(X[i]): if w not in self.stop_words: word_freq[w] += 1 for term, freq in word_freq.most_common(): if len(self.vocab) < self.vector_len: self.vocab.append(term) return self def _vectorize(self, words): freq = dict(Counter(words)) vector = [] for v in self.vocab: vector.append(freq[v] if v in words else 0) return np.array(vector) def transform(self, X, copy=True): _X = np.zeros((X.shape[0], len(self.vocab))) for i in range(X.shape[0]): _X[i] = self._vectorize(self.tokenize(X[i])) return _X
def pre_segment(doc): """Set sentence boundaries with nltk instead of spacy.""" if len(str(doc.text).split()) > 3: tokenizer = PunktSentenceTokenizer(doc.text) sentences = tokenizer.tokenize(doc.text) for nltk_sentence in sentences: words = re.findall(r"[\w]+|[^\s\w]", nltk_sentence) for i in range(len(doc) - len(words) + 1): token_list = [str(token) for token in doc[i:i + len(words)]] if token_list == words: doc[i].is_sent_start = True for token in doc[i + 1:i + len(words)]: token.is_sent_start = False return doc
def sentence_tokenizer(text): """ Tokenizes sentences. :param text: :return: list of sentences (a sentence is a string) """ punkt_param = PunktParameters() punkt_param.abbrev_types = { 'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv', 'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co', 'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl', '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21' } sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(text)
def handle(self, *app_labels, **options): print app_labels print options for article in BwogArticle.objects.all(): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(article.body) for sentence_index in range(len(sentences)): sentence = sentences[sentence_index] sentence_words = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(sentence_words) for tup_index in range(len(tagged)): tup = tagged[tup_index] article_word = tup[0] article_tag = tup[1] p = ParsedItem(content_object=article, word=article_word, tag=article_tag, sentence_sequence=sentence_index, word_sequence=tup_index) p.save() print p
def name_ent_recog(post): train_text = state_union.raw("2005-GWBush.txt") sample_text = post custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) namedEnt = [] try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt.append(nltk.ne_chunk(tagged)) # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}""" # # chunkGram = r"""Chunk: {<.*>+} # # }<VB.?|IN|DT>+{""" # chunkParser = nltk.RegexpParser(chunkGram) # chunked = chunkParser.parse(tagged) # print(chunked) # #print(tagged) except Exception as e: print(str(e)) return namedEnt
VBD verb, past tense took VBG verb, gerund/present participle taking VBN verb, past participle taken VBP verb, sing. present, non-3d take VBZ verb, 3rd person sing. present takes WDT wh-determiner which WP wh-pronoun who, what WP$ possessive wh-pronoun whose WRB wh-abverb where, when """ train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') custom_sentence_tokenizer = PunktSentenceTokenizer(train_text=train_text) tokenized_sentences = custom_sentence_tokenizer.tokenize(sample_text) def process_content(): try: for sentence in tokenized_sentences: words = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(words) chunk_gram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" chunk_parser = nltk.RegexpParser(chunk_gram) chunked = chunk_parser.parse(tagged) print(chunked) except Exception as e:
# -*- coding: utf-8 -*- """ Created on Wed Mar 11 12:49:39 2020 @author: alex.a.murray """ import nltk from nltk.corpus import state_union from nltk import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.text") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process_content
def tokenize_to_sentences(self, paragraph): tokenizer = PunktSentenceTokenizer() sentences = tokenizer.tokenize(paragraph) return sentences
VBD verb, past tense took VBG verb, gerund/present participle taking VBN verb, past participle taken VBP verb, sing. present, non-3d take VBZ verb, 3rd person sing. present takes WDT wh-determiner which WP wh-pronoun who, what WP$ possessive wh-pronoun whose WRB wh-abverb where, when """ train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') custom_sentence_tokenizer = PunktSentenceTokenizer(train_text=train_text) tokenized = custom_sentence_tokenizer.tokenize(text=sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) chunk_gram = r"""Chunk: {<.*>+} }<VB.?|IN|DT>+{""" chunk_parser = nltk.RegexpParser(chunk_gram) chunked = chunk_parser.parse(tagged) chunked.draw()
raw = f.read() lines = lib.get_dat_sgml(raw) sys.stderr.write(str(len(lines)) + " entries\n") p = PunktSentenceTokenizer() for i in range(len(lines)): if i % 100 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = dict(lines[i]) if not ("EKYWD" in line and "EABST" in line): continue abstract = line["EABST"] abstract = p.tokenize(abstract) abstract = [word_tokenize(sentence) for sentence in abstract] keywords = re.split("\t", line["EKYWD"]) keywords = [word_tokenize(keyword) for keyword in keywords] for sentence in abstract: pos_sentence = pos_tag(sentence) pos_sentence = [(word, tag.simplify.simplify_wsj_tag(t)) for word, t in pos_sentence] j = 0 while j < len(sentence): found = False for k in range(len(keywords)): keyword = keywords[k] keyword_len = len(keyword) if keyword_len > 0 and keyword == sentence[j:j+keyword_len]: for l in range(keyword_len): this_word = keyword[l]
for keyword in all_keywords: keywords.append(word_tokenize(keyword)) sys.stderr.write("All keywords: " + str(len(all_keywords)) + "\n") p = PunktSentenceTokenizer() for i in range(len(lines)): if i % 10 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = dict(lines[i]) if not ("EKYWD" in line and "EABST" in line): continue abstract = line["EABST"] abstract = p.tokenize(abstract) abstract = [word_tokenize(sentence) for sentence in abstract] for sentence in abstract: j = 0 while j < len(sentence): found = False for k in range(len(keywords)): keyword = keywords[k] keyword_len = len(keyword) if keyword_len > 0 and keyword == sentence[j:j+keyword_len]: for l in range(keyword_len): this_word = keyword[l] out = this_word + "\t" if l == 0: out += "B" else:
from nltk.corpus import stopwords import nltk example = "Hello Mr. Holmes. How are you doing? The weather is nice Holmes and Python is amazing. I hope you like it too!" sen_list = sent_tokenize(example) sen = sen_list[2] print(sen) stop_words = set(stopwords.words('english')) '''words = word_tokenize(sen) filtered_words = [] for w in words: if w not in stop_words: tokenizing filtered_words.append(w) print(filtered_words) ''' tokenize = PunktSentenceTokenizer(sen) tokenized = tokenize.tokenize(sen) # Speech tagging print(tokenized) for i in tokenized: words = word_tokenize(i) tagged = nltk.pos_tag(words) # Chunking ''' using regex here . means select all characters ? means atleast 1 repetation.. for further info see tutorial on pythonprogrammong.net ''' chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?} """ # RB,VB,NNp etc are tags like VB=verb... what we are doing here is selecting certain type of words in chunk chunkparser = nltk.RegexpParser(chunkgram) chunked = chunkparser.parse(tagged) print(chunked)
from nltk import PunktSentenceTokenizer, WordPunctTokenizer from collections import Counter vocab_size = 1000 sentTokenier = PunktSentenceTokenizer() wordTokenizer = WordPunctTokenizer() filename = 'data/formatted_movie_lines.txt' string = open(filename, mode='r', encoding='utf8').read() string = string.replace("'t", "") string = string.replace("'s", "") words = wordTokenizer.tokenize(string) sentences = set(sentTokenier.tokenize(string)) vocab = Counter(words).most_common(vocab_size) dict = Counter(vocab) sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences] new_sentences = [] with open("lines.txt", mode='w', encoding='utf8') as file: for sentence in sentences: write = True for word in sentence: if word in dict.keys(): write = False break if write: file.writelines(" ".join(sentence) + "\n") new_sentences.append(sentence)
#Representing the words with their Parts of Speech import nltk from nltk.corpus import state_union ''' PunktSentenceTokenizer is unsupervised ml sentence tokenizer It comes with pretraining and we can also further train it ''' from nltk import PunktSentenceTokenizer train = state_union.raw("2005-GWBush.txt") text = state_union.raw("2006-GWBush.txt") SentenceTokenizer = PunktSentenceTokenizer(train) tokenized = SentenceTokenizer.tokenize(text) def process(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process()
sentences = sent_tokenize(example_text) for w in words: print(w) print() for s in sentences: print(s) print() # Using PunktSentenceTokenizer and training it train_text = state_union.raw("2005-GWBush.txt") custom_sentence_tokenizer_trained = PunktSentenceTokenizer(train_text) sentences = custom_sentence_tokenizer_trained.tokenize(example_text) for s in sentences: print(s) print() # Using PunktSentenceTokenizer with no training (it comes pretrained) custom_sentence_tokenizer_untrained = PunktSentenceTokenizer() sentences = custom_sentence_tokenizer_untrained.tokenize(example_text) for s in sentences: print(s)