def parseTextToSentences(text): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) data = text data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) return sentences
def analyzer_results(): essay = request.form.get('essay') similarity_nmf = request.form.get('similarity_nmf') similarity_tfidf = request.form.get('similarity_tfidf') # linebreak_idx = [m.start() for m in re.finditer('\n', essay)] s_tokenizer = PunktSentenceTokenizer() sentences = s_tokenizer.tokenize(essay) top_sentences = textrank.summarize(essay).split('\n') top_idx = [] for i,sentence in enumerate(sentences): if sentence in top_sentences: top_idx.append(i) # Retokenize to get punctuation marks back sentences = s_tokenizer.tokenize(essay) sentences = list(enumerate(sentences)) topics,similar_essays = processEssay(essay, similarity_nmf, similarity_tfidf, json_output=False) essay1 = similar_essays[0] essay2 = similar_essays[1] essay3 = similar_essays[2] topic1, topic2, topic3, topic4, topic5, topic6, topic7 = topics topic_names = ['Family', 'Music', 'Culture', 'Sport', 'Personal/Story', 'Science', 'Career'] topic_tuples = zip(topic_names, topics) # Load interactive plot interactive_plot = interactivePlot() return render_template('analyzer_results.html', essay1 = essay1, essay2 = essay2, essay3 = essay3, topic_tuples = topic_tuples, sentences=sentences, top_idx=top_idx, interactive_plot=interactive_plot)
def summarize_pdf(article_text): trainer=PunktTrainer() trainer.train(article_text) tok=PunktSentenceTokenizer(trainer.get_params()) sentence_list = tok.tokenize(article_text) sentence_lists=[] sent_list=[] clean_sent=[] for sent in sentence_list: tok=TreebankWordTokenizer() words=tok.tokenize(sent) wordss=[] words=[ww.lower() for ww in words] sentence_lists.append(" ".join(words)) for word,tag in pos_tag(words): if tag.startswith('NN'): pos='n' elif tag.startswith('VB'): pos='v' elif tag.startswith('RB'): pos='r' else: pos='a' stem=WordNetLemmatizer() w=stem.lemmatize(word,pos) if(w not in punc) & bool(re.search("[^\d]",w)): wordss.append(w.lower()) clean_sent.append(' '.join(wordss)) sent_list.append(wordss) return sent_list,clean_sent,sentence_lists,sentence_list
def split_sentences_to_strings(input_file, name="Movie"): """ Transforme a plane text formation into a list of strings, each string a sentence. """ data = [] text = str() with open(input_file, newline='', encoding="utf-8") as in_file: reader = csv.reader(in_file, delimiter=',', quotechar='"') next(reader) #skip header for row in reader: if name == "Movie": text += row[2] elif name == "Financial": text += row[3] sent_detector = PunktSentenceTokenizer(train_text=text) with open(input_file, newline='', encoding="utf-8") as in_file: reader = csv.reader(in_file, delimiter=',', quotechar='"') next(reader) #skip header for row in reader: cleaned_sentences = [] if name == "Movie": sentences = sent_detector.tokenize(row[2].strip()) elif name == "Financial": sentences = sent_detector.tokenize(row[3].strip()) for sentence in sentences: words = wordpunct_tokenize(sentence) words_out = [] for word in words: words_out.append(clean_word(word)) cleaned_sentence = " ".join(words_out) cleaned_sentence = wordpunct_tokenize( cleaned_sentence) #remove double space cleaned_sentence = " ".join(cleaned_sentence) cleaned_sentence = wordpunct_tokenize( cleaned_sentence) #remove double NUMBER token last_len = 0 while last_len != len(cleaned_sentence): last_len = len(cleaned_sentence) for index, word in enumerate(cleaned_sentence): if word == "NUMBER": if index + 1 < len(cleaned_sentence): if cleaned_sentence[index + 1] == "NUMBER": del (cleaned_sentence[index + 1]) cleaned_sentence = " ".join(cleaned_sentence) cleaned_sentences.append(cleaned_sentence) if name == "Movie": data.append([cleaned_sentences, row[1]]) elif name == "Financial": data.append([cleaned_sentences, row[4], row[3]]) return data
class PunktSplitter(ISplitter): """ A splitter using the `PunktSentenceTokenizer <https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt>`_, the NLTK implementation of the "Unsupervised Multilingual Sentence Boundary Detection (Kiss and Strunk (2005)" algorithm. .. note:: The default implementation uses a model trained on English sentences. `This kaggle resource <https://www.kaggle.com/nltkdata/punkt/version/2#>`_ offers pretrained Punkt Models for other languages as well, including German. In my tests though, German models performed poorly compared to the default... .. todo:: Train a Punkt model for Swiss-German. (https://stackoverflow.com/questions/21160310/training-data-format-for-nltk-punkt) """ def __init__(self, modelfile=None): if modelfile is not None: with open(modelfile, 'rb') as f: self.tokenizer = pickle.load(f) else: self.tokenizer = PunktSentenceTokenizer() def split(self, text: str) -> List[str]: """ Split text using Punkt. """ paragraphs = (p for p in text.split('\n') if p) sentences = [] for p in paragraphs: sentences.extend(self.tokenizer.tokenize(p)) return sentences
def summarize(text, ref='', lines=7): text = re.sub(r'\[[0-9]*\]', ' ', text) text = re.sub(r'\s+', ' ', text) clean_text = text.lower() clean_text = re.sub(r'\W', ' ', clean_text) clean_text = re.sub(r'\d', ' ', clean_text) clean_text = re.sub(r'\s+', ' ', clean_text) punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e']) sentence_splitter = PunktSentenceTokenizer(punkt_param) text = text.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = sentence_splitter.tokenize(text) #sentences = nltk.sent_tokenize(text) stop_words = nltk.corpus.stopwords.words('english') word_count = {} for word in nltk.word_tokenize(clean_text): if word not in stop_words: word_count[word] = word_count.get(word, 0) + 1 sentence_score = {} i = 0 for s in sentences: for word in nltk.word_tokenize(s.lower()): if word in word_count.keys(): old = sentence_score.get(s, (0, 0, i)) i += 1 sentence_score[s] = (old[0] + word_count[word], old[1] + 1, old[2]) def score(pair): return (pair[0] - pair[2]) / pair[1] scores = {} for s in sentence_score.keys(): if sentence_score[s][1] > 2: scores[s] = score(sentence_score[s]) else: scores[s] = score(sentence_score[s]) - 100 best_sentences = heapq.nlargest(lines, scores, key=scores.get) best_sentences.sort(key=lambda x: sentence_score[x][2]) string = '' for s in best_sentences: if s[0] == ' ': s = s[1:] if 'refer' in s and len(scores.keys()) < 4: print('Please be more specific\n') if len(ref) > 1: print('Here are some suggestions:') for i in range(len(ref)): print("=>", ref[i]) print('\n') return print(s) string += s + '\n' return string
def get_todo_items(text): all_items = list() tokenizer = PunktSentenceTokenizer() sen_tokens = tokenizer.tokenize(text) for sen_token in sen_tokens: todo_items = list() tokens = nltk.word_tokenize(sen_token) tags = tagger.tag(tokens) stop_words = [word for (word, tag) in tags if tag in (tagVB, tagVBP)] ind = -1 for word in stop_words: curr_ind = tokens.index(word) if curr_ind != 0 and tags[curr_ind - 1][1] in (tagCC, tagRB): to_ind = curr_ind - 1 else: to_ind = curr_ind if ind != -1 and abs(to_ind - ind) > 1: todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, to_ind)])) elif ind != -1 and len(todo_items) > 0: last_ind = len(todo_items) todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[to_ind - 1]]) ind = curr_ind if ind != -1 and abs(len(tokens) - ind) > 1: todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, len(tokens))])) elif ind != -1 and len(todo_items) > 0: last_ind = len(todo_items) todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[len(tokens) - 1]]) all_items.extend(todo_items) return all_items
def fractal_representation(self): punkt_param = PunktParameters() for each_paragraph in self.paragraphs: buffer_p = paragraph() buffer_p.paragraph = each_paragraph buffer_p.tokens = nltk.word_tokenize(preprocess(each_paragraph)) buffer_p.weights['words'] = FreqDist(buffer_p.tokens) buffer_p.weights['total'] = {'words':0, 'sentences':0} punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(each_paragraph) for each_sentence in sentences: self.stotal += 1 buffer_s = sentence() buffer_s.sentence = each_sentence buffer_s.tokens = nltk.word_tokenize(preprocess(each_sentence)) if len(buffer_s.tokens) > 0: buffer_s.weights['sentence'] = FreqDist(buffer_s.tokens) buffer_s.weights['paragraph'] = self.calculate_relative_frequence(buffer_s.tokens, buffer_p.weights['words']) buffer_s.weights['document'] = self.calculate_relative_frequence(buffer_s.tokens, self.fractal.weights) buffer_s.weights['total'] = {} buffer_s.weights['total']['sentence'] = 1 buffer_s.weights['total']['paragraph'] = sum(buffer_s.weights['paragraph'].values()) buffer_s.weights['total']['document'] = sum(buffer_s.weights['document'].values()) self.s_weight += buffer_s.weights['total']['document'] buffer_p.weights['total']['sentences'] += buffer_s.weights['total']['document'] buffer_p.sentences.append(buffer_s) buffer_p.weights['total']['words'] = sum(buffer_p.weights['words'].values()) self.fractal.paragraphs.append(buffer_p) self.pindex += 1
def summarize(self): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(self.text) structure = {} sentence_objects = [] for idx in range(len(sentences)): obj = {'text' : sentences[idx], 'index' : idx , 'data': {}} sentence_objects.append(obj) structure['sentences'] = sentence_objects self.sentencecount = len(structure['sentences']) structure['ordered'] = [] structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0, 'transformed': 0} structure['weights']['total'] = sum(structure['weights']['words'].values()) self.sentenceIndex = 0 for each_sent in structure['sentences']: each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text'])) each_sent['data']['sinTransform'] = (1-math.sin(self.sentenceIndex*(math.pi/self.sentencecount)))+1 for each_word in structure['weights']['words']: if each_word in each_sent['data']['tokens']: structure['weights']['words'][each_word] *= each_sent['data']['sinTransform'] self.sentenceIndex += 1 structure['weights']['transformed'] = sum(structure['weights']['words'].values()) self.sentenceIndex = 0 for each_sent in structure['sentences']: each_sent['data']['weights'] = {'words': self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']), 'total': 0} each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values()) self.sentenceIndex += 1 structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True) structure_keep = structure['ordered'][:self.quota] structure_keep.sort(key=lambda x:x['index']) for eac_sen in structure_keep: self.summary.append(eac_sen['text'])
def _create_data(self): if self.split == 'train': self._create_vocab() else: self._load_vocab() tokenizer = PunktSentenceTokenizer(preserve_case=False) data = defaultdict(dict) with open(self.raw_data_path, 'r') as file: for i, line in enumerate(file): words = tokenizer.tokenize(line) input = ['<sos>'] + words input = input[:self.max_sequence_length] target = words[:self.max_sequence_length - 1] target = target + ['<eos>'] assert len(input) == len(target), "%i, %i" % (len(input), len(target)) length = len(input) input.extend(['<pad>'] * (self.max_sequence_length - length)) target.extend(['<pad>'] * (self.max_sequence_length - length)) input = [self.w2i.get(w, self.w2i['<unk>']) for w in input] target = [self.w2i.get(w, self.w2i['<unk>']) for w in target] id = len(data) data[id]['input'] = input data[id]['target'] = target data[id]['length'] = length with io.open(os.path.join(self.data_dir, self.data_file), 'wb') as data_file: data = json.dumps(data, ensure_ascii=False) data_file.write(data.encode('utf8', 'replace')) self._load_data(vocab=False)
def _create_vocab(self): assert self.split == 'train', "Vocabulary can only be created for training file." tokenizer = PunktSentenceTokenizer(preserve_case=False) w2c = OrderedCounter() w2i = dict() i2w = dict() special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>'] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) with open(self.raw_data_path, 'r') as file: for i, line in enumerate(file): words = tokenizer.tokenize(line) w2c.update(words) for w, c in w2c.items(): if c > self.min_occ and w not in special_tokens: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocabulary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) self._load_vocab()
def tokenSentence(s): tokenizer = PunktSentenceTokenizer() tokenizer.train(s) l = tokenizer.tokenize(s) s = '\n'.join(l) return s
def read_docx(path): """read .docx (Microsoft 2007+) """ try: doc = docx.Document(path) punkt_param = PunktParameters() punkt_param.abbrev_types = set(['fig']) tokenizer = PunktSentenceTokenizer(punkt_param) body = [] for p in doc.paragraphs: body += tokenizer.tokenize(clean_text(p.text)) body = '\n'.join(body) tables = [] for t in doc.tables: table = {'cells': []} for row in t.rows: row_elements = [] for cell in row.cells: for p in cell.paragraphs: row_elements.append({'text': clean_text(p.text)}) table['cells'].append(row_elements) tables.append(table) data = PaperData(body, tables) except Exception: logger.info('fail: %s', path) traceback.print_exc() return PaperData() return data
def retrieveUrlText(url): try: config = Config() config.request_timeout = 1000 config.memoize_articles = False config.fetch_images = False config.browser_user_agent = 'Mozilla/5.0' article = Article(url, config) article.download(recursion_counter=5) if article.download_state != 2: return '' article.parse() articleText = article.text.replace('\n', ' ') except KeyboardInterrupt: raise except Exception: return '' punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'fig', 'figs', 'chem', 'ph' ]) sentence_splitter = PunktSentenceTokenizer(punkt_param) articleSentences = validateSentences( sentence_splitter.tokenize(articleText)) return articleSentences
def _split_sentences(self, text): from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
def tokenize_sentences(self, untokenized_string: str): """Tokenize sentences by reading trained tokenizer and invoking ``PunktSentenceTokenizer()``. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :rtype : list of strings """ # load tokenizer assert isinstance(untokenized_string, str), \ 'Incoming argument must be a string.' if self.language == 'latin': tokenizer = super() elif self.language == 'greek': # Workaround for regex tokenizer self.sent_end_chars=GreekLanguageVars.sent_end_chars self.sent_end_chars_regex = '|'.join(self.sent_end_chars) self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s' elif self.language in INDIAN_LANGUAGES: self.sent_end_chars=SanskritLanguageVars.sent_end_chars self.sent_end_chars_regex = '|'.join(self.sent_end_chars) self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s' else: # Warn that NLTK Punkt is being used by default??? tokenizer = PunktSentenceTokenizer() # mk list of tokenized sentences if self.language == 'greek' or self.language in INDIAN_LANGUAGES: return re.split(self.pattern, untokenized_string) else: return tokenizer.tokenize(untokenized_string)
def textrank(document): pst = PunktSentenceTokenizer() sentences = pst.tokenize(document) # Bag of Words from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() bow_matrix = cv.fit_transform(sentences) from sklearn.feature_extraction.text import TfidfTransformer normalized_matrix = TfidfTransformer().fit_transform(bow_matrix) ## mirrored matrix where the rows and columns correspond to ## sentences, and the elements describe how similar the ## sentences are. score 1 means sentences are exactly the same. similarity_graph = normalized_matrix * normalized_matrix.T similarity_graph.toarray() # PageRank import networkx as nx nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) ## mapping of sentence indices to scores. use them to associate ## back to the original sentences and sort them scores = nx.pagerank(nx_graph) ranked = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True) print ranked[0][1]
def clean(text): # Returns cleaned, tokenized documents from raw HTML text. text = cleanmyhtml(url) # We need to remove things like (R-NE). There are some wacky abbreviations # for states, but all fall under five. text = re.sub(r'\w{1}\-\w{1,5}\.', '', text) # U.S. needs to become US or else it'll tokenize weirdly. Same with # H.R. (house resolution). text = re.sub(r'U\.S\.', 'US', text) text = re.sub(r'H\.R\.', 'HR', text) # NLTK is pretty poor at tokenizing sentences that contain ." or .' # We'll insert a space into these. text = re.sub(r'\.\"', '. \"', text) text = re.sub(r'\"\.', '. \'', text) punkt_param = PunktParameters() punkt_param.abbrev_types = set([ 'dr', 'reps', 'Reps', 'H.R', 'h.r', 'hr', 'HR', 'vs', 'mr', 'ms', 'pres,', 'mrs', 'prof', 'inc', 'sens', 'Sens', 'Sen', 'sen' ]) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return (sentences)
def __getlemmas(self, txt): ''' Filters noun, adjective and verb from input text, lemmatize them and returns as list of words(tokens) Parameters: @txt : The text file (str format) which must be lemmatized ''' lemma = WordNetLemmatizer() punkts = PunktParameters() punkts.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sent_tokenizer = PunktSentenceTokenizer(punkts) sentences = sent_tokenizer.tokenize(txt) lemma_tokens = [] for sentence in sentences: stoken = word_tokenize(sentence) pos_sent = pos_tag(stoken) for p in pos_sent: if p[1].startswith('N'): pos = wordnet.NOUN elif p[1].startswith('J'): pos = wordnet.ADJ elif p[1].startswith('V'): pos = wordnet.VERB else: pos = None if pos: lemma_tokens.append(lemma.lemmatize(p[0].lower(), pos)) return lemma_tokens
def tokenize_sentences(self, untokenized_string: str): """Tokenize sentences by reading trained tokenizer and invoking ``PunktSentenceTokenizer()``. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :rtype : list of strings """ # load tokenizer assert isinstance(untokenized_string, str), "Incoming argument must be a string." if self.language == "latin": tokenizer = super() elif self.language == "greek": # Workaround for regex tokenizer self.sent_end_chars = GreekLanguageVars.sent_end_chars self.sent_end_chars_regex = "|".join(self.sent_end_chars) self.pattern = rf"(?<=[{self.sent_end_chars_regex}])\s" elif self.language in INDIAN_LANGUAGES: self.sent_end_chars = SanskritLanguageVars.sent_end_chars self.sent_end_chars_regex = "|".join(self.sent_end_chars) self.pattern = rf"(?<=[{self.sent_end_chars_regex}])\s" else: # Warn that NLTK Punkt is being used by default??? tokenizer = PunktSentenceTokenizer() # mk list of tokenized sentences if self.language == "greek" or self.language in INDIAN_LANGUAGES: return re.split(self.pattern, untokenized_string) else: return tokenizer.tokenize(untokenized_string)
def nmf(document): #分句 sentence_tokenizer=PunktSentenceTokenizer() sentences=sentence_tokenizer.tokenize(document) #计算词频 c=CountVectorizer() #计算tf-idf bow_matrix=c.fit_transform(sentences) normalized=TfidfTransformer().fit_transform(bow_matrix) #获取词袋模型中所有词语 all_words=(c.get_feature_names()) #index2word index2words = {v: k for k, v in c.vocabulary_.items()} nmf=NMF(n_components=2,random_state=27,alpha=0.1,l1_ratio=0.5).fit(normalized) # 权重矩阵 w = nmf.transform(normalized) #特征矩阵 f=nmf.components_.shape
def plagiarismChecker(): text = request.form['text_to_check'] if (text.lstrip().rstrip() == ''): return render_template('input.html') punkt_parameters = PunktParameters() sentence_splitter = PunktSentenceTokenizer(punkt_parameters) sentences = sentence_splitter.tokenize(text) probability_of_plagiarism = 0 for a_sentence in sentences: time.sleep(0.1) content = list(filter(lambda x: x in string.printable, a_sentence)) str1 = ''.join(content) print(str1) # temp=list(content) # print(str(temp)) the_term = urllib.parse.quote('+' + '"' + str1 + '"') page = requests.get('https://www.bing.com/search?q=' + the_term) print(page.url) if ((not "There are no results for" in page.text) and (not "No hay resultados para" in page.text) and (not "are no results for" in page.text)): probability_of_plagiarism += 1 percent_plagiarised = str( (probability_of_plagiarism / len(sentences)) * 100) + '%' return render_template('results.html', text=text, percent_plagiarised=percent_plagiarised)
def rank_sentences(text, sentence_scores, title="", n=7): final_sentences = [] trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sent_tokenizer = PunktSentenceTokenizer(trainer.get_params()) for s in sentence_scores: if title == "": break else: sentence_scores[s] *= (1 + similarity_score(title, s)) sc = sentence_scores.copy() sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True)) ordered_sents = dict(islice(sc.items(), n)) proper_sentences = sent_tokenizer.tokenize(text) for s in proper_sentences: if s.lower() in ordered_sents: final_sentences.append(s) return final_sentences
def article_sentences(self, article_text): # take in article.text document = ' '.join(article_text.strip().split('\n')) sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(document) return sentences
def tokenise(self, sample): # first pass - look for poems verses = self.scan_for_verse(sample) if verses: self.notes.append("got {} verses".format(len(verses))) verses = [ re.sub(r'\[\d+\]', '', v) for v in verses ] else: verses = [] # second pass - look for sentences text = re.sub(r'\[\d+\]', '', sample) text = re.sub("\r\n", ' ', text) punkt_param = PunktParameters() punkt_param.abbrev_types = set(self.cf['abbreviations']) tokenizer = PunktSentenceTokenizer(punkt_param) sentences = tokenizer.tokenize(text) sentences = sentences[1:-1] self.notes.append("got {} sentences".format(len(sentences))) # remove any sentences which we already found as part of verses for s in sentences: matches = [ v for v in verses if s[:SENTENCE_MATCH] in v ] if matches: self.notes.append("found sentence {} in verses {}".format(s, matches)) sentences.remove(s) verses.extend(sentences) return verses
def get_key_sentences(self, n=5): ''' Uses a simple implementation of TextRank to extract the top N sentences from a document. Sources: - Original paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf - Super useful blog post: http://joshbohde.com/blog/document-summarization - Wikipedia: http://en.wikipedia.org/wiki/Automatic_summarization#Unsupervised_keyphrase_extraction:_TextRank ''' # Tokenize the document into sentences. More NLP preprocesing should also happen here. sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(self.doc) # Calculate word counts and TFIDF vectors word_counts = CountVectorizer(min_df=0).fit_transform(sentences) normalized = TfidfTransformer().fit_transform(word_counts) # Normalized graph * its transpose yields a sentence-level similarity matrix similarity_graph = normalized * normalized.T nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)[n]
def getSentences(text): #returns a list of sentences tokenized by Punkt punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
def process(text, word_sets_folder="algorithms/data/word_sets"): word_sets = import_word_sets(word_sets_folder) nltk_model_file = open('algorithms/data/NLTK_model_data/model.txt', 'rb') trained = pickle.load(nltk_model_file) sentence_tokenizer = PunktSentenceTokenizer(trained.get_params()) text = sentence_tokenizer.tokenize(text) print("Sentence tokenizer:") print(text) text = run_name_entity_recognizer(text) print("Name Entity Recognizer:") print(text) text = word_tokenizer(text) print("Word tokenizer:") print(text) text = words_clasifier(text, word_sets) print("Word clasifier") print(text) return text
def semafor_local(text): semafor = join( dirname(__file__), '../{0}/bin/runSemafor.sh'.format(config.get('semafor', 'base_dir'))) input_file = join( dirname(__file__), '../{0}/bin/in.txt'.format(config.get('semafor', 'base_dir'))) with open(input_file, 'w') as f: tokenizer = PunktSentenceTokenizer() sentences = tokenizer.tokenize(text) f.write('\n'.join(sentences)) output_file = join( dirname(__file__), '../{0}/bin/out.txt'.format(config.get('semafor', 'base_dir'))) if isfile(output_file): remove(output_file) process = subprocess.Popen([semafor, input_file, output_file, '1'], shell=False) out, err = process.communicate(text) if err: log.debug(err) sentences_semantics = [] with open(output_file) as f: # semafor outputs an invalid JSON, with one dictionary per line for line in f: sentence_dict = json.loads(line.rstrip()) sentences_semantics.append(sentence_dict) return sentences, sentences_semantics
def preprocess(phys): ''' :param fname: a text file :return: a json of sentences, processed for searchability ''' phys = phys.decode('utf-8') phys = re.sub('(\n)+', '. ', phys) sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(phys) for i in xrange(len(sentences)): sentence = unicode(sentences[i]) sentence = sentence.replace('\n', ' ') sentence = re.sub(' +',' ',sentence) sentence = re.sub(r'\d+', '', sentence) sentence = sentence.replace("-"," ") exclude = string.punctuation sentence = ''.join(ch for ch in sentence if ch not in exclude) sentence = re.sub(' +',' ',sentence) sentences[i] = sentence # sentences[i] = sentence.encode('utf-8') count = 0 for sentence in sentences: if sentence == ' ' or sentence == '': sentences.pop(count) count +=1 # with open(fname.rstrip('txt')+'json', 'w') as outfile: # json.dump(sentences, outfile) return sentences
def TextRank(document): #分句 sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(document) #计算词频 c = CountVectorizer() #计算tf-idf bow_matrix = c.fit_transform(sentences) normalized = TfidfTransformer().fit_transform(bow_matrix) #获取词袋模型中所有词语 all_words = (c.get_feature_names()) #index2word index2words = {v: k for k, v in c.vocabulary_.items()} #根据tf-idf 输出前三个关键字索引 top_n_idx = np.argsort(normalized.todense())[:, -3:] #print(top_n_idx) #根据tf-idf 获取top-n关键字 top_n_words = np.vectorize(index2words.get)(top_n_idx) #print (top_n_words) #计算文本相似性 similarity_graph = normalized * normalized.T #构建图计算 textrank nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) return sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
class PunktWordTokenizer(WordTokenizer): """Class for punkt word tokenization""" def __init__(self, sent_tokenizer: object = None): """ :param language : language for sentences tokenization :type language: str """ if sent_tokenizer: self.sent_tokenizer = sent_tokenizer() else: punkt_param = PunktParameters() self.sent_tokenizer = PunktSentenceTokenizer(punkt_param) def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [ item for sublist in tokenizer.tokenize_sents(sents) for item in sublist ]
def myNLTKParser(document, tagger): lexical_diversity = len(document) / len(set(document)) * 1.0 punkt_param = PunktParameters() # if any customized abbrev #punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) # tokenize to sentence sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(document.replace('\'s', '_s')) # tokenize sentence to words word_tokens = [[ w.strip() for w in nltk.word_tokenize(s) if not w.strip().lower() in stopwords ] for s in sentences] # extend token to bigram and trigram extended_tokens = [] for token_list in word_tokens: extended_tokens.append(token_list + nltk.bigrams(token_list) + nltk.trigrams(token_list)) # word stemmer to normalize p_stemmer = PorterStemmer() stem_tokens = [] for token_list in word_tokens: stem_tokens.append([p_stemmer.stem(w) for w in token_list]) # POS tags tags = [tagger.tag(a) for a in extended_tokens] tags_of_verbs = ['NN', 'VB', 'VBP', 'VBG'] tags_of_interest = [ 'JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNPS', 'NNS', 'RB', 'RBR', 'RBS' ] tags_of_noun = ['NN'] merged_tags_uni = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_verbs and isinstance(word, tuple) == False ] merged_tags_bi = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2 ] merged_tags_tri = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3 ] uni_tags_fd = nltk.FreqDist(merged_tags_uni) bi_tags_fd = nltk.FreqDist(merged_tags_bi) tri_tags_fd = nltk.FreqDist(merged_tags_tri) return { 'uni_fd': uni_tags_fd.max(), 'bi_fd': bi_tags_fd.max(), 'tri_fd': tri_tags_fd.max(), }
def sentences(self): try: return self.sentences_list except (AttributeError): sentence_tokenizer = SentenceTokenizer() self.sentences_list = sentence_tokenizer.tokenize(self.corpus) return self.sentences_list
def sentences(self): try: return self.sentences_list except(AttributeError): sentence_tokenizer = SentenceTokenizer() self.sentences_list = sentence_tokenizer.tokenize(self.corpus) return self.sentences_list
def tokenize(self, text, **kwargs): """ Only behavior I want to modify is this method :param text: :param kwargs: """ for x in PunktSentenceTokenizer.tokenize(self, text): yield x
def _punkt_sent_tokenize(text): ''' Sentence segmentation using nltk PunktSentenceTokenizer. ''' punkt_param = PunktParameters() punkt_param.abbrev_types = set(config.tokenize_abbrev) sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(text)
def split_sentence(caption): # Initialize the sentence tokenizer tokenizer = PunktSentenceTokenizer() # Tokenize the caption caption_tokens = tokenizer.tokenize(caption) # Return a list of tokens (sentences) return caption_tokens
def _split_text_to_sentences(self, text): # splits text to sentences (uses some utilities from nltk) punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(text) return sentences
class TagExtractor: """Extracts tags from a body of text using the NLTK toolkit.""" def __init__(self): """Creates a default Topia tagger and extractor.""" self.sentence_tokenizer = PunktSentenceTokenizer() self.parser = nltk.RegexpParser(GRAMMAR) self.productions = ['NP', 'VB', 'ADV'] def __is_just_stop_words(self, words): return not any([word not in STOP_WORDS for word in words]) def extract_tags(self, text): """Extract tags from the text.""" tags = {} for sentence in self.sentence_tokenizer.tokenize(text): chunks = self.__chunk_sentence(sentence) for production in chunks.productions(): tag_tokens = [] pos = production.lhs().symbol() if pos in self.productions: for (word, x) in production.rhs(): # Preprocess, and potentially, filter out the word. trimmed = filter_word(trim_word(word)) if trimmed: tag_tokens.append(trimmed.lower()) if len(tag_tokens) > 0: tag_text = string.join(tag_tokens, ' ') if self.__is_just_stop_words(tag_tokens): continue tag = self.__lookup_tag(tags, tag_text, pos) tag.increment_occurs() tag.set_pos(pos) results = tags.values() results.sort(key = tag_compare_key) return results def __lookup_tag(self, tags, text, pos): tag = tags.get(self.__get_tag_key(text, pos)) if not tag: tag = Tag(text, 0, pos) tags[self.__get_tag_key(text, pos)] = tag return tag def __get_tag_key(self, text, pos): """I want to keep the way we look up tags flexible so that I can easily change my mind on what uniquely identifies a tag (e.g. just the text? the text and the part of speech?). That is why all the logic for looking up tags is in this one method.""" return text def __chunk_sentence(self, sentence): """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't). Then chunk the tokens according to GRAMMAR. """ tokenizer = WhitespaceTokenizer() tokens = tokenizer.tokenize(sentence) pos_tagged = nltk.pos_tag(tokens) return self.parser.parse(pos_tagged)
def transform(self,documents): sentence_splitter = PunktSentenceTokenizer() for doc in documents: if not 'sentences' in doc.ext: doc.ext['sentences'] = [s.strip() for s in sentence_splitter.tokenize(doc.text)] # for doc in documents: # if not 'sentences' in doc.ext: # doc.ext['sentences'] = [s.strip() for s in doc.text.split('.') if s] return documents
def parse (text): """Use nltk's PunktSentenceTokenizer to convert the text string into a list of English-language sentences.""" punkt_param = PunktParameters() punkt_param.abbrev_types = set(ABBREVIATIONS) sentence_splitter = PunktSentenceTokenizer(punkt_param) return sentence_splitter.tokenize(preprocess(text))
def split_into_sentences(input_file_name, output_file_name): tokenizer = PunktSentenceTokenizer() with gzip.open(input_file_name) as input_file: with gzip.open(output_file_name, 'w') as sentence_file: for line in input_file: labelled_review = json.loads(line) tokenized_text = tokenizer.tokenize(labelled_review['text']) json.dump([tokenized_text, labelled_review['score']], sentence_file) sentence_file.write("\n")
def bayesSentiment(self, text): from nltk.tokenize.punkt import PunktSentenceTokenizer from senti_classifier import senti_classifier # break up text into sentences stzr = PunktSentenceTokenizer() sents = stzr.tokenize(text) pos_score, neg_score = senti_classifier.polarity_scores(sents) #print pos_score, neg_score return [pos_score, neg_score]
def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'): # Word frequency analysis my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re'] stoplist = set(stopwords.words('english') + my_abbrev) soup, sample = parse_hansard(filename) # Tokenisation, tagging, chunking sent_tokenizer = PunktSentenceTokenizer() # Stop breaking sentence at "No." sent_tokenizer._params.abbrev_types.add('no') #sentences = nltk.sent_tokenize(sample) # TODO: improve sentence tokenizer - still far from good sentences = sent_tokenizer.tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) # Word frequency over all sentences tokens = [] for sentence in tokenized_sentences: tokens += [word for word in sentence if word.lower() not in stoplist] display_freq(tokens) # Part-of-speech analysis tags = [] for sentence in tagged_sentences: tags += sentence pos_analysis(tags, my_abbrev) # spaCy NER nlp = spacy.load('en_core_web_sm') doc = nlp(sample) # Find named entities, phrases and concepts ne_spacy = {} for entity in doc.ents: if entity.label_ in ne_spacy: ne_spacy[entity.label_] += [entity.text] else: ne_spacy[entity.label_] = [entity.text] logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()}) for k in ne_spacy.keys(): display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20) # Interjection analysis parties = {} all_interjections = soup.find_all('interjection') for interjection in all_interjections: # Can be either a party or a role (Speaker, President, etc, ...) party = interjection.party.text or interjection.find('name', role='metadata').text if party in parties: parties[party] = parties[party] + 1 else: parties[party] = 1 logger.debug("%s interjections: %s" % (len(all_interjections), parties))
class Sent_Tokenizer(): def __init__(self): with open(TREETAGGER_ABBREVIATIONLIST, mode='r', encoding='utf-8') as f: abbr = set([l.strip('.\n') for l in f.readlines()]) punkt_param = PunktParameters() punkt_param.abbrev_types = abbr self.tokenizer = PunktSentenceTokenizer(punkt_param) def tokenize(self, text): return self.tokenizer.tokenize(text)
class ReviewItem: def __init__(self, review, rating): self.tok = PunktSentenceTokenizer() self.rating = rating self.review = review def words(self): return word_tokenize(self.review) def sents(self): return [word_tokenize(sent) for sent in self.tok.tokenize(self.review)]
def textrank(self, document): sentence_tokenizer = PunktSentenceTokenizer() sentences = sentence_tokenizer.tokenize(document) bow_matrix = CountVectorizer().fit_transform(sentences) normalized = TfidfTransformer().fit_transform(bow_matrix) similarity_graph = normalized * normalized.T nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) scores = nx.pagerank(nx_graph) return sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
def __init__(self, document): self.document = document self.sumLength = 10 self.weights = {} self.invWeights = {} self.sumIndex = {} self.summary = {} tokenizer = PunktSentenceTokenizer() self.sentences = [sentence.lower() for sentence in tokenizer.tokenize(document)]
def myNLTKParser(document, tagger): lexical_diversity = len(document) / len(set(document)) * 1.0 punkt_param = PunktParameters() # if any customized abbrev # punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) # tokenize to sentence sentence_splitter = PunktSentenceTokenizer(punkt_param) sentences = sentence_splitter.tokenize(document.replace("'s", "_s")) # tokenize sentence to words word_tokens = [[w.strip() for w in nltk.word_tokenize(s) if not w.strip().lower() in stopwords] for s in sentences] # extend token to bigram and trigram extended_tokens = [] for token_list in word_tokens: extended_tokens.append(token_list + nltk.bigrams(token_list) + nltk.trigrams(token_list)) # word stemmer to normalize p_stemmer = PorterStemmer() stem_tokens = [] for token_list in word_tokens: stem_tokens.append([p_stemmer.stem(w) for w in token_list]) # POS tags tags = [tagger.tag(a) for a in extended_tokens] tags_of_verbs = ["NN", "VB", "VBP", "VBG"] tags_of_interest = ["JJ", "JJR", "JJS", "NN", "NNP", "NNPS", "NNS", "RB", "RBR", "RBS"] tags_of_noun = ["NN"] merged_tags_uni = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_verbs and isinstance(word, tuple) == False ] merged_tags_bi = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 2 ] merged_tags_tri = [ word for sublist in tags for (word, tag) in sublist if tag in tags_of_interest and isinstance(word, tuple) and len(word) == 3 ] uni_tags_fd = nltk.FreqDist(merged_tags_uni) bi_tags_fd = nltk.FreqDist(merged_tags_bi) tri_tags_fd = nltk.FreqDist(merged_tags_tri) return {"uni_fd": uni_tags_fd.max(), "bi_fd": bi_tags_fd.max(), "tri_fd": tri_tags_fd.max()}
def keyword_sentiment(): ## take in tht input word = sys.argv[1] date_diff = int(sys.argv[2]) ## create a sentence_tokenizer from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']) sent_tokenizer = PunktSentenceTokenizer(punkt_param) ## caluclate the barrier date DD = datetime.timedelta(days=date_diff) barrier_date = datetime.datetime.now()- DD ## make connection to db and fetch tweets (and respective sentiment) above the barrier_date db = MySQLdb.connect(host="localhost",user="******",passwd="{2qGq(22+5iU",db="Insights") cur = db.cursor() sql = "SELECT Phrase,Sentiment FROM Phrases WHERE `Date`>'"+str(barrier_date)+"';" cur.execute(sql) total_sentiment = 0 total_count = 0 ## locate tweets which contain keyword, tokenize them into sentences for row in cur.fetchall(): if(row[0].lower().find(word.lower())!=-1): sentences = sent_tokenizer.tokenize(row[0]) ## if a single sentence then just take the sentiment from db if len(sentences) == 1: total_sentiment = total_sentiment + float(row[1]) total_count = total_count+1 ## else add together sentiment of sentence and keep the count else: for sentence in sentences: blob = TextBlob(sentence) total_sentiment= total_sentiment + int(blob.sentiment.polarity*1000)/1000.0 if(sentence.lower().find(word.lower())!=-1): total_count = total_count+1 ## json the total_sentiment/count and count if(total_count!=0): json_array = json_array = [{"sentiment": int(total_sentiment/total_count*1000)/1000.0, "count": total_count}] else: json_array = json_array = [{"sentiment": 0, "count": 0}] ## close the connection to the db db.close() ## print the json print(json.dumps(json_array))
def build_doc2vec_model(save_file=False): client = MongoClient() db = client['metacritic'] coll = db['steam_games'] all_games = list(coll.find({'user_review': {"$exists": "true"}, 'total_user_reviews': {'$ne': 0}, 'game_name': {'$not': re.compile("Demo")} })) plv = PunktSentenceTokenizer() # stemmer = PorterStemmer() labeled_sentences = [] for game in all_games: game_name = game['game_name'] user_data = game['user_review'] # critic_data = game['critic_review'] user_reviews = user_data['reviews'] for user_review in user_reviews: review = user_review['review'] review = review.encode('ascii', 'replace') review = str(review).translate(string.maketrans("",""), string.punctuation) review_sentence = [sentence.split() for sentence in plv.tokenize(review.lower())] if len(review_sentence) == 0: continue else: review_sentence = review_sentence[0] # stemmed_sentence = [] # for word in review_sentence[0]: # stemmed_sentence.append(stemmer.stem(word)) sentence = doc2vec.LabeledSentence(words=review_sentence, labels=[game_name]) labeled_sentences.append(sentence) model = Doc2Vec(alpha=0.025, min_alpha=0.025, workers=4)#, train_words=False, train_lbls=True) model.build_vocab(labeled_sentences) for epoch in range(10): model.train(labeled_sentences) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay if save_file: with open('data/model.pkl', 'wb') as f_model: pickle.dump(model, f_model) else: return model
def splitIntoSentences2(file_name): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc']) sentence_splitter = PunktSentenceTokenizer(punkt_param) fp = open(file_name) data = fp.read() data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "') sentences = [] for para in data.split('\n'): if para: sentences.extend(sentence_splitter.tokenize(para)) # print '\n-----\n'.join(sentences) return sentences
def test_tokenize(self): train = "\n".join(itertools.imap(strip_tags, itertools.chain(*(speech['text'] for speech in self.speeches[0:10])))) print train tokenizer = PunktSentenceTokenizer(train) sents = tokenizer.tokenize(strip_tags(self.speeches[0]['text'][0])) sents = tokenize_sents(strip_tags(self.speeches[0]['text'][0])) self.assertEqual(len(sents), 3)
def loadCorpus(self, path): for encoding in self.__encodings: try: self.__path = path fileName = codecs.open( self.__path,'r', encoding=encoding ) self.__rawText = fileName.read() break except UnicodeDecodeError: encoding = '' continue if encoding!='': self.initFields() #SENTENCES # more abbreviations with dots punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'vs', 'n', 'v', 'etc', 'art', 'p', 'Cost', 'ss', 'pag']) punkt_param = PunktParameters() sentence_splitter = PunktSentenceTokenizer(punkt_param) text = re.sub(ur'[\'\<\>`’]', ' ', self.__rawText) #text = re.sub('(\d+)', r' \1 ', text) sentences = sentence_splitter.tokenize(text) #TOKENS self.__tokens = [[token, ''] for token in list(itertools.chain(*[ customWordtokenize(sent) for sent in sentences]))] wordTokenizer = RegexpTokenizer('[a-zA-Z0-9\xe0\xe1\xe8\xe9\xec\xed\xf2\xf3\xf9\xfa]+') #wordTokenizer = RegexpTokenizer('[\w]+') sentences = [wordTokenizer.tokenize(sent.lower()) for sent in sentences if len(wordTokenizer.tokenize(sent)) > 0] words = list(itertools.chain(*sentences)) self.__words = words self.__sentences = sentences self.__avgSentLength = round(np.mean( [len(sent) for sent in sentences]), 3) self.__avgWordLength = round(np.mean( [len(word) for word in words]), 3) self.__freqDist = FreqDist(words) self.__wordCount = len(words) self.__lexicalDiversity = round(len(self.__freqDist.items())/float(len(words)), 5) ### resetting members self.__concordanceIndex = None self.__bigrams = None return encoding
def get_reviews(games_df): plv = PunktSentenceTokenizer() reviews = games_df.excerpt.tolist() sentences = [] for review in reviews: review = review.encode('ascii', 'replace') review = str(review).translate(string.maketrans("",""), string.punctuation) review_sentence = [sentence.split() for sentence in plv.tokenize(review.lower())] if len(review_sentence) == 0: sentences.append([]) else: sentences.extend(review_sentence) return sentences
def getSentences(paragraph): unicode_data= paragraph.decode("utf-8") data= "".join([i if ord(i) < 128 else "" for i in unicode_data]) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') punkt_params = PunktParameters() punkt_params.abbrev_types = set(['al',"inc","mr","dr","mrs","prof"]) splitter = PunktSentenceTokenizer(punkt_params) sentences=splitter.tokenize(data) sentences1=filter_list(sentences) ##print sentences1,"\n----------------------------------------------------------------------------" return sentences1
class BasePunktWordTokenizer(BaseWordTokenizer): """Base class for punkt word tokenization""" def __init__(self, language: str = None, sent_tokenizer:object = None): """ :param language : language for sentence tokenization :type language: str """ self.language = language super().__init__(language=self.language) if sent_tokenizer: self.sent_tokenizer = sent_tokenizer() else: punkt_param = PunktParameters() self.sent_tokenizer = PunktSentenceTokenizer(punkt_param) def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]