def summarize_pdf(article_text): trainer=PunktTrainer() trainer.train(article_text) tok=PunktSentenceTokenizer(trainer.get_params()) sentence_list = tok.tokenize(article_text) sentence_lists=[] sent_list=[] clean_sent=[] for sent in sentence_list: tok=TreebankWordTokenizer() words=tok.tokenize(sent) wordss=[] words=[ww.lower() for ww in words] sentence_lists.append(" ".join(words)) for word,tag in pos_tag(words): if tag.startswith('NN'): pos='n' elif tag.startswith('VB'): pos='v' elif tag.startswith('RB'): pos='r' else: pos='a' stem=WordNetLemmatizer() w=stem.lemmatize(word,pos) if(w not in punc) & bool(re.search("[^\d]",w)): wordss.append(w.lower()) clean_sent.append(' '.join(wordss)) sent_list.append(wordss) return sent_list,clean_sent,sentence_lists,sentence_list
def train_punkt(ctx, input, output, abbr, colloc): """Train Punkt sentence splitter using sentences in input.""" click.echo('chemdataextractor.tokenize.train_punkt') import pickle from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer punkt = PunktTrainer() # Set these to true to include collocations more leniently, then increase MIN_COLLOC_FREQ to restrict again # punkt.INCLUDE_ALL_COLLOCS = False # punkt.INCLUDE_ABBREV_COLLOCS = False # punkt.MIN_COLLOC_FREQ = 1 # Don't train on titles. They may contain abbreviations, but basically never have actual sentence boundaries. for fin in input: click.echo('Training on %s' % fin.name) sentences = fin.read() #.replace('.\n', '. \n\n') punkt.train(sentences, finalize=False, verbose=True) punkt.finalize_training(verbose=True) if abbr: abbreviations = abbr.read().strip().split('\n') click.echo('Manually adding abbreviations: %s' % abbreviations) punkt._params.abbrev_types.update(abbreviations) if colloc: collocations = [ tuple(l.split('. ', 1)) for l in colloc.read().strip().split('\n') ] click.echo('Manually adding collocs: %s' % collocations) punkt._params.collocations.update(collocations) model = PunktSentenceTokenizer(punkt.get_params()) pickle.dump(model, output, protocol=pickle.HIGHEST_PROTOCOL)
def constructor(): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train_tokens(self.words()) params = trainer.get_params() return PunktSentenceTokenizer(params)
def rank_sentences(text, sentence_scores, title="", n=7): final_sentences = [] trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sent_tokenizer = PunktSentenceTokenizer(trainer.get_params()) for s in sentence_scores: if title == "": break else: sentence_scores[s] *= (1 + similarity_score(title, s)) sc = sentence_scores.copy() sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True)) ordered_sents = dict(islice(sc.items(), n)) proper_sentences = sent_tokenizer.tokenize(text) for s in proper_sentences: if s.lower() in ordered_sents: final_sentences.append(s) return final_sentences
def train_sentence_tokenizer(self: object, text: str): """ Train sentences tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = (self.punctuation + self.strict_punctuation) else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def train(src, tgt): with open(src, 'r', encoding='utf-8') as infile, \ open(tgt, 'wb') as sent_tokenizer: contents = infile.read() language_punkt_vars = PunktLanguageVars # language_punkt_vars.sent_end_chars=tuple(args.end_chars) print("# Training sent tokenizer") trainer = PunktTrainer(contents, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True params = trainer.get_params() tokenizer = PunktSentenceTokenizer(params) tokenizer._params.abbrev_types.add('brgy') tokenizer._params.abbrev_types.add('sen') tokenizer._params.abbrev_types.add('supt') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('dr') tokenizer._params.abbrev_types.add('col') tokenizer._params.abbrev_types.add('sec') tokenizer._params.abbrev_types.add('mt') tokenizer._params.abbrev_types.add('asst') tokenizer._params.abbrev_types.add('mr') tokenizer._params.abbrev_types.add('c/insp') tokenizer._params.abbrev_types.add('rep') tokenizer._params.abbrev_types.add('sta') tokenizer._params.abbrev_types.add('sto') pickle.dump(tokenizer, sent_tokenizer)
def train_sentence_tokenizer(self: object, text: str): """ Train sentence tokenizer. """ language_punkt_vars = PunktLanguageVars # Set punctuation if self.punctuation: if self.strict: language_punkt_vars.sent_end_chars = self.punctuation + self.strict_punctuation else: language_punkt_vars.sent_end_chars = self.punctuation # Set abbreviations trainer = PunktTrainer(text, language_punkt_vars) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True tokenizer = PunktSentenceTokenizer(trainer.get_params()) if self.abbreviations: for abbreviation in self.abbreviations: tokenizer._params.abbrev_types.add(abbreviation) return tokenizer
def get_nltk_sent_tokenizer(container, lang): assert lang in ["zh", "en"], "Unknown language." trainer = PunktTrainer() if isinstance(container, Container): article_paths = container.get_all_article_paths( root_dir="../processed_data/crawler/nejm/articles/", ext=lang) elif isinstance(container, list): print("{} Articles.".format(len(container))) article_paths = container else: raise ValueError("Cannot parse container with class {}".\ format(container.__class__)) missing_count = 0 for path in article_paths: try: article = get_article_as_lowercase_string(path) trainer.train(text=article, finalize=False) except FileNotFoundError: print("{} not found.".format(path)) missing_count += 1 print("{} articles not found.".format(missing_count)) trainer.finalize_training() tokenizer = PunktSentenceTokenizer(trainer.get_params()) return tokenizer
def get_tokenizer(training_text): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(training_text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) tokenizer._params.abbrev_types.update(ABBREVIATIONS) return tokenizer
def get_tokenizer(self, xml, abbrevWordList, spentSplitList): #class BulletPointLangVars(PunktLanguageVars): #sent_end_chars = ('?', '!') #for i in range(len(spentSplitList)): # sent_end_chars = sent_end_chars + tuple(spentSplitList[i]) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True train_data = 'sss' trainer.train(train_data) tokenizer = PunktSentenceTokenizer(trainer.get_params()) #tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars = BulletPointLangVars()) #문장분리 예외추가 rule['ABBREV_WORDS'].extend(abbrevWordList) for i in rule['ABBREV_WORDS']: tokenizer._params.abbrev_types.add(i) tokenizer = PunktSentenceTokenizer(trainer.get_params()) return tokenizer
def trainSentenceTokenizer(self): text = "" for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) tokenizer._params.abbrev_types.add('dr') tokenizer._params.abbrev_types.add('fig') return tokenizer
class Punket_tokenizer: def __init__(self): self.modelfile = 'punket_tokenizer.pk' if os.path.exists(self.modelfile): self.tokenizer = self.punkt_tokenize_load() else: self.trainer = PunktTrainer() text = "" for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) self.trainer.INCLUDE_ALL_COLLOCS = True self.trainer.train(text) self.tokenizer = PunktSentenceTokenizer(self.trainer.get_params()) self.tokenizer._params.abbrev_types.add('dr') self.tokenizer._params.abbrev_types.add('mr') self.tokenizer._params.abbrev_types.add('mrs') self.tokenizer._params.abbrev_types.add('miss') self.tokenizer._params.abbrev_types.add('ms') self.tokenizer._params.abbrev_types.add('no') self.tokenizer._params.abbrev_types.add('jan') self.tokenizer._params.abbrev_types.add('feb') self.tokenizer._params.abbrev_types.add('mar') self.tokenizer._params.abbrev_types.add('apr') self.tokenizer._params.abbrev_types.add('may') self.tokenizer._params.abbrev_types.add('jun') self.tokenizer._params.abbrev_types.add('aug') self.tokenizer._params.abbrev_types.add('sep') self.tokenizer._params.abbrev_types.add('oct') self.tokenizer._params.abbrev_types.add('nov') self.tokenizer._params.abbrev_types.add('dec') with open(self.modelfile, mode='wb') as fout: pickle.dump(self.tokenizer, fout, protocol=pickle.HIGHEST_PROTOCOL) def punkt_tokenize_load(self): with open(self.modelfile, mode='rb') as fin: punket_tokenizer = pickle.load(fin) return punket_tokenizer def puket_tokenizer_add_rule(self, word): self.tokenizer._params.abbrev_types.add(word) def punket_sentence_tokenizer(self, sentences): return self.tokenizer.tokenize(sentences)
def get_sentence_tokenizer(language): """ Return the sentence tokenizer callable. """ pickle_path = 'sentence_tokenizer.pickle' try: input_file = open(pickle_path, 'rb') sentence_tokenizer = load(input_file) input_file.close() except FileNotFoundError: data_file_paths = [] sentences = [] try: # Get the paths to each file the bot will be trained with corpus_files = list_corpus_files('core.corpus.{language}'.format( language=language.ENGLISH_NAME.lower() )) except LookupError: # Fall back to English sentence splitting rules if a language is not supported corpus_files = list_corpus_files('core.corpus.{language}'.format( language=languages.ENG.ENGLISH_NAME.lower() )) data_file_paths.extend(corpus_files) for corpus, _categories, _file_path in load_corpus(*data_file_paths): for conversation in corpus: for text in conversation: sentences.append(text.upper()) sentences.append(text.lower()) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train('\n'.join(sentences)) sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params()) # Pickle the sentence tokenizer for future use output_file = open(pickle_path, 'wb') dump(sentence_tokenizer, output_file, -1) output_file.close() return sentence_tokenizer
def trainSentenceTokenizer(): """ Method trains custom sentence tokenizer using punk. At the moment it preforms worse then plain english one (most likely due to not that much data) """ collection = database["crawled-data"] text = "" for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}): text += record[ABSTRACT_DOCUMENT] + " " trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True trainer.train(text) model = nltk.PunktSentenceTokenizer(trainer.get_params()) with open("latvianPunkt2.pickle", mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
def score_sentences(text, word_scores, unique): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sent_score = {} sent_tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentences = sent_tokenizer.tokenize(text.lower()) for s in sentences: words = clean_text(s) sent_score[s] = 0 for w in words: w = lemmatizer.lemmatize(w) if w in unique: sent_score[s] += word_scores[w] return sent_score
def main(): opts, args = getopt.getopt(sys.argv[1:], 'l:', []) lang = None for o, a in opts: if o == '-l': lang = a if lang is None: print >> sys.stderr, "Must pass -l language on the command line!" sys.exit(1) if lang == 'en': print >> sys.stderr, "Don't train for -l en! We are using the pre-trained punkt tokenizer from NLTK." sys.exit(1) lang_vars = MyPunktLanguageVars() trainer = PunktTrainer(lang_vars=lang_vars) train(trainer, lang) trainer.finalize_training(verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars) pickle.dump(tokenizer, open('LingwoNLP/punkt-'+lang+'.pickle','wt'))
def train_tokenizer(trainfile,abbreviationfile,modelfile): k = 0 skipped_ = 0 custom_ = 0 punkt = PunktTrainer() input_ = codecs.open(trainfile, encoding='utf-8') for sentence in input_: k+=1 if k%100 == 0: print('trained from sentences :' + str(k)) try: punkt.train(sentence, finalize=False, verbose=False) except: skipped_ += 1 input_.close() if abbreviationfile !='': abbreviations_ = codecs.open(abbreviationfile,encoding='utf-8') for abbr in abbreviations_: try: punkt.train('Start ' + abbr + '. End.' ,finalize=False, verbose=False) custom_ += 1 except: pass abbreviations_.close() punkt.finalize_training(verbose=False) model = PunktSentenceTokenizer(punkt.get_params()) model_output = codecs.open(modelfile,mode='wb') pickle.dump(model,model_output,protocol=pickle.HIGHEST_PROTOCOL) model_output.close() print('') print(str(skipped_) + ' sentences skipped') print(str(custom_) + ' custom abbreviations added')
def get_V(self, topics_file_name, other_file): if other_file == True: path = topics_file_name else: path = 'OpinosisDataset1.0_0/topics/{}'.format(topics_file_name) text = open(path, encoding="utf8", errors='ignore') text = text.read() # get the X_train_counts and X_train_tf trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) X = tokenizer.tokenize(text) bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) X_train_counts = bigram_vectorizer.fit_transform(X) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) return X_train_counts, X_train_tf, tokenizer, bigram_vectorizer
def main(): opts, args = getopt.getopt(sys.argv[1:], 'l:', []) lang = None for o, a in opts: if o == '-l': lang = a if lang is None: print >> sys.stderr, "Must pass -l language on the command line!" sys.exit(1) if lang == 'en': print >> sys.stderr, "Don't train for -l en! We are using the pre-trained punkt tokenizer from NLTK." sys.exit(1) lang_vars = MyPunktLanguageVars() trainer = PunktTrainer(lang_vars=lang_vars) train(trainer, lang) trainer.finalize_training(verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars) pickle.dump(tokenizer, open('LingwoNLP/punkt-' + lang + '.pickle', 'wt'))
def create_sentences(text_file, min_sentence_len): trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True with open(text_file, "r") as input_file: paragraphs = input_file.read() trainer.train(paragraphs) tokenizer = PunktSentenceTokenizer(trainer.get_params()) # print(tokenizer._params.abbrev_types) sentences = [] for line in open(text_file, "r+").readlines(): sentences_tmp = tokenizer.tokenize(line) for sentence in sentences_tmp: sentences.append(sentence) with open("dataset/sentences.txt", "a") as out_file: for sentence in sentences: if len(sentence) > min_sentence_len: out_file.write(sentence + "\n\n")
def train_punktsent(trainfile, modelfile): """ Trains an unsupervised NLTK punkt SENTENCE tokenizer. *trainfile* is the filename for the input file. s *modelfile* is the filename for the model output file. """ punkt = PunktTrainer() try: with codecs.open(trainfile, 'r', 'utf8') as fin: punkt.train(fin.read(), finalize=False, verbose=False) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' ##HACK: Adds abbreviations from rb_tokenizer. abbrv_sent = " ".join([i.strip() for i in \ codecs.open('abbrev.lex','r','utf8').readlines()]) abbrv_sent = "Start" + abbrv_sent + "End." punkt.train(abbrv_sent, finalize=False, verbose=False) # Finalize and outputs trained model. punkt.finalize_training(verbose=True) model = PunktSentenceTokenizer(punkt.get_params()) with open(modelfile, mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) return model
def train_punktsent(trainfile, modelfile): """ Trains an unsupervised NLTK punkt SENTENCE tokenizer. *trainfile* is the filename for the input file. s *modelfile* is the filename for the model output file. """ punkt = PunktTrainer() try: with codecs.open(trainfile, 'r','utf8') as fin: punkt.train(fin.read(), finalize=False, verbose=False) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' ##HACK: Adds abbreviations from rb_tokenizer. abbrv_sent = " ".join([i.strip() for i in \ codecs.open('abbrev.lex','r','utf8').readlines()]) abbrv_sent = "Start"+abbrv_sent+"End." punkt.train(abbrv_sent,finalize=False, verbose=False) # Finalize and outputs trained model. punkt.finalize_training(verbose=True) model = PunktSentenceTokenizer(punkt.get_params()) with open(modelfile, mode='wb') as fout: pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL) return model
def build_sentence_model(text, extra_abbrevs=None): """ Build a sentence model from text with optional extra abbreviations to include. :param text: :param extra_abbrevs: :return: """ # Setup Punkt trainer punkt_trainer = PunktTrainer() punkt_trainer.train(text, verbose=False, finalize=False) punkt_trainer.finalize_training(verbose=False) # Extract parameters from trainer punkt_params = punkt_trainer.get_params() # Add any extras if passed if extra_abbrevs is not None: for abbrev in extra_abbrevs: punkt_params.abbrev_types.add(abbrev.strip(".").lower()) # Return model instantiated with new parameters return PunktSentenceTokenizer(punkt_params)
# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True, limit=10000) cursor = bills.find({"congress": {"$in": ["114", "113"]}}, {"text_versions": 1}, no_cursor_timeout=True) # Train trainer pbar = ProgressBar(maxval=cursor.count()).start() for i, line in enumerate(cursor): text = line['text_versions'].itervalues().next() trainer.train(text, finalize=False, verbose=False) pbar.update(i) pbar.finish() print "Finalizing training..." trainer.finalize_training(verbose=True) print "Training done." # Include custom parameters params = trainer.get_params() # params.collocations = params.collocations | extra_collocations # params.sent_starters = params.sent_starters | extra_sentence_starters with open('sentence_tokenizer_params.pickle', 'wb') as f: pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL) print "Params: %s" % repr(params) # Create tokenizer tokenizer = PunktSentenceTokenizer(params) # Dump pickled tokenizer with open("sentence_tokenizer.pickle", mode='wb') as f: pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)
# coding: utf-8 import codecs from sys import argv, exit from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer if len(argv) != 3: print "Usage: %s <TRAINING_CORPUS> <SENTENCES_TO_SPLIT>" % __file__ exit(1) training = ''.join(codecs.open(argv[1], 'rb', 'utf-8').readlines()) trainer = PunktTrainer() trainer.train(training, verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True) text = ''.join(codecs.open(argv[2], 'rb', 'utf-8').readlines()) sentences = tokenizer.tokenize(text) codecs.open('split', 'wb', 'utf-8').writelines([s + '\n' for s in sentences])
PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') trainer = PunktTrainer(lang_vars=PunktLanguageVars()) trainer.INCLUDE_ALL_COLLOCS = True trainer.INCLUDE_ABBREV_COLLOCS = True corpus_dir = 'tesserae' + os.sep + 'texts' + os.sep + 'grc' file_extension = 'tess' #Obtain all the files to parse by traversing through the directory file_names = sorted(list({current_path + os.sep + current_file_name for current_path, current_dir_names, current_file_names in \ os.walk(corpus_dir) for current_file_name in current_file_names if current_file_name.endswith('.' + file_extension)})) counter = 1 for file_name in file_names: file_text = file_parsers[file_extension](file_name) trainer.train(file_text, verbose=False, finalize=False) print_progress_bar(counter, len(file_names)) counter += 1 with open(lang + '.pickle', 'wb') as pickle_file: pickle_file.write( pickle.dumps(PunktSentenceTokenizer(trainer.get_params()))) # params = trainer.get_params() # tkzr = PunktSentenceTokenizer(params) # # s = 'test test test test test. test test test test. test test. test test; test test test.' # s = 'test test test. test test test test test; test test. test test' # print(tkzr.tokenize(s)) # print(TokenizeSentence('greek').tokenize_sentences(s))
class SimpleVectorizer: """Learns a dictionary from a given set of tokenized documents, and provides a user-friendly interface for subsequent conversions from tokens --> integer sequences. """ # Tokens (and their IDs) always placed at the top of any vocab list. # Note: These are counted as part of the vocabulary. _PAD_TOKEN = '_PAD' _UNK_TOKEN = '_UNK' START_VOCAB = OrderedDict([(_PAD_TOKEN, 0), (_UNK_TOKEN, 1)]) # Tokens with global frequency < FREQ_CUTOFF will be ignored entirely. # Specifically, we won't even insert UNK for tokens that occurred less than # this number of times across the entire set of calls to update(). FREQ_CUTOFF = 0 def __init__(self, vocab_size=None): self.vocab_size = vocab_size # Tokenization tools and other private attributes. self._sent_tokenizer = PunktSentenceTokenizer() self._sent_trainer = PunktTrainer() self._word_tokenizer = WordTokenizer() self._index_to_dfreq = None self._is_finalized = False # Number of times a given word has been seen across entire corpus. self.word_to_freq = OrderedCounter() # Number of docs that contained word w. self.word_to_dfreq = OrderedCounter() # Number of documents trained on so far. self.num_docs = 0 # Dicts that will be filled when fitting documents. # word_index: w => i (index into vocabulary) # index_docs: i => doc_counts (doc_freq for word with index i). self.word_to_index = OrderedDict() def truncate_vocab(self, new_vocab_size=None): # If not given, set to exact number of unique tokens we've seen. new_vocab_size = new_vocab_size or len(self.word_to_index) logger.debug(f'Truncating vocab size from {self.vocab_size} ' f'to {new_vocab_size}.') if new_vocab_size > len(self.word_to_index): raise ValueError( 'truncate_vocab received new vocab larger than previous.') self.vocab_size = new_vocab_size self.word_to_index = OrderedDict( (k, v) for k, v in self.word_to_index.items() if v < self.vocab_size) assert len(self.word_to_index) == self.vocab_size, '{} != {}'.format( len(self.word_to_index), self.vocab_size) @property def vocab(self): vocab = list(self.word_to_index.keys()) expected_len = self.vocab_size if self.vocab_size is not None and len(vocab) != expected_len: raise RuntimeError( 'Vectorizer\'s word_to_index dictionary has unexpected number of entries:' ' {}. It should have {}.'.format(len(vocab), expected_len)) return vocab @util.timed_function() def update(self, tokenized_docs): if self._is_finalized: raise RuntimeError('Vectorizer has been finalized. Update prohibited.') # Compute frequency statistics. self.word_to_freq.update(Vectorizer.get_word_dict( 'word_to_freq', tokenized_docs)) self.word_to_dfreq.update(Vectorizer.get_word_dict( 'word_to_dfreq', tokenized_docs)) logger.info('Longest sequence in docs has {} tokens.'.format( len(max(tokenized_docs, key=len)))) logger.info(f'Num unique tokens seen thus far:' f' {len(self.word_to_freq)}') logger.info(f'Num tokens total seen thus far:' f' {sum(self.word_to_freq.values())}') if self.vocab_size is None: common_words_sorted = util.lmap( itemgetter(0), self.word_to_freq.most_common()) else: common_words_sorted = util.lmap( itemgetter(0), self.word_to_freq.most_common( self.vocab_size - len(self.START_VOCAB))) assert '_PAD' not in common_words_sorted, [ l for l in tokenized_docs if '_PAD' in l ][:2] assert '_UNK' not in common_words_sorted word_start = len(self.START_VOCAB) word_end = self.vocab_size or (word_start + len(common_words_sorted)) self.word_to_index = OrderedDict( list(self.START_VOCAB.items()) + list(zip(common_words_sorted, range(word_start, word_end)))) def finalize_updates(self): """Teardown operations after last call to .update, as determined by user. """ self._finalize_frequency_dicts() self._finalize_sent_tokenizer() self._is_finalized = True def _finalize_frequency_dicts(self): # Formally insert entries for START_VOCAB in self.word_to_freq, # and align items with self.word_to_index. unk_freqs = sum([f for w, f in self.word_to_freq.items() if w not in self.word_to_index]) self.word_to_freq = OrderedCounter( [(self._PAD_TOKEN, None), (self._UNK_TOKEN, unk_freqs)] + [(w, self.word_to_freq[w]) for w in self.word_to_index]) # Not obvious how to do analogous procedure for doc freqs, so set both # special token counts to None. self.word_to_dfreq = OrderedCounter( [(self._PAD_TOKEN, None), (self._UNK_TOKEN, None)] + [(w, self.word_to_dfreq[w]) for w in self.word_to_index]) def _finalize_sent_tokenizer(self): """Re-instantiate sentence tokenizer to ensure has updated params.""" self._sent_tokenizer = PunktSentenceTokenizer( self._sent_trainer.get_params()) def tokens_to_vector(self, tokens): """Converts list of word tokens to list of integer ids.""" sent_vec = [] for word in tokens: if self.is_common_unknown(word): sent_vec.append(self.START_VOCAB['_UNK']) elif self.is_in_vocabulary(word): sent_vec.append(self.word_to_index.get(word)) return sent_vec def is_in_vocabulary(self, word): return word in self.word_to_index def is_common_unknown(self, word): return not self.is_in_vocabulary(word) and \ self.word_to_freq.get(word, 0) >= self.FREQ_CUTOFF def detokenize(self, tokens): return self._word_tokenizer.detokenize(tokens) @util.listify def sent_tokenize(self, docs): """ Args: docs: str or list(str) Returns: docs, with each entry tokenized into sentence strings. """ return self._sent_tokenizer.tokenize_sents(docs) @util.listify def word_tokenize(self, docs, parallel=True): return self._word_tokenizer.tokenize_docs(docs, parallel=parallel)
from nltk.tokenize.punkt import PunktTrainer import pickle PUNCTUATION = ( ';', '.', '!', '?', ) trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True with open('./corpus.txt', 'r') as fs: text = fs.read() trainer.train(text, verbose=True) params = trainer.get_params() with open('./egs/punkt_tokenize/vi.pkl', 'wb') as fs: pickle.dump(params, fs)
for file_id in gutenberg.fileids(): text += gutenberg.raw(file_id) print len(text) soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser') from pprint import pprint from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) tokenizer = PunktSentenceTokenizer(trainer.get_params()) sentences = soup.get_text(' ') sentence_list= tokenizer.tokenize(sentences) from pymongo import MongoClient client = MongoClient('mongodb://localhost:27017/') db=client['nlp'] coll=db['Keywords_list'] extracted_sentences=[]
# coding: utf-8 import codecs import sys from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer training = ''.join( codecs.open('IT-TrainingCorpus.txt', 'rb', 'utf-8').readlines()) trainer = PunktTrainer() trainer.train(training, verbose=True) tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True) text = ''.join(codecs.open(sys.argv[1], 'rb', 'utf-8').readlines()) sentences = tokenizer.tokenize(text) clean = [s for s in sentences if s.find('<strong>') != -1] codecs.open('clean-gold', 'wb', 'utf-8').writelines([s + '\n' for s in clean])
train = False if train: with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open( "de_corp", 'rt', encoding='utf-8') as decorp: text_en = encorp.read() text_de = decorp.read() trainer_en = PunktTrainer() trainer_en.INCLUDE_ALL_COLLOCS = True trainer_en.train(text_en) trainer_de = PunktTrainer() trainer_de.INCLUDE_ALL_COLLOCS = True trainer_de.train(text_de) tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params()) tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params()) else: #tokenizer_en=PunktSentenceTokenizer() #tokenizer_de=PunktSentenceTokenizer() #nltk.download('punkt') tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle') tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle') mismatch = 0 with open(sys.argv[1]) as filtered: for line in filtered: tabs = line.split('\t') line_src = tabs[2] line_tgt = tabs[3] sent_src = tokenizer_en.tokenize(line_src)
from cltk.utils.file_operations import open_pickle from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer from extract_features import parse_tess PunktLanguageVars.sent_end_chars = ('.', ';', ';') PunktLanguageVars.internal_punctuation = (',', '·', ':') text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess') new_xeno_trainer = PunktTrainer() # new_xeno_trainer.INCLUDE_ALL_COLLOCS = True # new_xeno_trainer.INCLUDE_ABBREV_COLLOCS = True new_xeno_trainer.train(text) new_xeno_params = new_xeno_trainer.get_params() tess_xeno_params = open_pickle('tokenizers/ancient_greek.pickle')._params print(new_xeno_params.abbrev_types) print(new_xeno_params.abbrev_types == tess_xeno_params.abbrev_types) print() print(new_xeno_params.collocations) print(new_xeno_params.collocations == tess_xeno_params.collocations) print() print(new_xeno_params.sent_starters) print(new_xeno_params.sent_starters == tess_xeno_params.sent_starters) print() print(new_xeno_params.ortho_context) print(new_xeno_params.ortho_context == tess_xeno_params.ortho_context) print() ''' I got the internal PunktParameters object from the cltk pickle file that was trained on Xenophon's Anabasis (https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/training_sentences.txt), and I also got the internal PunktParameters object from an PunktTrainer that I created from training on Xenophon's Anabasis from the tesserae corpus (https://github.com/tesserae/tesserae/blob/master/texts/grc/xenophon.anabasis.tess).