Python PunktTrainer.get_params 예제들, nltk.tokenize.punkt.PunktTrainer.get_params Python 예제들

예제 #1

0

파일 보기

파일: summarizer.py 프로젝트: kashif-flask/Text-summarizer

def summarize_pdf(article_text):
    
    trainer=PunktTrainer()
    trainer.train(article_text)
    tok=PunktSentenceTokenizer(trainer.get_params())
    sentence_list = tok.tokenize(article_text)
    sentence_lists=[]
    sent_list=[]

    clean_sent=[]
    for sent in sentence_list:
            tok=TreebankWordTokenizer()
            words=tok.tokenize(sent)
            wordss=[]
            words=[ww.lower() for ww in words]
            sentence_lists.append(" ".join(words))
            for word,tag in pos_tag(words):
                if tag.startswith('NN'):
                    pos='n'
                elif tag.startswith('VB'):
                    pos='v'
                
                elif tag.startswith('RB'):
                    pos='r'
                else:
                    pos='a'
                stem=WordNetLemmatizer()
                w=stem.lemmatize(word,pos)
                if(w not in punc) & bool(re.search("[^\d]",w)):
                    wordss.append(w.lower())
            clean_sent.append(' '.join(wordss))    
            sent_list.append(wordss)
    return sent_list,clean_sent,sentence_lists,sentence_list

예제 #2

0

파일 보기

파일: tokenize.py 프로젝트: Python3pkg/ChemDataExtractor

def train_punkt(ctx, input, output, abbr, colloc):
    """Train Punkt sentence splitter using sentences in input."""
    click.echo('chemdataextractor.tokenize.train_punkt')
    import pickle
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
    punkt = PunktTrainer()
    # Set these to true to include collocations more leniently, then increase MIN_COLLOC_FREQ to restrict again
    # punkt.INCLUDE_ALL_COLLOCS = False
    # punkt.INCLUDE_ABBREV_COLLOCS = False
    # punkt.MIN_COLLOC_FREQ = 1
    # Don't train on titles. They may contain abbreviations, but basically never have actual sentence boundaries.
    for fin in input:
        click.echo('Training on %s' % fin.name)
        sentences = fin.read()  #.replace('.\n', '. \n\n')
        punkt.train(sentences, finalize=False, verbose=True)
    punkt.finalize_training(verbose=True)
    if abbr:
        abbreviations = abbr.read().strip().split('\n')
        click.echo('Manually adding abbreviations: %s' % abbreviations)
        punkt._params.abbrev_types.update(abbreviations)
    if colloc:
        collocations = [
            tuple(l.split('. ', 1)) for l in colloc.read().strip().split('\n')
        ]
        click.echo('Manually adding collocs: %s' % collocations)
        punkt._params.collocations.update(collocations)
    model = PunktSentenceTokenizer(punkt.get_params())
    pickle.dump(model, output, protocol=pickle.HIGHEST_PROTOCOL)

예제 #3

0

파일 보기

파일: tiger.py 프로젝트: ooz/Confopy

 def constructor():
     trainer = PunktTrainer()
     trainer.INCLUDE_ALL_COLLOCS = True
     trainer.INCLUDE_ABBREV_COLLOCS = True
     trainer.train_tokens(self.words())
     params = trainer.get_params()
     return PunktSentenceTokenizer(params)

예제 #4

0

파일 보기

파일: summary.py 프로젝트: tshantan24/tldr-flask

def rank_sentences(text, sentence_scores, title="", n=7):

    final_sentences = []

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    sent_tokenizer = PunktSentenceTokenizer(trainer.get_params())

    for s in sentence_scores:
        if title == "":
            break
        else:
            sentence_scores[s] *= (1 + similarity_score(title, s))

    sc = sentence_scores.copy()
    sc = OrderedDict(sorted(sc.items(), key=lambda t: t[1], reverse=True))
    ordered_sents = dict(islice(sc.items(), n))

    proper_sentences = sent_tokenizer.tokenize(text)

    for s in proper_sentences:
        if s.lower() in ordered_sents:
            final_sentences.append(s)

    return final_sentences

예제 #5

0

파일 보기

    def train_sentence_tokenizer(self: object, text: str):
        """
        Train sentences tokenizer.
        """
        language_punkt_vars = PunktLanguageVars

        # Set punctuation
        if self.punctuation:
            if self.strict:
                language_punkt_vars.sent_end_chars = (self.punctuation +
                                                      self.strict_punctuation)
            else:
                language_punkt_vars.sent_end_chars = self.punctuation

        # Set abbreviations
        trainer = PunktTrainer(text, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True

        tokenizer = PunktSentenceTokenizer(trainer.get_params())

        if self.abbreviations:
            for abbreviation in self.abbreviations:
                tokenizer._params.abbrev_types.add(abbreviation)

        return tokenizer

예제 #6

0

파일 보기

파일: sent_tokenizer.py 프로젝트: ryangmolina/text-normalization

def train(src, tgt):
    with open(src, 'r', encoding='utf-8') as infile, \
            open(tgt, 'wb') as sent_tokenizer:
        contents = infile.read()
        language_punkt_vars = PunktLanguageVars
        # language_punkt_vars.sent_end_chars=tuple(args.end_chars)
        print("# Training sent tokenizer")
        trainer = PunktTrainer(contents, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True
        params = trainer.get_params()
        tokenizer = PunktSentenceTokenizer(params)
        tokenizer._params.abbrev_types.add('brgy')
        tokenizer._params.abbrev_types.add('sen')
        tokenizer._params.abbrev_types.add('supt')
        tokenizer._params.abbrev_types.add('rep')
        tokenizer._params.abbrev_types.add('dr')
        tokenizer._params.abbrev_types.add('col')
        tokenizer._params.abbrev_types.add('sec')
        tokenizer._params.abbrev_types.add('mt')
        tokenizer._params.abbrev_types.add('asst')
        tokenizer._params.abbrev_types.add('mr')
        tokenizer._params.abbrev_types.add('c/insp')
        tokenizer._params.abbrev_types.add('rep')
        tokenizer._params.abbrev_types.add('sta')
        tokenizer._params.abbrev_types.add('sto')
        pickle.dump(tokenizer, sent_tokenizer)

예제 #7

0

파일 보기

파일: utils.py 프로젝트: cltk/cltk

    def train_sentence_tokenizer(self: object, text: str):
        """
        Train sentence tokenizer.
        """
        language_punkt_vars = PunktLanguageVars

        # Set punctuation
        if self.punctuation:
            if self.strict:
                language_punkt_vars.sent_end_chars = self.punctuation + self.strict_punctuation
            else:
                language_punkt_vars.sent_end_chars = self.punctuation

        # Set abbreviations
        trainer = PunktTrainer(text, language_punkt_vars)
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.INCLUDE_ABBREV_COLLOCS = True

        tokenizer = PunktSentenceTokenizer(trainer.get_params())

        if self.abbreviations:
            for abbreviation in self.abbreviations:
                tokenizer._params.abbrev_types.add(abbreviation)

        return tokenizer

예제 #8

0

파일 보기

파일: tiger.py 프로젝트: neoyukito/Confopy

 def constructor():
     trainer = PunktTrainer()
     trainer.INCLUDE_ALL_COLLOCS = True
     trainer.INCLUDE_ABBREV_COLLOCS = True
     trainer.train_tokens(self.words())
     params = trainer.get_params()
     return PunktSentenceTokenizer(params)

예제 #9

0

파일 보기

파일: utils.py 프로젝트: wangzhgcn/med_translation

def get_nltk_sent_tokenizer(container, lang):

    assert lang in ["zh", "en"], "Unknown language."

    trainer = PunktTrainer()
    if isinstance(container, Container):
        article_paths = container.get_all_article_paths(
            root_dir="../processed_data/crawler/nejm/articles/", ext=lang)
    elif isinstance(container, list):
        print("{} Articles.".format(len(container)))
        article_paths = container
    else:
        raise ValueError("Cannot parse container with class {}".\
         format(container.__class__))

    missing_count = 0
    for path in article_paths:
        try:
            article = get_article_as_lowercase_string(path)
            trainer.train(text=article, finalize=False)
        except FileNotFoundError:
            print("{} not found.".format(path))
            missing_count += 1
    print("{} articles not found.".format(missing_count))

    trainer.finalize_training()
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    return tokenizer

예제 #10

0

파일 보기

def get_tokenizer(training_text):
    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(training_text)
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    tokenizer._params.abbrev_types.update(ABBREVIATIONS)

    return tokenizer

예제 #11

0

파일 보기

파일: Parser.py 프로젝트: pshg6899/red

    def get_tokenizer(self, xml, abbrevWordList, spentSplitList):
        #class BulletPointLangVars(PunktLanguageVars):
            #sent_end_chars = ('?', '!')
            #for i in range(len(spentSplitList)):
            #    sent_end_chars = sent_end_chars + tuple(spentSplitList[i])

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        train_data = 'sss'
        trainer.train(train_data)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        #tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars = BulletPointLangVars())

        #문장분리 예외추가
        rule['ABBREV_WORDS'].extend(abbrevWordList)

        for i in rule['ABBREV_WORDS']:
            tokenizer._params.abbrev_types.add(i)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        return tokenizer

예제 #12

0

파일 보기

파일: SentenceTokenizer.py 프로젝트: loxavia/corpus-analysis

    def trainSentenceTokenizer(self):
        text = ""
        for file_id in gutenberg.fileids():
            text += gutenberg.raw(file_id)

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train(text)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        tokenizer._params.abbrev_types.add('dr')
        tokenizer._params.abbrev_types.add('fig')
        return tokenizer

예제 #13

0

파일 보기

class Punket_tokenizer:
    def __init__(self):
        self.modelfile = 'punket_tokenizer.pk'

        if os.path.exists(self.modelfile):
            self.tokenizer = self.punkt_tokenize_load()

        else:
            self.trainer = PunktTrainer()
            text = ""
            for file_id in gutenberg.fileids():
                text += gutenberg.raw(file_id)
            self.trainer.INCLUDE_ALL_COLLOCS = True
            self.trainer.train(text)
            self.tokenizer = PunktSentenceTokenizer(self.trainer.get_params())

            self.tokenizer._params.abbrev_types.add('dr')
            self.tokenizer._params.abbrev_types.add('mr')
            self.tokenizer._params.abbrev_types.add('mrs')
            self.tokenizer._params.abbrev_types.add('miss')
            self.tokenizer._params.abbrev_types.add('ms')
            self.tokenizer._params.abbrev_types.add('no')

            self.tokenizer._params.abbrev_types.add('jan')
            self.tokenizer._params.abbrev_types.add('feb')
            self.tokenizer._params.abbrev_types.add('mar')
            self.tokenizer._params.abbrev_types.add('apr')
            self.tokenizer._params.abbrev_types.add('may')
            self.tokenizer._params.abbrev_types.add('jun')
            self.tokenizer._params.abbrev_types.add('aug')
            self.tokenizer._params.abbrev_types.add('sep')
            self.tokenizer._params.abbrev_types.add('oct')
            self.tokenizer._params.abbrev_types.add('nov')
            self.tokenizer._params.abbrev_types.add('dec')

            with open(self.modelfile, mode='wb') as fout:
                pickle.dump(self.tokenizer,
                            fout,
                            protocol=pickle.HIGHEST_PROTOCOL)

    def punkt_tokenize_load(self):
        with open(self.modelfile, mode='rb') as fin:
            punket_tokenizer = pickle.load(fin)

        return punket_tokenizer

    def puket_tokenizer_add_rule(self, word):
        self.tokenizer._params.abbrev_types.add(word)

    def punket_sentence_tokenizer(self, sentences):
        return self.tokenizer.tokenize(sentences)

예제 #14

0

파일 보기

파일: tokenizers.py 프로젝트: Deali-Axy/LittleWhiteOne

def get_sentence_tokenizer(language):
    """
    Return the sentence tokenizer callable.
    """

    pickle_path = 'sentence_tokenizer.pickle'

    try:
        input_file = open(pickle_path, 'rb')
        sentence_tokenizer = load(input_file)
        input_file.close()
    except FileNotFoundError:

        data_file_paths = []

        sentences = []

        try:
            # Get the paths to each file the bot will be trained with
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=language.ENGLISH_NAME.lower()
            ))
        except LookupError:
            # Fall back to English sentence splitting rules if a language is not supported
            corpus_files = list_corpus_files('core.corpus.{language}'.format(
                language=languages.ENG.ENGLISH_NAME.lower()
            ))

        data_file_paths.extend(corpus_files)

        for corpus, _categories, _file_path in load_corpus(*data_file_paths):
            for conversation in corpus:
                for text in conversation:
                    sentences.append(text.upper())
                    sentences.append(text.lower())

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train('\n'.join(sentences))

        sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())

        # Pickle the sentence tokenizer for future use
        output_file = open(pickle_path, 'wb')
        dump(sentence_tokenizer, output_file, -1)
        output_file.close()

    return sentence_tokenizer

예제 #15

0

파일 보기

파일: SentenceTokenizerTrain.py 프로젝트: xottabi4/academic-work-analyzer

def trainSentenceTokenizer():
    """
    Method trains custom sentence tokenizer using punk.
    At the moment it preforms worse then plain english one (most likely due to not that much data)
    """
    collection = database["crawled-data"]

    text = ""
    for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}):
        text += record[ABSTRACT_DOCUMENT] + " "

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.INCLUDE_ABBREV_COLLOCS = True
    trainer.train(text)

    model = nltk.PunktSentenceTokenizer(trainer.get_params())
    with open("latvianPunkt2.pickle", mode='wb') as fout:
        pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)

예제 #16

0

파일 보기

파일: summary.py 프로젝트: tshantan24/tldr-flask

def score_sentences(text, word_scores, unique):

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    sent_score = {}

    sent_tokenizer = PunktSentenceTokenizer(trainer.get_params())
    sentences = sent_tokenizer.tokenize(text.lower())

    for s in sentences:
        words = clean_text(s)
        sent_score[s] = 0

        for w in words:
            w = lemmatizer.lemmatize(w)
            if w in unique:
                sent_score[s] += word_scores[w]

    return sent_score

예제 #17

0

파일 보기

파일: punkt_trainer.py 프로젝트: dsnopek/lingwo-old

def main():
    opts, args = getopt.getopt(sys.argv[1:], 'l:', [])

    lang = None
    for o, a in opts:
        if o == '-l':
            lang = a

    if lang is None:
        print >> sys.stderr, "Must pass -l language on the command line!"
        sys.exit(1)
    if lang == 'en':
        print >> sys.stderr, "Don't train for -l en!  We are using the pre-trained punkt tokenizer from NLTK."
        sys.exit(1)

    lang_vars = MyPunktLanguageVars()
    trainer = PunktTrainer(lang_vars=lang_vars)
    train(trainer, lang)
    trainer.finalize_training(verbose=True)

    tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=lang_vars)
    pickle.dump(tokenizer, open('LingwoNLP/punkt-'+lang+'.pickle','wt'))

예제 #18

0

파일 보기

파일: TrainTokenizer.py 프로젝트: chaitradangat/PunktTrainer

def train_tokenizer(trainfile,abbreviationfile,modelfile):
 k = 0
 skipped_ = 0
 custom_ = 0
 
 punkt = PunktTrainer()
 input_ = codecs.open(trainfile, encoding='utf-8')
 
 for sentence in input_:
  k+=1
  if k%100 == 0:
   print('trained from sentences :' + str(k))
  try:
   punkt.train(sentence, finalize=False, verbose=False)
  except:
   skipped_ += 1
 
 input_.close()
 
 if abbreviationfile !='':
  abbreviations_ = codecs.open(abbreviationfile,encoding='utf-8') 
  for abbr in abbreviations_:
   try:
    punkt.train('Start ' + abbr + '. End.' ,finalize=False, verbose=False)
    custom_ += 1
   except:
    pass
  abbreviations_.close()
  
 punkt.finalize_training(verbose=False)
 
 model = PunktSentenceTokenizer(punkt.get_params())
 model_output = codecs.open(modelfile,mode='wb')
 pickle.dump(model,model_output,protocol=pickle.HIGHEST_PROTOCOL)
 model_output.close()
 
 print('')
 print(str(skipped_) + ' sentences skipped')
 print(str(custom_) + ' custom abbreviations added')

예제 #19

0

파일 보기

파일: Submod.py 프로젝트: abiraja2004/DocumentSummarization-4

    def get_V(self, topics_file_name, other_file):
        if other_file == True:
            path = topics_file_name
        else:
            path = 'OpinosisDataset1.0_0/topics/{}'.format(topics_file_name)
        text = open(path, encoding="utf8", errors='ignore')
        text = text.read()

        # get the X_train_counts and X_train_tf
        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train(text)
        tokenizer = PunktSentenceTokenizer(trainer.get_params())
        X = tokenizer.tokenize(text)
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                            token_pattern=r'\b\w+\b',
                                            min_df=1)
        X_train_counts = bigram_vectorizer.fit_transform(X)

        tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
        X_train_tf = tf_transformer.transform(X_train_counts)
        return X_train_counts, X_train_tf, tokenizer, bigram_vectorizer

예제 #20

0

파일 보기

def main():
    opts, args = getopt.getopt(sys.argv[1:], 'l:', [])

    lang = None
    for o, a in opts:
        if o == '-l':
            lang = a

    if lang is None:
        print >> sys.stderr, "Must pass -l language on the command line!"
        sys.exit(1)
    if lang == 'en':
        print >> sys.stderr, "Don't train for -l en!  We are using the pre-trained punkt tokenizer from NLTK."
        sys.exit(1)

    lang_vars = MyPunktLanguageVars()
    trainer = PunktTrainer(lang_vars=lang_vars)
    train(trainer, lang)
    trainer.finalize_training(verbose=True)

    tokenizer = PunktSentenceTokenizer(trainer.get_params(),
                                       lang_vars=lang_vars)
    pickle.dump(tokenizer, open('LingwoNLP/punkt-' + lang + '.pickle', 'wt'))

예제 #21

0

파일 보기

파일: sentences.py 프로젝트: MaliciousGenius/nlp

def create_sentences(text_file, min_sentence_len):
    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True

    with open(text_file, "r") as input_file:
        paragraphs = input_file.read()

    trainer.train(paragraphs)

    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    # print(tokenizer._params.abbrev_types)

    sentences = []

    for line in open(text_file, "r+").readlines():
        sentences_tmp = tokenizer.tokenize(line)
        for sentence in sentences_tmp:
            sentences.append(sentence)

    with open("dataset/sentences.txt", "a") as out_file:
        for sentence in sentences:
            if len(sentence) > min_sentence_len:
                out_file.write(sentence + "\n\n")

예제 #22

0

파일 보기

파일: tokenizer.py 프로젝트: ikonov/DLTK

def train_punktsent(trainfile, modelfile):
    """ 
  Trains an unsupervised NLTK punkt SENTENCE tokenizer. 
  *trainfile* is the filename for the input file. s
  *modelfile* is the filename for the model output file.
  """
    punkt = PunktTrainer()
    try:
        with codecs.open(trainfile, 'r', 'utf8') as fin:
            punkt.train(fin.read(), finalize=False, verbose=False)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'
    ##HACK: Adds abbreviations from rb_tokenizer.
    abbrv_sent = " ".join([i.strip() for i in \
                           codecs.open('abbrev.lex','r','utf8').readlines()])
    abbrv_sent = "Start" + abbrv_sent + "End."
    punkt.train(abbrv_sent, finalize=False, verbose=False)
    # Finalize and outputs trained model.
    punkt.finalize_training(verbose=True)
    model = PunktSentenceTokenizer(punkt.get_params())
    with open(modelfile, mode='wb') as fout:
        pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
    return model

예제 #23

0

파일 보기

파일: tokenizer.py 프로젝트: alvations/DLTK

def train_punktsent(trainfile, modelfile):
  """ 
  Trains an unsupervised NLTK punkt SENTENCE tokenizer. 
  *trainfile* is the filename for the input file. s
  *modelfile* is the filename for the model output file.
  """
  punkt = PunktTrainer()
  try:
    with codecs.open(trainfile, 'r','utf8') as fin:
      punkt.train(fin.read(), finalize=False, verbose=False)
  except KeyboardInterrupt:
    print 'KeyboardInterrupt: Stopping the reading of the dump early!'
  ##HACK: Adds abbreviations from rb_tokenizer.
  abbrv_sent = " ".join([i.strip() for i in \
                         codecs.open('abbrev.lex','r','utf8').readlines()])
  abbrv_sent = "Start"+abbrv_sent+"End."
  punkt.train(abbrv_sent,finalize=False, verbose=False)
  # Finalize and outputs trained model.
  punkt.finalize_training(verbose=True)
  model = PunktSentenceTokenizer(punkt.get_params())
  with open(modelfile, mode='wb') as fout:
    pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
  return model

예제 #24

0

파일 보기

파일: sentences.py 프로젝트: vasishtraghavendra/lexpredict-lexnlp

def build_sentence_model(text, extra_abbrevs=None):
    """
    Build a sentence model from text with optional
    extra abbreviations to include.
    :param text:
    :param extra_abbrevs:
    :return:
    """

    # Setup Punkt trainer
    punkt_trainer = PunktTrainer()
    punkt_trainer.train(text, verbose=False, finalize=False)
    punkt_trainer.finalize_training(verbose=False)

    # Extract parameters from trainer
    punkt_params = punkt_trainer.get_params()

    # Add any extras if passed
    if extra_abbrevs is not None:
        for abbrev in extra_abbrevs:
            punkt_params.abbrev_types.add(abbrev.strip(".").lower())

    # Return model instantiated with new parameters
    return PunktSentenceTokenizer(punkt_params)

예제 #25

0

파일 보기

파일: token_trainer.py 프로젝트: Sandy4321/text_processing

# cursor = bills.find({}, {"text_versions": 1}, no_cursor_timeout=True, limit=10000)
cursor = bills.find({"congress": {"$in": ["114", "113"]}}, {"text_versions": 1}, no_cursor_timeout=True)

# Train trainer
pbar = ProgressBar(maxval=cursor.count()).start()
for i, line in enumerate(cursor):
    text = line['text_versions'].itervalues().next()
    trainer.train(text, finalize=False, verbose=False)
    pbar.update(i)
pbar.finish()

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

# Include custom parameters
params = trainer.get_params()
# params.collocations = params.collocations | extra_collocations
# params.sent_starters = params.sent_starters | extra_sentence_starters

with open('sentence_tokenizer_params.pickle', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
print "Params: %s" % repr(params)

# Create tokenizer
tokenizer = PunktSentenceTokenizer(params)

# Dump pickled tokenizer
with open("sentence_tokenizer.pickle", mode='wb') as f:
        pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

예제 #26

0

파일 보기

파일: split_sentence.py 프로젝트: Sergiuu17/fact-extractor

# coding: utf-8
import codecs
from sys import argv, exit

from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer

if len(argv) != 3:
    print "Usage: %s <TRAINING_CORPUS> <SENTENCES_TO_SPLIT>" % __file__
    exit(1)

training = ''.join(codecs.open(argv[1], 'rb', 'utf-8').readlines())
trainer = PunktTrainer()
trainer.train(training, verbose=True)
tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True)
text = ''.join(codecs.open(argv[2], 'rb', 'utf-8').readlines())
sentences = tokenizer.tokenize(text)
codecs.open('split', 'wb', 'utf-8').writelines([s + '\n' for s in sentences])

예제 #27

0

파일 보기

PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')
trainer = PunktTrainer(lang_vars=PunktLanguageVars())
trainer.INCLUDE_ALL_COLLOCS = True
trainer.INCLUDE_ABBREV_COLLOCS = True

corpus_dir = 'tesserae' + os.sep + 'texts' + os.sep + 'grc'
file_extension = 'tess'
#Obtain all the files to parse by traversing through the directory
file_names = sorted(list({current_path + os.sep + current_file_name for current_path, current_dir_names, current_file_names in \
os.walk(corpus_dir) for current_file_name in current_file_names if current_file_name.endswith('.' + file_extension)}))

counter = 1
for file_name in file_names:
    file_text = file_parsers[file_extension](file_name)
    trainer.train(file_text, verbose=False, finalize=False)
    print_progress_bar(counter, len(file_names))
    counter += 1

with open(lang + '.pickle', 'wb') as pickle_file:
    pickle_file.write(
        pickle.dumps(PunktSentenceTokenizer(trainer.get_params())))

# params = trainer.get_params()
# tkzr = PunktSentenceTokenizer(params)
# # s = 'test test test test test. test test test test. test test. test test; test test test.'
# s = 'test test test. test test test test test; test test. test test'
# print(tkzr.tokenize(s))
# print(TokenizeSentence('greek').tokenize_sentences(s))

예제 #28

0

파일 보기

파일: vectorize.py 프로젝트: mckinziebrandon/CS221FinalProject

class SimpleVectorizer:
    """Learns a dictionary from a given set of tokenized documents,
    and provides a user-friendly interface for subsequent conversions from
    tokens --> integer sequences.
    """
    # Tokens (and their IDs) always placed at the top of any vocab list.
    # Note: These are counted as part of the vocabulary.
    _PAD_TOKEN = '_PAD'
    _UNK_TOKEN = '_UNK'
    START_VOCAB = OrderedDict([(_PAD_TOKEN, 0), (_UNK_TOKEN, 1)])
    # Tokens with global frequency < FREQ_CUTOFF will be ignored entirely.
    # Specifically, we won't even insert UNK for tokens that occurred less than
    # this number of times across the entire set of calls to update().
    FREQ_CUTOFF = 0

    def __init__(self, vocab_size=None):
        self.vocab_size = vocab_size

        # Tokenization tools and other private attributes.
        self._sent_tokenizer = PunktSentenceTokenizer()
        self._sent_trainer = PunktTrainer()
        self._word_tokenizer = WordTokenizer()
        self._index_to_dfreq = None
        self._is_finalized = False

        # Number of times a given word has been seen across entire corpus.
        self.word_to_freq = OrderedCounter()
        # Number of docs that contained word w.
        self.word_to_dfreq = OrderedCounter()
        # Number of documents trained on so far.
        self.num_docs = 0

        # Dicts that will be filled when fitting documents.
        # word_index: w => i (index into vocabulary)
        # index_docs: i => doc_counts (doc_freq for word with index i).
        self.word_to_index = OrderedDict()

    def truncate_vocab(self, new_vocab_size=None):
        # If not given, set to exact number of unique tokens we've seen.
        new_vocab_size = new_vocab_size or len(self.word_to_index)
        logger.debug(f'Truncating vocab size from {self.vocab_size} '
                     f'to {new_vocab_size}.')

        if new_vocab_size > len(self.word_to_index):
            raise ValueError(
                'truncate_vocab received new vocab larger than previous.')
        self.vocab_size = new_vocab_size
        self.word_to_index = OrderedDict(
            (k, v) for k, v in self.word_to_index.items()
            if v < self.vocab_size)

        assert len(self.word_to_index) == self.vocab_size, '{} != {}'.format(
            len(self.word_to_index), self.vocab_size)

    @property
    def vocab(self):
        vocab = list(self.word_to_index.keys())
        expected_len = self.vocab_size
        if self.vocab_size is not None and len(vocab) != expected_len:
            raise RuntimeError(
                'Vectorizer\'s word_to_index dictionary has unexpected number of entries:'
                ' {}. It should have {}.'.format(len(vocab), expected_len))
        return vocab

    @util.timed_function()
    def update(self, tokenized_docs):
        if self._is_finalized:
            raise RuntimeError('Vectorizer has been finalized. Update prohibited.')

        # Compute frequency statistics.
        self.word_to_freq.update(Vectorizer.get_word_dict(
            'word_to_freq', tokenized_docs))
        self.word_to_dfreq.update(Vectorizer.get_word_dict(
            'word_to_dfreq', tokenized_docs))

        logger.info('Longest sequence in docs has {} tokens.'.format(
            len(max(tokenized_docs, key=len))))
        logger.info(f'Num unique tokens seen thus far:'
                    f' {len(self.word_to_freq)}')
        logger.info(f'Num tokens total seen thus far:'
                    f' {sum(self.word_to_freq.values())}')

        if self.vocab_size is None:
            common_words_sorted = util.lmap(
                itemgetter(0), self.word_to_freq.most_common())
        else:
            common_words_sorted = util.lmap(
                itemgetter(0), self.word_to_freq.most_common(
                    self.vocab_size - len(self.START_VOCAB)))

        assert '_PAD' not in common_words_sorted, [
            l for l in tokenized_docs if '_PAD' in l
        ][:2]
        assert '_UNK' not in common_words_sorted

        word_start = len(self.START_VOCAB)
        word_end = self.vocab_size or (word_start + len(common_words_sorted))
        self.word_to_index = OrderedDict(
            list(self.START_VOCAB.items()) +
            list(zip(common_words_sorted, range(word_start, word_end))))

    def finalize_updates(self):
        """Teardown operations after last call to .update, as determined
        by user.
        """
        self._finalize_frequency_dicts()
        self._finalize_sent_tokenizer()
        self._is_finalized = True

    def _finalize_frequency_dicts(self):
        # Formally insert entries for START_VOCAB in self.word_to_freq,
        # and align items with self.word_to_index.
        unk_freqs = sum([f for w, f in self.word_to_freq.items()
                         if w not in self.word_to_index])
        self.word_to_freq = OrderedCounter(
            [(self._PAD_TOKEN, None), (self._UNK_TOKEN, unk_freqs)] +
            [(w, self.word_to_freq[w]) for w in self.word_to_index])
        # Not obvious how to do analogous procedure for doc freqs, so set both
        # special token counts to None.
        self.word_to_dfreq = OrderedCounter(
            [(self._PAD_TOKEN, None), (self._UNK_TOKEN, None)] +
            [(w, self.word_to_dfreq[w]) for w in self.word_to_index])

    def _finalize_sent_tokenizer(self):
        """Re-instantiate sentence tokenizer to ensure has updated params."""
        self._sent_tokenizer = PunktSentenceTokenizer(
            self._sent_trainer.get_params())

    def tokens_to_vector(self, tokens):
        """Converts list of word tokens to list of integer ids."""
        sent_vec = []
        for word in tokens:
            if self.is_common_unknown(word):
                sent_vec.append(self.START_VOCAB['_UNK'])
            elif self.is_in_vocabulary(word):
                sent_vec.append(self.word_to_index.get(word))
        return sent_vec

    def is_in_vocabulary(self, word):
        return word in self.word_to_index

    def is_common_unknown(self, word):
        return not self.is_in_vocabulary(word) and \
               self.word_to_freq.get(word, 0) >= self.FREQ_CUTOFF

    def detokenize(self, tokens):
        return self._word_tokenizer.detokenize(tokens)

    @util.listify
    def sent_tokenize(self, docs):
        """
        Args:
            docs: str or list(str)

        Returns:
            docs, with each entry tokenized into sentence strings.
        """
        return self._sent_tokenizer.tokenize_sents(docs)

    @util.listify
    def word_tokenize(self, docs, parallel=True):
        return self._word_tokenizer.tokenize_docs(docs, parallel=parallel)

예제 #29

0

파일 보기

파일: train_abbrev_types.py 프로젝트: undertheseanlp/sent_tokenize

from nltk.tokenize.punkt import PunktTrainer
import pickle

PUNCTUATION = (
    ';',
    '.',
    '!',
    '?',
)
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True

with open('./corpus.txt', 'r') as fs:
    text = fs.read()

trainer.train(text, verbose=True)
params = trainer.get_params()
with open('./egs/punkt_tokenize/vi.pkl', 'wb') as fs:
    pickle.dump(params, fs)

예제 #30

0

파일 보기

for file_id in gutenberg.fileids():
    text += gutenberg.raw(file_id)
print len(text)
soup = BeautifulSoup(open("D:\\YK Python\\xmltodict\\LUMNLRB3.BL23899175.xml").read(), 'html.parser')
from pprint import pprint

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

trainer = PunktTrainer()

trainer.INCLUDE_ALL_COLLOCS = True

trainer.train(text)

tokenizer = PunktSentenceTokenizer(trainer.get_params())

sentences = soup.get_text(' ')

sentence_list= tokenizer.tokenize(sentences)

from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')

db=client['nlp']

coll=db['Keywords_list']

extracted_sentences=[]

예제 #31

0

파일 보기

파일: split_sentence.py 프로젝트: fsonntag/fact-extractor

# coding: utf-8
import codecs
import sys

from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer
training = ''.join(
    codecs.open('IT-TrainingCorpus.txt', 'rb', 'utf-8').readlines())
trainer = PunktTrainer()
trainer.train(training, verbose=True)
tokenizer = PunktSentenceTokenizer(trainer.get_params(), verbose=True)
text = ''.join(codecs.open(sys.argv[1], 'rb', 'utf-8').readlines())
sentences = tokenizer.tokenize(text)
clean = [s for s in sentences if s.find('<strong>') != -1]
codecs.open('clean-gold', 'wb', 'utf-8').writelines([s + '\n' for s in clean])

예제 #32

0

파일 보기

train = False
if train:
    with gzip.open("en_corp", 'rt', encoding='utf-8') as encorp, gzip.open(
            "de_corp", 'rt', encoding='utf-8') as decorp:
        text_en = encorp.read()
        text_de = decorp.read()

    trainer_en = PunktTrainer()
    trainer_en.INCLUDE_ALL_COLLOCS = True
    trainer_en.train(text_en)

    trainer_de = PunktTrainer()
    trainer_de.INCLUDE_ALL_COLLOCS = True
    trainer_de.train(text_de)

    tokenizer_en = PunktSentenceTokenizer(trainer_en.get_params())
    tokenizer_de = PunktSentenceTokenizer(trainer_de.get_params())
else:
    #tokenizer_en=PunktSentenceTokenizer()
    #tokenizer_de=PunktSentenceTokenizer()
    #nltk.download('punkt')
    tokenizer_en = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer_de = nltk.data.load('tokenizers/punkt/german.pickle')

mismatch = 0
with open(sys.argv[1]) as filtered:
    for line in filtered:
        tabs = line.split('\t')
        line_src = tabs[2]
        line_tgt = tabs[3]
        sent_src = tokenizer_en.tokenize(line_src)

예제 #33

0

파일 보기

from cltk.utils.file_operations import open_pickle
from nltk.tokenize.punkt import PunktLanguageVars, PunktTrainer, PunktSentenceTokenizer
from extract_features import parse_tess

PunktLanguageVars.sent_end_chars = ('.', ';', ';')
PunktLanguageVars.internal_punctuation = (',', '·', ':')

text = parse_tess('tesserae/texts/grc/xenophon.anabasis.tess')
new_xeno_trainer = PunktTrainer()
# new_xeno_trainer.INCLUDE_ALL_COLLOCS = True
# new_xeno_trainer.INCLUDE_ABBREV_COLLOCS = True
new_xeno_trainer.train(text)
new_xeno_params = new_xeno_trainer.get_params()

tess_xeno_params = open_pickle('tokenizers/ancient_greek.pickle')._params

print(new_xeno_params.abbrev_types)
print(new_xeno_params.abbrev_types == tess_xeno_params.abbrev_types)
print()
print(new_xeno_params.collocations)
print(new_xeno_params.collocations == tess_xeno_params.collocations)
print()
print(new_xeno_params.sent_starters)
print(new_xeno_params.sent_starters == tess_xeno_params.sent_starters)
print()
print(new_xeno_params.ortho_context)
print(new_xeno_params.ortho_context == tess_xeno_params.ortho_context)
print()
'''
I got the internal PunktParameters object from the cltk pickle file that was trained on Xenophon's Anabasis (https://github.com/cltk/greek_training_set_sentence_cltk/blob/master/training_sentences.txt), and I also got the internal PunktParameters object from an PunktTrainer that I created from training on Xenophon's Anabasis from the tesserae corpus (https://github.com/tesserae/tesserae/blob/master/texts/grc/xenophon.anabasis.tess).