def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True) sentences = [] fields = [field if field is not None else Field(str(i)) for i, field in enumerate(fields)] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info('Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append fake columns line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))])) assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split())) lines.append(line) return cls(fields, sentences)
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt' with open(raw_text_file) as fin: for line in tokenizer.format(tokenizer.predict(fin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 2, line
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) sin = io.StringIO( "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo." ) for line in tokenizer.format(tokenizer.predict(sin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 10, line
def test_ner(crf, test_sent): from tokenizer.tokenizer import Tokenizer token = Tokenizer() token.run() arr_featurized_sent = [] postaged_sent = ViPosTagger.postagging(token.predict(test_sent)) print postaged_sent test_arr = [] for i in xrange(len(postaged_sent[0])): test_arr.append((postaged_sent[0][i], postaged_sent[1][i])) print test_arr featurized_sent = sent2features(test_arr) arr_featurized_sent.append(featurized_sent) predict = crf.predict(arr_featurized_sent) return zip(test_arr, predict[0])
def first_stats(): tokenizer = Tokenizer() tokenizer.run() question_vocabulary = Vocabulary() questions = load_questions() cc = 0 for question in questions: #print question if cc % 10 == 0: print "\r%s" % cc, cc += 1 sen = tokenizer.predict(question) sen = sen.lower() tokens = question_vocabulary.get_sentence_token_ids(sen) question_list.append(tokens) print "\n Saving..." question_vocabulary.save(Q_VOCAB_NAME) utils.pickle_save(question_list, "question_tokens.dat") print "Done"
def load(cls, path, fields, tokenizer_lang, tokenizer_dir, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, verbose=verbose) sentences = [] fields = [ field if field is not None else Field(str(i)) for i, field in enumerate(fields) ] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info( 'Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append empty columns line += '\t_' * (len(CoNLL._fields) - len(line.split('\t'))) lines.append(line) return cls(fields, sentences)
def test_tokenize(self): tokenizer = Tokenizer(**self.args) sentences = tokenizer.predict( 'Domani vorrei andare al mare.Speriamo faccia bel tempo.') self.assertEqual(len(sentences), 2)
def test_tokenize(self): tokenizer = Tokenizer(self.args['lang']) sentences = tokenizer.predict( 'Ha chiamato il dr. Rossi.Vuole salutarti.') self.assertEqual(len(sentences), 2)