def tag(tagger, args): """Chunk named entities.""" for l in args.input: words = l.strip().split() line_annotations = [u"{:<16}{:<5}".format(w,p) for w, p in tagger.annotate(words)] _print(u"\n".join(line_annotations)) _print(u"")
def ner_chunk(args): """Chunk named entities.""" chunker = NEChunker(lang=args.lang) for l in args.input: words = l.strip().split() line_annotations = [u"{}\t{}".format(w,p) for w, p in chunker.annotate(words)] _print(u"\n".join(line_annotations))
def morphemes(args): """Segment words according to their morphemes.""" morfessor = load_morfessor_model(lang=args.lang) for l in args.input: words = l.strip().split() morphemes = [(w, u"_".join(morfessor.viterbi_segment(w)[0])) for w in words] line_annotations = [u"{:<16}{:<5}".format(w,p) for w, p in morphemes] _print(u"\n".join(line_annotations)) _print(u"")
def transliterate(args): """Transliterate words according to the target language.""" t = Transliterator(source_lang=args.lang, target_lang=args.target) for l in args.input: words = l.strip().split() line_annotations = [u"{:<16}{:<16}".format(w, t.transliterate(w)) for w in words] _print(u"\n".join(line_annotations)) _print(u"")
def segment(args): lang = args.lang w_tokenizer = WordTokenizer(locale=lang) s_tokenizer = SentenceTokenizer(locale=lang) if args.only_sent: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq))) elif args.only_word: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq))) else: for l in args.input: seq = Sequence(l) sents = s_tokenizer.transform(seq) words = w_tokenizer.transform(seq) for tokenized_sent in words.split(sents): if not tokenized_sent.empty(): _print(u' '.join(tokenized_sent.tokens()))
def segment(args): lang = args.lang w_tokenizer = WordTokenizer(locale=lang) s_tokenizer = SentenceTokenizer(locale=lang) if args.only_sent: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(s_tokenizer.transform(seq)) elif args.only_word: for l in args.input: seq = Sequence(l) if not seq.empty(): _print(w_tokenizer.transform(seq)) else: for l in args.input: seq = Sequence(l) sents = s_tokenizer.transform(seq) words = w_tokenizer.transform(seq) for tokenized_sent in words.split(sents): if not tokenized_sent.empty(): _print(u' '.join(tokenized_sent.tokens()))
def cat(args): """ Concatenate the content of the input file.""" for l in args.input: _print(l.strip())
def detect(args): """ Detect the language of each line.""" for l in args.input: if l.strip(): _print("{:<20}{}".format(Detector(l).language.name, l.strip()))
def detect(args): """ Detect the language of each line.""" for l in args.input: if l.strip(): _print(Detector(l).name)