def highlight_text(args, parser): """Outputs the result of highlighting a text.""" tokenizer = utils.get_tokenizer(args) corpus = utils.get_corpus(args) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): parser.exit(status=3, message='Output directory already exists, ' 'aborting.\n') os.makedirs(output_dir, exist_ok=True) if args.ngrams: if args.label is None or len(args.label) != len(args.ngrams): parser.error('There must be as many labels as there are files ' 'of n-grams') report = tacl.NgramHighlightReport(corpus, tokenizer) ngrams = [] for ngram_file in args.ngrams: ngrams.append(utils.get_ngrams(ngram_file)) minus_ngrams = [] if args.minus_ngrams: minus_ngrams = utils.get_ngrams(args.minus_ngrams) report.generate(args.output, args.base_name, ngrams, args.label, minus_ngrams) else: report = tacl.ResultsHighlightReport(corpus, tokenizer) report.generate(args.output, args.base_name, args.results)
def search_texts(args, parser): """Searches texts for presence of n-grams.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) store.validate(corpus, catalogue) ngrams = [] for ngram_file in args.ngrams: ngrams.extend(utils.get_ngrams(ngram_file)) store.search(catalogue, ngrams, sys.stdout)