def ngram_intersection(args, parser): """Outputs the results of performing an intersection query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) store.validate(corpus, catalogue) store.intersection(catalogue, sys.stdout)
def main(): parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, formatter_class=ParagraphFormatter) utils.add_db_arguments(parser) utils.add_tokenizer_argument(parser) utils.add_query_arguments(parser) parser.add_argument('parent', help=PARENT_LABEL_HELP, metavar='PARENT_LABEL') parser.add_argument('child', help=CHILD_LABEL_HELP, metavar='CHILD_LABEL') parser.add_argument('unrelated', help=UNRELATED_LABEL_HELP, metavar='UNRELATED_LABEL') parser.add_argument('max_works', help=MAX_WORKS_HELP, metavar='MAXIMUM', type=int) parser.add_argument('output_dir', help=OUTPUT_DIR_HELP, metavar='DIRECTORY') args = parser.parse_args() catalogue = utils.get_catalogue(args) data_store = utils.get_data_store(args) tokenizer = utils.get_tokenizer(args) try: test = taclextra.paternity.PaternityTest(data_store, catalogue, tokenizer, args.parent, args.child, args.unrelated, args.max_works, args.output_dir) test.process() except Exception as e: parser.error(e)
def lifetime_report(args, parser): """Generates a lifetime report.""" catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) results = tacl.Results(args.results, tokenizer) output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) report = tacl.LifetimeReport() report.generate(output_dir, catalogue, results, args.label)
def generate_ngrams(args, parser): """Adds n-grams data to the data store.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) if args.catalogue: catalogue = utils.get_catalogue(args) else: catalogue = None store.add_ngrams(corpus, args.min_size, args.max_size, catalogue)
def generate_ngrams(args, parser): """Adds n-grams data to the data store.""" store = utils.get_data_store(args, must_exist=False) corpus = utils.get_corpus(args) if args.catalogue: catalogue = utils.get_catalogue(args) else: catalogue = None store.add_ngrams(corpus, args.min_size, args.max_size, catalogue)
def search_texts(args, parser): """Searches texts for presence of n-grams.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) store.validate(corpus, catalogue) ngrams = [] for ngram_file in args.ngrams: ngrams.extend(utils.get_ngrams(ngram_file)) store.search(catalogue, ngrams, sys.stdout)
def ngram_diff(args, parser): """Outputs the results of performing a diff query.""" store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) store.validate(corpus, catalogue) if args.asymmetric: store.diff_asymmetric(catalogue, args.asymmetric, tokenizer, sys.stdout) else: store.diff(catalogue, tokenizer, sys.stdout)
def main(): parser = argparse.ArgumentParser(description=DESCRIPTION) utils.add_db_arguments(parser) utils.add_tokenizer_argument(parser) utils.add_query_arguments(parser) parser.add_argument('output', help=HELP_OUTPUT, metavar='DIRECTORY') args = parser.parse_args() data_store = utils.get_data_store(args) catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) output_dir = os.path.abspath(args.output) reporter = lifetime.LifetimeReporter(data_store, catalogue, tokenizer, output_dir) reporter.process()
def main(): parser = generate_parser() args = parser.parse_args() if hasattr(args, 'verbose'): utils.configure_logging(args.verbose, logger) store = utils.get_data_store(args) corpus = utils.get_corpus(args) catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) try: check_catalogue(catalogue, args.label) except Exception as e: parser.error(str(e)) store.validate(corpus, catalogue) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): logger.warning('Output directory already exists; any results therein ' 'will be reused rather than regenerated.') os.makedirs(output_dir, exist_ok=True) report = jitc.JitCReport(store, corpus, tokenizer) report.generate(output_dir, catalogue, args.label)
def main(): parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, formatter_class=ParagraphFormatter) parser.add_argument('--min_size', default=1, help=MINIMUM_HELP, metavar='MINIMUM', type=int) parser.add_argument('--max_size', default=10, help=MAXIMUM_HELP, metavar='MAXIMUM', type=int) utils.add_common_arguments(parser) utils.add_db_arguments(parser) utils.add_corpus_arguments(parser) utils.add_query_arguments(parser) parser.add_argument('output_dir', help='Path to output directory', metavar='DIRECTORY') parser.add_argument('tracker_path', help='Path to tracking file', metavar='TRACKING') args = parser.parse_args() logger = logging.getLogger('taclextra') if hasattr(args, 'verbose'): utils.configure_logging(args.verbose, logger) corpus = utils.get_corpus(args) if args.db == 'memory': data_store = None else: data_store = utils.get_data_store(args) tokenizer = utils.get_tokenizer(args) catalogue = utils.get_catalogue(args) pi = paired_intersector.PairedIntersector(data_store, corpus, tokenizer, catalogue, args.output_dir, args.tracker_path, args.min_size, args.max_size) pi.intersect_all()