def process_corpora(conf_list, backend, reg_dir, variant, replace): for conf_file in conf_list: logging.getLogger(__name__).info('Processing {0}'.format(conf_file)) with open(conf_file) as fr: conf = InstallJson() conf.update(fr) if replace: logging.getLogger(__name__).info( 'Removing existing record (including registry) for {0}.'. format(conf.ident)) backend.remove_corpus(conf.ident) if backend.contains_corpus(conf.ident): logging.getLogger(__name__).info( 'Corpus {0} already present - skipping.'.format( conf.ident)) else: backend.save_corpus_config( conf, reg_dir, get_corpus_size(conf.ident, reg_dir)) logging.getLogger(__name__).info( 'Saved config for {0}.'.format(conf.ident)) if variant: reg_path = os.path.join(reg_dir, variant, conf.ident) else: reg_path = os.path.join(reg_dir, conf.ident) if os.path.isfile(reg_path): enc = infer_encoding(reg_path) with open(reg_path) as fr2: parse_registry(fr2, variant=variant, backend=backend, encoding=enc)
def process_corpora(conf_list, backend, reg_dir, variant, replace): for conf_file in conf_list: logging.getLogger(__name__).info('Processing {0}'.format(conf_file)) with open(conf_file) as fr: conf = InstallJson() conf.update(fr) if replace: logging.getLogger(__name__).info( 'Removing existing record (including registry) for {0}.'.format(conf.ident)) backend.remove_corpus(conf.ident) if backend.contains_corpus(conf.ident): logging.getLogger(__name__).info( 'Corpus {0} already present - skipping.'.format(conf.ident)) else: backend.save_corpus_config(conf, reg_dir, get_corpus_size(conf.ident, reg_dir)) logging.getLogger(__name__).info('Saved config for {0}.'.format(conf.ident)) if variant: reg_path = os.path.join(reg_dir, variant, conf.ident) else: reg_path = os.path.join(reg_dir, conf.ident) if os.path.isfile(reg_path): enc = infer_encoding(reg_path) with open(reg_path) as fr2: parse_registry(fr2, variant=variant, backend=backend, encoding=enc)
def process_directory(dir_path, variant, backend, auto_align, verbose): if variant: dir_path = os.path.join(dir_path, variant) aligned = {} id_map = {} created_rt = {} for item in os.listdir(dir_path): fpath = os.path.join(dir_path, item) if os.path.isfile(fpath): enc = infer_encoding(fpath) with open(fpath) as fr: try: ans = parse_registry(fr, variant=variant, backend=backend, encoding=enc) created_rt[ans['corpus_id']] = ans['created_rt'] if not auto_align: aligned[ans['corpus_id']] = ans['aligned'] id_map[ans['corpus_id']] = ans['corpus_id'] except Exception as ex: logging.getLogger(__name__).error(ex) if verbose: import traceback traceback.print_exc(ex) aligned_ids_map = defaultdict(lambda: []) if auto_align: ids = set(id_map.values()) for k in ids: aligned_ids_map[k] = list(ids - set([k])) else: for id, alig in aligned.items(): for a in alig: try: aligned_ids_map[id].append(id_map[a]) except KeyError: logging.getLogger(__name__).warning( 'Ignored alignment {0} --> {1}'.format(id, a)) for corpus_id, aligned_ids in aligned_ids_map.items(): if created_rt.get(corpus_id, False): backend.save_corpus_alignments(corpus_id, aligned_ids)
def process_directory(dir_path, variant, backend, auto_align, verbose): if variant: dir_path = os.path.join(dir_path, variant) aligned = {} id_map = {} created_rt = {} for item in os.listdir(dir_path): fpath = os.path.join(dir_path, item) if os.path.isfile(fpath): enc = infer_encoding(fpath) with open(fpath) as fr: try: ans = parse_registry(fr, variant=variant, backend=backend, encoding=enc) created_rt[ans['corpus_id']] = ans['created_rt'] if not auto_align: aligned[ans['corpus_id']] = ans['aligned'] id_map[ans['corpus_id']] = ans['corpus_id'] except Exception as ex: logging.getLogger(__name__).error(ex) if verbose: import traceback traceback.print_exc(ex) aligned_ids_map = defaultdict(lambda: []) if auto_align: ids = set(id_map.values()) for k in ids: aligned_ids_map[k] = list(ids - set([k])) else: for id, alig in aligned.items(): for a in alig: try: aligned_ids_map[id].append(id_map[a]) except KeyError: logging.getLogger(__name__).warning( 'Ignored alignment {0} --> {1}'.format(id, a)) for corpus_id, aligned_ids in aligned_ids_map.items(): if created_rt.get(corpus_id, False): backend.save_corpus_alignments(corpus_id, aligned_ids)
'-l', '--auto-align', metavar='AUTO_ALIGN', action='store_const', const=True, help='Align all the corpus in a directory automatically') parser.add_argument( '-v', '--verbose', action='store_const', const=True, help='Provide more information during processing (especially errors)') args = parser.parse_args() import settings settings.load(args.conf_path) backend = WritableBackend(MySQLConf(settings)) if os.path.isdir(args.rpath): process_directory(args.rpath, None, backend, args.auto_align, args.verbose) if args.variant: process_directory(args.rpath, args.variant, backend, args.auto_align, args.verbose) else: with open(args.rpath) as fr: parse_registry(fr, backend=backend, variant=args.variant, encoding=args.encoding if args.encoding else infer_encoding(args.rpath))
backend.save_corpus_alignments(corpus_id, aligned_ids) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Import a Manatee registry file(s)') parser.add_argument('rpath', metavar='REGISTRY_PATH', type=str) parser.add_argument('conf_path', metavar='CONFPATH', type=str) parser.add_argument('-e', '--encoding', metavar='ENCODING', type=str, default=None) parser.add_argument('-a', '--variant', metavar='VARIANT', type=str, help='A subdirectory containing (restricted) variants of corpora') parser.add_argument('-l', '--auto-align', metavar='AUTO_ALIGN', action='store_const', const=True, help='Align all the corpus in a directory automatically') parser.add_argument('-v', '--verbose', action='store_const', const=True, help='Provide more information during processing (especially errors)') args = parser.parse_args() import settings settings.load(args.conf_path) backend = WritableBackend(MySQLConf(settings)) if os.path.isdir(args.rpath): process_directory(args.rpath, None, backend, args.auto_align, args.verbose) if args.variant: process_directory(args.rpath, args.variant, backend, args.auto_align, args.verbose) else: with open(args.rpath) as fr: parse_registry(fr, backend=backend, variant=args.variant, encoding=args.encoding if args.encoding else infer_encoding(args.rpath))