def extractWords(text, size, newWordsOnly=False): """ * 提取词语 * * @param reader 从reader获取文本 * @param size 需要提取词语的数量 * @param newWordsOnly 是否只提取词典中没有的词语 * @return 一个词语列表 """ return HanLP.extractWords(text, size, newWordsOnly)
def main(): if len(sys.argv) == 1: sys.argv.append('--help') arg_parser = argparse.ArgumentParser(description='HanLP: Han Language Processing v{}'.format(HANLP_JAR_VERSION)) arg_parser.add_argument('-v', '--version', required=False, action='store_true', help='show installed versions of HanLP') task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?') segment_parser = task_parser.add_parser(name='segment', help='word segmentation') tag_parser = segment_parser.add_mutually_exclusive_group(required=False) tag_parser.add_argument('--tag', dest='tag', action='store_true', help='show part-of-speech tags') tag_parser.add_argument('--no-tag', dest='tag', action='store_false', help='don\'t show part-of-speech tags') segment_parser.set_defaults(tag=True) segment_parser.add_argument('-a', '--algorithm', type=str, default='viterbi', help='algorithm of segmentation e.g. perceptron') newwords_parser = task_parser.add_parser(name='newwords', help='recognize new words') newwords_parser.add_argument('corpus', help='corpus file path') parse_parser = task_parser.add_parser(name='parse', help='dependency parsing') server_parser = task_parser.add_parser(name='serve', help='start http server', description='A http server for HanLP') server_parser.add_argument('--port', type=int, default=8765) update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP') def add_args(p): p.add_argument("--config", default=PATH_CONFIG, help='path to hanlp.properties') # p.add_argument("--action", dest="action", default='predict', # help='Which action (train, test, predict)?') add_args(newwords_parser) add_args(segment_parser) add_args(parse_parser) if '-v' in sys.argv or '--version' in sys.argv: print('jar {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH)) data_version = hanlp_installed_data_version() print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH)) print('config : {}'.format(os.path.join(STATIC_ROOT, 'hanlp.properties'))) exit(0) args = arg_parser.parse_args() def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def die(msg): eprint(msg) exit(1) if hasattr(args, 'config') and args.config: if os.path.isfile(args.config): JClass('com.hankcs.hanlp.utility.Predefine').HANLP_PROPERTIES_PATH = args.config else: die('Can\'t find config file {}'.format(args.config)) if args.task == 'segment': segmenter = None try: segmenter = HanLP.newSegment(args.algorithm) except JavaException as e: if e.javaClass() == JClass('java.lang.IllegalArgumentException'): die('invalid algorithm {}'.format(args.algorithm)) elif e.javaClass() == JClass('java.lang.RuntimeException'): die('failed to load required model') is_lexical_analyzer = hasattr(segmenter, 'analyze') if not args.tag: if is_lexical_analyzer: segmenter.enablePartOfSpeechTagging(False) JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False else: JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False for line in sys.stdin: line = line.strip() print(' '.join(term.toString() for term in segmenter.seg(any2utf8(line)))) elif args.task == 'newwords': if not os.path.exists(args.corpus) or not os.path.isfile(args.corpus): die('corpus不存在或不是文件') with open(args.corpus, 'r') as corpus_file: new_words = [ (word_info.text, word_info.frequency) for word_info in list(HanLP.extractWords( any2utf8(corpus_file.read()), 10000, True)) ] for word in new_words: print('{} {}'.format(word[0], word[1])) elif args.task == 'parse': for line in sys.stdin: line = line.strip() print(HanLP.parseDependency(any2utf8(line))) elif args.task == 'serve': if PY == 3: from pyhanlp import server server.run(port=args.port) else: die('现在server.py暂时不支持Python2,欢迎参与移植') elif args.task == 'update': if hanlp_installed_data_version() == '手动安装': die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量') else: from pyhanlp.static import update_hanlp update_hanlp()