예제 #1
0
파일: api.py 프로젝트: wisonwang/pyhanlp
def extractWords(text, size, newWordsOnly=False):
    """
     * 提取词语
     *
     * @param reader 从reader获取文本
     * @param size   需要提取词语的数量
     * @param newWordsOnly 是否只提取词典中没有的词语
     * @return 一个词语列表

    """
    return HanLP.extractWords(text, size, newWordsOnly)
예제 #2
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append('--help')

    arg_parser = argparse.ArgumentParser(description='HanLP: Han Language Processing v{}'.format(HANLP_JAR_VERSION))
    arg_parser.add_argument('-v', '--version', required=False, action='store_true',
                            help='show installed versions of HanLP')
    task_parser = arg_parser.add_subparsers(dest="task", help='which task to perform?')
    segment_parser = task_parser.add_parser(name='segment', help='word segmentation')
    tag_parser = segment_parser.add_mutually_exclusive_group(required=False)
    tag_parser.add_argument('--tag', dest='tag', action='store_true', help='show part-of-speech tags')
    tag_parser.add_argument('--no-tag', dest='tag', action='store_false', help='don\'t show part-of-speech tags')
    segment_parser.set_defaults(tag=True)
    segment_parser.add_argument('-a', '--algorithm', type=str, default='viterbi',
                                help='algorithm of segmentation e.g. perceptron')
    newwords_parser = task_parser.add_parser(name='newwords', help='recognize new words')
    newwords_parser.add_argument('corpus', help='corpus file path')
    parse_parser = task_parser.add_parser(name='parse', help='dependency parsing')
    server_parser = task_parser.add_parser(name='serve', help='start http server',
                                           description='A http server for HanLP')
    server_parser.add_argument('--port', type=int, default=8765)
    update_parser = task_parser.add_parser(name='update', help='update jar and data of HanLP')

    def add_args(p):
        p.add_argument("--config", default=PATH_CONFIG,
                       help='path to hanlp.properties')
        # p.add_argument("--action", dest="action", default='predict',
        #                help='Which action (train, test, predict)?')

    add_args(newwords_parser)
    add_args(segment_parser)
    add_args(parse_parser)

    if '-v' in sys.argv or '--version' in sys.argv:
        print('jar  {}: {}'.format(HANLP_JAR_VERSION, HANLP_JAR_PATH))
        data_version = hanlp_installed_data_version()
        print('data {}: {}'.format(data_version if data_version else '自定义', HANLP_DATA_PATH))
        print('config    : {}'.format(os.path.join(STATIC_ROOT, 'hanlp.properties')))
        exit(0)

    args = arg_parser.parse_args()

    def eprint(*args, **kwargs):
        print(*args, file=sys.stderr, **kwargs)

    def die(msg):
        eprint(msg)
        exit(1)

    if hasattr(args, 'config') and args.config:
        if os.path.isfile(args.config):
            JClass('com.hankcs.hanlp.utility.Predefine').HANLP_PROPERTIES_PATH = args.config
        else:
            die('Can\'t find config file {}'.format(args.config))

    if args.task == 'segment':
        segmenter = None
        try:
            segmenter = HanLP.newSegment(args.algorithm)
        except JavaException as e:
            if e.javaClass() == JClass('java.lang.IllegalArgumentException'):
                die('invalid algorithm {}'.format(args.algorithm))
            elif e.javaClass() == JClass('java.lang.RuntimeException'):
                die('failed to load required model')

        is_lexical_analyzer = hasattr(segmenter, 'analyze')
        if not args.tag:
            if is_lexical_analyzer:
                segmenter.enablePartOfSpeechTagging(False)
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
            else:
                JClass('com.hankcs.hanlp.HanLP$Config').ShowTermNature = False
        for line in sys.stdin:
            line = line.strip()
            print(' '.join(term.toString() for term in segmenter.seg(any2utf8(line))))
    elif args.task == 'newwords':
        if not os.path.exists(args.corpus) or not os.path.isfile(args.corpus):
            die('corpus不存在或不是文件')
        with open(args.corpus, 'r') as corpus_file:
            new_words = [
                (word_info.text, word_info.frequency)
                for word_info in list(HanLP.extractWords(
                    any2utf8(corpus_file.read()), 10000, True))
            ]
            for word in new_words:
                print('{} {}'.format(word[0], word[1]))
    elif args.task == 'parse':
        for line in sys.stdin:
            line = line.strip()
            print(HanLP.parseDependency(any2utf8(line)))
    elif args.task == 'serve':
        if PY == 3:
            from pyhanlp import server
            server.run(port=args.port)
        else:
            die('现在server.py暂时不支持Python2,欢迎参与移植')
    elif args.task == 'update':
        if hanlp_installed_data_version() == '手动安装':
            die('手动配置不支持自动升级,若要恢复自动安装,请清除HANLP相关环境变量')
        else:
            from pyhanlp.static import update_hanlp
            update_hanlp()