def install_jar(name, url): dst = os.path.join(STATIC_ROOT, name) if os.path.isfile(dst): return dst download(url, dst) return dst install_jar('text-classification-svm-1.0.2.jar', 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar') install_jar('liblinear-1.95.jar', 'http://file.hankcs.com/bin/liblinear-1.95.jar') # 载入分类器 LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 保存模型的工具 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') return LinearSVMClassifier(IOUtil.readObjectFrom(model_file_name))
def url_recognition(): # URL 识别 text = """HanLP的项目地址是https://github.com/hankcs/HanLP, 发布地址是https://github.com/hankcs/HanLP/releases, 我有时候会在www.hankcs.com上面发布一些消息, 我的微博是http://weibo.com/hankcs/,会同步推送hankcs.com的新闻。 听说.中国域名开放申请了,但我并没有申请hankcs.中国,因为穷…… """ Nature = SafeJClass("com.hankcs.hanlp.corpus.tag.Nature") # Term = SafeJClass("com.hankcs.hanlp.seg.common.Term") URLTokenizer = SafeJClass("com.hankcs.hanlp.tokenizer.URLTokenizer") term_list = URLTokenizer.segment(text) print(term_list) for term in term_list: if term.nature == Nature.xu: print(term.word)
def divisionTrainData(trainDataPath, classificationPath): # 创建类别目录 positivePath = os.path.join(classificationPath, 'positive') negetivePath = os.path.join(classificationPath, 'negetive') if not os.path.isdir(classificationPath): os.mkdir(classificationPath) if not os.path.isdir(positivePath): os.mkdir(positivePath) if not os.path.isdir(negetivePath): os.mkdir(negetivePath) # 将文本内容按照label分成两个类目并保存在不同文件夹 with open(trainDataPath, 'r', encoding='utf-8') as fin: fin.readline() for sentence in fin.readlines(): sentence = sentence.strip('\n') sentence = sentence.split('\t') if (sentence[2] == '0'): if() pf = open(os.path.join(positivePath, sentence[0] + '.txt'), 'a+', encoding='utf-8') pf.write(sentence[1]) pf.close() else: nf = open(os.path.join(negetivePath, sentence[0] + '.txt'), 'a+', encoding='utf-8') nf.write(sentence[1]) nf.close() print('成功加载训练集。') ########################################################################################## # 载入分类器 IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 载入分词器 ITokenizer = JClass('com.hankcs.hanlp.classification.tokenizers.ITokenizer') HanLPTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer') BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') ########################################################################################## if __name__ == '__main__': divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH) classifier = NaiveBayesClassifier() classifier.train(CLASSIFICATION_DATA_PATH) print(classifier.classify("我去挂机了"))
def install_jar(name, filepath, url): dst = os.path.join(filepath, name) if os.path.isfile(dst): return dst download(url, dst) return dst install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar') install_jar('liblinear-1.95.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/liblinear-1.95.jar') ########################################################################################## # 载入分类器 LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 保存模型的工具 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') # 载入分词器 BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') ########################################################################################## if __name__ == '__main__': divisionTrainData(TRAIN_DATA_PATH, CLASSIFICATION_DATA_PATH) classifier = LinearSVMClassifier() classifier.train(CLASSIFICATION_DATA_PATH) # 保存模型 model = classifier.getmodel()
fin.readline() for sentence in fin.readlines(): sentence = sentence.strip('\n') sentence = sentence.split('\t') if (sentence[2] == '0'): pf = open(os.path.join(positivePath, sentence[0] + '.txt'), 'a+', encoding='utf-8') pf.write(sentence[1]) pf.close() else: nf = open(os.path.join(negetivePath, sentence[0] + '.txt'), 'a+', encoding='utf-8') nf.write(sentence[1]) nf.close() print('成功加载训练集。') ########################################################################################## # 载入分类器 IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') ########################################################################################## if __name__ == '__main__': pass
download(url, dst) return dst install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar') install_jar('liblinear-1.95.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/liblinear-1.95.jar') ########################################################################################## # 载入分词器 BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') # 载入分类器 LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 保存模型的工具 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet') MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet') Evaluator = JClass( 'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator') ########################################################################################## # 对数据集进行预处理 def dataPreprocessing(dataPath): print("开始修正数据......") fi = open(dataPath, "r", encoding="utf-8")
download(url, dst) return dst install_jar('text-classification-svm-1.0.2.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar') install_jar('liblinear-1.95.jar', PROJECT_PATH, 'http://file.hankcs.com/bin/liblinear-1.95.jar') ########################################################################################## # 载入分词器 BigramTokenizer = JClass( 'com.hankcs.hanlp.classification.tokenizers.BigramTokenizer') # 载入分类器 LinearSVMClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier') # 保存模型的工具 IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet') MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet') Evaluator = JClass( 'com.hankcs.hanlp.classification.statistics.evaluations.Evaluator') ########################################################################################## # 对数据集进行预处理 def dataPreprocessing(dataPath): fi = open(dataPath, "r", "utf-8") fi.readline()
print("dest_path:" + dest_path) if os.path.exists(dest_path): return dest_path if data_url.endswith('.zip'): dest_path += '.zip' download(data_url, dest_path) if data_url.endswith('.zip'): with zipfile.ZipFile(dest_path, "r") as archive: archive.extractall(root_path) remove_file(dest_path) dest_path = dest_path[:-len('.zip')] print("dest_path:" + dest_path) return dest_path NaiveBayesClassifier = SafeJClass( 'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil') # sogou_corpus_path = ensure_data('搜狗文本分类语料库迷你版', # 'http://hanlp.linrunsoft.com/release/corpus/sogou-text-classification-corpus-mini.zip') ChnSentiCorp_path = ensure_data('酒店评论情感分析', \ 'http://hanlp.linrunsoft.com/release/corpus/ChnSentiCorp.zip') def train_or_load_classifier(path): model_path = path + '.ser' if os.path.isfile(model_path): return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path)) classifier = NaiveBayesClassifier() classifier.train(ChnSentiCorp_path) model = classifier.getModel()
if sys.version_info[0] < 3: reload(sys) sys.setdefaultencoding("utf-8") # raise "Must be using Python 3" from absl import flags # absl-py from absl import logging # absl-py FLAGS = flags.FLAGS import unittest import threading import time from pyhanlp import HanLP, SafeJClass # 在线程体外部用SafeJClass线程安全地引入类名 CRFLexicalAnalyzer = SafeJClass( "com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer") class MyThread(threading.Thread): def __init__(self, name, counter, analyzer): threading.Thread.__init__(self) self.thread_name = name self.counter = counter self.analyzer = analyzer def run(self): print("Starting " + self.thread_name) while self.counter: time.sleep(1) sentence = self.analyzer.analyze("商品和服务") print("%s: %s, seg: %s" %