예제 #1
0
def stanford_seg(path, stage='train'):
    df_dir = os.path.join(path, '{}.csv'.format(stage))
    data = pd.read_csv(df_dir, error_bad_lines=False, dtype=object)
    data = data.dropna(axis=0, how='any').reset_index(drop=True)
    data_dir = '/home/trueto/stanford_segmenter/'
    seg = StanfordSegmenter(path_to_jar=data_dir + 'stanford-segmenter.jar',
                            java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
                            path_to_sihan_corpora_dict=data_dir + "data",
                            path_to_model=data_dir + 'data/pku.gz',
                            path_to_dict=data_dir + "data/dict-chris6.ser.gz")
    columns = data.columns
    for column in columns:
        if column in ['question1', 'question2']:
            column_file = os.path.join(path, 'cut',
                                       '{}_{}.txt'.format(stage, column))
            data[column].to_csv(column_file, index=False)
            cut_file = os.path.join(path, 'cut',
                                    '{}_{}_cut.txt'.format(stage, column))
            with open(cut_file, 'w') as f:
                f.write(seg.segment_file(column_file))
예제 #2
0
class SegmentWorker:
    def __init__(self):
        file_path = path.realpath(__file__)
        dir_path = path.dirname(file_path)
        self.path_to_jar = path.join(dir_path, 'stanford-segmenter-3.9.2.jar')
        self.path_to_model = path.join(dir_path, 'data/ctb.gz')  # pku.gz
        self.path_to_dict = path.join(dir_path, 'data/dict-chris6.ser.gz')
        self.path_to_sihan_corpora_dict = path.join(dir_path, 'data/')
        self.seg = StanfordSegmenter(
            path_to_jar=self.path_to_jar,
            java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
            path_to_model=self.path_to_model,
            path_to_dict=self.path_to_dict,
            path_to_sihan_corpora_dict=self.path_to_sihan_corpora_dict)

    def seg_file(self, file_to_segment):
        """segment a file and return the result string"""
        seg_result = self.seg.segment_file(file_to_segment)

        translator = str.maketrans('', '', string.digits)
        seg_result = seg_result.translate(translator)
        seg_result = re.sub('[\\\\.!/_,$%^*(+\\"\']+|[+—!,:;。?、~@#¥%…&*()]+',
                            '', seg_result)
        # print(seg_result)
        return seg_result

    def seg_file2list(self, file_to_segment):
        """segment a text file and return array of tokens"""
        seg_result = self.seg_file(file_to_segment)
        # print(seg_result)
        return seg_result.split()

    def seg_file2file(self, origin_file, dest_file):
        """segment a text file and write result tokens to another file"""
        seg_result = self.seg_file(origin_file)
        seg_result = re.sub('\\s+', ' ', seg_result)
        # print(seg_result)
        with open(dest_file, 'w', encoding='UTF-8') as f:
            f.write(seg_result)
예제 #3
0
# # u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...
##下载
# import nltk
# nltk.download()

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
segmenter = StanfordSegmenter(
     path_to_jar="stanford-segmenter-3.6.0.jar",
     path_to_slf4j = "slf4j-api.jar",
     path_to_sihan_corpora_dict="./data",
     path_to_model="./data/pku.gz",
     path_to_dict="./data/dict-chris6.ser.gz")
sentence = u"这是斯坦福中文分词器测试"
segmenter.segment(sentence)
# >>> u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n'
segmenter.segment_file("test.simp.utf8")
# >>> u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...

# 英文测试
# import nltk
# text = 'i am a good boy.you are a bad girl'
# sens = nltk.sent_tokenize(text)
# print(sens)
# words = []
# for sent in sens:
#     words.append(nltk.word_tokenize(sent))
# for line in words:
#     print(line)
#
# tags = []
# for tokens in words:
예제 #4
0
import os, time

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
start = time.time()

jar = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/stanford-segmenter.jar'
api_jar = '/Users/sinansmac/Public/StanfordNLP/stanford-parser-full-2017-06-09/slf4j-api.jar'
# dict = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/data/dict-chris6.ser.gz'

seg = StanfordSegmenter(path_to_jar=jar, path_to_slf4j=api_jar)
seg.default_config('zh')

# sent = u'这是斯坦福中文分词器测试'
# print(seg.segment(sent))

fp = "chinese.txt"
tokenstr = seg.segment_file(fp)
token_ls = list(tokenstr)
print(len(token_ls), '\n', tokenstr, '\n', token_ls)

# with open('chinese_tokens.txt', 'a') as writef:
#     for line in token_ls:
#         writef.write(line.rstrip().split())

# print(tokens, '\n', type(tokens)) # class 'str'

end = time.time()

print("process time:", round(end - start))
예제 #5
0
파일: NLTK.py 프로젝트: LeonHanml/Python
# from nltk.tokenize.stanford import

path = "D:/www/data/nlpsoftware/stanford-segmenter"
segmenter = StanfordSegmenter(
    path_to_jar=path + "/stanford-segmenter.jar",
    path_to_sihan_corpora_dict=path + "/data",
    path_to_model=path + "/data/pku.gz",
    path_to_dict=path + "/data/dict-chris6.ser.gz",
    java_class='edu.stanford.nlp.ie.crf.CRFClassifier')
#
sentence = u"这是斯坦福中文分词器测试"
sentence = u"工信处女干事每月经过   下属   科室都要亲口交代24口交换机等技术性器件的安装工作"

segmenter.tokenize_sents(u"工信处")
result = segmenter.segment(sentence)
result2 = segmenter.segment_file(
    "D:/www/data/nlpdata/icwb2-data/testing/pku_test.utf8")
clean_content = "D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content.txt"
# clean_content_out="D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content_out.txt"
# result3 = segmenter.segment_file(clean_content)
print(type(result2))

# with open(clean_content_out,'wb+') as f:
#     f.writelines([(s+"\r\n").encode('utf8') for s in  clean_content_out])
print(result2)
# outfile = open("D:/www/data/nlpsoftware/outfile.txt",'w')
# outfile.write(result)
# outfile.close()
#
# stanford_postagger="D:\\www\\data/nlpsoftware/stanford-postagger-full-2017-06-09\\stanford-postagger.jar"
# stanford_ner="D:\\www\\data/nlpsoftware/stanford-ner-2017-06-09\\stanford-ner.jar"
# classifiers="D:\\www\\data\\nlpsoftware\\stanford-ner\\classifiers\\"
            f.write('\n###############################################################################\n')
        f.close()

    # Segments file
    JAVA_PATH = 'C:/java/bin/java.exe'
    os.environ['JAVA_HOME'] = JAVA_PATH
    segmenter = StanfordSegmenter(
        path_to_jar='stanford-segmenter-2018-02-27/stanford-segmenter-3.9.1.jar',
        path_to_sihan_corpora_dict='./stanford-segmenter-2018-02-27/data',
        java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
        path_to_model='stanford-segmenter-2018-02-27/data/pku.gz',
        path_to_dict='stanford-segmenter-2018-02-27/data/dict-chris6.ser.gz')

    with open('segmented_chinese.txt', mode='w', encoding='utf-8') as  f:
        print('Starting segmentation')
        result = segmenter.segment_file('chinese_all.txt')
        print('now writing')
        f.write(result)
        f.close()

    # Separates into list
    documents_seg = []
    with open('segmented_chinese.txt', 'r', encoding='utf-8') as f:
        doc = ''
        for d in f.readlines():
            if '###############################################################################' in d:
                documents_seg.append(doc)
                doc = ''
            else:
                doc = ' '.join([doc, d])
        f.close()
예제 #7
0
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize.stanford import CoreNLPTokenizer
# from nltk.tokenize.stanford import

path = "D:/www/data/nlpsoftware/stanford-segmenter"
segmenter = StanfordSegmenter(
    path_to_jar=path + "/stanford-segmenter.jar",
    path_to_sihan_corpora_dict=path + "/data",
    path_to_model=path + "/data/pku.gz",
    path_to_dict=path + "/data/dict-chris6.ser.gz",
    java_class='edu.stanford.nlp.ie.crf.CRFClassifier'

)
#
# sentence = u"这是斯坦福中文分词器测试

# result = segmenter.segment(sentence)
# result2 = segmenter.segment_file("D:/www/data/nlpdata/icwb2-data/testing/pku_test.utf8")
clean_content="D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content.txt"
clean_content_out="D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content_out.txt"
result3 = segmenter.segment_file(clean_content)
print(result3)

with open(clean_content_out,'wb+') as f:
    f.write(result3.encode('utf8'))