示例#1
0
# encoding=utf8
__author__ = 'wang'

import sys
sys.path.append('/home/jdwang/PycharmProjects/corprocessor/bosonnlp')
import bosonnlp.toolSet as bosennlp_toolset

import os
import logging

# create a bosennlp object
# set the level of logging
logging_level = logging.INFO
bosennlp_model = bosennlp_toolset.bosonnlp(
    logging_level=logging_level
)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging_level)

originalCorpusDir = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/originalCorpusAfterClean/20160404/'
outputCorpusDir = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/clean_repeated_sentences/20160404/'
segmented_crpus_dir = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/bosonnlp/20160404/'

def clean_repeat_sentences():
    logging.info( 'clean repeat sentences...')
    from subprocess import call
    if not os.path.exists(outputCorpusDir):
        print "the outputCorpusDir/"+outputCorpusDir+" is not existed!\nCreate dir..."
        os.mkdir(outputCorpusDir)
    # print originalCorpusDir
    # get all the file list in the dir
output_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/testCorpus/20160403/ch2r_test_file.csv'

sentences = []
with open(segmented_merge_file,'r') as fin:
    for line in fin:
        line = ''.join([item.split(',')[0] for item in line.split('|')])
        sentences.append(line)
        # print line

labels = []
with open(correct_label,'r') as fin:
    for line in fin:
        line = line.strip()
        labels.append(line)

test = pd.DataFrame({
    'origin_sentence':sentences,
    'label_name':labels
})


print 'segment the file..'

boson_model = boson_tool.bosonnlp(logging_level = logging.INFO)

test['sentence'] = test['origin_sentence'].apply(lambda x: boson_model.Seg(x).encode('utf8'))

# print test.info()
print test.head()
print 'save the test file in %s..'%(output_file)
test.to_csv(output_file,sep='\t',index=None)