# encoding=utf8 __author__ = 'wang' import sys sys.path.append('/home/jdwang/PycharmProjects/corprocessor/bosonnlp') import bosonnlp.toolSet as bosennlp_toolset import os import logging # create a bosennlp object # set the level of logging logging_level = logging.INFO bosennlp_model = bosennlp_toolset.bosonnlp( logging_level=logging_level ) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging_level) originalCorpusDir = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/originalCorpusAfterClean/20160404/' outputCorpusDir = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/clean_repeated_sentences/20160404/' segmented_crpus_dir = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/segmentedCorpus/bosonnlp/20160404/' def clean_repeat_sentences(): logging.info( 'clean repeat sentences...') from subprocess import call if not os.path.exists(outputCorpusDir): print "the outputCorpusDir/"+outputCorpusDir+" is not existed!\nCreate dir..." os.mkdir(outputCorpusDir) # print originalCorpusDir # get all the file list in the dir
output_file = '/home/jdwang/PycharmProjects/corprocessor/coprocessor/Corpus/testCorpus/20160403/ch2r_test_file.csv' sentences = [] with open(segmented_merge_file,'r') as fin: for line in fin: line = ''.join([item.split(',')[0] for item in line.split('|')]) sentences.append(line) # print line labels = [] with open(correct_label,'r') as fin: for line in fin: line = line.strip() labels.append(line) test = pd.DataFrame({ 'origin_sentence':sentences, 'label_name':labels }) print 'segment the file..' boson_model = boson_tool.bosonnlp(logging_level = logging.INFO) test['sentence'] = test['origin_sentence'].apply(lambda x: boson_model.Seg(x).encode('utf8')) # print test.info() print test.head() print 'save the test file in %s..'%(output_file) test.to_csv(output_file,sep='\t',index=None)