Exemplo n.º 1
0
def jieba_seg(filepath):

    cnt = Counter()
    cnt['line'] = 0

    segmentor = segment()

    with open(filepath + '.seg', 'wb') as outputfile:
        writer = unicodecsv.writer(outputfile, delimiter='\t', encoding='utf-8')

        with open(filepath) as inputfile:

            cnt['line'] += 1
            logging.info('line count')
                
            for transaction in csv.reader(inputfile, delimiter='\t'):
                assert len(transaction) == 1, "\n%s" % (str(transaction))

                cleanedstr = string_process.remove_characters(unicode(transaction[0], 'utf-8'))
                segmentres = segmentor.jieba_segment(cleanedstr)
                cleanedres = string_process.remove_invalid_string(segmentres)
                writer.writerow(cleanedres)
Exemplo n.º 2
0
def convert_to_str(string):

    segvec = segmentor.jieba_segment(string)
    cleanedvec = string_process.remove_invalid_string(segvec)
    sortedvec = sorted(cleanedvec)
    return (u'|'.join(sortedvec))