예제 #1
0
def filter_to_kw(filename, beg, end, id):
    logging.info('info:%s,%s,%s-%s' % (id, filename, beg, end))
    dish_extractor = DishOpinionExtractor()
    envir_service_extractor = EnvirServExtractor()
    fout = file('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg, 'w')
    fieldnames = ['_id', 'content', 'shop_id', 'user_id', 'star']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)
    writer.writeheader()
    count = 0
    with open(filename) as fin:
        reader = csv.DictReader(fin)
        for line in reader:
            if count < beg:
                continue
            if count >= end:
                break
            count += 1
            if count % 500 == 0:
                logging.info('process:%d:%d' % (id, count))
            new_comment = ''
            opinion = dish_extractor.extract(line['content'])
            envir_service = envir_service_extractor.extract(line['content'])
            for each in opinion:
                new_comment += ' %s' % ' '.join(each)
            for each in envir_service:
                new_comment += ' %s' % ' '.join(each)
            row = {}
            for key in fieldnames:
                if key != 'content':
                    row[key] = line[key]
            row['content'] = new_comment if new_comment.strip() != '' else line['content']
            writer.writerow(row)
예제 #2
0
def filter_to_kw(filename, beg, end, id):
    logging.info('info:%s,%s,%s-%s' % (id, filename, beg, end))
    if id != 9: return
    dish_extractor = DishOpinionExtractor()
    envir_service_extractor = EnvirServExtractor()
    # has finish
    #if os.path.exists('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg):
    #print 'Exist. process %d finish' % id
    #return

    fout = file(
        '../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg, 'a')
    fieldnames = ['_id', 'content', 'shop_id', 'user_id', 'star']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)
    writer.writeheader()
    count = 0
    process_line_count = 0
    with open(filename) as fin:
        reader = csv.DictReader(fin)
        for line in reader:
            count += 1
            if count < beg:
                continue
            if count >= end:
                break
            process_line_count += 1
            if process_line_count % 500 == 0:
                logging.info('process:%d:%lf%%' %
                             (id, 1.0 * process_line_count /
                              (end - beg) * 100))
            if process_line_count < 160652: continue
            new_comment = ''
            opinion = dish_extractor.extract(line['content'])
            envir_service = envir_service_extractor.extract(line['content'])
            for each in opinion:
                new_comment += ' %s' % ' '.join(each)
            for each in envir_service:
                new_comment += ' %s' % ' '.join(each)
            row = {}
            for key in fieldnames:
                if key != 'content':
                    row[key] = line[key]
            row['content'] = new_comment if new_comment.strip(
            ) != '' else line['content']
            writer.writerow(row)
    print 'process %d finish' % id
예제 #3
0
def filter_to_kw(filename, beg, end, id):
    logging.info('info:%s,%s,%s-%s' % (id, filename, beg, end))
    if id != 9: return
    dish_extractor = DishOpinionExtractor()
    envir_service_extractor = EnvirServExtractor()
    # has finish
    #if os.path.exists('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg):
        #print 'Exist. process %d finish' % id
        #return

    fout = file('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg, 'a')
    fieldnames = ['_id', 'content', 'shop_id', 'user_id', 'star']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)
    writer.writeheader()
    count = 0
    process_line_count = 0
    with open(filename) as fin:
        reader = csv.DictReader(fin)
        for line in reader:
            count += 1
            if count < beg:
                continue
            if count >= end:
                break
            process_line_count += 1
            if process_line_count % 500 == 0:
                logging.info('process:%d:%lf%%' % (id, 1.0 * process_line_count / (end - beg) * 100))
            if process_line_count < 160652: continue
            new_comment = ''
            opinion = dish_extractor.extract(line['content'])
            envir_service = envir_service_extractor.extract(line['content'])
            for each in opinion:
                new_comment += ' %s' % ' '.join(each)
            for each in envir_service:
                new_comment += ' %s' % ' '.join(each)
            row = {}
            for key in fieldnames:
                if key != 'content':
                    row[key] = line[key]
            row['content'] = new_comment if new_comment.strip() != '' else line['content']
            writer.writerow(row)
    print 'process %d finish' % id