def filter_to_kw(filename, beg, end, id): logging.info('info:%s,%s,%s-%s' % (id, filename, beg, end)) dish_extractor = DishOpinionExtractor() envir_service_extractor = EnvirServExtractor() fout = file('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg, 'w') fieldnames = ['_id', 'content', 'shop_id', 'user_id', 'star'] writer = csv.DictWriter(fout, fieldnames=fieldnames) writer.writeheader() count = 0 with open(filename) as fin: reader = csv.DictReader(fin) for line in reader: if count < beg: continue if count >= end: break count += 1 if count % 500 == 0: logging.info('process:%d:%d' % (id, count)) new_comment = '' opinion = dish_extractor.extract(line['content']) envir_service = envir_service_extractor.extract(line['content']) for each in opinion: new_comment += ' %s' % ' '.join(each) for each in envir_service: new_comment += ' %s' % ' '.join(each) row = {} for key in fieldnames: if key != 'content': row[key] = line[key] row['content'] = new_comment if new_comment.strip() != '' else line['content'] writer.writerow(row)
def filter_to_kw(filename, beg, end, id): logging.info('info:%s,%s,%s-%s' % (id, filename, beg, end)) if id != 9: return dish_extractor = DishOpinionExtractor() envir_service_extractor = EnvirServExtractor() # has finish #if os.path.exists('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg): #print 'Exist. process %d finish' % id #return fout = file( '../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg, 'a') fieldnames = ['_id', 'content', 'shop_id', 'user_id', 'star'] writer = csv.DictWriter(fout, fieldnames=fieldnames) writer.writeheader() count = 0 process_line_count = 0 with open(filename) as fin: reader = csv.DictReader(fin) for line in reader: count += 1 if count < beg: continue if count >= end: break process_line_count += 1 if process_line_count % 500 == 0: logging.info('process:%d:%lf%%' % (id, 1.0 * process_line_count / (end - beg) * 100)) if process_line_count < 160652: continue new_comment = '' opinion = dish_extractor.extract(line['content']) envir_service = envir_service_extractor.extract(line['content']) for each in opinion: new_comment += ' %s' % ' '.join(each) for each in envir_service: new_comment += ' %s' % ' '.join(each) row = {} for key in fieldnames: if key != 'content': row[key] = line[key] row['content'] = new_comment if new_comment.strip( ) != '' else line['content'] writer.writerow(row) print 'process %d finish' % id
def filter_to_kw(filename, beg, end, id): logging.info('info:%s,%s,%s-%s' % (id, filename, beg, end)) if id != 9: return dish_extractor = DishOpinionExtractor() envir_service_extractor = EnvirServExtractor() # has finish #if os.path.exists('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg): #print 'Exist. process %d finish' % id #return fout = file('../../paper/data/dianping/comment.kw/comment.keyword.%s' % beg, 'a') fieldnames = ['_id', 'content', 'shop_id', 'user_id', 'star'] writer = csv.DictWriter(fout, fieldnames=fieldnames) writer.writeheader() count = 0 process_line_count = 0 with open(filename) as fin: reader = csv.DictReader(fin) for line in reader: count += 1 if count < beg: continue if count >= end: break process_line_count += 1 if process_line_count % 500 == 0: logging.info('process:%d:%lf%%' % (id, 1.0 * process_line_count / (end - beg) * 100)) if process_line_count < 160652: continue new_comment = '' opinion = dish_extractor.extract(line['content']) envir_service = envir_service_extractor.extract(line['content']) for each in opinion: new_comment += ' %s' % ' '.join(each) for each in envir_service: new_comment += ' %s' % ' '.join(each) row = {} for key in fieldnames: if key != 'content': row[key] = line[key] row['content'] = new_comment if new_comment.strip() != '' else line['content'] writer.writerow(row) print 'process %d finish' % id