def remove_html_tag(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with db_session: print('select items!') estates = EstateEntity.select(lambda e: e.status == 1) for estate in estates: print('===> process: ', estate.url) try: extract_content = bs(estate.content).text except: estate.content = '[extract_error]' commit() print('extract_error') continue if extract_content: estate.content = extract_content commit() print('done') else: estate.content = '[no_cleaned_text]' commit() print('no_cleaned_text')
def filter_seg_freq(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with db_session: print('select items!') estates = EstateEntity.select().order_by(EstateEntity.id) for estate in estates: raw_json = estate.seg_freq seg_freq = json.loads(raw_json) # @todo: print('total_seg_freq: ', total_seg_freq) print('done')
def export_all_content(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with open(csvfile_path, 'wb') as csvfile: print('open csvfile!') writer = csv.writer(csvfile) print('write table head') table_head = ['url', 'website', 'published_at', 'content'] writer.writerow(table_head) with db_session: print('select items!') for estate in EstateEntity.select().order_by(EstateEntity.id): table_row = [estate.url, estate.website, estate.published_at, estate.content.encode('utf-8')] writer.writerow(table_row) print('id: ', estate.id)