def remove_html_tag(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with db_session: print('select items!') estates = EstateEntity.select(lambda e: e.status == 1) for estate in estates: print('===> process: ', estate.url) try: extract_content = bs(estate.content).text except: estate.content = '[extract_error]' commit() print('extract_error') continue if extract_content: estate.content = extract_content commit() print('done') else: estate.content = '[no_cleaned_text]' commit() print('no_cleaned_text')
def extract_content_with_readability(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with db_session: print('select items!') estates = EstateQ2Entity.select(lambda e: e.content == '[no_cleaned_text]') for estate in estates: print('===> process: ', estate.url) try: extract_content = Document(estate.html).summary() except: estate.content = '[extract_error]' commit() print('extract_error') continue if extract_content: estate.content = extract_content commit() print('done') else: estate.content = '[no_cleaned_text]' commit() print('no_cleaned_text')
def mark_topic(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with db_session: print('select items!') estates = EstateQ2Entity.select(lambda e: e.topic == None).order_by(EstateQ2Entity.id) for estate in estates: raw_json = estate.seg_freq seg_freq = json.loads(raw_json) topic = 0 if check_topic1(seg_freq): topic = topic | TOPIC1 if check_topic2(seg_freq): topic = topic | TOPIC2 if check_topic3(seg_freq): topic = topic | TOPIC3 if check_topic4(seg_freq): topic = topic | TOPIC4 if check_topic5(seg_freq): topic = topic | TOPIC5 if check_topic6(seg_freq): topic = topic | TOPIC6 if check_topic7(seg_freq): topic = topic | TOPIC7 if check_topic8(seg_freq): topic = topic | TOPIC8 if check_topic9(seg_freq): topic = topic | TOPIC9 if check_topic10(seg_freq): topic = topic | TOPIC10 if check_topic11(seg_freq): topic = topic | TOPIC11 estate.topic = topic commit() if topic != 0: print('!!! topic: ', topic) print('id: ', estate.id)
def filter_seg_freq(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with db_session: print('select items!') estates = EstateEntity.select().order_by(EstateEntity.id) for estate in estates: raw_json = estate.seg_freq seg_freq = json.loads(raw_json) # @todo: print('total_seg_freq: ', total_seg_freq) print('done')
def export_all_content(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with open(csvfile_path, 'wb') as csvfile: print('open csvfile!') writer = csv.writer(csvfile) print('write table head') table_head = ['url', 'website', 'published_at', 'content'] writer.writerow(table_head) with db_session: print('select items!') for estate in EstateEntity.select().order_by(EstateEntity.id): table_row = [estate.url, estate.website, estate.published_at, estate.content.encode('utf-8')] writer.writerow(table_row) print('id: ', estate.id)
def export_csv(): db.bind('mysql', **settings.get('DB')) db.generate_mapping() print('connect to db!') with open(csvfile_path, 'wb') as csvfile: print('open csvfile!') writer = csv.writer(csvfile) print('write table head') # url, website, location, published_at, content, seg_freq, topic table_head = ['url', 'website', 'location', 'published_at', 'seg_freq', 'topic'] writer.writerow(table_head) with db_session: print('select items!') for estate in EstateQ2Entity.select().order_by(EstateQ2Entity.id): table_row = [estate.url, estate.website, estate.location, estate.published_at, estate.seg_freq, estate.topic] writer.writerow(table_row) print('id: ', estate.id)
def items_segment(): db.bind("mysql", **settings.get("DB")) db.generate_mapping() print("connect to db!") with db_session: print("select items!") estates = EstateQ2Entity.select(lambda e: e.content != None).order_by(EstateQ2Entity.id) for estate in estates: if estate.content: try: seg_freq_res = segment(estate.content) except: estate.status = 2 commit() continue estate.seg_freq = seg_freq_res commit() print("==> id: ", estate.id)
def __init__(self): db.bind('mysql', **settings.get('DB')) db.generate_mapping()
def extract_content(): if len(sys.argv) >= 3: start_id = int(sys.argv[1]) end_id = int(sys.argv[2]) print("start_id: ", start_id, " end_id: ", end_id) else: start_id = 0 end_id = 0 db.bind("mysql", **settings.get("DB")) db.generate_mapping() print("connect to db!") goose = Goose({"stopwords_class": StopWordsChinese}) with db_session: if start_id != 0: print("select items!") for estate in EstateQ2Entity.select(lambda e: e.content is None and e.id >= start_id and e.id < end_id): print("===> process: ", estate.url) try: extract_content = goose.extract(raw_html=estate.html) except: estate.content = "[extract_error]" commit() print("extract_error") continue if extract_content.cleaned_text: estate.content = extract_content.cleaned_text commit() print("done") else: estate.content = "[no_cleaned_text]" commit() print("no_cleaned_text") else: print("select items!") for estate in EstateQ2Entity.select(lambda e: e.content is None): print("===> process: ", estate.url) try: extract_content = goose.extract(raw_html=estate.html) except: estate.content = "[extract_error]" commit() print("extract_error") continue if extract_content.cleaned_text: estate.content = extract_content.cleaned_text commit() print("done") else: estate.content = "[no_cleaned_text]" commit() print("no_cleaned_text")