def main(): GET_SQL = "SELECT song_id,song_lrc FROM song" PUT_SQL = "INSERT INTO res_lrc(song_id,lrc) VALUES(%s,%s)" db = DBUtil.getConnection() cursor = db.cursor() cursor.execute(GET_SQL) res = cursor.fetchall() for item in res: lrc_json = json.loads(item[1]) try: lrc = lrc_json['lrc']['lyric'].strip().split('\n') tlrc = lrc_json['tlyric']['lyric'] except KeyError: continue if tlrc: continue words = '' for line in lrc: line = line[line.find(']')+1:].strip() line = line[line.find('】')+1:].strip() if ":" in line or ":" in line: continue words += line + " " words = words.strip() if words: cursor.execute(PUT_SQL, (item[0], words)) db.commit() print(item[0], words) db.close()
import jieba import re from util import DBUtil, SYSUtil db = DBUtil.getConnection() cursor = db.cursor() stop_file = open('static/stop-words.txt') # '../static/' if run as __main__ stop_words = stop_file.read().strip().split() stop_file.close() GET_SQL = "SELECT song_id,lrc FROM res_lrc" PUT_SQL = "INSERT INTO res_word(song_id, word, cnt) VALUES(%s, %s, %s)" def checkChinese(word): return re.match('[\u4E00-\u9FA5\uF900-\uFA2D]', word) def work(lrc): cut_res = jieba.cut(lrc) res_list = [] for item in cut_res: item = item.strip() if item in stop_words or not item or not checkChinese(item): continue res_list.append(item) return res_list def count(words):