def title(): db = sql_operation.getcon() selectsql = "SELECT article_id,title FROM original_qdfuns_article WHERE title_flag IS NULL" results = sql_operation.baseselect(db, selectsql) for row in results: id = row[0] title = row[1] if baidudetector.url(title): updatesql = "UPDATE original_qdfuns_article SET title_flag = '0' WHERE (`article_id`='%d')" % (id) else: updatesql = "UPDATE original_qdfuns_article SET title_flag = '1' WHERE (`article_id`='%d')" % (id) sql_operation.baseoperation(db, updatesql) sql_operation.closecon(db)
def key_extract(): db = sql_operation.getcon() key_file = open("keyword_relevant.txt", "a", encoding='utf-8') select_sql = "SELECT keyword,key_list FROM t_relevant_search WHERE keyword LIKE '%么%' and keyword LIKE '%php%'" results = sql_operation.baseselect(db, select_sql) for row in results: # row_str = "" keyword = row[0] # key_list = row[1] key_file.write(keyword + '\n') print(keyword) key_file.close() sql_operation.closecon(db)
def sqltofile(): db = sql_operation.getcon() select_sql = "SELECT ip,`port`,type FROM t_proxy_info Where type = 'HTTPS' Order By rand() Limit 5000" results = sql_operation.baseselect(db, select_sql) proxys = [] for tur in results: item_list = [] for item in tur: item_list.append(item) proxys.append(item_list) sql_operation.closecon(db) print(proxys) thread(proxys)
def sqlETL(): db = sql_operation.getcon() select_sql = "SELECT id,key_list from t_relevant_search" select_key_sql = "SELECT `key` from t_relevant_search_key" results = sql_operation.baseselect(db, select_sql) for row in results: key_list = row[1].split('---') key_results = sql_operation.baseselect(db, select_key_sql) key_results_list = [] for temp in key_results: key_results_list.append(temp[0]) for key in key_list: if key in key_results_list: update_sql = "update t_relevant_search_key " \ "set keyword_id_list = CONCAT(keyword_id_list,'@','%s') where `key` = '%s'" \ % (row[0], key) print('-----------------') sql_operation.baseoperation(db, update_sql) else: insert_sql = "insert into t_relevant_search_key (`key`,keyword_id_list) value ('%s','%s')" \ % (key, row[0]) sql_operation.baseoperation(db, insert_sql) sql_operation.closecon(db)
def updatetosql(): db = sql_operation.getcon() select_sql = "SELECT title,url FROM t_stackoverflow_question WHERE flag = '0' Order By rand() limit 10" results = sql_operation.baseselect(db, select_sql) for row in results: url = row[1] item = item_html(url) ansers_text = "[split]".join(item[7]) updatesql = "UPDATE `t_stackoverflow_question` " \ "SET `tags`='%s', `views`='%s', `answers_num`='%s', `asked_time`='%s', `last_active_time`='%s', `question_content`='%s', `answers_contetnt`='%s' , `flag` = '1'" \ "WHERE (`title`='%s') " \ % (item[4], item[1], item[2], item[3], item[5], item[6], ansers_text, item[0],) # print(updatesql) sql_operation.baseoperation(db, updatesql) sql_operation.closecon(db)
def content(): db = sql_operation.getcon() selectsql = "SELECT article_id,url FROM original_python_article WHERE (content_flag IS NULL or content_flag = 0) and title_flag = 0" results = sql_operation.baseselect(db, selectsql) for row in results: article_id = row[0] url = row[1] print('当前所处理的文章url: ' + url) if circleCheck(url): updatesql = "UPDATE `original_python_article` SET `content_flag`='0' WHERE (`article_id`='%d')" % ( article_id) else: updatesql = "UPDATE `original_python_article` SET `content_flag`='1' WHERE (`article_id`='%d')" % ( article_id) sql_operation.baseoperation(db, updatesql) sql_operation.closecon(db)
def sql_search(): db = sql_operation.getcon() select_sql = "select keyword,relevant_search from t_relevant_search where flag = '0' Limit 50" update_sql = "update t_relevant_search set flag = '1' where flag = '0' Limit 50" results = sql_operation.baseselect(db, select_sql) sql_operation.baseoperation(db, update_sql) search_list = [] threads = [] for row in results: search_list = row[1].split("-----") for keyword in search_list: thread = threading.Thread(target=page_html, args=[keyword]) threads.append(thread) thread.start() # 阻塞主进程,等待所有子线程结束 for thread in threads: thread.join() sql_operation.closecon(db)
def titleandintro(): db = sql_operation.getcon() selectsql = "SELECT article_id,title,intro FROM original_python_article WHERE title_flag IS NULL or title_flag = 0" results = sql_operation.baseselect(db, selectsql) # print(results) for row in results: article_id = row[0] title = row[1] intro = row[2] if baidudetector.url(title) and baidudetector.url(intro): updatesql = "UPDATE `original_python_article` SET `title_flag`='0' WHERE (`article_id`='%d')" % ( article_id) else: updatesql = "UPDATE `original_python_article` SET `title_flag`='1' WHERE (`article_id`='%d')" % ( article_id) sql_operation.baseoperation(db, updatesql) # print(article_id, title, intro) sql_operation.closecon(db)