from predict_execution_en_review import * from db.almaden import Sql db = Sql("datacast2") contents_row = db.select('crawling_status_youtube_view','*','contents_status="GF"') ## pred_confing ##predict for row in contents_row: keyword= row['keyword'] contents_id = row['contents_id'] n_reply_crawled = row['n_reply_crawled'] if n_reply_crawled is not None and n_reply_crawled > 0: db.update_one('crawl_contents','crawl_status','SI','contents_id',contents_id) obj_predict = Predict(keyword=row['keyword'], channel='youtube', contents_id=contents_id) obj_predict.predict() else: task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id) pass db = Sql("datacast2") db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
from predict_execution_blog import * from db.almaden import Sql db = Sql("datacast2") blog_channel_list = str(('naverblog', 'instagram', 'GooglePlay')) review_channel_list = str(('navershopping', 'youtube')) task_row = db.select( ''' crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id JOIN request_batch AS rb ON rb.batch_id = cr.batch_id JOIN crawl_task AS ct ON crt.task_id=ct.task_id ''', 'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel', 'ct.crawl_status="GF" and ct.channel in %s' % (blog_channel_list)) ## pred_confing obj_pred_config = Pred_config() ##predict for row in task_row: task_id = row['task_id'] channel = row['channel'] real_task = db.select('crawl_task', '*', 'task_id=%s' % (task_id)) real_crawl_status = real_task[0]['crawl_status'] if real_crawl_status == 'GF': db.update_one('crawl_task', 'crawl_status', 'SI', 'task_id', task_id) obj_predict = Predict(obj_pred_config, task_id=task_id,
from predict_execution_en_review import * from db.almaden import Sql db = Sql("datacast2") contents_row = db.select('crawling_status_youtube_view', '*', 'contents_status="GF" and is_channel=1') ## pred_confing ##predict for row in contents_row: keyword = row['keyword'] contents_id = row['contents_id'] n_reply_crawled = row['n_reply_crawled'] if n_reply_crawled is not None and n_reply_crawled > 0: db.update_one('crawl_contents', 'crawl_status', 'SI', 'contents_id', contents_id) obj_predict = Predict(keyword=row['keyword'], channel='youtube', contents_id=contents_id) obj_predict.predict() else: task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id) pass db = Sql("datacast2") db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
from predict_execution_en_blog import * from db.almaden import Sql db = Sql("datacast2") key = 'sonnen home battery' cha = 'twitter' request_row = db.select('crawl_request','*',f'crawl_status="GF" and keyword="{key}" and channel="{cha}"') ## pred_confing ##predict for row in request_row: obj_predict = Predict(keyword=row['keyword'],channel=row['channel']) obj_predict.predict() task_ids = db.select('crawl_task','*',f'keyword="{row["keyword"]}" and channel="youtube"') for task in task_ids: db.update_one('crawl_task','crawl_status','SF','task_id',task['task_id'])
from predict_execution_review import * from db.almaden import Sql db = Sql("datacast2") "select * from crawl_contents where contents_status='GF'" contents_row = db.select('crawling_status_youtube_view','*','contents_status="GF"') ## pred_confing obj_pred_config = Pred_config() ##predict for row in contents_row: keyword= row['keyword'] contents_id = row['contents_id'] n_reply_crawled = row['n_reply_crawled'] print(keyword,contents_id,n_reply_crawled) real_crawl_status = db.select('crawling_status_youtube_view', '*', 'contents_id=%s'%(contents_id)) real_crawl_status = real_crawl_status[0]['contents_status'] if real_crawl_status =='GF': if n_reply_crawled is not None and n_reply_crawled > 0: db.update_one('crawl_contents', 'crawl_status', 'SI', 'contents_id', contents_id) obj_predict = Predict(obj_pred_config,keyword=keyword,contents_id=contents_id) obj_predict.predict() else: task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id) pass db = Sql("datacast2") task_ids = db.update_one('crawl_contents','crawl_status','SF','contents_id',contents_id)
#시작시간 start_time = time.time() #멀티쓰레드 사용 하는 경우 (20만 카운트) #Pool 사용해서 함수 실행을 병렬 if __name__ == '__main__': process_list = [] task_list = [] db = Sql("datacast2") task_row = db.select( ''' crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id JOIN request_batch AS rb ON rb.batch_id = cr.batch_id JOIN crawl_task AS ct ON crt.task_id=ct.task_id ''', 'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel', 'ct.crawl_status="GF" and ct.channel !="navershopping" and rb.batch_id=57 limit 6' ) ## pred_confing obj_pred_config = Pred_config() ##predict # #멀티 쓰레딩 Pool 사용 for row in task_row: task_id = row['task_id'] channel = row['channel'] real_task_status = db.select('crawl_task', '*', 'task_id=%s' % (task_id)) if real_task_status[0]['crawl_status'] == 'GF': db.update_one('crawl_task', 'crawl_status', 'SI', 'task_id',
from predict_execution_review import * from db.almaden import Sql db = Sql("datacast2") contents_row = db.select('crawling_status_navershopping_view', '*', 'contents_status="GF"') ## pred_confing obj_pred_config = Pred_config() ##predict for row in contents_row: keyword = row['keyword'] contents_id = row['contents_id'] db.update_one('crawl_contents', 'crawl_status', 'SI', 'contents_id', contents_id) n_reply_crawled = row['n_reply_crawled'] print(keyword, contents_id, n_reply_crawled) if n_reply_crawled is not None and n_reply_crawled > 0: obj_predict = Predict(obj_pred_config, keyword=keyword, contents_id=contents_id) obj_predict.predict() else: task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id) pass db = Sql("datacast2") task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
class Data: def __init__(self, dbName='dalmaden'): self.db = Sql(dbName) self.df = pd.DataFrame() def _load_(self, channel, keyword, fromDate, toDate, tablename='cdata'): where_str = f"keyword='{keyword}' and channel='{channel}' and post_date between '{fromDate}' and '{toDate}'" ldf = self.db.select(tablename, "*", where_str, asDataFrame=True) return ldf def addData(self, channel, keyword, fromDate, toDate, tablename='cdata', unique=True, drop_by=['keyword', 'url']): nrows0 = self.df.shape[0] ldf = self._load_(channel, keyword, fromDate, toDate, tablename) print(ldf) nrowsldf = ldf.shape[0] self.df = self.df.append(ldf) addednRows = nrowsldf droppednRows = 0 if unique: self.drop_duplicates(subset=drop_by) addednRows = self.df.shape[0] - nrows0 droppednRows = nrowsldf - addednRows print( f'addData : added {addednRows} rows (dropped {droppednRows} rows)') def drop_duplicates(self, subset=None): self.df = self.df.drop_duplicates(subset=subset) def shape(self): return self.df.shape def get_df(self, *colnames, by_sentence=''): ''' :param colnames: 행이름 str :param by_sentence: 문장 분해 대상 text 행이름 :return: DataFrame ''' df_documents = self.df.loc[:, list(colnames)] if len(by_sentence) > 0: df_sentences = pd.DataFrame() nrows = df_documents.shape[0] for i in range(nrows): if i % 100 == 0: print(f"loader : Getting Sentences {i}/{nrows}") row = df_documents.iloc[i] text = row[by_sentence] if len(text) > 0: text = cleanse_text(text) sentences = kss.split_sentences( text) #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함 for s in sentences: s = cleanse_sentence(s) if len(s) > 0: row_temp = row.copy() row_temp[by_sentence] = s df_sentences = df_sentences.append(row_temp) else: continue print( f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences" ) return df_sentences else: return df_documents