예제 #1
0
from predict_execution_en_review import *
from db.almaden import Sql
db = Sql("datacast2")

contents_row = db.select('crawling_status_youtube_view','*','contents_status="GF"')
## pred_confing

##predict
for row in contents_row:
    keyword= row['keyword']
    contents_id = row['contents_id']
    n_reply_crawled = row['n_reply_crawled']

    if n_reply_crawled is not None and n_reply_crawled > 0:
        db.update_one('crawl_contents','crawl_status','SI','contents_id',contents_id)
        obj_predict = Predict(keyword=row['keyword'], channel='youtube', contents_id=contents_id)
        obj_predict.predict()
    else:
        task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
        pass
    db = Sql("datacast2")
    db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
예제 #2
0
from predict_execution_blog import *
from db.almaden import Sql
db = Sql("datacast2")

blog_channel_list = str(('naverblog', 'instagram', 'GooglePlay'))
review_channel_list = str(('navershopping', 'youtube'))

task_row = db.select(
    '''
crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id
JOIN request_batch AS rb ON rb.batch_id = cr.batch_id
JOIN crawl_task AS ct ON crt.task_id=ct.task_id
''',
    'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel',
    'ct.crawl_status="GF" and ct.channel in %s' % (blog_channel_list))
## pred_confing
obj_pred_config = Pred_config()

##predict
for row in task_row:

    task_id = row['task_id']
    channel = row['channel']

    real_task = db.select('crawl_task', '*', 'task_id=%s' % (task_id))
    real_crawl_status = real_task[0]['crawl_status']

    if real_crawl_status == 'GF':
        db.update_one('crawl_task', 'crawl_status', 'SI', 'task_id', task_id)
        obj_predict = Predict(obj_pred_config,
                              task_id=task_id,
from predict_execution_en_review import *
from db.almaden import Sql
db = Sql("datacast2")

contents_row = db.select('crawling_status_youtube_view', '*',
                         'contents_status="GF" and is_channel=1')
## pred_confing

##predict
for row in contents_row:
    keyword = row['keyword']
    contents_id = row['contents_id']
    n_reply_crawled = row['n_reply_crawled']

    if n_reply_crawled is not None and n_reply_crawled > 0:
        db.update_one('crawl_contents', 'crawl_status', 'SI', 'contents_id',
                      contents_id)
        obj_predict = Predict(keyword=row['keyword'],
                              channel='youtube',
                              contents_id=contents_id)
        obj_predict.predict()
    else:
        task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF',
                                 'contents_id', contents_id)
        pass
    db = Sql("datacast2")
    db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id',
                  contents_id)
예제 #4
0
from predict_execution_en_blog import *
from db.almaden import Sql
db = Sql("datacast2")
key = 'sonnen home battery'
cha = 'twitter'
request_row = db.select('crawl_request','*',f'crawl_status="GF" and keyword="{key}" and channel="{cha}"')
## pred_confing

##predict
for row in request_row:
    obj_predict = Predict(keyword=row['keyword'],channel=row['channel'])
    obj_predict.predict()
    task_ids = db.select('crawl_task','*',f'keyword="{row["keyword"]}" and channel="youtube"')
    for task in task_ids:
        db.update_one('crawl_task','crawl_status','SF','task_id',task['task_id'])
예제 #5
0
from predict_execution_review import *
from db.almaden import Sql

db = Sql("datacast2")
"select * from crawl_contents where contents_status='GF'"
contents_row = db.select('crawling_status_youtube_view','*','contents_status="GF"')
## pred_confing
obj_pred_config = Pred_config()
##predict
for row in contents_row:
    keyword= row['keyword']
    contents_id = row['contents_id']
    n_reply_crawled = row['n_reply_crawled']
    print(keyword,contents_id,n_reply_crawled)

    real_crawl_status = db.select('crawling_status_youtube_view', '*', 'contents_id=%s'%(contents_id))
    real_crawl_status = real_crawl_status[0]['contents_status']
    if real_crawl_status =='GF':
        if n_reply_crawled is not None and n_reply_crawled > 0:
            db.update_one('crawl_contents', 'crawl_status', 'SI', 'contents_id', contents_id)
            obj_predict = Predict(obj_pred_config,keyword=keyword,contents_id=contents_id)
            obj_predict.predict()
        else:
            task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
            pass
        db = Sql("datacast2")
        task_ids = db.update_one('crawl_contents','crawl_status','SF','contents_id',contents_id)
예제 #6
0
#시작시간
start_time = time.time()

#멀티쓰레드 사용 하는 경우 (20만 카운트)
#Pool 사용해서 함수 실행을 병렬

if __name__ == '__main__':
    process_list = []
    task_list = []
    db = Sql("datacast2")
    task_row = db.select(
        '''
    crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id
    JOIN request_batch AS rb ON rb.batch_id = cr.batch_id
    JOIN crawl_task AS ct ON crt.task_id=ct.task_id
    ''',
        'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel',
        'ct.crawl_status="GF" and ct.channel !="navershopping" and rb.batch_id=57 limit 6'
    )
    ## pred_confing
    obj_pred_config = Pred_config()
    ##predict
    # #멀티 쓰레딩 Pool 사용
    for row in task_row:
        task_id = row['task_id']
        channel = row['channel']
        real_task_status = db.select('crawl_task', '*',
                                     'task_id=%s' % (task_id))
        if real_task_status[0]['crawl_status'] == 'GF':
            db.update_one('crawl_task', 'crawl_status', 'SI', 'task_id',
from predict_execution_review import *
from db.almaden import Sql

db = Sql("datacast2")
contents_row = db.select('crawling_status_navershopping_view', '*',
                         'contents_status="GF"')
## pred_confing
obj_pred_config = Pred_config()
##predict
for row in contents_row:
    keyword = row['keyword']
    contents_id = row['contents_id']
    db.update_one('crawl_contents', 'crawl_status', 'SI', 'contents_id',
                  contents_id)
    n_reply_crawled = row['n_reply_crawled']
    print(keyword, contents_id, n_reply_crawled)
    if n_reply_crawled is not None and n_reply_crawled > 0:
        obj_predict = Predict(obj_pred_config,
                              keyword=keyword,
                              contents_id=contents_id)
        obj_predict.predict()
    else:
        task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF',
                                 'contents_id', contents_id)
        pass
    db = Sql("datacast2")
    task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF',
                             'contents_id', contents_id)
예제 #8
0
class Data:
    def __init__(self, dbName='dalmaden'):
        self.db = Sql(dbName)
        self.df = pd.DataFrame()

    def _load_(self, channel, keyword, fromDate, toDate, tablename='cdata'):
        where_str = f"keyword='{keyword}' and channel='{channel}' and post_date between '{fromDate}' and '{toDate}'"
        ldf = self.db.select(tablename, "*", where_str, asDataFrame=True)
        return ldf

    def addData(self,
                channel,
                keyword,
                fromDate,
                toDate,
                tablename='cdata',
                unique=True,
                drop_by=['keyword', 'url']):
        nrows0 = self.df.shape[0]
        ldf = self._load_(channel, keyword, fromDate, toDate, tablename)
        print(ldf)
        nrowsldf = ldf.shape[0]

        self.df = self.df.append(ldf)
        addednRows = nrowsldf
        droppednRows = 0

        if unique:
            self.drop_duplicates(subset=drop_by)
            addednRows = self.df.shape[0] - nrows0
            droppednRows = nrowsldf - addednRows
        print(
            f'addData : added {addednRows} rows (dropped {droppednRows} rows)')

    def drop_duplicates(self, subset=None):
        self.df = self.df.drop_duplicates(subset=subset)

    def shape(self):
        return self.df.shape

    def get_df(self, *colnames, by_sentence=''):
        '''
        :param colnames: 행이름 str
        :param by_sentence: 문장 분해 대상 text 행이름
        :return: DataFrame
        '''
        df_documents = self.df.loc[:, list(colnames)]
        if len(by_sentence) > 0:
            df_sentences = pd.DataFrame()
            nrows = df_documents.shape[0]
            for i in range(nrows):
                if i % 100 == 0:
                    print(f"loader : Getting Sentences {i}/{nrows}")
                row = df_documents.iloc[i]
                text = row[by_sentence]
                if len(text) > 0:
                    text = cleanse_text(text)
                    sentences = kss.split_sentences(
                        text)  #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함
                    for s in sentences:
                        s = cleanse_sentence(s)
                        if len(s) > 0:
                            row_temp = row.copy()
                            row_temp[by_sentence] = s
                            df_sentences = df_sentences.append(row_temp)
                else:
                    continue
            print(
                f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences"
            )
            return df_sentences
        else:
            return df_documents