def __init__(self,
                 pred_config: Pred_config,
                 keyword=None,
                 contents_id=None):
        self.pred_config = pred_config
        self.engine = create_engine(
            ("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format(
                'root', 'robot369', '1.221.75.76', 3306, 'datacast2'))
        self.args = self.pred_config.get_args()

        ##쿠다, cpu 중 사용할 디바이스 설정
        self.device = self.pred_config.get_device()

        ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기)
        self.batch_size = self.pred_config.batch_size

        ##모델 가져오기
        self.model = self.pred_config.load_model(self.args, self.device)

        ##토크나이저 가져오기
        self.tokenizer = self.pred_config.load_tokenizer()
        self.nlp = Mecab()
        self.keyword = keyword
        self.contents_id = contents_id
        self.db = Sql("datacast2")
示例#2
0
 def __init__(self,keyword,channel,contents_id):
     self.engine = create_engine(("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format('root','robot369',
                                                                                                 '1.221.75.76',3306,'datacast2'))
     self.db = Sql("datacast2")
     self.keyword = keyword
     self.channel = channel
     self.contents_id = contents_id
示例#3
0
class Predict:
    def __init__(self, pred_config:Pred_config,keyword=None,contents_id=None):
        self.pred_config = pred_config
        self.engine = create_engine(("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format('root','robot369',
                                                                                                '10.96.5.179',3306,'datacast2'))
        self.args = self.pred_config.get_args()

        ##쿠다, cpu 중 사용할 디바이스 설정
        self.device = self.pred_config.get_device()

        ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기)
        self.batch_size = self.pred_config.batch_size

        ##모델 가져오기
        self.model = self.pred_config.load_model(self.args, self.device)

        ##토크나이저 가져오기
        self.tokenizer = self.pred_config.load_tokenizer()
        self.nlp = Mecab()
        self.keyword = keyword
        self.contents_id = contents_id
        self.db = Sql("datacast2")
    def verbs(self,phrase):
        """Verbs extractor."""
        verbs = ['VV']
        tagged = self.nlp.pos(phrase)
        return [s for s, t in tagged if t in verbs]

    def adjs(self,phrase):

        """Adjs extractor."""
        adjs = ['VA','IC']
        tagged = self.nlp.pos(phrase)
        return [s for s, t in tagged if t in adjs]

    def read(self):
        # conn = pymysql.connect(host='1.221.75.76', user='******', password='******', database='datacast')
        # curs = conn.cursor(pymysql.cursors.DictCursor)
        # sql_select_sentence = 'select * from analysis_sentence'
        # curs.execute(sql_select_sentence)
        # rows = curs.fetchall()
        ##pandas datatable 형태로 sentece 테이블 읽어들이기


        print('sql:',"SELECT ct.channel,cc.contents_id,cs.text from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id "
            "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (self.contents_id ,self.keyword))
        # df_sentence_rows = pd.read_sql("SELECT ct.task_id,ct.channel,cc.contents_id,cc.text,cc.url from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id WHERE ct.keyword=\'%s\' limit %d,%d;"%(self.keyword,start_num,chunk_size),self.engine)
        df_sentence_rows = pd.read_sql(
            "SELECT ct.keyword,ct.channel,cc.contents_id as contents_id,cs.sentence_id as sentence_id, cs.text as sentence from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id "
            "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (
            self.contents_id,self.keyword),
            self.engine)

        return df_sentence_rows

    def convert_input_sentence_to_tensor_dataset(self,df_sentence_rows,cls_token_segment_id=0,
                                             pad_token_segment_id=0,
                                             sequence_a_segment_id=0,
                                             mask_padding_with_zero=True):
        tokenizer = self.tokenizer
        args = self.args


        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        pad_token_id = tokenizer.pad_token_id

        all_input_ids = []
        all_attention_mask = []
        all_token_type_ids = []

        ###input file 읽어들이기
        ###input file 읽어서 tensordata type 으로 변환
        for index in df_sentence_rows.index:
            sentence = df_sentence_rows.at[index, 'sentence']

            tokens = tokenizer.tokenize(sentence)

            # Account for [CLS] and [SEP]
            special_tokens_count = 2
            #문장의 최대길이 보다 큰 문장에 대해서 길이 조정을 해준다.
            if len(tokens) > args.max_seq_len - special_tokens_count:
                tokens = tokens[:(args.max_seq_len - special_tokens_count)]

            # Add [SEP] token
            tokens += [sep_token]
            token_type_ids = [sequence_a_segment_id] *len(tokens)

            # Add [CLS] token
            tokens = [cls_token] + tokens
            token_type_ids = [cls_token_segment_id] + token_type_ids
            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 real tokens and 0 for padding tokens. Only real tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
            # Zero-pad up to the sequence length.
            padding_length = args.max_seq_len - len(input_ids)
            input_ids = input_ids+([pad_token_id] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_token_type_ids.append(token_type_ids)

        # Change to Tensor
        all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
        all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
        all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
        return dataset

    def predict(self):
        ##tuning 시 파라미터 정보가 들어있는 파일(training_args.bin)
        args = self.args

        ##쿠다, cpu 중 사용할 디바이스 설정
        device = self.device

        ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기)
        batch_size= self.batch_size

        ##모델 가져오기
        model = self.model
        logger.info(args)

        ##감성분석할 데이터 가져오기
        df_sentence_data_rows = self.read()

        dataset = self.convert_input_sentence_to_tensor_dataset(df_sentence_data_rows)

        # dataset 을 model 을 이용하여 output 도출
        # Predict
        sampler = SequentialSampler(dataset)
        data_loader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
        preds = None
        probs = None
        print(type(data_loader),len(data_loader))
        for index,batch in enumerate(tqdm(data_loader, desc="Prediction")):
            batch = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                inputs = {"input_ids": batch[0],
                          "attention_mask": batch[1],
                          "labels": None}
                if args.model_type != "distilkobert":
                    inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                logits = outputs[0]

                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    probs = np.exp(logits.detach().cpu().numpy())/ (1 + np.exp(logits.detach().cpu().numpy()))
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    probs = np.append(probs, np.exp(logits.detach().cpu().numpy())/ (1 + np.exp(logits.detach().cpu().numpy())), axis=0)
        preds = np.argmax(preds, axis=1).tolist()
        prob_max_index = np.argmax(probs, axis=-1)
        maximum_probs = probs[np.arange(probs.shape[0]), prob_max_index]
        # maximum_probs = maximum_probs.tolist()
        # maximum_probs = list([round(maximum_prob,2) if pred==1 else round(maximum_prob,2)*(-1) for pred,maximum_prob in zip(preds,maximum_probs)])
        df_sentence_data_rows['positiveness'] = preds

        #update
        for idx in tqdm(df_sentence_data_rows.index,desc="sentence_anlysis&db_update"):
            try:
                sentence_id = df_sentence_data_rows.at[idx,'sentence_id']
                sentence = df_sentence_data_rows.at[idx,'sentence']
                positiveness = df_sentence_data_rows.at[idx,'positiveness']
                nouns = list(set(self.nlp.nouns(sentence)))
                nouns = json.dumps(nouns,ensure_ascii=False)

                verbs = list(set(self.verbs(sentence)))
                verbs = json.dumps(verbs,ensure_ascii=False)

                adjs = list(set(self.adjs(sentence)))
                adjs = json.dumps(adjs,ensure_ascii=False)
                self.db.update_multi_column("crawl_sentence",
                                     update_dict={"nouns":nouns,"verbs":verbs,"adjs":adjs,"positiveness":float(positiveness)},
                                     where_dict={"sentence_id":float(sentence_id)})
            except Exception as e:
                print(e)
                continue
示例#4
0
from predict_execution_en_review import *
from db.almaden import Sql
db = Sql("datacast2")

contents_row = db.select('crawling_status_youtube_view','*','contents_status="GF"')
## pred_confing

##predict
for row in contents_row:
    keyword= row['keyword']
    contents_id = row['contents_id']
    n_reply_crawled = row['n_reply_crawled']

    if n_reply_crawled is not None and n_reply_crawled > 0:
        db.update_one('crawl_contents','crawl_status','SI','contents_id',contents_id)
        obj_predict = Predict(keyword=row['keyword'], channel='youtube', contents_id=contents_id)
        obj_predict.predict()
    else:
        task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
        pass
    db = Sql("datacast2")
    db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
from predict_execution_blog import *
from db.almaden import Sql
db = Sql("datacast2")

blog_channel_list = str(('naverblog', 'instagram', 'GooglePlay'))
review_channel_list = str(('navershopping', 'youtube'))

task_row = db.select(
    '''
crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id
JOIN request_batch AS rb ON rb.batch_id = cr.batch_id
JOIN crawl_task AS ct ON crt.task_id=ct.task_id
''',
    'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel',
    'ct.crawl_status="GF" and ct.channel in %s' % (blog_channel_list))
## pred_confing
obj_pred_config = Pred_config()

##predict
for row in task_row:

    task_id = row['task_id']
    channel = row['channel']

    real_task = db.select('crawl_task', '*', 'task_id=%s' % (task_id))
    real_crawl_status = real_task[0]['crawl_status']

    if real_crawl_status == 'GF':
        db.update_one('crawl_task', 'crawl_status', 'SI', 'task_id', task_id)
        obj_predict = Predict(obj_pred_config,
                              task_id=task_id,
示例#6
0
from predict_execution_en_blog import *
from db.almaden import Sql
db = Sql("datacast2")
key = 'sonnen home battery'
cha = 'twitter'
request_row = db.select('crawl_request','*',f'crawl_status="GF" and keyword="{key}" and channel="{cha}"')
## pred_confing

##predict
for row in request_row:
    obj_predict = Predict(keyword=row['keyword'],channel=row['channel'])
    obj_predict.predict()
    task_ids = db.select('crawl_task','*',f'keyword="{row["keyword"]}" and channel="youtube"')
    for task in task_ids:
        db.update_one('crawl_task','crawl_status','SF','task_id',task['task_id'])
示例#7
0
import multiprocessing
import time
from predict_execution_blog import *
from db.almaden import Sql

#시작시간
start_time = time.time()

#멀티쓰레드 사용 하는 경우 (20만 카운트)
#Pool 사용해서 함수 실행을 병렬

if __name__ == '__main__':
    process_list = []
    task_list = []
    db = Sql("datacast2")
    task_row = db.select(
        '''
    crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id
    JOIN request_batch AS rb ON rb.batch_id = cr.batch_id
    JOIN crawl_task AS ct ON crt.task_id=ct.task_id
    ''',
        'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel',
        'ct.crawl_status="GF" and ct.channel !="navershopping" and rb.batch_id=57 limit 6'
    )
    ## pred_confing
    obj_pred_config = Pred_config()
    ##predict
    # #멀티 쓰레딩 Pool 사용
    for row in task_row:
        task_id = row['task_id']
        channel = row['channel']
示例#8
0
 def __init__(self, dbName='dalmaden'):
     self.db = Sql(dbName)
     self.df = pd.DataFrame()
示例#9
0
class Data:
    def __init__(self, dbName='dalmaden'):
        self.db = Sql(dbName)
        self.df = pd.DataFrame()

    def _load_(self, channel, keyword, fromDate, toDate, tablename='cdata'):
        where_str = f"keyword='{keyword}' and channel='{channel}' and post_date between '{fromDate}' and '{toDate}'"
        ldf = self.db.select(tablename, "*", where_str, asDataFrame=True)
        return ldf

    def addData(self,
                channel,
                keyword,
                fromDate,
                toDate,
                tablename='cdata',
                unique=True,
                drop_by=['keyword', 'url']):
        nrows0 = self.df.shape[0]
        ldf = self._load_(channel, keyword, fromDate, toDate, tablename)
        print(ldf)
        nrowsldf = ldf.shape[0]

        self.df = self.df.append(ldf)
        addednRows = nrowsldf
        droppednRows = 0

        if unique:
            self.drop_duplicates(subset=drop_by)
            addednRows = self.df.shape[0] - nrows0
            droppednRows = nrowsldf - addednRows
        print(
            f'addData : added {addednRows} rows (dropped {droppednRows} rows)')

    def drop_duplicates(self, subset=None):
        self.df = self.df.drop_duplicates(subset=subset)

    def shape(self):
        return self.df.shape

    def get_df(self, *colnames, by_sentence=''):
        '''
        :param colnames: 행이름 str
        :param by_sentence: 문장 분해 대상 text 행이름
        :return: DataFrame
        '''
        df_documents = self.df.loc[:, list(colnames)]
        if len(by_sentence) > 0:
            df_sentences = pd.DataFrame()
            nrows = df_documents.shape[0]
            for i in range(nrows):
                if i % 100 == 0:
                    print(f"loader : Getting Sentences {i}/{nrows}")
                row = df_documents.iloc[i]
                text = row[by_sentence]
                if len(text) > 0:
                    text = cleanse_text(text)
                    sentences = kss.split_sentences(
                        text)  #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함
                    for s in sentences:
                        s = cleanse_sentence(s)
                        if len(s) > 0:
                            row_temp = row.copy()
                            row_temp[by_sentence] = s
                            df_sentences = df_sentences.append(row_temp)
                else:
                    continue
            print(
                f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences"
            )
            return df_sentences
        else:
            return df_documents
示例#10
0
class Predict:
    def __init__(self,keyword,channel,contents_id):
        self.engine = create_engine(("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format('root','robot369',
                                                                                                    '1.221.75.76',3306,'datacast2'))
        self.db = Sql("datacast2")
        self.keyword = keyword
        self.channel = channel
        self.contents_id = contents_id
    # def nouns(self,phrase):
    #     """Nouns extractor."""
    #     verbs = ['VV']
    #     tagged = pos_tag(word_tokenize(phrase))
    #     return [s for s, t in tagged if t in verbs]
    #
    # def verbs(self,phrase):
    #     """Verbs extractor."""
    #     verbs = ['VV']
    #     tagged = self.nlp.pos(phrase)
    #     return [s for s, t in tagged if t in verbs]
    #
    # def adjs(self,phrase):
    #
    #     """Adjs extractor."""
    #     adjs = ['VA','IC']
    #     tagged = self.nlp.pos(phrase)
    #     return [s for s, t in tagged if t in adjs]

    def read(self):
        # conn = pymysql.connect(host='1.221.75.76', user='******', password='******', database='datacast')
        # curs = conn.cursor(pymysql.cursors.DictCursor)
        # sql_select_sentence = 'select * from analysis_sentence'
        # curs.execute(sql_select_sentence)
        # rows = curs.fetchall()
        ##pandas datatable 형태로 sentece 테이블 읽어들이기

        print('sql:',
              "SELECT ct.channel,cc.contents_id,cs.text from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id "
              "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (self.contents_id, self.keyword))
        df_sentence_rows = pd.read_sql(
            "SELECT ct.keyword,ct.channel,cc.contents_id as contents_id,cs.sentence_id as sentence_id, cs.text as sentence from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id "
            "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (
            self.contents_id,self.keyword),
            self.engine)
        return df_sentence_rows


    def predict(self):
        df_sentence_data_rows = self.read()
        sid = SentimentIntensityAnalyzer()
        for idx in tqdm(df_sentence_data_rows.index,desc="sentence_anlysis&db_update"):
            try:
                sentence_id = df_sentence_data_rows.at[idx,'sentence_id']
                sentence = df_sentence_data_rows.at[idx,'sentence']
                korean = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')
                sentence = re.sub(korean,"",sentence)
                sentence = sentence.lower()
                nouns = [p[0] for p in pos_tag(word_tokenize(sentence), tagset='universal') if p[1] in ['NOUN']]
                nouns = list(filter(lambda x: (x not in stopwords) and all(stop not in x for stop in stop_list), nouns))
                nouns = json.dumps(nouns, ensure_ascii=False)

                verbs = [p[0] for p in pos_tag(word_tokenize(sentence),tagset='universal') if p[1] in ['VERB']]
                verbs = list(filter(lambda x: (x not in stopwords) and all(stop not in x for stop in stop_list),verbs))
                verbs = json.dumps(verbs, ensure_ascii=False)

                adjs = [p[0] for p in pos_tag(word_tokenize(sentence),tagset='universal') if p[1] in ['ADJ']]
                adjs = list(filter(lambda x: (x not in stopwords) and all(stop not in x for stop in stop_list),adjs))
                adjs = json.dumps(adjs, ensure_ascii=False)

                pos = sid.polarity_scores(sentence)
                pos = 1 if pos['compound']>=0 else 0

                self.db.update_multi_column("crawl_sentence",
                                            update_dict={"nouns": nouns, "verbs": verbs, "adjs": adjs,
                                                         "positiveness": float(pos)},
                                            where_dict={"sentence_id": float(sentence_id)})
            except Exception as e:
                print(e)
                continue