Exemplo n.º 1
0
def get_documents(params, size, index, scroll_id=None):
    request = None
    queryObj = Query(params)

    if not scroll_id:
        es_uri = "/" + index + "/doc/_search?scroll=1d"
        request = queryObj.get_documents_query()
        request['size'] = size
    else:
        es_uri = "/_search/scroll"
        request = {"scroll": "1d", "scroll_id": scroll_id}

    logger.debug("get_documents() ==> request : ")
    for k, v in request.items():
        logger.debug("\t{} : {}".format(k, v))

    es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60)
    es_conn.request("POST", es_uri, json.dumps(request),
                    {"Content-type": "application/json"})
    result = es_conn.getresponse().read()

    if 'hits' in json.loads(result):
        logger.debug("[get_documents] result['hits']['total'] >>> %d" %
                     int(json.loads(result)['hits']['total']))
    else:
        logger.debug("[get_documents] result ::: " + str(result))

    return json.loads(result)
Exemplo n.º 2
0
 def __init__(self, params):
     self.compare = True if params['compare_yn']=='Y' else False
     self.start_date = re.sub("[-:\s]", "", params['start_date'])[:8]
     self.end_date = re.sub("[-:\s]", "", params['end_date'])[:8]
     self.seq = params['seq']
     self.reg_dt = re.sub("[-:\s]", "", params['reg_dt'])
     self.report_type = db.get_exceltype_name(params['type_cd']) # RSP -> 리포트_소셜모니터링_추이분석
     self.project_name = db.get_project_name(params['project_seq'])
     self.channel = '전체' if not params['channels'] or params['channels']=='all' else "채널일부"
     
     self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in params['datasets'].split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티
     if os.name == 'nt' and bool(re.match("[\/\\\"*?<>\|]", self.dataset_names)):
         self.dataset_names = re.sub("[\/\\\"*?<>\|]", "_", self.dataset_names)
         
     self.queryObj = Query()
     
     compare_yn = "동일기간비교" if params['compare_yn']=='Y' else "해당기간"
     
     if not params['datasets']: # 검색트렌드
         self.file_name = "_".join([str(self.seq), self.report_type, self.start_date, self.end_date, compare_yn]) + ".xlsx"
     else: # 소셜모니터링
         if len(params['datasets'].split("^"))>1:
             self.file_name = "_".join([str(self.seq), self.report_type, self.channel, self.start_date, self.end_date, compare_yn]) + ".xlsx"
         else:
             self.file_name = "_".join([str(self.seq), self.report_type+"("+self.dataset_names+")", self.channel, self.start_date, self.end_date, compare_yn]) + ".xlsx"
             
     self.logger.info("=======================================================================================")
     for k, v in params.items():
         self.logger.info(k + " :\t\t" + str(v))
     self.logger.info("=======================================================================================")    
Exemplo n.º 3
0
def get_documents_count(params, index):
    queryObj = Query(params)

    es_uri = "/" + index + "/doc/_count"
    request = queryObj.get_documents_query()

    es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60)
    es_conn.request("POST", es_uri, json.dumps(request),
                    {"Content-type": "application/json"})
    result = es_conn.getresponse().read()

    if 'count' in json.loads(result):
        return json.loads(result)['count']
    else:
        logger.error("[get_documents_count] %s" % str(result))
        return -1
Exemplo n.º 4
0
def get_request_query(params, scroll_id=None):
    queryObj = Query(params)

    if not scroll_id:
        request = {"query": {"bool": {}}}
    else:
        request = {"scroll": "1d", "scroll_id": scroll_id}

    if "query" in request:
        filter = []
        # 프로젝트 시퀀스 포함
        filter.append(queryObj.get_project_seq_query())
        # 여러 프로젝트 seq 가 들어오더라도 모두 filter keyword가 동일하므로 첫번째 project_seq만 사용.
        filter.append(
            queryObj.get_project_filter_query(
                params['project_seqs'].split(",")[0]))

        # 대상 채널
        if "channels" in params and params[
                "channels"] and params["channels"] != 'all':
            filter.append(queryObj.get_channel_query())

        # 대상 기간
        if "start_date" in params and "end_date" in params:
            filter.append(queryObj.get_period_query(params['mode']))

        request["query"]["bool"]["filter"] = filter
        request["query"]["bool"]["must"] = queryObj.get_total_dataset_query(
            params['project_seqs'])

    logger.debug("[get_request_query] Query >>> %s " % json.dumps(request))

    return request
Exemplo n.º 5
0
    def __init__(self, params):
        self.mode = params['mode']
        self.compare = True if params['compare_yn']=='Y' else False

        self.start_date = re.sub("[-:T\s]", "", params['start_date'])[:12]
        self.end_date = re.sub("[-:T\s]", "", params['end_date'])[:12]
        self.reg_dt = re.sub("[-:T\s]", "", params['reg_dt'])

        self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in str(params['datasets']).split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티
        self.query = Query(params)

        if mode == MODE_DOCUMENTS:
            self.file_name = "_".join(["SNS", self.dataset_names, self.start_date, self.end_date]) + ".xlsx"
        elif mode == MODE_TOPICS:
            self.file_name = "_".join(["화제어", self.dataset_names, self.start_date, self.end_date]) + ".xlsx"
        elif mode == MODE_EMOTIONS:
            self.file_name = "_".join(["감성분석", self.dataset_names, self.start_date, self.end_date]) + ".xlsx"
        elif mode == MODE_TREND:
            self.file_name = "_".join(["연관검색어", str(params['project_seq']), self.start_date, self.end_date]) + ".xlsx"
Exemplo n.º 6
0
    def __init__(self, params):
        self.seq = params['seq']
        self.compare = True if params['compare_yn']=='Y' else False

        self.start_date = re.sub("[-:T\s]", "", params['start_date'])[:12]
        self.end_date = re.sub("[-:T\s]", "", params['end_date'])[:12]
        self.reg_dt = re.sub("[-:T\s]", "", params['reg_dt'])

        self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in str(params['datasets']).split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티
        self.query = Query(params)

        self.file_name = "B-%d-%s-I-C.SCD" % (self.seq, get_current_datetime())
Exemplo n.º 7
0
def get_request_query(params, scroll_id=None):
    queryObj = Query(params)

    if not scroll_id:
        request = {"query": {"bool": {}}}
    else:
        request = {"scroll": "1d", "scroll_id": scroll_id}

    if "query" in request:
        filter = []
        # 프로젝트 시퀀스 포함
        filter.append(queryObj.get_project_seq_query())
        # 여러 프로젝트 seq 가 들어오더라도 모두 filter keyword가 동일하므로 첫번째 project_seq만 사용.
        filter.append(
            queryObj.get_project_filter_query(
                params['project_seqs'].split(",")[0]))

        # 대상 채널
        if "channels" in params and params[
                "channels"] and params["channels"] != 'all':
            filter.append(queryObj.get_channel_query())

        # 대상 기간
        if "start_date" in params and "end_date" in params:
            filter.append(queryObj.get_period_query(params['mode']))

        request["query"]["bool"]["filter"] = filter
        request["query"]["bool"]["must"] = queryObj.get_total_dataset_query(
            params['project_seqs'])
        '''
        request["query"]["bool"]["must"] = {
            "bool" : {
                "should" : [
                    {
                        "query_string": {
                            "fields": ["doc_title^100", "doc_content"],
                            "query" : "신한은행",
                            "default_operator" : "AND",
                            "tie_breaker" : 0.0
                        }
                    }
                ]
            }
        }
        '''

    logger.debug("[get_request_query] Query >>> %s " % json.dumps(request))

    return request
Exemplo n.º 8
0
def get_documents(params, size, index, scroll_id=None):
    queryObj = Query(params)

    if not scroll_id:
        es_uri = "/" + index + "/doc/_search?scroll=1d"
        request = {"size": size, "query": {"bool": {}}}
    else:
        es_uri = "/_search/scroll"
        request = {"scroll": "1d", "scroll_id": scroll_id}

    if "query" in request:
        filter = []
        # 프로젝트 시퀀스 포함
        filter.append(queryObj.get_project_seq_query())
        filter.append(queryObj.get_project_filter_query(params['project_seq']))

        # 대상 채널
        if "channels" in params and params[
                "channels"] and params["channels"] != 'all':
            filter.append(queryObj.get_channel_query())

        # 대상 기간
        if "start_date" in params and "end_date" in params:
            filter.append(queryObj.get_period_query())

        request["query"]["bool"]["filter"] = filter

        # 데이터셋의 포함 키워드
        if "datasets" in params and params["datasets"]:
            request["query"]["bool"]["must"] = queryObj.get_dataset_query(
                params['project_seq'], params["datasets"])

    logger.debug("[get_documents] Query >>> %s " % json.dumps(request))

    es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60)
    es_conn.request("POST", es_uri, json.dumps(request),
                    {"Content-type": "application/json"})
    result = es_conn.getresponse().read()

    if 'hits' in json.loads(result):
        logger.debug("[get_documents] result['hits']['total'] >>> %d" %
                     int(json.loads(result)['hits']['total']))
    else:
        logger.debug("[get_documents] result ::: " + str(result))

    return json.loads(result)
Exemplo n.º 9
0
                params['project_seq'], params["datasets"])

    logger.debug("[get_documents] Query >>> %s " % json.dumps(request))

    es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60)
    es_conn.request("POST", es_uri, json.dumps(request),
                    {"Content-type": "application/json"})
    result = es_conn.getresponse().read()

    if 'hits' in json.loads(result):
        logger.debug("[get_documents] result['hits']['total'] >>> %d" %
                     int(json.loads(result)['hits']['total']))
    else:
        logger.debug("[get_documents] result ::: " + str(result))

    return json.loads(result)


if __name__ == '__main__':
    params = {
        "start_date": "2018-01-01T00:00:00",
        "end_date": "2018-12-31T23:59:59",
        "project_seq": 176,
        "compare_yn": "N",
        "channels": "all",
        "datasets": "2852"
    }
    queryObj = Query(params)

    #print(queryObj.ALL_TOPICS_LIST("신한금융지주"))
    print(get_documents(params, 10, "documents"))
Exemplo n.º 10
0
class ReportKDICDocuments:
    mode = ""
    seq = -1
    reg_dt = ""
    report_day = ""
    report_time = ""
    report_type = ""
    project_name = ""
    channel = ""
    start_date = ""
    end_date = ""
    dataset_names = ""
    query = None
    compare = ''
    save_path = ""

    file_name = ""
    file_path = ""

    #BASE_EXCEL_DIRECTORY='/data/dmap-data/dmap-excel'
    conf = Config()
    BASE_EXCEL_DIRECTORY=conf.get_report_home()

    HEADER_FORMAT = {
        'bold' : True,
        'font_size' : 9,
        'bg_color' : '#F2F2F2',
        'align' : 'center',
        'border' : 1
    }
    DEFAULT_FORMAT = {
        'font_size' : 9,
        'border' : 1
    }

    def __init__(self, params):
        self.mode = params['mode']
        self.compare = True if params['compare_yn']=='Y' else False

        self.start_date = re.sub("[-:T\s]", "", params['start_date'])[:12]
        self.end_date = re.sub("[-:T\s]", "", params['end_date'])[:12]
        self.reg_dt = re.sub("[-:T\s]", "", params['reg_dt'])

        self.dataset_names = ",".join([db.get_dataset_name(x) if db.get_dataset_name(x)!=None else 'unknown' for x in str(params['datasets']).split("^")]) if params['datasets'] else '' # 6^7^15 -> 신라면,안성탕면,짜파게티
        self.query = Query(params)

        if mode == MODE_DOCUMENTS:
            self.file_name = "_".join(["SNS", self.dataset_names, self.start_date, self.end_date]) + ".xlsx"
        elif mode == MODE_TOPICS:
            self.file_name = "_".join(["화제어", self.dataset_names, self.start_date, self.end_date]) + ".xlsx"
        elif mode == MODE_EMOTIONS:
            self.file_name = "_".join(["감성분석", self.dataset_names, self.start_date, self.end_date]) + ".xlsx"
        elif mode == MODE_TREND:
            self.file_name = "_".join(["연관검색어", str(params['project_seq']), self.start_date, self.end_date]) + ".xlsx"


    def get_file_name(self):
        return self.file_name

    def create_file_path(self, path):
        self.file_path = path
        return file_util.search_create_directory( self.file_path )
        # if mode == 'documents':
        #     '''
        #     - documents는 report 폴더 아래 Social 디렉터리 아래 떨어지게 됨.
        #     '''
        #     self.file_path = os.path.join(self.BASE_EXCEL_DIRECTORY, self.reg_dt, 'raw')
        #     return file_util.search_create_directory( self.file_path )
        # else:
        #     '''
        #     - topics는 report 폴더 아래 Social_topics 디렉터리 아래 떨어지게 됨.
        #     '''
        #     self.file_path = os.path.join(self.BASE_EXCEL_DIRECTORY, self.reg_dt, 'topic')
        #     return file_util.search_create_directory( self.file_path )


    def topics_list(self, params):
        worksheet = self.workbook.add_worksheet("화제어(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]]))
        # 헤더
        # 날짜 형식은 YYYYMMDD 이어야 함
        worksheet.write(0, 0, '날짜', self.header)
        worksheet.write(0, 1, '순위', self.header)
        worksheet.write(0, 2, '화제어', self.header)
        worksheet.write(0, 3, '문서수', self.header)
        worksheet.write(0, 4, '연관어', self.header)
        worksheet.write(0, 5, '문서수', self.header)

        # 데이터
        result_topic = es.get_aggregations(self.query.ALL_TOPICS_LIST(params['dataset_name']), params, Query.INDEX_TOPICS)
        row=0
        seq=0 # topic의 순위
        #topics_date = params['start_date'][0:10].replace('-','')

        for bucket0 in result_topic['aggregations']['my_aggs0']['buckets']:
            for bucket1 in bucket0['my_aggs1']['buckets']:
                topic = re.sub("[\+=\-/]", "", str(bucket1['key']))
                seq += 1
                
                topics_date = bucket0['key_as_string']
                
                if len(bucket1['my_aggs2']['buckets'])>0:
                    for bucket2 in bucket1['my_aggs2']['buckets']:
                        str(startdate.strftime('%Y-%m-%dT%H:00:00'))
                        # worksheet.write(1+row, 0, params['start_date'][0:10].replace('-',''), self.default)
                        worksheet.write(1+row, 0, re.sub("-","", topics_date[:topics_date.find("T")]), self.default)
                        worksheet.write(1+row, 1, seq, self.default)
                        worksheet.write(1+row, 2, re.sub("[\[\]]", "", topic), self.default)
                        worksheet.write(1+row, 3, bucket1['doc_count'], self.default)
                        worksheet.write(1+row, 4, bucket2['key'], self.default)
                        worksheet.write(1+row, 5, bucket2['doc_count'], self.default)
                        #worksheet.write(1+row, 6, verb_list, self.default)
                        row += 1
                        
                else:
                    worksheet.write(1+row, 0, re.sub("-","", topics_date[:topics_date.find("T")]), self.default)
                    worksheet.write(1+row, 1, seq, self.default)
                    worksheet.write(1+row, 2, re.sub("[\[\]]", "", topic), self.default)
                    worksheet.write(1+row, 3, bucket1['doc_count'], self.default)
                    worksheet.write(1+row, 4, '', self.default)
                    worksheet.write(1+row, 5, '', self.default)
                    #worksheet.write(1+row, 6, '', self.default)
                    row += 1
        
        logger.info("<%s> Total Topics : %d" % (self.dataset_names, row) )



    def emotions_per_causes(self, params):
        worksheet = self.workbook.add_worksheet("강성분석(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]]))

        # 헤더
        # 날짜 형식은 YYYYMMDD 이어야 함
        worksheet.write(0, 0, '날짜', self.header)
        worksheet.write(0, 1, '채널1', self.header)
        worksheet.write(0, 2, '채널2', self.header)
        worksheet.write(0, 3, '채널3', self.header)
        worksheet.write(0, 4, '대분류', self.header)
        worksheet.write(0, 5, '중분류', self.header)
        worksheet.write(0, 6, '소분류', self.header)
        worksheet.write(0, 7, '긍부정', self.header)
        worksheet.write(0, 8, '문서수', self.header)

        # 데이터
        qdsl = self.query.EMOTIONS_PER_CAUSES()
        result = es.get_aggregations(copy.copy(qdsl), params, INDEX_EMOTIONS)
        #total = result['hits']['total']
        total = 0
        row = 0
        #emotions_date = params['start_date'][0:10].replace('-','')

        for bucket0 in result['aggregations']['my_aggs0']['buckets']:
            for bucket1 in bucket0['my_aggs1']['buckets']:
                for bucket2 in bucket1['my_aggs2']['buckets']:
                    for bucket5 in bucket2['my_aggs3']['my_aggs4']['my_aggs5']['buckets']:
                        # 2018.01.11 "(주)"가 포함된 경우에는 (주)를 뺀 나머지 이름이 포함됐는지 확인해야 하므로 변경.
                        if params['dataset_name'].find(bucket2['key']) >= 0 :
                            depth_level = bucket1['key'].split(">")
                            
                            #worksheet.write(1+row, 0, emotions_date, self.default)
                            emotions_date = bucket0['key_as_string']
                            worksheet.write(1+row, 0, re.sub("-", "", emotions_date[:emotions_date.find("T")]), self.default)
                            worksheet.write(1+row, 1, re.sub("[\[\]]", "", depth_level[0]) if len(bucket1['key'].split(">"))>=0 else '', self.default)
                            worksheet.write(1+row, 2, re.sub("[\[\]]", "", depth_level[1]) if len(bucket1['key'].split(">"))>=1 else '', self.default)
                            worksheet.write(1+row, 3, re.sub("[\[\]]", "", depth_level[2]) if len(bucket1['key'].split(">"))>=2 else '', self.default)
                            worksheet.write(1+row, 4, bucket2['key'], self.default)
                            worksheet.write(1+row, 5, '', self.default)
                            worksheet.write(1+row, 6, '', self.default)
                            worksheet.write(1+row, 7, bucket5['key'], self.default)
                            worksheet.write(1+row, 8, bucket5['doc_count'], self.default)
                            
                            total += int(bucket5['doc_count']) 
                            row += 1

        # 합꼐
        if len(params['datasets'].split("^"))==1:
            worksheet.write(row+1, 0, '합계', self.header)
            worksheet.write(row+1, 1, '', self.header)
            worksheet.write(row+1, 2, '', self.header)
            worksheet.write(row+1, 3, '', self.header)
            worksheet.write(row+1, 4, '', self.header)
            worksheet.write(row+1, 5, '', self.header)
            worksheet.write(row+1, 6, '', self.header)
            worksheet.write(row+1, 7, '', self.header)
            worksheet.write(row+1, 8, total, self.header)

        logger.info("<%s> Total Emotions : %d" % (self.dataset_names, row) )
        
        
        

    # 원문
    def create_documents_list(self, params, index):
        # title, content에 포함되어 있을 시 제외시킬 패턴 가져오기
        project_filter_keywords = db.get_project_filter_keywords(params['project_seq'])
        
        EXCLUDE_PATTERNS = None
        if project_filter_keywords and 'regex_filter_keywords' in project_filter_keywords:
            EXCLUDE_PATTERNS = re.compile("(?i)("+re.sub(",", "|", project_filter_keywords['regex_filter_keywords'].strip())+")")
        
        size = 10000 # 페이징 사이즈

        # 검색 시작
        result = es.get_documents(params, size, index, "")

        # 시트 생성
        worksheet = self.workbook.add_worksheet("원문(%s)(0)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]]))

        # 엑셀 헤더
        worksheet.write(0, 0, 'ID', self.header)
        worksheet.write(0, 1, '게시일', self.header)
        worksheet.write(0, 2, '작성자', self.header)
        worksheet.write(0, 3, 'URL', self.header)
        worksheet.write(0, 4, '제목', self.header)
        worksheet.write(0, 5, '내용', self.header)
        worksheet.write(0, 6, '채널1', self.header)
        worksheet.write(0, 7, '채널2', self.header)
        worksheet.write(0, 8, '채널3', self.header)
        worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가    
        
        logger.info("<%s> Total Documents : %d" % (self.dataset_names, result["hits"]["total"]))
        
        # 엑셀 본문
        if "hits" in result and result["hits"]["total"] > 0:
            row = 0
            for this_result in result["hits"]["hits"]:
                doc_id       = this_result["_id"]
                doc_datetime = this_result["_source"]["doc_datetime"]
                doc_writer   = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"]))
                doc_url      = this_result["_source"]["doc_url"]
                doc_title    = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"]))
                doc_content  = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"]))
                depth1_nm    = this_result["_source"]["depth1_nm"]
                depth2_nm    = this_result["_source"]["depth2_nm"]
                depth3_nm    = this_result["_source"]["depth3_nm"]
                score        = this_result["_score"]
                
                # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외.
                if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None):
                    continue
                
                row += 1
                worksheet.write(row, 0, doc_id, self.default)
                worksheet.write(row, 1, doc_datetime, self.default)
                worksheet.write(row, 2, doc_writer, self.default)
                worksheet.write(row, 3, doc_url, self.default)
                worksheet.write(row, 4, doc_title, self.default)
                worksheet.write(row, 5, doc_content, self.default)
                worksheet.write(row, 6, depth1_nm, self.default)
                worksheet.write(row, 7, depth2_nm, self.default)
                worksheet.write(row, 8, depth3_nm, self.default)
                worksheet.write(row, 9, score, self.default)

            # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴.
            # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성.
            if "hits" in result and result["hits"]["total"] > size:
                row = 0
                for page in range(1, math.ceil(result["hits"]["total"]/size)): # 0, 1, 2, ....
                    worksheet = self.workbook.add_worksheet("원문(%s)(%d)"%("~".join([params['start_date'][0:10],params['end_date'][0:10]]),page))
                    # 엑셀 헤더
                    worksheet.write(0, 0, 'ID', self.header)
                    worksheet.write(0, 1, '게시일', self.header)
                    worksheet.write(0, 2, '작성자', self.header)
                    worksheet.write(0, 3, 'URL', self.header)
                    worksheet.write(0, 4, '제목', self.header)
                    worksheet.write(0, 5, '내용', self.header)
                    worksheet.write(0, 6, '채널1', self.header)
                    worksheet.write(0, 7, '채널2', self.header)
                    worksheet.write(0, 8, '채널3', self.header)
                    worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가    
                    
                    scrolled_result = es.get_documents(params, size, index, scroll_id=result["_scroll_id"])
                    for this_result in scrolled_result["hits"]["hits"]:
                        doc_id       = this_result["_id"]
                        doc_datetime = this_result["_source"]["doc_datetime"]
                        doc_writer   = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"]))
                        doc_url      = this_result["_source"]["doc_url"]
                        doc_title    = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"]))
                        doc_content  = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"]))
                        depth1_nm    = this_result["_source"]["depth1_nm"]
                        depth2_nm    = this_result["_source"]["depth2_nm"]
                        depth3_nm    = this_result["_source"]["depth3_nm"]
                        score        = this_result["_score"]
                        
                        # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외.
                        if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None):
                            continue
                        
                        row += 1
                        worksheet.write(row, 0, doc_id, self.default)
                        worksheet.write(row, 1, doc_datetime, self.default)
                        worksheet.write(row, 2, doc_writer, self.default)
                        worksheet.write(row, 3, doc_url, self.default)
                        worksheet.write(row, 4, doc_title, self.default)
                        worksheet.write(row, 5, doc_content, self.default)
                        worksheet.write(row, 6, depth1_nm, self.default)
                        worksheet.write(row, 7, depth2_nm, self.default)
                        worksheet.write(row, 8, depth3_nm, self.default)
                        worksheet.write(row, 9, score, self.default)


                    # 마지막 페이지를 처리하고 나면 scroll을 clear
                    if page == math.ceil(result["hits"]["total"]/size)-1: 
                        if result["_scroll_id"]:
                            es.clear_scroll(result["_scroll_id"])



    def make_trend_report(self, params):
        logger.info("============================= \"make_trend_report\" starts.")

        today = re.sub("[-]", "", params['start_date'][0:10])

        worksheet = self.workbook.add_worksheet("연관어(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]]))

        # 헤더
        # 날짜 형식은 YYYYMMDD 이어야 함
        worksheet.write(0, 0, '날짜', self.header)
        worksheet.write(0, 1, '시간', self.header)
        worksheet.write(0, 2, '검색그룹', self.header)
        worksheet.write(0, 3, '검색아이템', self.header)
        worksheet.write(0, 4, '검색키워드', self.header)
        worksheet.write(0, 5, '키워드', self.header)

        # 데이터
        result = db.get_data_for_report_trend(params['project_seq'], today)
        for idx, row in enumerate(result, 1):
            worksheet.write(idx, 0, row[0], self.default)
            worksheet.write(idx, 1, row[1], self.default)
            worksheet.write(idx, 2, row[2], self.default)
            worksheet.write(idx, 3, row[3], self.default)
            worksheet.write(idx, 4, row[4], self.default)
            worksheet.write(idx, 5, row[5], self.default)




    def create_report(self, params):
        self.workbook = xlsxwriter.Workbook(os.path.join(self.file_path.replace("/", os.path.sep), self.file_name), options={'strings_to_urls': False, 'strings_to_numbers': True} )
        self.header = self.workbook.add_format(self.HEADER_FORMAT)
        self.default = self.workbook.add_format(self.DEFAULT_FORMAT)

        if self.mode == MODE_TOPICS:
            self.topics_list(params)
        elif self.mode == MODE_DOCUMENTS:
            self.create_documents_list(params, INDEX_DOCUMENTS)
        elif self.mode == MODE_EMOTIONS:
            self.emotions_per_causes(params)
        elif self.mode == MODE_TREND:
            self.make_trend_report(params)


        self.close_workbook()

    
    
    
    def close_workbook(self):
        self.workbook.close()
Exemplo n.º 11
0
def get_documents(params, size, index, scroll_id=None):
    queryObj = Query(params)

    if not scroll_id:
        es_uri = "/" + index + "/doc/_search?scroll=1d"
        request = {"size": size, "query": {"bool": {"must": []}}}
    else:
        es_uri = "/_search/scroll"
        request = {"scroll": "1d", "scroll_id": scroll_id}

    must = []
    # 프로젝트 시퀀스 포함
    must.append(get_project_seq_query(params))

    # 대상 채널
    if "channels" in params and params[
            "channels"] and params["channels"] != 'all':
        must.append(get_channel_query(params))

    # 대상 기간
    if "start_date" in params and "end_date" in params:
        must.append(get_period_query(params))

    # 데이터셋의 포함 키워드
    if "datasets" in params and params["datasets"]:  # 신라면,삼양라면,안성탕면
        if len(params["datasets"].split("^")) > 1:
            should = []
            for dataset in params["datasets"].split("^"):
                should.append(
                    queryObj.get_dataset_query(params['project_seq'], dataset))

            must.append({"bool": {"should": should}})
        else:
            must.append(
                queryObj.get_dataset_query(params['project_seq'],
                                           params["datasets"]))

    # elif params["type_cd"] == "CCT002": # 소셜모니터링-문서통계
    # elif params["type_cd"] == "CCT003": # 소셜모니터링-감성분석
    # .....
    # 코드별로 request 필요한 형태로 변경해서 추가

    if "query" in request:
        request["query"]["bool"]["must"] = must

    logger.debug("get_documents() ==> request : ")
    for k, v in request.items():
        logger.debug("\t{} : {}".format(k, v))

    es_conn = hc.HTTPConnection(es_ip, es_port, timeout=60)
    es_conn.request("POST", es_uri, json.dumps(request),
                    {"Content-type": "application/json"})
    result = es_conn.getresponse().read()

    if 'hits' in json.loads(result):
        logger.debug("[get_documents] result['hits']['total'] >>> %d" %
                     int(json.loads(result)['hits']['total']))
    else:
        logger.debug("[get_documents] result ::: " + str(result))

    return json.loads(result)