Пример #1
0
    def create_documents_list(self, params, index):
        size = 10000 # 페이징 사이즈
       
        # 검색 시작
        result = es.get_documents(params, size, index, "")

        #worksheet = self.workbook.add_worksheet("원문(%s)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]]))

        # 엑셀 헤더
        '''
        for colidx, field in enumerate(output_fields_korean):
            worksheet.write(0, colidx, field, self.header)
        '''

        if "hits" in result and result["hits"]["total"] > 0:
            scdfile = codecs.open(os.path.join(self.file_path, self.file_name), 'w', 'utf-8')
            
            for this_result in result["hits"]["hits"]:
                for field in FIELDS_DOCUMENTS:
                    if field == 'doc_id':
                        val = this_result["_id"]
                        #worksheet.write(row+1, col, val, self.default)
                        scdfile.write("<DOCID>%s"%val)
                        scdfile.write("\r\n")
                        
                        continue
                        
                    val = this_result["_source"][field] if field in this_result["_source"] else "null"
                    #worksheet.write(row+1, col, val, self.default)
                    scdfile.write("<%s>%s" % (field, val))
                    scdfile.write("\r\n")


            # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴.
            # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성.
            if "hits" in result and result["hits"]["total"] > size:
                for page in range(1, math.ceil(result["hits"]["total"]/size)): # 0, 1, 2, ....
                    scrolled_result = es.get_documents(params, size, index, scroll_id=result["_scroll_id"])
                    for this_result in scrolled_result["hits"]["hits"]:
                        for field in FIELDS_DOCUMENTS:
                            if field == 'doc_id':
                                val = this_result["_id"]
                                #worksheet.write(row+1, col, val, self.default)
                                scdfile.write("<DOCID>%s"%val)
                                scdfile.write("\r\n")
                                
                                continue
                                
                            val = this_result["_source"][field] if field in this_result["_source"] else "null"
                            #worksheet.write(row+1, col, val, self.default)
                            scdfile.write("<%s>%s" % (field, val))
                            scdfile.write("\r\n")

                    if page == math.ceil(result["hits"]["total"]/size)-1: # 마지막 페이지를 처리하고 나면 scroll을 clear
                        if result["_scroll_id"]:
                            es.clear_scroll(result["_scroll_id"])
                            
            scdfile.close()
Пример #2
0
    def create_documents_list(self, params, index):
        size = 10000  # 페이징 사이즈

        # 검색 시작
        #result = es.get_documents(params, size, index, "")
        totalCount = es.get_count("/" + index + "/doc/_count",
                                  self.queryObj.get_documents_query(params))

        self.logger.debug("[ReportStatistics][create_documents_list] %s" %
                          self.queryObj.get_documents_query(params))

        #if "hits" in result and result["hits"]["total"] > 0:
        if totalCount > 0:
            scroll_id = None

            # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴.
            # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성.
            #if "hits" in result and result["hits"]["total"] > size:
            for page in range(math.ceil(totalCount / size)):  # 0, 1, 2, ....
                worksheet = self.workbook.add_worksheet(
                    "원문(%s)(%d)" % ("~".join([
                        params['start_date'][0:10], params['end_date'][0:10]
                    ]), page + 1))  #>%s(%d)"%(this_dataset_name,page))
                scrolled_result = es.get_list(
                    "/" + index + "/doc/_search",
                    self.queryObj.get_documents_query(params), size, scroll_id)
                scroll_id = scrolled_result['_scroll_id']

                # 엑셀 헤더
                for colidx, field in enumerate(self.DOCUMENTS_FIELDS_KOREAN):
                    worksheet.write(0, colidx, field, self.header)

                for row, this_result in enumerate(
                        scrolled_result["hits"]["hits"]):
                    for col, field in enumerate(self.DOCUMENTS_FIELDS):
                        if "." in field:
                            field, subfield = field.split(".")

                            val = this_result["_source"][field][
                                subfield] if field in this_result[
                                    "_source"] and subfield in this_result[
                                        "_source"][field] else "null"
                            worksheet.write(row + 1, col, val, self.default)
                        else:
                            val = this_result["_source"][
                                field] if field in this_result[
                                    "_source"] else "null"
                            worksheet.write(row + 1, col, val, self.default)

                if page == math.ceil(
                        totalCount /
                        size) - 1:  # 마지막 페이지를 처리하고 나면 scroll을 clear
                    if '_scroll_id' in scrolled_result and scrolled_result[
                            "_scroll_id"]:
                        es.clear_scroll(scroll_id)
Пример #3
0
    def create_documents_list(self, params, index):
        size = 10000  # 페이징 사이즈
        output_fields_korean = self.DOCUMENTS_FIELDS_KOREAN if index.startswith(
            'documents') else self.EMOTIONS_FIELDS_KOREAN
        output_fields = self.DOCUMENTS_FIELDS if index.startswith(
            'documents') else self.EMOTIONS_FIELDS

        # 검색 시작
        #result = es.get_documents(params, size, index, "")
        totalCount = es.get_documents_count(params, index)

        #if "hits" in result and result["hits"]["total"] > 0:
        if totalCount > 0:
            scroll_id = None

            # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴.
            # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성.
            #if "hits" in result and result["hits"]["total"] > size:
            for page in range(math.ceil(totalCount / size)):  # 0, 1, 2, ....
                worksheet = self.workbook.add_worksheet(
                    "원문(%s)(%d)" % ("~".join([
                        params['start_date'][0:10], params['end_date'][0:10]
                    ]), page + 1))  #>%s(%d)"%(this_dataset_name,page))
                scrolled_result = es.get_documents(params, size, index,
                                                   scroll_id)
                scroll_id = scrolled_result['_scroll_id']

                # 엑셀 헤더
                for colidx, field in enumerate(output_fields_korean):
                    worksheet.write(0, colidx, field, self.header)

                for row, this_result in enumerate(
                        scrolled_result["hits"]["hits"]):
                    for col, field in enumerate(output_fields):
                        if "." in field:
                            field, subfield = field.split(".")

                            val = this_result["_source"][field][
                                subfield] if field in this_result[
                                    "_source"] and subfield in this_result[
                                        "_source"][field] else "null"
                            worksheet.write(row + 1, col, val, self.default)
                        else:
                            val = this_result["_source"][
                                field] if field in this_result[
                                    "_source"] else "null"
                            worksheet.write(row + 1, col, val, self.default)

                if page == math.ceil(
                        totalCount /
                        size) - 1:  # 마지막 페이지를 처리하고 나면 scroll을 clear
                    if '_scroll_id' in scrolled_result and scrolled_result[
                            "_scroll_id"]:
                        es.clear_scroll(scroll_id)
Пример #4
0
 def create_documents_list(self, params, index):
     size = 10000 # 페이징 사이즈
     
     # 검색 시작
     #result = es.get_documents(params, size, index, "")
     totalCount = es.get_count("/"+index+"/doc/_count", self.queryObj.get_documents_query(params))
     
     self.logger.debug("[ReportStatistics][create_documents_list] %s" % self.queryObj.get_documents_query(params))
     
     
     #if "hits" in result and result["hits"]["total"] > 0:
     if totalCount > 0 :
         scroll_id = None
                 
         # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴.
         # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성.            
         #if "hits" in result and result["hits"]["total"] > size:
         for page in range(math.ceil(totalCount/size)): # 0, 1, 2, ....
             worksheet = self.workbook.add_worksheet("원문(%s)(%d)"%("~".join([params['start_date'][0:10],params['end_date'][0:10]]), page+1))#>%s(%d)"%(this_dataset_name,page))
             scrolled_result = es.get_list("/"+index+"/doc/_search", self.queryObj.get_documents_query(params), size, scroll_id)
             scroll_id = scrolled_result['_scroll_id']
             
             # 엑셀 헤더
             for colidx, field in enumerate(self.DOCUMENTS_FIELDS_KOREAN):
                 worksheet.write(0, colidx, field, self.header)
                 
             for row, this_result in enumerate(scrolled_result["hits"]["hits"]):
                 for col, field in enumerate(self.DOCUMENTS_FIELDS):
                     if "." in field:
                         field, subfield = field.split(".")
                         
                         val = this_result["_source"][field][subfield] if field in this_result["_source"] and subfield in this_result["_source"][field] else "null"
                         worksheet.write(row+1, col, val, self.default)
                     else:
                         val = this_result["_source"][field] if field in this_result["_source"] else "null"
                         worksheet.write(row+1, col, val, self.default)
                 
             if page == math.ceil(totalCount/size)-1: # 마지막 페이지를 처리하고 나면 scroll을 clear
                 if '_scroll_id' in scrolled_result and scrolled_result["_scroll_id"]:
                     es.clear_scroll(scroll_id)
Пример #5
0
    def create_documents_list(self, params, index):
        # title, content에 포함되어 있을 시 제외시킬 패턴 가져오기
        project_filter_keywords = db.get_project_filter_keywords(params['project_seq'])
        
        EXCLUDE_PATTERNS = None
        if project_filter_keywords and 'regex_filter_keywords' in project_filter_keywords:
            EXCLUDE_PATTERNS = re.compile("(?i)("+re.sub(",", "|", project_filter_keywords['regex_filter_keywords'].strip())+")")
        
        size = 10000 # 페이징 사이즈

        # 검색 시작
        result = es.get_documents(params, size, index, "")

        # 시트 생성
        worksheet = self.workbook.add_worksheet("원문(%s)(0)"%"~".join([params['start_date'][0:10],params['end_date'][0:10]]))

        # 엑셀 헤더
        worksheet.write(0, 0, 'ID', self.header)
        worksheet.write(0, 1, '게시일', self.header)
        worksheet.write(0, 2, '작성자', self.header)
        worksheet.write(0, 3, 'URL', self.header)
        worksheet.write(0, 4, '제목', self.header)
        worksheet.write(0, 5, '내용', self.header)
        worksheet.write(0, 6, '채널1', self.header)
        worksheet.write(0, 7, '채널2', self.header)
        worksheet.write(0, 8, '채널3', self.header)
        worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가    
        
        logger.info("<%s> Total Documents : %d" % (self.dataset_names, result["hits"]["total"]))
        
        # 엑셀 본문
        if "hits" in result and result["hits"]["total"] > 0:
            row = 0
            for this_result in result["hits"]["hits"]:
                doc_id       = this_result["_id"]
                doc_datetime = this_result["_source"]["doc_datetime"]
                doc_writer   = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"]))
                doc_url      = this_result["_source"]["doc_url"]
                doc_title    = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"]))
                doc_content  = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"]))
                depth1_nm    = this_result["_source"]["depth1_nm"]
                depth2_nm    = this_result["_source"]["depth2_nm"]
                depth3_nm    = this_result["_source"]["depth3_nm"]
                score        = this_result["_score"]
                
                # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외.
                if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None):
                    continue
                
                row += 1
                worksheet.write(row, 0, doc_id, self.default)
                worksheet.write(row, 1, doc_datetime, self.default)
                worksheet.write(row, 2, doc_writer, self.default)
                worksheet.write(row, 3, doc_url, self.default)
                worksheet.write(row, 4, doc_title, self.default)
                worksheet.write(row, 5, doc_content, self.default)
                worksheet.write(row, 6, depth1_nm, self.default)
                worksheet.write(row, 7, depth2_nm, self.default)
                worksheet.write(row, 8, depth3_nm, self.default)
                worksheet.write(row, 9, score, self.default)

            # 결과건수가 한 페이지 사이즈보다 큰 경우, scroll을 이용해서 paging하며 결과를 가져옴.
            # 용량이 클 것으로 예상하여 엑셀 파일도 새로 생성.
            if "hits" in result and result["hits"]["total"] > size:
                row = 0
                for page in range(1, math.ceil(result["hits"]["total"]/size)): # 0, 1, 2, ....
                    worksheet = self.workbook.add_worksheet("원문(%s)(%d)"%("~".join([params['start_date'][0:10],params['end_date'][0:10]]),page))
                    # 엑셀 헤더
                    worksheet.write(0, 0, 'ID', self.header)
                    worksheet.write(0, 1, '게시일', self.header)
                    worksheet.write(0, 2, '작성자', self.header)
                    worksheet.write(0, 3, 'URL', self.header)
                    worksheet.write(0, 4, '제목', self.header)
                    worksheet.write(0, 5, '내용', self.header)
                    worksheet.write(0, 6, '채널1', self.header)
                    worksheet.write(0, 7, '채널2', self.header)
                    worksheet.write(0, 8, '채널3', self.header)
                    worksheet.write(0, 9, '정확도', self.header) # 정확도(Score) 추가    
                    
                    scrolled_result = es.get_documents(params, size, index, scroll_id=result["_scroll_id"])
                    for this_result in scrolled_result["hits"]["hits"]:
                        doc_id       = this_result["_id"]
                        doc_datetime = this_result["_source"]["doc_datetime"]
                        doc_writer   = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_writer"]))
                        doc_url      = this_result["_source"]["doc_url"]
                        doc_title    = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_title"]))
                        doc_content  = re.sub("[\+=\-/]", "", str(this_result["_source"]["doc_content"]))
                        depth1_nm    = this_result["_source"]["depth1_nm"]
                        depth2_nm    = this_result["_source"]["depth2_nm"]
                        depth3_nm    = this_result["_source"]["depth3_nm"]
                        score        = this_result["_score"]
                        
                        # 2018.04.05 특정 패턴이 등장하는 title, content가 포함되어 있을 경우 row에서 제외.
                        if EXCLUDE_PATTERNS is not None and (EXCLUDE_PATTERNS.search(doc_title) is not None or EXCLUDE_PATTERNS.search(doc_content) is not None):
                            continue
                        
                        row += 1
                        worksheet.write(row, 0, doc_id, self.default)
                        worksheet.write(row, 1, doc_datetime, self.default)
                        worksheet.write(row, 2, doc_writer, self.default)
                        worksheet.write(row, 3, doc_url, self.default)
                        worksheet.write(row, 4, doc_title, self.default)
                        worksheet.write(row, 5, doc_content, self.default)
                        worksheet.write(row, 6, depth1_nm, self.default)
                        worksheet.write(row, 7, depth2_nm, self.default)
                        worksheet.write(row, 8, depth3_nm, self.default)
                        worksheet.write(row, 9, score, self.default)


                    # 마지막 페이지를 처리하고 나면 scroll을 clear
                    if page == math.ceil(result["hits"]["total"]/size)-1: 
                        if result["_scroll_id"]:
                            es.clear_scroll(result["_scroll_id"])