示例#1
0
def process_from_json(file_path, nlp_model):
    """
    从json文件中读入数据
    :param file_path: json file path
    :param nlp_model:
    :return:
    """
    try:
        # mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION)
        es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE)
        with open(file_path, 'rb') as f:
            string = f.read()
            record = json.loads(string)
        document_model = documentExtraction(record, nlp_model)
        if not es.check_info_exist(document_model.title):
            logger.info('begin extract doc %s...' % document_model.title)
            document_info = document_model.extract_knowledge_from_record()
            if len(document_info.keys()):
                es.insert_single_info(document_info)
            else:
                logger.warn('extract document info failed ,skip es store')
        else:
            logger.info('doc %s exist in es, skip' % document_model.title)
    except Exception, e:
        logger.error(
            'document extraction process from json file failed for %s' %
            str(e))
示例#2
0
 def _run(self):
     """
     启动爬虫主函数
     :return:
     """
     self.notice_link_list = list()
     self.title_base_url = self.base_url + '/' + self.category
     for page in range(0, self.page):
         if page == 0:
             url = self.title_base_url + '/' + 'index.htm'
         else:
             url = self.title_base_url + '/' + 'index_%d.htm' % page
         logger.info('searching gov finance notice link on page %d' %
                     (page + 1))
         response = self.get(url)
         page_soup = BeautifulSoup(response, 'html5lib')
         # debug 2018-9-5
         # 财经视点栏目的tag class名字与其他栏目的tag class不一致
         if self.category == 'caijingshidian':
             notice_tag_list = page_soup.find_all('td',
                                                  attrs={'class': 'xiaxu'})
         else:
             notice_tag_list = page_soup.find_all('td',
                                                  attrs={'class': 'ZITI'})
         for notice_tag in notice_tag_list:
             title = notice_tag.attrs.get('title')
             time_str = self._search_time_from_title(title)
             logger.info('notice publish time is %s' % time_str)
             if title:
                 pass
             else:
                 logger.warning('searching notice title failed')
                 continue
             notice_info_tag = notice_tag.find('a')
             link = notice_info_tag.attrs.get('href')
             if link:
                 logger.info('searching notice info for %s' % title)
                 self.notice_link_list.append(link)
                 link_info, is_exist = self.search_link_info(link)
                 if link_info and not is_exist:
                     link_info['publishTime'] = time_str
                     self.save_notice_info(link_info)
                 elif is_exist:
                     link_info['publishTime'] = time_str
                     self.save_notice_info(link_info)
                     logger.info('link info is existed')
                     continue
                 else:
                     logger.warn('searching link info failed')
             else:
                 logger.warning('get notice link failed for %s' % title)
             # 间隔5秒
             logger.info('crawler sleeping for 5s...')
             time.sleep(5)
         # 间隔2秒
         logger.info('crawler sleeping for 2s...')
         time.sleep(2)
示例#3
0
 def _extract_identify_from_doc(self):
     """
     从文件正文中提取出文件标示编号,
     目前的文件编号格式为:财会〔2018〕20号
     :return:
     """
     try:
         doc = self.content
         identify_list = re.findall(self.identify_pattern, doc)
         if len(identify_list):
             return identify_list
         else:
             logger.warn('doc do not have file identify')
             return []
     except Exception, e:
         logger.error('extract file identify from doc failed for %s' %
                      str(e))
示例#4
0
 def _create_doc_node(self, result_info):
     """
     建立图中的文档节点
     :result_info: es中查询结果
     :return:
     """
     try:
         for doc_info in result_info:
             doc_analysis = self._doc_info_analysis(doc_info)
             if doc_analysis:
                 if not self.neo4j_db.check_node_exist(doc_analysis):
                     self.neo4j_db.create_doc_node(doc_analysis)
                     logger.info('create node...')
                 else:
                     logger.info('node is existed, skip')
             else:
                 logger.warn('analysis doc info failed ,skip...')
     except Exception, e:
         logger.error('create doc node failed for %s' %str(e))
示例#5
0
    def _search_time_from_title(self, title):
        """

        :param title:
        :return:
        """
        try:
            pattern = re.compile('('.decode('utf-8') + u'(.*)' +
                                 ')'.decode('utf-8'))
            for str in re.findall(pattern, title):
                try:
                    datetime.datetime.strptime(str, '%Y-%m-%d')
                    return str
                except:
                    continue
            logger.warn('do not find time str..')
            return ''
        except Exception, e:
            logger.error('searching time string failed for %s' % str(e))
            return ''
示例#6
0
 def __get_content_title(self):
     """
     统一取出content和title,需要多次使用
     :return:
     """
     try:
         if not self.file_name:
             self.title = self.record.get('noticeTitle', '')
             self.content = self.record.get('noticeContent', '')
             self.type = 'notice'
         else:
             # 去掉文件名中的空格,文件格式转换时做了空格的消除
             self.file_name = self.__pre_deal_with_str(self.file_name)
             if len(self.file_name.split('.')) >= 2:
                 self.title = self.file_name.split('.')[-2]
             else:
                 self.title = self.file_name
             file_type = self.file_name.split('.')[-1]
             if file_type in ['xls', 'xlsx']:
                 trans_file_type = 'csv'
             else:
                 trans_file_type = 'txt'
             trans_file_name = self.file_name[:-1 *
                                              (len(file_type) +
                                               1)] + '.' + trans_file_type
             if os.path.isfile(os.path.join(FILE_PATH, trans_file_name)):
                 logger.info('reading file %s' % trans_file_name)
                 with open(os.path.join(FILE_PATH, trans_file_name),
                           'r') as f:
                     self.content = f.read()
                     self.type = file_type
             else:
                 logger.warn('file %s do not have trans file' %
                             trans_file_name)
                 self.content = ''
                 self.type = ''
     except Exception, e:
         logger.error('get content and title string failed for %s' % str(e))
         self.title = ''
         self.content = ''
         self.type = ''
示例#7
0
    def search_link_info(self, notice_link):
        """
        通过公告链接获取全文,下载附件
        :param notice_link:
        :return:
        """
        try:
            if notice_link.startswith('http'):
                pass
            else:
                notice_link = self.title_base_url + notice_link[1:]
            # generate for attachment file url
            notice_baseurl = notice_link[0:(len(notice_link.split('/')[-1]) +
                                            1) * -1]

            response = self.get(notice_link)
            notice_soup = BeautifulSoup(response, 'html5lib')
            title_tag = notice_soup.find('td', attrs={'class': 'font_biao1'})
            main_tag = notice_soup.find('div', attrs={'class': 'TRS_Editor'})
            attachment_tag = notice_soup.find('span', attrs={'id': 'appendix'})
            title = self._get_tag_string(title_tag).strip()
            # debug 2018-9-12
            # file name without space
            title = title.replace(' ', '')
            # if self._check_info_exist(title):
            #     return None, True
            logger.info('notice title is %s' % title)
            # notice doc search
            doc_tag_list = main_tag.find_all('p')
            doc_content = ''
            doc_identify = ''
            doc_attachment = ''
            # 原始网站中的公告内容使用p tag进行换行,所以在存入content的时候需要加入换行符
            # 2018-9-4 cc
            for doc_tag in doc_tag_list:
                if doc_tag.attrs.get('align') == 'center':
                    doc_content += self._get_tag_string(doc_tag) + '\n'
                    doc_identify += self._get_tag_string(doc_tag).strip()
                # elif doc_tag.attrs.get('align') == 'justify':
                #     doc_content += self._get_tag_string(doc_tag)
                elif doc_tag.attrs.get('align') == 'right':
                    doc_content += self._get_tag_string(doc_tag) + '\n'
                    doc_attachment += self._get_tag_string(
                        doc_tag).strip() + '\n'
                else:
                    doc_content += self._get_tag_string(doc_tag) + '\n'

            # attachment file search
            attachment_file_list = attachment_tag.find_all('a')
            attachment_file_name_list = list()
            attachment_file_link_list = list()
            # 部分文件的后缀名不在附件名中出现需要从链接中取出后缀名
            # 2018-9-6 debug
            for attachment_file_tag in attachment_file_list:
                attachment_file_name = ''
                _attachment_link = attachment_file_tag.attrs.get('href')
                try:
                    file_type = _attachment_link.split('.')[-1]
                except:
                    logger.warn('search file type failed')
                    file_type = ''
                _attachment_file_name = self._get_tag_string(
                    attachment_file_tag).strip()
                if ':' in _attachment_file_name:
                    attachment_file_name = _attachment_file_name.split(':')[-1]
                elif ':' in _attachment_file_name:
                    attachment_file_name = _attachment_file_name.split(':')[-1]
                else:
                    attachment_file_name = _attachment_file_name
                # add file attachment type
                try:
                    attachment_file_type = attachment_file_name.split('.')[-1]
                except:
                    attachment_file_type = ''
                if attachment_file_type not in ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip']\
                        and file_type != '':
                    attachment_file_name = attachment_file_name + '.' + file_type

                # _attachment_link format './P020180828399303596996.pdf'
                attachment_file_link = notice_baseurl + _attachment_link[1:]
                # saving file
                self.save_attachement_file(attachment_file_link,
                                           attachment_file_name)
                attachment_file_name_list.append(attachment_file_name)
                attachment_file_link_list.append(attachment_file_link)
            return {
                'noticeTitle': title,
                'noticeContent': doc_content,
                'noticeIdentify': doc_identify,
                'noticeAttachment': doc_attachment,
                'noticeLink': notice_link,
                'attachmentFileList': attachment_file_name_list,
                'attachmentLinkList': attachment_file_link_list,
                'category': self.category,
                'filePath': SAVING_PATH,
                'location': self.location
            }, False
        except Exception, e:
            logger.error('searching link info failed for %s' % str(e))
            return None, False
示例#8
0
def main_process(nlp_model):
    """
    main function
    :return:
    """
    try:
        mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB,
                            MONGODB_COLLECTION)
        es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE)
        cursor = mongo.collection.find(no_cursor_timeout=True)
        for record in cursor:
            # for record in mongo.collection.find().batch_size(1):
            if not len(record.get('attachmentFileList', [])):
                document_model = documentExtraction(record, nlp_model)
                if not es.check_info_exist(document_model.title):
                    logger.info('begin extract doc %s...' %
                                document_model.title)
                    document_info = document_model.extract_knowledge_from_record(
                    )
                    if len(document_info.keys()):
                        es.insert_single_info(document_info)
                    else:
                        logger.warn(
                            'extract document info failed ,skip es store')
                else:
                    logger.info('doc %s exist in es, skip' %
                                document_model.title)
            else:
                document_model = documentExtraction(record, nlp_model)
                if not es.check_info_exist(document_model.title):
                    logger.info('begin extract doc %s...' %
                                document_model.title)
                    document_info = document_model.extract_knowledge_from_record(
                    )
                    if len(document_info.keys()):
                        es.insert_single_info(document_info)
                    else:
                        logger.warn(
                            'extract document info failed ,skip es store')
                else:
                    logger.info('doc %s exist in es, skip' %
                                document_model.title)
                for file_name in record.get('attachmentFileList', []):
                    document_model = documentExtraction(record,
                                                        nlp_model,
                                                        file_name=file_name)
                    if not es.check_info_exist(document_model.title):
                        logger.info('begin extract doc %s...' %
                                    document_model.title)
                        document_info = document_model.extract_knowledge_from_record(
                        )
                        if len(document_info.keys()):
                            es.insert_single_info(document_info)
                        else:
                            logger.warn(
                                'extract document info failed ,skip es store')
                    else:
                        logger.info('doc %s exist in es, skip' %
                                    document_model.title)
        cursor.close()
    except Exception, e:
        logger.error('document extract failed for %s' % str(e))
示例#9
0
def trans_file_from_db(trans_path):
    """

    :return:
    """
    try:
        mongo_db = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB,
                               MONGODB_COLLECTION)
        # 进入文件存储目录
        # os.system('cd %s' % SAVING_PATH)
        path_command = 'cd %s &&' % SAVING_PATH
        failed_list = list()
        for record in mongo_db.collection.find():
            file_list = record.get('attachmentFileList', [])
            for file_name in file_list:
                logger.info('begin to trans file %s' % file_name)
                # file name has space string , failed with shell command
                # remind with mongoDB attachment file list link
                if ' ' in file_name:
                    logger.info('file name has space string, trans file name')
                    os.system(path_command + "mv '%s' %s" %
                              (file_name, file_name.replace(' ', '')))
                    file_name = file_name.replace(' ', '')
                base_name = file_name[:(len(file_name.split('.')[-1]) + 1) *
                                      -1]
                if file_name.endswith('.doc') or file_name.endswith('.docx'):
                    os.system(path_command + 'unoconv -f txt %s' % file_name)
                    os.system(path_command + 'mv %s.txt %s' %
                              (base_name, trans_path))
                elif file_name.endswith('.xls') or file_name.endswith('.xlsx'):
                    os.system(path_command + 'unoconv -f csv %s' % file_name)
                    os.system(path_command + 'mv %s.csv %s' %
                              (base_name, trans_path))
                elif file_name.endswith('.pdf'):
                    os.system(path_command +
                              'pdftotext -nopgbrk %s %s/%s.txt' %
                              (file_name, trans_path, base_name))
                # 压缩文件类型不齐全
                # 目前包括 rar zip gz
                elif file_name.endswith('.rar') or file_name.endswith(
                        '.zip') or file_name.endswith('.gz'):
                    pass
                else:
                    logger.warn(
                        'file type is not recognized; file name is %s' %
                        file_name)
                    # trying trans doc/docx file
                    logger.info('trying trans file with unconv txt')
                    result = os.system(path_command +
                                       'unoconv -f txt %s' % file_name)
                    if not result:
                        os.system(path_command + 'mv %s.txt %s' %
                                  (base_name, trans_path))
                        continue
                    else:
                        logger.warn('trans file with unconv txt failed')
                    # trying trans xls/xlsx file
                    logger.info('trying trans file with unconv csv')
                    result = os.system(path_command +
                                       'unoconv -f csv %s' % file_name)
                    if not result:
                        os.system(path_command + 'mv %s.csv %s' %
                                  (base_name, trans_path))
                        continue
                    else:
                        logger.warn('trans file with unconv csv failed')
                    # trying trans pdf file
                    logger.info('trying trans file with pdftotext')
                    result = os.system(path_command +
                                       'pdftotext -pgnobrk %s %s/%s.txt' %
                                       (file_name, trans_path, base_name))
                    if not result:
                        continue
                    else:
                        logger.warn('trans file with pdftotext failed')
                    failed_list.append(file_name)
        # 打印无法转换的文件名称
        for file_name in failed_list:
            print file_name
    except Exception, e:
        logger.error('file trans failed for %s' % str(e))
示例#10
0
 # 测试
 thunlp_model = thulac.thulac(seg_only=False, model_path=THUNLP_MODEL_PATH, \
                              user_dict=THUNLP_USER_DIC_PATH)
 mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB,
                     MONGODB_COLLECTION)
 es = esConnector(url='localhost:9200', index='test', doc_type='finace')
 for record in mongo.collection.find().batch_size(1):
     if not len(record.get('attachmentFileList', [])):
         document_model = documentExtraction(record, thunlp_model)
         if not es.check_info_exist(document_model.title):
             logger.info('begin extract doc %s...' % document_model.title)
             document_info = document_model.extract_knowledge_from_record()
             if len(document_info.keys()):
                 es.insert_single_info(document_info)
             else:
                 logger.warn('extract document info failed ,skip es store')
         else:
             logger.info('doc %s exist in es, skip' % document_model.title)
     else:
         document_model = documentExtraction(record, thunlp_model)
         if not es.check_info_exist(document_model.title):
             logger.info('begin extract doc %s...' % document_model.title)
             document_info = document_model.extract_knowledge_from_record()
             if len(document_info.keys()):
                 es.insert_single_info(document_info)
             else:
                 logger.warn('extract document info failed ,skip es store')
         else:
             logger.info('doc %s exist in es, skip' % document_model.title)
         for file_name in record.get('attachmentFileList', []):
             document_model = documentExtraction(record,