예제 #1
0
def process_from_json(file_path, nlp_model):
    """
    从json文件中读入数据
    :param file_path: json file path
    :param nlp_model:
    :return:
    """
    try:
        # mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION)
        es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE)
        with open(file_path, 'rb') as f:
            string = f.read()
            record = json.loads(string)
        document_model = documentExtraction(record, nlp_model)
        if not es.check_info_exist(document_model.title):
            logger.info('begin extract doc %s...' % document_model.title)
            document_info = document_model.extract_knowledge_from_record()
            if len(document_info.keys()):
                es.insert_single_info(document_info)
            else:
                logger.warn('extract document info failed ,skip es store')
        else:
            logger.info('doc %s exist in es, skip' % document_model.title)
    except Exception, e:
        logger.error(
            'document extraction process from json file failed for %s' %
            str(e))
예제 #2
0
 def rule_notice_attach(self, source_info):
     """
     提取附件关系的规则
     :param source_info:
     :return: bool, string, list
     """
     try:
         link_info = list()
         info = source_info.get('_source', {})
         source_id = source_info.get('_id', '')
         if len(info.get('attachment_file', [])):
             for attachment_file in info.get('attachment_file'):
                 search_name = attachment_file[: - 1 * (len(attachment_file.split('.')[-1]) + 1)]
                 _id_list, _title_list = self.es_db.search_id_from_title(search_name)
                 for _id, _title in zip(_id_list, _title_list):
                     if _id != source_id and _title == search_name:
                         link_info.append({
                             'source': source_id,
                             'target': _id,
                             'sourceType': 'id',
                             'targetType': 'id'
                         })
                 else:
                     pass
             if len(link_info):
                 return True, 'attach', link_info
             else:
                 return False, '', []
         else:
             return False, '', []
     except Exception, e:
         logger.error('searching attach relation attach failed for %s' % str(e))
         return False, '', []
예제 #3
0
 def rule_file_from(self, source_info):
     """
     提取附件从属的规则
     :param source_info:
     :return: bool, string, list
     """
     try:
         link_info = list()
         info = source_info.get('_source', {})
         source_id = source_info.get('_id', '')
         if len(info.get('parrent_file', '')):
             search_name = info.get('parrent_file')
             _id_list, _title_list = self.es_db.search_id_from_title(search_name)
             for _id, _title in zip(_id_list, _title_list):
                 if _id != source_id and _title == search_name:
                     link_info.append({
                         'source': source_id,
                         'target': _id,
                         'sourceType': 'id',
                         'targetType': 'id'
                     })
             if len(link_info):
                 return True, 'from', link_info
             else:
                 return False, '', []
         else:
             return False, '', []
     except Exception, e:
         logger.error('searching attach relation from failed for %s' % str(e))
         return False, '', []
예제 #4
0
    def rule_doc_entity(self, source_info):
        """

        :param source_info:
        :return:
        """
        try:
            link_info = list()
            info = source_info.get('_source', {})
            source_id = source_info.get('_id', '')
            entity_name = info.get('entity_name', [])
            entity_org = info.get('entity_org', [])
            entity_loc = info.get('entity_loc', [])
            entity_list = entity_name + entity_org + entity_loc
            for seg in entity_list:
                link_info.append({
                    'source': source_id,
                    'target': seg,
                    'sourceType': 'id',
                    'targetType': 'seg'
                })
            if len(link_info):
                return True, 'include', link_info
            else:
                return False, '', []
        except Exception, e:
            logger.error('searching entity relation failed for %s' % str(e))
            return False, '', []
예제 #5
0
 def run(self):
     """
     """
     logger.info('begin crawler..')
     try:
         self._run()
     except Exception, e:
         logger.error('star crawler failed for %s, stop crawler' % str(e))
         sys.exit(1)
예제 #6
0
    def rule_doc_quote(self, source_info):
        """
        提取文档的引用关系,包括idendify和文件的引用
        :param source_info:
        :return:
        """
        try:
            link_info = list()
            info = source_info.get('_source', {})
            source_id = source_info.get('_id', '')
            source_identify = info.get('identify', '')
            source_quote = info.get('quote_title', []) + info.get('quote_content', [])
            source_file = list()
            source_quote_file = list()
            # can use counter
            for item in info.get('quote_title', []):
                if item not in source_file:
                    source_file.append(item)
            for item in info.get('quote_content', []):
                if item not in source_file:
                    source_file.append(item)
            for item in source_quote:
                if item not in source_quote_file:
                    source_quote_file.append(item)
            # seaching
            if len(source_identify):
                _id_list = self.es_db.search_id_list_from_identify(source_identify)
            else:
                _id_list = []
            for _id in _id_list:
                if _id != source_id:
                   link_info.append({
                        'source': source_id,
                        'target': _id,
                        'sourceType': 'id',
                        'targetType': 'id'
                   })
            for quote_file in source_quote_file:
                _id_list = self.es_db.search_id_list_from_filename(quote_file)
                for _id in _id_list:
                    if _id != source_id:
                        link_info.append({
                            'source': source_id,
                            'target': _id,
                            'sourceType': 'id',
                            'targetType': 'id'
                        })
            if len(link_info):
                return True, 'quote', link_info
            else:
                return False, '', []

        except Exception, e:
            logger.error('searching attach relation quote failed for %s' % str(e))
            return False, '', []
예제 #7
0
 def _save_json(self, content, file_path):
     """
     存储json文件
     :param file_path:
     :return:
     """
     try:
         with open(file_path, 'wb') as f:
             f.write(json.dumps(content, ensure_ascii=False, indent=4))
     except Exception, e:
         logger.error('write json file failed for %s' % str(e))
예제 #8
0
 def _extract_keyword_from_doc(self):
     """
     提取文档关键词
     :return:
     """
     try:
         doc = self.title + self.content
         key_word_model = TextSummary4Seg(doc, 6, 0.85, 700, self.model)
         return key_word_model.top_n_seg(5)
     except Exception, e:
         logger.error('extract key word from doc failed for %s' % str(e))
         return []
예제 #9
0
 def _load_json(self, file_path):
     """
     加载json文件
     :param file_path:
     :return:
     """
     try:
         with open(file_path, 'rb') as f:
             content = f.read()
         return json.loads(content)
     except Exception, e:
         logger.error('load json file failed for %s' % str(e))
         return None
예제 #10
0
 def _extract_abstract_from_doc(self, seperated=False):
     """
     提取文档摘要,目前使用抽取式,不使用生成式摘要
     :param seperated: 是否对文档内容分段进行摘要提取
     :return:
     """
     try:
         doc = self.title + self.content
         key_sentence_model = TextSummary4Sentence(doc, 700, 0.85,
                                                   self.model)
         return key_sentence_model.top_n_sentence(3)
     except Exception, e:
         logger.error('extract abstract from doc failed for %s' % str(e))
         return []
예제 #11
0
    def _create_entity_node(self, result_info):
        """
        建立图中的
        :param result_info:
        :return:
        """
        try:
            entity_cache_list = list()
            for doc_info in result_info:
                info = doc_info['_source']
                entity_name = info.get('entity_name', [])
                entity_org = info.get('entity_org', [])
                entity_loc = info.get('entity_loc', [])
                for seg in entity_name:
                    if seg not in entity_cache_list:
                        entity_info = {
                            'entity_type': 'name',
                            'seg': seg
                        }
                        self.neo4j_db.create_entity_node(entity_info)
                        logger.info('create name entity node of %s' % seg)
                        entity_cache_list.append(seg)
                    else:
                        continue
                for seg in entity_org:
                    if seg not in entity_cache_list:
                        entity_info = {
                            'entity_type': 'org',
                            'seg': seg
                        }
                        self.neo4j_db.create_entity_node(entity_info)
                        logger.info('create organization entity node of %s' % seg)
                        entity_cache_list.append(seg)
                    else:
                        continue

                for seg in entity_loc:
                    if seg not in entity_cache_list:
                        entity_info = {
                            'entity_type': 'loc',
                            'seg': seg
                        }
                        self.neo4j_db.create_entity_node(entity_info)
                        logger.info('create location entity node of %s' % seg)
                        entity_cache_list.append(seg)
                    else:
                        continue

        except Exception, e:
            logger.error('create entity node failed for %s' %str(e))
예제 #12
0
 def save_attachement_file(self, attachment_file_link,
                           attachment_file_name):
     """
     保存附件文件
     :param attachment_file_link:
     :return:
     """
     try:
         response = self.get(attachment_file_link)
         with open(os.path.join(SAVING_PATH, attachment_file_name),
                   'wb') as f:
             logger.info('saving file %s' % attachment_file_name)
             f.write(response)
     except Exception, e:
         logger.error('saving attachment file failed for %s' % str(e))
예제 #13
0
 def _extract_public_org_2(self):
     """
     针对中央财政部,提取二级发布部门
     位置在网址链接中http后的第一级字段
     :return:
     """
     try:
         link = self.record.get('noticeLink', '')
         link_start = re.findall(self.link_pattern, link)[0]
         second_org = CENTER_DEPARTMENT.get(link_start, '')
         return second_org
     except Exception, e:
         logger.error('extract public organization level 2 failed for %s' %
                      str(e))
         return ''
예제 #14
0
 def _extract_filename_from_title(self):
     """
     从标题中提取出文件名列表
     :return:
     """
     try:
         filename_list = list()
         # doc = self.record.get('noticeTitle', '')
         for string in re.findall(self.file_pattern, self.title):
             if string not in filename_list:
                 filename_list.append(string)
         return filename_list
     except Exception, e:
         logger.error('find file name from title failed for %s' % str(e))
         return []
예제 #15
0
 def _extract_filename_from_doc(self):
     """
     从中文中提取出文件名
     :return:
     """
     try:
         filename_list = list()
         # doc = self.record.get('noticeContent', '')
         for string in re.findall(self.file_pattern, self.content):
             if string not in filename_list:
                 filename_list.append(string)
         return filename_list
     except Exception, e:
         logger.error('find file name from doc failed for %s' % str(e))
         return []
예제 #16
0
 def _check_info_exist(self, title):
     """
     判断 title 的信息是否已经在数据库中
     :param title:
     :return:
     """
     try:
         result = self.mongo.collection.find({'noticeTitle': title})
         try:
             result[0]
             return True
         except:
             return False
     except Exception, e:
         logger.error('check title failed for %s' % str(e))
예제 #17
0
 def build_graph_by_id(self, id):
     """
     建立固定文档的图连接
     :param id:
     :return:
     """
     try:
         doc_result = self.es_db.search_doc_by_id(id)
         doc_result_info = doc_result['hits']['hits']
         self._create_doc_node(doc_result_info)
         self._create_entity_node(doc_result_info)
         # result = self.es_db.search_all(size=10000)
         # result_info = result['hits']['hits']
         self._create_entity_node(doc_result_info)
     except Exception, e:
         logger.error('build graph by id failed for %s' % str(e))
예제 #18
0
    def initial(self):
        """
        建立图数据库的主运行函数
        数据读取来自于es的存储数据
        :return:
        """
        try:
            result = self.es_db.search_all(size=10000)
            result_info = result['hits']['hits']
            self._create_doc_node(result_info)
            self._create_entity_node(result_info)
            # self._create_node_relationship(result_info, [self.rule_doc_explain, self.rule_doc_quote])
            self._create_node_relationship(result_info, self.rule_list)

        except Exception, e:
            logger.error('build graph failed for %s' % str(e))
예제 #19
0
    def rule_doc_trans(self, source_info):
        """
        转发文件的关系提取
        :param source_info:
        :return:
        """
        try:
            link_info = list()
            info = source_info.get('_source', {})
            source_id = source_info.get('_id', '')
            title = info.get('title', '')
            if '转发' in title:
                pass
            else:
                return False, '', []
            if len(info.get('title', [])):
                search_title = info.get('title').replace('转发', '')
                _id_list, _ = self.es_db.search_id_from_title(search_title)
                for _id in _id_list:
                    if _id != source_id:
                        link_info.append({
                                'source': source_id,
                                'target': _id,
                                'sourceType': 'id',
                                'targetType': 'id'
                        })
                # for quote_file in info.get('quote_title'):
                #     _id_list = self.es_db.search_id_list_from_filename(quote_file)
                #     for _id in _id_list:
                #         if _id != source_id:
                #             link_info.append({
                #                 'source': _id,
                #                 'target': source_id,
                #                 'sourceType': 'id',
                #                 'targetType': 'id'
                #             })

                if len(link_info):
                    return True, 'trans', link_info
                else:
                    return False, '', []
            else:
                return False, '', []

        except Exception, e:
            logger.error('searching doc explain relationship failed for %s' % str(e))
            return False, '', []
예제 #20
0
def preprocess_for_data():
    """
    将文本转换为模型输入的前处理流程

    :return:
    """
    try:
        train_sentences = load_sentence_file(FLAGS.train_file, FLAGS.zeros)
        dev_sentences = load_sentence_file(FLAGS.dev_file, FLAGS.zeros)
        test_sentences = load_sentence_file(FLAGS.test_file, FLAGS.zeros)
        # change tag schema in sentence
        trans_tag_schema(train_sentences, FLAGS.tag_schema)
        trans_tag_schema(test_sentences, FLAGS.tag_schema)
        # loading/writing mapping file
        if not os.path.isfile(FLAGS.map_file):
            logger.info('mapping file does not exist, create mapping file')
            if FLAGS.pre_emb:
                pass
            else:
                char_count_dic, id_to_char, char_to_id = char_mapping(
                    train_sentences, FLAGS.lower)
            tag_count_dic, id_to_tag, tag_to_id = tag_mapping(train_sentences)
            with open(FLAGS.map_file, 'wb') as f:
                # notice pickle file format with py2 and py3
                pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
        else:
            pass
            logger.info('loading mapping file')
            with open(FLAGS.map_file, 'rb') as f:
                char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

        # prepare model data set
        # format data  --- [[char_list, char_id_list, seg_id_list, tags_id_list],[]]
        #     seg_id_list example: [X/XX/XXX/XXXX] -> [0 /1 3 /1 2 3 /1 2 2 3]
        train_data = prepare_model_data(train_sentences, char_to_id, tag_to_id,
                                        FLAGS.lower)
        dev_data = prepare_model_data(dev_sentences, char_to_id, tag_to_id,
                                      FLAGS.lower)
        test_data = prepare_model_data(test_sentences, char_to_id, tag_to_id,
                                       FLAGS.lower)
        train_manager = BatchManager(train_data, FLAGS.batch_size)
        dev_manager = BatchManager(dev_data, 100)
        test_manager = BatchManager(test_data, 100)
        return train_manager, dev_manager, test_manager

    except Exception, e:
        logger.error('pre-process for train string failed for %s' % str(e))
예제 #21
0
def train():
    """
    训练模块
    :return:
    """
    try:
        # limit GPU memory
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True

        train_manager, dev_manager, test_manager = preprocess_for_data()
        logger.info('loading mapping file')
        with open(FLAGS.map_file, 'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        make_path(FLAGS)
        if os.path.isfile(FLAGS.config_file):
            config = load_config(FLAGS.config_file)
        else:
            config = build_config(char_to_id, tag_to_id)
            save_config(config, FLAGS.config_file)
        #
        steps_per_epoch = train_manager.len_data
        with tf.Session(config=tf_config) as sess:
            model = initial_ner_model(sess, NER_MODEL, FLAGS.ckpt_path,
                                      load_word2vec, config, id_to_char)
            logger.info("start training NER model")
            loss = []
            # epoch iterate
            for i in range(FLAGS.max_epoch):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                                        iteration, step % steps_per_epoch,
                                        steps_per_epoch, np.mean(loss)))
                        loss = []
                save_model(sess, model, FLAGS.ckpt_path)
                # evaluate result for stop epoch iter
                # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                # if best:
                #     save_model(sess, model, FLAGS.ckpt_path, logger)
                # evaluate(sess, model, "test", test_manager, id_to_tag, logger)
    except Exception, e:
        logger.error('training model process failed for %s' % str(e))
예제 #22
0
 def _extract_identify_from_doc(self):
     """
     从文件正文中提取出文件标示编号,
     目前的文件编号格式为:财会〔2018〕20号
     :return:
     """
     try:
         doc = self.content
         identify_list = re.findall(self.identify_pattern, doc)
         if len(identify_list):
             return identify_list
         else:
             logger.warn('doc do not have file identify')
             return []
     except Exception, e:
         logger.error('extract file identify from doc failed for %s' %
                      str(e))
예제 #23
0
    def save_notice_info(self, notice_info):
        """

        :param notice_info:
        :return:
        """
        try:
            if not self._check_info_exist(notice_info['noticeTitle']):
                logger.info('insert notice info...')
                self.mongo.collection.insert_one(notice_info)
            else:
                logger.info('update notice info...')
                self.mongo.collection.find_one_and_update(
                    {'noticeTitle': notice_info['noticeTitle']},
                    {'$set': notice_info})

        except Exception, e:
            logger.error('mongoDB store notice info failed for %s' % str(e))
예제 #24
0
 def _create_node_relationship(self, result_info, rule_list):
     """
     根据规则建立节点间的链接关系
     :param result_info:
     :return:
     """
     try:
         for source_info in result_info:
             # begin match rules
             logger.info('extract file with id %s' % str(source_info.get('_id','')))
             for rule in rule_list:
                 is_match, relationship_type, relationship_info = rule(source_info)
                 if is_match:
                     logger.info('matching rule %s'%rule.__name__)
                     self.neo4j_db.create_relation(relationship_type, relationship_info)
                 else:
                     pass
     except Exception, e:
         logger.error('extract relationship between nodes failed for %s' % str(e))
예제 #25
0
 def _create_doc_node(self, result_info):
     """
     建立图中的文档节点
     :result_info: es中查询结果
     :return:
     """
     try:
         for doc_info in result_info:
             doc_analysis = self._doc_info_analysis(doc_info)
             if doc_analysis:
                 if not self.neo4j_db.check_node_exist(doc_analysis):
                     self.neo4j_db.create_doc_node(doc_analysis)
                     logger.info('create node...')
                 else:
                     logger.info('node is existed, skip')
             else:
                 logger.warn('analysis doc info failed ,skip...')
     except Exception, e:
         logger.error('create doc node failed for %s' %str(e))
예제 #26
0
    def _analysis_table_data(self, table_info, zb_key, reg_key):
        """
        table_info format
        {
            'datanodes':[{
                u'code': u'zb.A080101_reg.110000_sj.2017', u'data': {u'dotcount': 2, u'data': 5430.79, u'strdata': u'5430.79', u'hasdata': True}, u'wds': [{u'wdcode': u'zb', u'valuecode': u'A080101'}, {u'wdcode': u'reg', u'valuecode': u'110000'}, {u'wdcode': u'sj', u'valuecode': u'2017'}]
            },# table data
            ],
            'wdnodes':[{zb node info}, {reg node info}, {sj node info}]
        }
        :param table_info:
        :return:
        """
        try:
            # for saving json
            data_list = list()
            zb_node_dict = dict()
            for zb_node in table_info['returndata']['wdnodes'][0]['nodes']:
                zb_node_dict[zb_node['code']] = {
                    'name': zb_node['cname'],
                    'des': zb_node.get('exp', '') + zb_node.get('memo', ''),
                    'unit': zb_node.get('exp', '')
                }
            for _data_info in table_info['returndata']['datanodes']:
                data_info = {
                    'id': _data_info['wds'][0]['valuecode'],
                    'mainKey': zb_key,
                    'location': reg_key,
                    'key':
                    zb_node_dict[_data_info['wds'][0]['valuecode']]['name'],
                    'value': _data_info['data']['data'],
                    'year': _data_info['wds'][2]['valuecode'],
                    'unit':
                    zb_node_dict[_data_info['wds'][0]['valuecode']]['unit']
                }
                self._save_data(data_info)
                data_list.append(data_info)
            # self._save_json(json.dumps(data_list, ensure_ascii=False, indent=4), '../data/%s.json'%)
            return data_list

        except Exception, e:
            logger.error('analysis table data failed for %s' % str(e))
예제 #27
0
 def extract_knowledge_from_record(self):
     """
     抽取主函数
     :return:
     """
     try:
         # 数据库抽取
         entity_list = self._extract_entity_from_record()
         knowledge_body = {
             # debug 存入es中时时间字段不能为空,为空时不能进行数据的插入
             # #爬虫数据中有可能没有时间字段,可由content_attach中进行抽取,暂时不进行,进行默认时间的录入
             # cc 2018-09-23
             'publish_time': self.record.get('publishTime', '') if len(self.record.get('publishTime', '')) else '2018-08-01',
             'publish_location': self.record.get('location', ''),
             'publish_org': LOCATION_ORG_DICT.get(self.record.get('location', ''), ''),
             'publish_org_2': self._extract_public_org_2(),
             'title': self.title,
             'category': self.record.get('category', ''),
             'classify': '',
             'content': self.content,
             'identify': self.record.get('noticeIdentify', ''),
             'content_identify': self._extract_identify_from_doc(),
             'content_attach': self.record.get('noticeAttachment', ''),
             'quote_title': self._extract_filename_from_title(),
             'quote_content': self._extract_filename_from_doc(),
             'entity_loc': [item[0] for item in entity_list if item[1] == 'ns'],
             'entity_org': [item[0] for item in entity_list if item[1] == 'ni'],
             'entity_name': [item[0] for item in entity_list if item[1] == 'np'],
             'attachment_file': self.record.get('attachmentFileList', []) if not self.file_name else [],
             'parrent_file': self.record.get('noticeTitle', '') if self.file_name else '',
             'key_word': [item[0] for item in self._extract_keyword_from_doc()] \
                 if self.type not in ['xls', 'xlsx'] else [],
             'abstract': [item[0] for item in self._extract_abstract_from_doc()] \
                 if self.type not in ['xls', 'xlsx'] else [],
             'data_key': [],
             'data': {}
         }
         return knowledge_body
     except Exception, e:
         logger.error('extract knowledge from record failed for %s' %
                      str(e))
         return {}
예제 #28
0
    def _search_time_from_title(self, title):
        """

        :param title:
        :return:
        """
        try:
            pattern = re.compile('('.decode('utf-8') + u'(.*)' +
                                 ')'.decode('utf-8'))
            for str in re.findall(pattern, title):
                try:
                    datetime.datetime.strptime(str, '%Y-%m-%d')
                    return str
                except:
                    continue
            logger.warn('do not find time str..')
            return ''
        except Exception, e:
            logger.error('searching time string failed for %s' % str(e))
            return ''
예제 #29
0
 def _check_info_exist(self, id, year, reg):
     """
     判断info 是否存在
     :param zb_key: 指标的名称
     :param reg_key: 地点的名称
     :return: bool
     """
     try:
         result = self.mongo.collection.find({
             'id': id,
             'year': year,
             'location': reg
         })
         try:
             result[0]
             return True
         except:
             return False
     except Exception, e:
         logger.error('check title failed for %s' % str(e))
예제 #30
0
 def __get_content_title(self):
     """
     统一取出content和title,需要多次使用
     :return:
     """
     try:
         if not self.file_name:
             self.title = self.record.get('noticeTitle', '')
             self.content = self.record.get('noticeContent', '')
             self.type = 'notice'
         else:
             # 去掉文件名中的空格,文件格式转换时做了空格的消除
             self.file_name = self.__pre_deal_with_str(self.file_name)
             if len(self.file_name.split('.')) >= 2:
                 self.title = self.file_name.split('.')[-2]
             else:
                 self.title = self.file_name
             file_type = self.file_name.split('.')[-1]
             if file_type in ['xls', 'xlsx']:
                 trans_file_type = 'csv'
             else:
                 trans_file_type = 'txt'
             trans_file_name = self.file_name[:-1 *
                                              (len(file_type) +
                                               1)] + '.' + trans_file_type
             if os.path.isfile(os.path.join(FILE_PATH, trans_file_name)):
                 logger.info('reading file %s' % trans_file_name)
                 with open(os.path.join(FILE_PATH, trans_file_name),
                           'r') as f:
                     self.content = f.read()
                     self.type = file_type
             else:
                 logger.warn('file %s do not have trans file' %
                             trans_file_name)
                 self.content = ''
                 self.type = ''
     except Exception, e:
         logger.error('get content and title string failed for %s' % str(e))
         self.title = ''
         self.content = ''
         self.type = ''