Пример #1
0
def process_from_json(file_path, nlp_model):
    """
    从json文件中读入数据
    :param file_path: json file path
    :param nlp_model:
    :return:
    """
    try:
        # mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION)
        es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE)
        with open(file_path, 'rb') as f:
            string = f.read()
            record = json.loads(string)
        document_model = documentExtraction(record, nlp_model)
        if not es.check_info_exist(document_model.title):
            logger.info('begin extract doc %s...' % document_model.title)
            document_info = document_model.extract_knowledge_from_record()
            if len(document_info.keys()):
                es.insert_single_info(document_info)
            else:
                logger.warn('extract document info failed ,skip es store')
        else:
            logger.info('doc %s exist in es, skip' % document_model.title)
    except Exception, e:
        logger.error(
            'document extraction process from json file failed for %s' %
            str(e))
Пример #2
0
 def _doc_info_analysis(self, doc_info):
     """
     分析doc_info,提取doc的属性
     :param doc_info:
     :return:node_info
     node_type:节点的类型 notice,file
     id:es中id
     title:文件名
     """
     try:
         info = doc_info['_source']
         # 存储的doc的类型
         if len(info.get('parrent_file',[])):
             node_type = 'file'
         else:
             node_type = 'doc'
         node_id = doc_info['_id']
         title = info.get('title', '')
         location = info.get('publish_location', '')
         return {
             'node_name': 'notice',
             'node_type': node_type,
             'id': node_id,
             'title': title,
             'location': location
         }
     except Exception, e:
         logger.info('analysis doc info failed for %s' % str(e))
         return None
Пример #3
0
    def attention_layer_op(self):
        """
        define attention layer
        :return:
        """
        with tf.name_scope('attention'), tf.variable_scope('attention'):
            attention_w = tf.Variable(tf.truncated_normal([2 * self.hidden_size, self.attention_size], stddev=0.1),
                                      name='attention_w')
            attention_b = tf.Variable(tf.constant(0.1, shape=[self.attention_size]), name='attention_b')
            u_list = []
            for t in list(range(self.sequence_length)):
                u_t = tf.tanh(tf.matmul(self.hidden_outputs[t], attention_w) + attention_b)
                u_list.append(u_t)
            u_w = tf.Variable(tf.truncated_normal([self.attention_size, 1], stddev=0.1), name='attention_uw')
            attn_z = []
            for t in list(range(self.sequence_length)):
                z_t = tf.matmul(u_list[t], u_w)
                attn_z.append(z_t)
            # transform to batch_size * sequence_length
            attn_zconcat = tf.concat(attn_z, axis=1)
            self.alpha = tf.nn.softmax(attn_zconcat)
            # transform to sequence_length * batch_size * 1 , same rank as outputs
            alpha_trans = tf.reshape(tf.transpose(self.alpha, [1, 0]), [self.sequence_length, -1, 1])
            self.attention_output = tf.reduce_sum(self.hidden_outputs * alpha_trans, 0)
            logger.info('attention layer output shape is %s' % self.attention_output.shape)

        with tf.name_scope("output"):
            # outputs shape: (sequence_length, batch_size, 2*rnn_size)
            W = tf.Variable(tf.truncated_normal([2 * self.hidden_size, self.num_classes], stddev=0.1), name='W')
            b = tf.Variable(tf.zeros([self.num_classes]), name='b')
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            self.logits = tf.nn.xw_plus_b(self.attention_output, W, b, name="logits")
            self.prob = tf.nn.softmax(self.logits, name='prob')
            self.predictions = tf.argmax(self.prob, 1, name="predictions")
Пример #4
0
 def run(self):
     """
     """
     logger.info('begin crawler..')
     try:
         self._run()
     except Exception, e:
         logger.error('star crawler failed for %s, stop crawler' % str(e))
         sys.exit(1)
Пример #5
0
 def cut_doc(self):
     """
     将文档进行分词处理
     :return:
     """
     logger.info(u'文档文本未分词,使用thunlp进行分词')
     # self.thunlp_model = thulac.thulac(seg_only=False, model_path=THUNLP_MODEL_PATH, \
     #                          user_dict=THUNLP_USER_DIC_PATH)
     doc_seg = self.thunlp_model.cut(self.doc)
     # 保存原始分词结果,进行关键相邻词的短语组合
     self.origin_doc_seg = doc_seg
     doc_seg_clear = self._clear_seg_list(doc_seg)
     return doc_seg_clear
Пример #6
0
 def lstm():
     # define forward cell
     with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'):
         logger.info(tf.get_variable_scope().name)
         lstm_fw_cell_list = [tf.contrib.rnn.LSTMCell(self.hidden_size) for _ in list(range(self.num_layer))]
         lstm_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list),
                                                        output_keep_prob=self.dropout_keep_prob)
     # define backward cell
     with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'):
         logger.info(tf.get_variable_scope().name)
         lstm_bw_cell_list = [tf.contrib.rnn.LSTMCell(self.hidden_size) for _ in list(range(self.num_layer))]
         lstm_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_bw_cell_list),
                                                        output_keep_prob=self.dropout_keep_prob)
         return lstm_fw_cell_m, lstm_bw_cell_m
Пример #7
0
    def _add_summary(self, sess, vocab_processor):
        """
        Tesorboard 图形化展示
        :param sess:
        :return:
        """
        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in self.grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram(
                    "{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar(
                    "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(
            os.path.join(os.path.curdir, "runs", timestamp))
        logger.info("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", self.loss)
        acc_summary = tf.summary.scalar("accuracy", self.accuracy)

        # Train Summaries
        self.train_summary_op = tf.summary.merge(
            [loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        self.train_summary_writer = tf.summary.FileWriter(
            train_summary_dir, sess.graph)

        # Dev summaries
        self.dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        self.dev_summary_writer = tf.summary.FileWriter(
            dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        self.checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))
Пример #8
0
 def train_step(self, sess, x_batch, y_batch):
     """
     A single training step
     """
     feed_dict = {
         self.input_x: x_batch,
         self.input_y: y_batch,
         self.dropout_keep_prob: self.dropout
     }
     # scores, predictions = sess.run([self.scores, self.predictions], feed_dict)
     _, step, summaries, loss, accuracy = sess.run(
         [self.train_op, self.global_step, self.train_summary_op, self.loss, self.accuracy], feed_dict)
     time_str = datetime.datetime.now().isoformat()
     logger.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
     self.train_summary_writer.add_summary(summaries, step)
Пример #9
0
 def dev_step(self, sess, x_batch, y_batch):
     """
     Evaluates model on a dev set
     """
     feed_dict = {
         self.input_x: x_batch,
         self.input_y: y_batch,
         self.dropout_keep_prob: 1.0
     }
     step, summaries, loss, accuracy = sess.run(
         [self.global_step, self.dev_summary_op, self.loss, self.accuracy],
         feed_dict)
     time_str = datetime.datetime.now().isoformat()
     logger.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
     self.dev_summary_writer.add_summary(summaries, step)
Пример #10
0
 def save_attachement_file(self, attachment_file_link,
                           attachment_file_name):
     """
     保存附件文件
     :param attachment_file_link:
     :return:
     """
     try:
         response = self.get(attachment_file_link)
         with open(os.path.join(SAVING_PATH, attachment_file_name),
                   'wb') as f:
             logger.info('saving file %s' % attachment_file_name)
             f.write(response)
     except Exception, e:
         logger.error('saving attachment file failed for %s' % str(e))
Пример #11
0
def preprocess_for_data():
    """
    将文本转换为模型输入的前处理流程

    :return:
    """
    try:
        train_sentences = load_sentence_file(FLAGS.train_file, FLAGS.zeros)
        dev_sentences = load_sentence_file(FLAGS.dev_file, FLAGS.zeros)
        test_sentences = load_sentence_file(FLAGS.test_file, FLAGS.zeros)
        # change tag schema in sentence
        trans_tag_schema(train_sentences, FLAGS.tag_schema)
        trans_tag_schema(test_sentences, FLAGS.tag_schema)
        # loading/writing mapping file
        if not os.path.isfile(FLAGS.map_file):
            logger.info('mapping file does not exist, create mapping file')
            if FLAGS.pre_emb:
                pass
            else:
                char_count_dic, id_to_char, char_to_id = char_mapping(
                    train_sentences, FLAGS.lower)
            tag_count_dic, id_to_tag, tag_to_id = tag_mapping(train_sentences)
            with open(FLAGS.map_file, 'wb') as f:
                # notice pickle file format with py2 and py3
                pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
        else:
            pass
            logger.info('loading mapping file')
            with open(FLAGS.map_file, 'rb') as f:
                char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

        # prepare model data set
        # format data  --- [[char_list, char_id_list, seg_id_list, tags_id_list],[]]
        #     seg_id_list example: [X/XX/XXX/XXXX] -> [0 /1 3 /1 2 3 /1 2 2 3]
        train_data = prepare_model_data(train_sentences, char_to_id, tag_to_id,
                                        FLAGS.lower)
        dev_data = prepare_model_data(dev_sentences, char_to_id, tag_to_id,
                                      FLAGS.lower)
        test_data = prepare_model_data(test_sentences, char_to_id, tag_to_id,
                                       FLAGS.lower)
        train_manager = BatchManager(train_data, FLAGS.batch_size)
        dev_manager = BatchManager(dev_data, 100)
        test_manager = BatchManager(test_data, 100)
        return train_manager, dev_manager, test_manager

    except Exception, e:
        logger.error('pre-process for train string failed for %s' % str(e))
Пример #12
0
    def save_notice_info(self, notice_info):
        """

        :param notice_info:
        :return:
        """
        try:
            if not self._check_info_exist(notice_info['noticeTitle']):
                logger.info('insert notice info...')
                self.mongo.collection.insert_one(notice_info)
            else:
                logger.info('update notice info...')
                self.mongo.collection.find_one_and_update(
                    {'noticeTitle': notice_info['noticeTitle']},
                    {'$set': notice_info})

        except Exception, e:
            logger.error('mongoDB store notice info failed for %s' % str(e))
Пример #13
0
 def _create_doc_node(self, result_info):
     """
     建立图中的文档节点
     :result_info: es中查询结果
     :return:
     """
     try:
         for doc_info in result_info:
             doc_analysis = self._doc_info_analysis(doc_info)
             if doc_analysis:
                 if not self.neo4j_db.check_node_exist(doc_analysis):
                     self.neo4j_db.create_doc_node(doc_analysis)
                     logger.info('create node...')
                 else:
                     logger.info('node is existed, skip')
             else:
                 logger.warn('analysis doc info failed ,skip...')
     except Exception, e:
         logger.error('create doc node failed for %s' %str(e))
Пример #14
0
 def _create_node_relationship(self, result_info, rule_list):
     """
     根据规则建立节点间的链接关系
     :param result_info:
     :return:
     """
     try:
         for source_info in result_info:
             # begin match rules
             logger.info('extract file with id %s' % str(source_info.get('_id','')))
             for rule in rule_list:
                 is_match, relationship_type, relationship_info = rule(source_info)
                 if is_match:
                     logger.info('matching rule %s'%rule.__name__)
                     self.neo4j_db.create_relation(relationship_type, relationship_info)
                 else:
                     pass
     except Exception, e:
         logger.error('extract relationship between nodes failed for %s' % str(e))
Пример #15
0
 def __get_content_title(self):
     """
     统一取出content和title,需要多次使用
     :return:
     """
     try:
         if not self.file_name:
             self.title = self.record.get('noticeTitle', '')
             self.content = self.record.get('noticeContent', '')
             self.type = 'notice'
         else:
             # 去掉文件名中的空格,文件格式转换时做了空格的消除
             self.file_name = self.__pre_deal_with_str(self.file_name)
             if len(self.file_name.split('.')) >= 2:
                 self.title = self.file_name.split('.')[-2]
             else:
                 self.title = self.file_name
             file_type = self.file_name.split('.')[-1]
             if file_type in ['xls', 'xlsx']:
                 trans_file_type = 'csv'
             else:
                 trans_file_type = 'txt'
             trans_file_name = self.file_name[:-1 *
                                              (len(file_type) +
                                               1)] + '.' + trans_file_type
             if os.path.isfile(os.path.join(FILE_PATH, trans_file_name)):
                 logger.info('reading file %s' % trans_file_name)
                 with open(os.path.join(FILE_PATH, trans_file_name),
                           'r') as f:
                     self.content = f.read()
                     self.type = file_type
             else:
                 logger.warn('file %s do not have trans file' %
                             trans_file_name)
                 self.content = ''
                 self.type = ''
     except Exception, e:
         logger.error('get content and title string failed for %s' % str(e))
         self.title = ''
         self.content = ''
         self.type = ''
Пример #16
0
 def _save_data(self, info):
     """
     保存data info
     :param info:
     :return:
     """
     try:
         if not self._check_info_exist(info['id'], info['year'],
                                       info['location']):
             logger.info('insert notice info...')
             self.mongo.collection.insert_one(info)
         else:
             logger.info('update notice info...')
             self.mongo.collection.find_one_and_update(
                 {
                     'id': info['id'],
                     'year': info['year'],
                     'location': info['location']
                 }, {'$set': info})
     except Exception, e:
         logger.error('mongoDB save data info failed for %s' % str(e))
Пример #17
0
 def train(self, vocab_processor, x_train, y_train, x_dev, y_dev, pre_embeddings=None, checkpoint_file=None):
     """
     model train process
     :return:
     """
     saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.num_checkpoints)
     # GPU assign
     tf_config = tf.ConfigProto()
     tf_config.gpu_options.allow_growth = False
     with tf.Session(config=tf_config) as sess:
         sess.run(self.init_op)
         self._add_summary(sess, vocab_processor)
         # using pre trained embeddings
         if IS_PRETRAINED_EMBEDDING:
             sess.run(self._word_embeddings.assign(pre_embeddings))
             del pre_embeddings
         # restore model
         if IS_MIDDLE_MODEL:
             saver.restore(sess, checkpoint_file.model_checkpoint_path)
         # Generate batches
         batches = batch_iter(list(zip(x_train, y_train)), self.batch_size, self.num_epochs)
         # Training loop. For each batch...
         for batch in batches:
             x_batch, y_batch = zip(*batch)
             self.train_step(sess, x_batch, y_batch)
             current_step = tf.train.global_step(sess, self.global_step)
             if current_step % self.evaluate_every == 0:
                 logger.info("Evaluation:")
                 self.dev_step(sess, x_dev, y_dev)
                 logger.info("")
             if current_step % self.checkpoint_every == 0:
                 path = saver.save(sess, self.checkpoint_prefix, global_step=current_step)
                 logger.info("Saved model checkpoint to {}\n".format(path))
Пример #18
0
    def train(self, vocab_processor, x_train, y_train, x_dev, y_dev):
        """
        model train process
        :return:
        """
        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=self.num_checkpoints)

        with tf.Session() as sess:
            sess.run(self.init_op)
            self._add_summary(sess, vocab_processor)
            # Generate batches
            batches = batch_iter(list(zip(x_train, y_train)), self.batch_size,
                                 self.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                self.train_step(sess, x_batch, y_batch)
                current_step = tf.train.global_step(sess, self.global_step)
                if current_step % self.evaluate_every == 0:
                    logger.info("Evaluation:")
                    self.dev_step(sess, x_dev, y_dev)
                    logger.info("")
                if current_step % self.checkpoint_every == 0:
                    path = saver.save(sess,
                                      self.checkpoint_prefix,
                                      global_step=current_step)
                    logger.info("Saved model checkpoint to {}\n".format(path))
Пример #19
0
 def run(self):
     """
     主函数
     :return:
     """
     try:
         wds_info = [{"wdcode": "reg", "valuecode": ""}]
         dfwds_info = [{
             "wdcode": "zb",
             "valuecode": ""
         }, {
             "wdcode": "sj",
             "valuecode": "LAST20"
         }]
         for needed_key in self.needed_info:
             dfwds_info[0]['valuecode'] = needed_key['id']
             zb_key = needed_key['name']
             needed_key_info_list = list()
             for _reg in self.reg_info:
                 wds_info[0]['valuecode'] = _reg['code']
                 reg_key = _reg['name']
                 logger.info('analysis %s data info' % reg_key)
                 self.post_data['wds'] = json.dumps(wds_info)
                 self.post_data['dfwds'] = json.dumps(dfwds_info)
                 response = requests.post(self.data_url,
                                          data=self.post_data)
                 result = json.loads(response.content)
                 data_list = self._analysis_table_data(
                     result, zb_key, reg_key)
                 data_reg_info = {
                     'location': _reg['name'],
                     'key': zb_key,
                     'data': data_list
                 }
                 needed_key_info_list.append(data_reg_info)
             self._save_json(needed_key_info_list,
                             '../data/%s.json' % zb_key)
     except Exception, e:
         logger.error('crawler main process failed for %s' % str(e))
Пример #20
0
    def _create_entity_node(self, result_info):
        """
        建立图中的
        :param result_info:
        :return:
        """
        try:
            entity_cache_list = list()
            for doc_info in result_info:
                info = doc_info['_source']
                entity_name = info.get('entity_name', [])
                entity_org = info.get('entity_org', [])
                entity_loc = info.get('entity_loc', [])
                for seg in entity_name:
                    if seg not in entity_cache_list:
                        entity_info = {
                            'entity_type': 'name',
                            'seg': seg
                        }
                        self.neo4j_db.create_entity_node(entity_info)
                        logger.info('create name entity node of %s' % seg)
                        entity_cache_list.append(seg)
                    else:
                        continue
                for seg in entity_org:
                    if seg not in entity_cache_list:
                        entity_info = {
                            'entity_type': 'org',
                            'seg': seg
                        }
                        self.neo4j_db.create_entity_node(entity_info)
                        logger.info('create organization entity node of %s' % seg)
                        entity_cache_list.append(seg)
                    else:
                        continue

                for seg in entity_loc:
                    if seg not in entity_cache_list:
                        entity_info = {
                            'entity_type': 'loc',
                            'seg': seg
                        }
                        self.neo4j_db.create_entity_node(entity_info)
                        logger.info('create location entity node of %s' % seg)
                        entity_cache_list.append(seg)
                    else:
                        continue

        except Exception, e:
            logger.error('create entity node failed for %s' %str(e))
Пример #21
0
def train():
    """
    model train process
    :return:
    """
    logger.info('Loading train data...')
    # Load train data
    x_text, y = load_data_and_labels(TRAIN_DATA_PATH_POS, TRAIN_DATA_PATH_NEG)
    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    vocab_size = len(vocab_processor.vocabulary_)
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    del x, y, x_shuffled, y_shuffled
    logger.info("Vocabulary Size: {:d}".format(len(
        vocab_processor.vocabulary_)))
    logger.info("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

    model = TextCNN(sequence_length=x_train.shape[1],
                    num_classes=y_train.shape[1],
                    dropout=dropout,
                    vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    filter_sizes=filter_sizes,
                    num_filters=num_filters,
                    l2_reg_lambda=l2_reg_lambda,
                    optimizer=optimizer,
                    lr=lr,
                    grad_clip=grad_clip,
                    num_checkpoints=num_checkpoints,
                    batch_size=batch_size,
                    num_epochs=num_epochs,
                    evaluate_every=evaluate_every,
                    checkpoint_every=checkpoint_every)

    model.build_graph()
    model.train(vocab_processor, x_train, y_train, x_dev, y_dev)
Пример #22
0
def train():
    """
    训练模块
    :return:
    """
    try:
        # limit GPU memory
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True

        train_manager, dev_manager, test_manager = preprocess_for_data()
        logger.info('loading mapping file')
        with open(FLAGS.map_file, 'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        make_path(FLAGS)
        if os.path.isfile(FLAGS.config_file):
            config = load_config(FLAGS.config_file)
        else:
            config = build_config(char_to_id, tag_to_id)
            save_config(config, FLAGS.config_file)
        #
        steps_per_epoch = train_manager.len_data
        with tf.Session(config=tf_config) as sess:
            model = initial_ner_model(sess, NER_MODEL, FLAGS.ckpt_path,
                                      load_word2vec, config, id_to_char)
            logger.info("start training NER model")
            loss = []
            # epoch iterate
            for i in range(FLAGS.max_epoch):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                                        iteration, step % steps_per_epoch,
                                        steps_per_epoch, np.mean(loss)))
                        loss = []
                save_model(sess, model, FLAGS.ckpt_path)
                # evaluate result for stop epoch iter
                # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                # if best:
                #     save_model(sess, model, FLAGS.ckpt_path, logger)
                # evaluate(sess, model, "test", test_manager, id_to_tag, logger)
    except Exception, e:
        logger.error('training model process failed for %s' % str(e))
Пример #23
0
    def search_link_info(self, notice_link):
        """
        通过公告链接获取全文,下载附件
        :param notice_link:
        :return:
        """
        try:
            if notice_link.startswith('http'):
                pass
            else:
                notice_link = self.title_base_url + notice_link[1:]
            # generate for attachment file url
            notice_baseurl = notice_link[0:(len(notice_link.split('/')[-1]) +
                                            1) * -1]

            response = self.get(notice_link)
            notice_soup = BeautifulSoup(response, 'html5lib')
            title_tag = notice_soup.find('td', attrs={'class': 'font_biao1'})
            main_tag = notice_soup.find('div', attrs={'class': 'TRS_Editor'})
            attachment_tag = notice_soup.find('span', attrs={'id': 'appendix'})
            title = self._get_tag_string(title_tag).strip()
            # debug 2018-9-12
            # file name without space
            title = title.replace(' ', '')
            # if self._check_info_exist(title):
            #     return None, True
            logger.info('notice title is %s' % title)
            # notice doc search
            doc_tag_list = main_tag.find_all('p')
            doc_content = ''
            doc_identify = ''
            doc_attachment = ''
            # 原始网站中的公告内容使用p tag进行换行,所以在存入content的时候需要加入换行符
            # 2018-9-4 cc
            for doc_tag in doc_tag_list:
                if doc_tag.attrs.get('align') == 'center':
                    doc_content += self._get_tag_string(doc_tag) + '\n'
                    doc_identify += self._get_tag_string(doc_tag).strip()
                # elif doc_tag.attrs.get('align') == 'justify':
                #     doc_content += self._get_tag_string(doc_tag)
                elif doc_tag.attrs.get('align') == 'right':
                    doc_content += self._get_tag_string(doc_tag) + '\n'
                    doc_attachment += self._get_tag_string(
                        doc_tag).strip() + '\n'
                else:
                    doc_content += self._get_tag_string(doc_tag) + '\n'

            # attachment file search
            attachment_file_list = attachment_tag.find_all('a')
            attachment_file_name_list = list()
            attachment_file_link_list = list()
            # 部分文件的后缀名不在附件名中出现需要从链接中取出后缀名
            # 2018-9-6 debug
            for attachment_file_tag in attachment_file_list:
                attachment_file_name = ''
                _attachment_link = attachment_file_tag.attrs.get('href')
                try:
                    file_type = _attachment_link.split('.')[-1]
                except:
                    logger.warn('search file type failed')
                    file_type = ''
                _attachment_file_name = self._get_tag_string(
                    attachment_file_tag).strip()
                if ':' in _attachment_file_name:
                    attachment_file_name = _attachment_file_name.split(':')[-1]
                elif ':' in _attachment_file_name:
                    attachment_file_name = _attachment_file_name.split(':')[-1]
                else:
                    attachment_file_name = _attachment_file_name
                # add file attachment type
                try:
                    attachment_file_type = attachment_file_name.split('.')[-1]
                except:
                    attachment_file_type = ''
                if attachment_file_type not in ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip']\
                        and file_type != '':
                    attachment_file_name = attachment_file_name + '.' + file_type

                # _attachment_link format './P020180828399303596996.pdf'
                attachment_file_link = notice_baseurl + _attachment_link[1:]
                # saving file
                self.save_attachement_file(attachment_file_link,
                                           attachment_file_name)
                attachment_file_name_list.append(attachment_file_name)
                attachment_file_link_list.append(attachment_file_link)
            return {
                'noticeTitle': title,
                'noticeContent': doc_content,
                'noticeIdentify': doc_identify,
                'noticeAttachment': doc_attachment,
                'noticeLink': notice_link,
                'attachmentFileList': attachment_file_name_list,
                'attachmentLinkList': attachment_file_link_list,
                'category': self.category,
                'filePath': SAVING_PATH,
                'location': self.location
            }, False
        except Exception, e:
            logger.error('searching link info failed for %s' % str(e))
            return None, False
Пример #24
0
def trans_file_from_db(trans_path):
    """

    :return:
    """
    try:
        mongo_db = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB,
                               MONGODB_COLLECTION)
        # 进入文件存储目录
        # os.system('cd %s' % SAVING_PATH)
        path_command = 'cd %s &&' % SAVING_PATH
        failed_list = list()
        for record in mongo_db.collection.find():
            file_list = record.get('attachmentFileList', [])
            for file_name in file_list:
                logger.info('begin to trans file %s' % file_name)
                # file name has space string , failed with shell command
                # remind with mongoDB attachment file list link
                if ' ' in file_name:
                    logger.info('file name has space string, trans file name')
                    os.system(path_command + "mv '%s' %s" %
                              (file_name, file_name.replace(' ', '')))
                    file_name = file_name.replace(' ', '')
                base_name = file_name[:(len(file_name.split('.')[-1]) + 1) *
                                      -1]
                if file_name.endswith('.doc') or file_name.endswith('.docx'):
                    os.system(path_command + 'unoconv -f txt %s' % file_name)
                    os.system(path_command + 'mv %s.txt %s' %
                              (base_name, trans_path))
                elif file_name.endswith('.xls') or file_name.endswith('.xlsx'):
                    os.system(path_command + 'unoconv -f csv %s' % file_name)
                    os.system(path_command + 'mv %s.csv %s' %
                              (base_name, trans_path))
                elif file_name.endswith('.pdf'):
                    os.system(path_command +
                              'pdftotext -nopgbrk %s %s/%s.txt' %
                              (file_name, trans_path, base_name))
                # 压缩文件类型不齐全
                # 目前包括 rar zip gz
                elif file_name.endswith('.rar') or file_name.endswith(
                        '.zip') or file_name.endswith('.gz'):
                    pass
                else:
                    logger.warn(
                        'file type is not recognized; file name is %s' %
                        file_name)
                    # trying trans doc/docx file
                    logger.info('trying trans file with unconv txt')
                    result = os.system(path_command +
                                       'unoconv -f txt %s' % file_name)
                    if not result:
                        os.system(path_command + 'mv %s.txt %s' %
                                  (base_name, trans_path))
                        continue
                    else:
                        logger.warn('trans file with unconv txt failed')
                    # trying trans xls/xlsx file
                    logger.info('trying trans file with unconv csv')
                    result = os.system(path_command +
                                       'unoconv -f csv %s' % file_name)
                    if not result:
                        os.system(path_command + 'mv %s.csv %s' %
                                  (base_name, trans_path))
                        continue
                    else:
                        logger.warn('trans file with unconv csv failed')
                    # trying trans pdf file
                    logger.info('trying trans file with pdftotext')
                    result = os.system(path_command +
                                       'pdftotext -pgnobrk %s %s/%s.txt' %
                                       (file_name, trans_path, base_name))
                    if not result:
                        continue
                    else:
                        logger.warn('trans file with pdftotext failed')
                    failed_list.append(file_name)
        # 打印无法转换的文件名称
        for file_name in failed_list:
            print file_name
    except Exception, e:
        logger.error('file trans failed for %s' % str(e))
Пример #25
0
# coding=utf-8
"""
@ license: Apache Licence
@ github: invoker4zoo
@ author: invoker/cc
@ wechart: whatshowlove
@ software: PyCharm
@ file: web_server.py
@ time: $18-9-25 下午6:25
"""

import tornado.ioloop
import tornado.web
from handler import *
from tool.logger import logger

HandlerList = [
    (r"/main", MainSearchHandler),
    (r"/search/query", QuerySearchHandler),
    (r"/search/id", IdSearchHandler),
]

if __name__ == '__main__':
    application = tornado.web.Application(HandlerList)
    serverPort = 8080
    application.listen(serverPort)
    logger.info('server start at port %d' % serverPort)
    tornado.ioloop.IOLoop.instance().start()
Пример #26
0
# coding=utf-8
"""
@ license: Apache Licence
@ github: invoker4zoo
@ author: invoker/cc
@ wechart: whatshowlove
@ software: PyCharm
@ file: process.py
@ time: $18-9-25 下午4:34
"""
import sys
sys.path.append('..')
from tool.logger import logger
from config.config import *
import thulac
from document_extraction import main_process
from knowledge_extraction_sample import buildGraph

if __name__ == '__main__':
    logger.info('loading nlp model')
    # thunlp_model = thulac.thulac(seg_only=False, model_path=THUNLP_MODEL_PATH, \
    #                              user_dict=THUNLP_USER_DIC_PATH)
    # logger.info('begin document extraction...')
    # main_process(thunlp_model)
    logger.info('begin knowledge extraction...')
    process = buildGraph()
    process.initial()
Пример #27
0
def test():
    """
    model test process
    :return:
    """
    logger.info('Loading test data...')
    # Load test data
    x_text, y = load_data_and_labels(TEST_DATA_PATH_POS, TEST_DATA_PATH_NEG)
    # Load vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        '../process/runs/1548669564/vocab')
    x_test = np.array(list(vocab_processor.transform(x_text)))
    y_test = y
    logger.info('test data: {}'.format(len(x_test)))
    # Load train model
    # ckpt_file = tf.train.latest_checkpoint('..\\process\\runs\\1548399694\\checkpoints\\')
    # logger.info('model path is %s' % ckpt_file)

    # testing
    graph = tf.Graph()
    with graph.as_default():
        tf_config = tf.ConfigProto()
        # Misc Parameters
        tf_config.allow_soft_placement = True
        tf_config.log_device_placement = False
        with tf.Session(config=tf_config) as sess:
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(
                "..\\process\\runs\\1548669564\\checkpoints\\model-4100"))
            saver.restore(
                sess, "..\\process\\runs\\1548669564\\checkpoints\\model-4100")

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = batch_iter(list(x_test), batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predictions = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    # Print accuracy if y_test is defined
    if y_test is not None:
        correct_predictions = float(
            sum(np.argmax(y_test, 1) == all_predictions))
        logger.info("Total number of test examples: {}".format(len(y_test)))
        logger.info("Accuracy: {:g}".format(correct_predictions /
                                            float(len(y_test))))

    # Save the test result to a csv
    predictions_human_readable = np.column_stack(
        (np.array(x_text), all_predictions))
    output_path = "../data/test_data/prediction.csv"
    logger.info("Saving evaluation to {0}".format(output_path))
    with open(output_path, 'w') as f:
        csv.writer(f).writerows(predictions_human_readable)
Пример #28
0
def train():
    """
    model train process
    :return:
    """
    logger.info('Loading train data...')
    # load train data
    x_text, y = load_data_and_labels(TRAIN_DATA_PATH)
    # build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    vocab_size = len(vocab_processor.vocabulary_)
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    del x, y, x_shuffled, y_shuffled
    logger.info("Vocabulary Size: {:d}".format(vocab_size))
    logger.info("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    # define pretrained embedding
    embedding_dim = EMBEDDING_DIM
    embeddings = None
    if IS_PRETRAINED_EMBEDDING:
        embeddings = np.load(PRETRAINED_EMBEDDING_PATH)
        logger.info("embedding shape {}".format(embeddings.shape))
        vocab_size = embeddings.shape[0]
        embedding_dim = embeddings.shape[1]
    # load saved middle model last time
    ckpt = None
    if IS_MIDDLE_MODEL:
        assert os.path.isdir(MIDDLE_PATH), '{} must be a directory'.format(MIDDLE_PATH)
        ckpt = tf.train.get_checkpoint_state(MIDDLE_PATH)
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'

    model = BiRNN(embedding_dim=embedding_dim,
                  hidden_size=hidden_size,
                  num_layer=num_layer,
                  vocab_size=vocab_size,
                  attention_size=attention_size,
                  sequence_length=x_train.shape[1],
                  num_classes=y_train.shape[1],
                  grad_clip=grad_clip,
                  lr=lr,
                  l2_reg_lambda=l2_reg_lambda,
                  dropout=dropout,
                  optimizer=optimizer,
                  num_checkpoints=num_checkpoints,
                  batch_size=batch_size,
                  num_epochs=num_epochs,
                  evaluate_every=evaluate_every,
                  checkpoint_every=checkpoint_every)

    model.build_graph()
    model.train(vocab_processor, x_train, y_train, x_dev, y_dev, pre_embeddings=embeddings, checkpoint_file=ckpt)
Пример #29
0
def main_process(nlp_model):
    """
    main function
    :return:
    """
    try:
        mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB,
                            MONGODB_COLLECTION)
        es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE)
        cursor = mongo.collection.find(no_cursor_timeout=True)
        for record in cursor:
            # for record in mongo.collection.find().batch_size(1):
            if not len(record.get('attachmentFileList', [])):
                document_model = documentExtraction(record, nlp_model)
                if not es.check_info_exist(document_model.title):
                    logger.info('begin extract doc %s...' %
                                document_model.title)
                    document_info = document_model.extract_knowledge_from_record(
                    )
                    if len(document_info.keys()):
                        es.insert_single_info(document_info)
                    else:
                        logger.warn(
                            'extract document info failed ,skip es store')
                else:
                    logger.info('doc %s exist in es, skip' %
                                document_model.title)
            else:
                document_model = documentExtraction(record, nlp_model)
                if not es.check_info_exist(document_model.title):
                    logger.info('begin extract doc %s...' %
                                document_model.title)
                    document_info = document_model.extract_knowledge_from_record(
                    )
                    if len(document_info.keys()):
                        es.insert_single_info(document_info)
                    else:
                        logger.warn(
                            'extract document info failed ,skip es store')
                else:
                    logger.info('doc %s exist in es, skip' %
                                document_model.title)
                for file_name in record.get('attachmentFileList', []):
                    document_model = documentExtraction(record,
                                                        nlp_model,
                                                        file_name=file_name)
                    if not es.check_info_exist(document_model.title):
                        logger.info('begin extract doc %s...' %
                                    document_model.title)
                        document_info = document_model.extract_knowledge_from_record(
                        )
                        if len(document_info.keys()):
                            es.insert_single_info(document_info)
                        else:
                            logger.warn(
                                'extract document info failed ,skip es store')
                    else:
                        logger.info('doc %s exist in es, skip' %
                                    document_model.title)
        cursor.close()
    except Exception, e:
        logger.error('document extract failed for %s' % str(e))
Пример #30
0
 def _run(self):
     """
     启动爬虫主函数
     :return:
     """
     self.notice_link_list = list()
     self.title_base_url = self.base_url + '/' + self.category
     for page in range(0, self.page):
         if page == 0:
             url = self.title_base_url + '/' + 'index.htm'
         else:
             url = self.title_base_url + '/' + 'index_%d.htm' % page
         logger.info('searching gov finance notice link on page %d' %
                     (page + 1))
         response = self.get(url)
         page_soup = BeautifulSoup(response, 'html5lib')
         # debug 2018-9-5
         # 财经视点栏目的tag class名字与其他栏目的tag class不一致
         if self.category == 'caijingshidian':
             notice_tag_list = page_soup.find_all('td',
                                                  attrs={'class': 'xiaxu'})
         else:
             notice_tag_list = page_soup.find_all('td',
                                                  attrs={'class': 'ZITI'})
         for notice_tag in notice_tag_list:
             title = notice_tag.attrs.get('title')
             time_str = self._search_time_from_title(title)
             logger.info('notice publish time is %s' % time_str)
             if title:
                 pass
             else:
                 logger.warning('searching notice title failed')
                 continue
             notice_info_tag = notice_tag.find('a')
             link = notice_info_tag.attrs.get('href')
             if link:
                 logger.info('searching notice info for %s' % title)
                 self.notice_link_list.append(link)
                 link_info, is_exist = self.search_link_info(link)
                 if link_info and not is_exist:
                     link_info['publishTime'] = time_str
                     self.save_notice_info(link_info)
                 elif is_exist:
                     link_info['publishTime'] = time_str
                     self.save_notice_info(link_info)
                     logger.info('link info is existed')
                     continue
                 else:
                     logger.warn('searching link info failed')
             else:
                 logger.warning('get notice link failed for %s' % title)
             # 间隔5秒
             logger.info('crawler sleeping for 5s...')
             time.sleep(5)
         # 间隔2秒
         logger.info('crawler sleeping for 2s...')
         time.sleep(2)