示例#1
0
    def segment(self, pic, pipe, text, lower=False, use_jieba=False):
        text = util.as_text(text)
        sentences = self.ss.segment(
            text)  # Sentences is a sentence with end delimiters .
        words_no_filter = self.ws.segment_sentences(
            pic,
            pipe,
            sentences=sentences,
            lower=lower,
            use_stop_words=False,
            use_speech_tags_filter=False,
            use_jieba=use_jieba)
        words_no_stop_words = self.ws.segment_sentences(
            pic,
            pipe,
            sentences=sentences,
            lower=lower,
            use_stop_words=True,
            use_speech_tags_filter=False,
            use_jieba=use_jieba)

        words_all_filters = self.ws.segment_sentences(
            pic,
            pipe,
            sentences=sentences,
            lower=lower,
            use_stop_words=True,
            use_speech_tags_filter=True,
            use_jieba=use_jieba)

        return util.AttrDict(sentences=sentences,
                             words_no_filter=words_no_filter,
                             words_no_stop_words=words_no_stop_words,
                             words_all_filters=words_all_filters)
示例#2
0
def sort_sentences(sentences, words, sim_func=default_sentence_similarity):
    """ 将句子按关键程度进行排序 """
    sorted_sentences = {}
    graph = similarity_matrix(words, sim_func)
    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph)
    average_score = sum(scores.values()) / len(scores)
    for index, score in scores.items():
        feature_score = util.clue_score(words[index]) * average_score + score
        if len(words[index]) < 8:
            feature_score = 0
        item = util.AttrDict(sentence=sentences[index],
                             weight=feature_score,
                             words=util.clean_stop_words(words[index]))
        sorted_sentences[index] = item
    return sorted_sentences
示例#3
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    tf.enable_v2_behavior()

    config_file = most_recent_file(FLAGS.experiment_path, r'config.yaml')
    assert config_file
    with open(config_file, 'r') as f:
        config = util.AttrDict(**yaml.load(f.read()))
    logging.info('Config:\n%s', pprint.pformat(config))
    env = gym.make(config.env)
    cls = globals()[config.policy]
    policy = cls(config)
    # Initialize policy
    policy.argmax(np.expand_dims(env.reset(), 0))

    # Load checkpoint.
    # Assuming policy is a keras.Model instance.
    logging.info('policy variables: %s',
                 [v.name for v in policy.trainable_variables])
    ckpt = tf.train.Checkpoint(policy=policy)
    ckpt_file = most_recent_file(FLAGS.experiment_path, r'model.ckpt-[0-9]+')
    if ckpt_file:
        ckpt_file = re.findall('^(.*/model.ckpt-[0-9]+)', ckpt_file)[0]
        logging.info('Checkpoint file: %s', ckpt_file)
        ckpt.restore(ckpt_file).assert_consumed()
    else:
        raise RuntimeError('No checkpoint found')

    summary_writer = tf.summary.create_file_writer(FLAGS.experiment_path,
                                                   flush_millis=10000)

    logging.info('Starting Evaluation')
    it = (range(FLAGS.num_episodes)
          if FLAGS.num_episodes >= 0 else itertools.count())
    for ep in it:
        memory = replay.Memory()
        sample_episode(env, policy, memory, max_episode_length=200)
        logging.info(ep)
        with summary_writer.as_default(), summary.always_record_summaries():
            summary.scalar('return', memory.observed_rewards().sum(), step=ep)
            summary.scalar('length',
                           memory.observed_rewards().shape[-1],
                           step=ep)

    logging.info('DONE')
示例#4
0
 def segment(self, text, lower = False):
     '''
     返回util.AttrDict, 包括: 
     sentences:句列表,即根据delimiters分隔的
     words_no_filter:分好词的嵌套列表
     words_no_stop_words:去除了停词
     words_all_filters:去除了停词和词性不在allow_speech_tags中的词
     '''
     # text = util.as_text(text)
     sentences = self.ss.segment(text) # 一个filter
     sentences_list=list(sentences)
     words_res=self.ws.segment_sentences(sentences_list)
     # 以上3个列表的属性完全一样, 区别只在是否去停词, 和词性
     # TODO: 这里不一定需要返回list, 如果不需要的话, 尽量返回generator
     return util.AttrDict(
                 sentences           = sentences_list, 
                 words_no_filter     = [list(i) for i in words_res[0]], 
                 words_no_stop_words = [list(i) for i in words_res[1]], 
                 words_all_filters   = [list(i) for i in words_res[2]]
             )
示例#5
0
    def segment(self, text, lower = False):
        sentences = self.ss.segment(text)
        words_no_filter = self.ws.segment_sentences(sentences=sentences, 
                                                    lower = lower, 
                                                    use_stop_words = False,
                                                    use_speech_tags_filter = False)
        words_no_stop_words = self.ws.segment_sentences(sentences=sentences, 
                                                    lower = lower, 
                                                    use_stop_words = True,
                                                    use_speech_tags_filter = False)

        words_all_filters = self.ws.segment_sentences(sentences=sentences, 
                                                    lower = lower, 
                                                    use_stop_words = True,
                                                    use_speech_tags_filter = True)

        return util.AttrDict(
                    sentences           = sentences, 
                    words_no_filter     = words_no_filter, 
                    words_no_stop_words = words_no_stop_words, 
                    words_all_filters   = words_all_filters
                )
 def sentences_classify(self, sored_scores):
     category_sentence = {'方法': [], '目的': [], '结果': [], '其他': []}
     for i, score in enumerate(self.sentences_scores()):
         sentence = ','.join(self.content_sentences[i])
         words = []
         for short_words in self.content_words[i]:
             words += short_words
         category = max(score, key=lambda x: score[x])
         max_score = score[category]
         item = util.AttrDict(sentence=sentence,
                              sorted_score=sored_scores[i]['weight'],
                              words=words,
                              category_score=max_score,
                              category=category)
         if max_score >= 1:
             category_sentence[category].append(item)
         else:
             category_sentence['其他'].append(item)
     for category in category_sentence:
         category_sentence[category].sort(
             key=lambda item: item['sorted_score'], reverse=True)
     return category_sentence
示例#7
0
import json
from bottle import install, get, request, response, run, HTTPError
import bottle_pgpool
import psycopg2.pool
import util

### API endpoints


api_v1 = '/api/v1'
ep = util.AttrDict(
    index = os.path.join(api_v1, 'index'),
    venues = os.path.join(api_v1, 'venues'),
    venue = os.path.join(api_v1, 'venues/<id:int>'),
    venue_nearby = os.path.join(api_v1, 'venues/<id:int>/nearby'),
    categories = os.path.join(api_v1, 'categories'),
    category = os.path.join(api_v1, 'categories/<id:int>'),
    category_venues = os.path.join(api_v1, 'categories/<id:int>/venues'),
    zips = os.path.join(api_v1, 'zips'),
    zip = os.path.join(api_v1, 'zips/<zip>'),
    zip_venues = os.path.join(api_v1, 'zips/<zip>/venues'),
)


### setup


config = util.read_config("api")
log = util.config_logging(config).getLogger("server")
# connection pool
pool = psycopg2.pool.ThreadedConnectionPool(
    minconn=1,
示例#8
0
parser.add_argument("--gumbel_sample",
                    action="store_true",
                    help="turn on random selection of path during training")
parser.add_argument("--max_chunk_vocab_size",
                    default=10000,
                    type=int,
                    help="size of chunk vocab")
parser.add_argument("--test_samples",
                    default=1,
                    type=int,
                    help="number of samples to take")
parser.add_argument("--concat_context_vector",
                    action="store_true",
                    help="concat context vector instead of initializing")

args = util.AttrDict(vars(parser.parse_args()))
print "Args:", args

if args.ptb:
    DATA_LOC = 'data/ptb'
    DATA_VIEW = 'word'
elif args.zh:
    DATA_LOC = 'data/zh'
    DATA_VIEW = 'char'
else:
    DATA_LOC = 'data/en/bpe'
    DATA_VIEW = 'word'

args.train_data = DATA_LOC + '/train'
args.valid_data = DATA_LOC + '/valid'
args.test_data = DATA_LOC + '/test'
示例#9
0
import util

config = util.AttrDict (**{
    # Default values for command line arguments
    'data_dir' : '/tmp/zf/data',
    'pgn_dir' : '/tmp/zf/pgns',
    'model_dir' : '/tmp/zf/model',

    'optimizer' : 'Adam',
    'learning_rate' : 1e-3,
    'l2_scale' : 1e-3,
    'num_epochs' : 1,
    'batch_size' : 32,

    'filters' : 32,
    'modules' : 0,

    # Not parsed from command line
    'input_height' : 8,
    'input_width' : 8,
    'input_channels' : 26,

    'classes_shape' : (8, 8, 8, 8),

    # Defined below
    'input_shape' : None,
    'input_total' : None,

    'n_classes' : None
})

config.input_shape = (config.input_height, config.input_width, config.input_channels)
示例#10
0
import util
import tensorflow as tf
import data_util
import dual_learning
from pprint import pprint

# parameters
params = util.AttrDict()
params.seq2seq = util.AttrDict(
        max_len_A = 21,
        max_len_B = 21,
        ckpt_path_AB = 'en_fr',
        ckpt_path_BA = 'fr_en',
        emb_dim = 1024,
        num_layers = 1,
        batch_size = 32,
        steps = 100000,
        beam_size = 2,
        alpha = 0.5,
        ratio_dual = 0.5
        )
params.lm_a = util.AttrDict(
        model_name = 'lm_a',
        load_model = '../cv2/A/epoch022_6.8524.model',
        train_dir = '../cv2/A',
        rnn_size = 650,
        highway_layers = 2,
        char_embed_size = 30,
        kernels = '[1,2,3,4,5,6,7]',
        kernel_features = '[50,100,150,200,200,200,200]',
示例#11
0
def model_fn(features, labels, mode, params):
    # Training flag
    training = (mode == tf.estimator.ModeKeys.TRAIN)

    # Extract and concatenate features for input
    inputs = features['image']

    # Get unscaled log probabilities
    with tf.variable_scope('inference',
                           reuse=params.get('reuse', False),
                           custom_getter=collection_getter):
        policy, value = inference(inputs,
                                  filters=params['filters'],
                                  modules=params['modules'],
                                  n_classes=params['n_classes'],
                                  training=training)

    # Add summaries to weights
    for var in tf.trainable_variables():
        tf.summary.histogram(var.name.split(':')[0] + '_summary', var)

    # Specification
    spec = util.AttrDict(mode=mode,
                         features=features,
                         predictions=(policy, value))

    # Return early inference specification
    if mode == tf.estimator.ModeKeys.PREDICT:
        return spec

    with tf.variable_scope('losses'):
        # Value loss
        value_loss = tf.losses.mean_squared_error(labels=labels['value'],
                                                  predictions=value,
                                                  weights=1.0 / 4.0)

        policy_loss = tf.losses.softmax_cross_entropy(
            onehot_labels=labels['policy'], logits=policy, weights=1.0)

        # Get l2 regularization loss
        l2_loss = tf.contrib.layers.apply_regularization(
            tf.contrib.layers.l2_regularizer(params['l2_scale']))
        tf.losses.add_loss(l2_loss)

        # Total loss
        loss = tf.losses.get_total_loss(add_regularization_losses=False)

        # Add total loss to loss collection
        tf.add_to_collection(tf.GraphKeys.LOSSES, loss)

    # Add summaries for losses
    for loss_tensor in tf.get_collection(tf.GraphKeys.LOSSES):
        tf.summary.scalar(
            loss_tensor.name.split(':')[0] + '_summary', loss_tensor)

    spec.labels = labels
    spec.loss = loss
    spec.eval_metric_ops = util.AttrDict()

    # Return early evaluation specification
    if mode == tf.estimator.ModeKeys.EVAL:
        return spec

    # Get global step for training op
    global_step = tf.train.get_global_step()

    with tf.variable_scope('train'):
        # Get optimizer function
        optimizer_fn = {
            'Adam': tf.train.AdamOptimizer,
            'RMSProp': tf.train.RMSPropOptimizer,
            'GradientDescent': tf.train.GradientDescentOptimizer
        }[params.get('optimizer', 'Adam')]

        optimizer = optimizer_fn(params['learning_rate'])

        # Compute gradients and add summaries
        grads_and_tvars = optimizer.compute_gradients(spec.loss)

        # Create train operation
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.apply_gradients(grads_and_tvars,
                                                 global_step=global_step)

    # Add summaries for gradients
    with tf.variable_scope('gradients'):
        tf.contrib.training.add_gradients_summaries(grads_and_tvars)

    spec.train_op = train_op

    # Return full train specification
    return spec