示例#1
0
def article_iterator(tokenizer, final_desired_size):
    """ Iterate through the provided filename + tokenize"""
    assert os.path.exists(args.input_fn)
    with open(args.input_fn, "r") as f:
        for l_no, l in enumerate(f):
            if l_no % args.num_folds == args.fold:
                article = json.loads(l)
                tokens = []
                query = tokenization.convert_to_unicode(article["query"])
                query_tokens = tokenizer.tokenize(query)
                c_size = final_desired_size - len(query_tokens) - 2
                content = tokenization.convert_to_unicode(article["content"])
                content_tokens = tokenizer.tokenize(content)
                if len(content_tokens) > c_size:
                    content_tokens = content_tokens[:c_size]
                tokens.extend(content_tokens)
                tokens.append("[SEP]")
                tokens.extend(query_tokens)
                tokens.append("[SEP]")
                input_ids = tokenizer.convert_tokens_to_ids(tokens)

                article['input_ids'] = input_ids
                article['inst_index'] = (l_no // args.num_folds)

                if article['inst_index'] < 100:
                    print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(
                        article['inst_index'], tokens, input_ids),
                          flush=True)

                if len(article['input_ids']) <= 10:  # min size of article
                    continue
                yield article
示例#2
0
def article_iterator(tokenizer):
    """ Iterate through the provided filename + tokenize"""
    assert os.path.exists(args.input_fn)
    for (dirpath, dirnames, filenames) in os.walk(args.input_fn):
        for filename in filenames:
            with open(os.path.join(dirpath, filename), 'r') as f:
                for l_no, l in enumerate(f):
                    if l_no % args.num_folds == args.fold:
                        article = json.loads(l)

                        line = tokenization.convert_to_unicode(
                            article['text'])  # for news2016zh text body
                        tokens = tokenizer.tokenize(line)
                        input_ids = tokenizer.convert_tokens_to_ids(tokens)

                        article['input_ids'] = input_ids

                        article['inst_index'] = (l_no // args.num_folds)
                        if article['inst_index'] < 100:
                            print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(
                                article['inst_index'], tokens, input_ids),
                                  flush=True)
                        if len(article['input_ids']
                               ) <= 64:  # min size of article
                            continue
                        yield article
示例#3
0
def gen(text, length):
    line = tokenization.convert_to_unicode(text)
    encoded = tokenizer.tokenize(line)
    context_formatted = []
    context_formatted.extend(encoded)
    # Format context end

    gens = []
    gens_raw = []
    gen_probs = []

    for chunk_i in range(num_chunks):
        tokens_out, probs_out = sess.run(
            [tokens, probs],
            feed_dict={
                initial_context: [context_formatted] * batch_size_per_chunk,
                eos_token: args.eos_token,
                min_len: int(length),
                p_for_topp: top_p[chunk_i]
            })

        for t_i, p_i in zip(tokens_out, probs_out):
            extraction = extract_generated_target(output_tokens=t_i,
                                                  tokenizer=tokenizer)
            gens.append(extraction['extraction'])

    l = re.findall('.{1,70}', gens[0].replace('[UNK]', '').replace('to', ''))
    result = "".join(l)
    return result
示例#4
0
def article_iterator(tokenizer,filename):
    """ Iterate through the provided filename + tokenize"""
    assert os.path.exists(args.input_fn)
    count = 0
    print("dirpath, " + os.path.join(filename))
    with open(os.path.join(filename), 'rb') as f:
        for l_no, l in enumerate(f):
            if l_no % args.num_folds == args.fold:
                print("type: ", str(type(l)))
                print("l: " + str(len(l)) + " l_no: " , (str(l_no)))
                l = str(l, encoding = "gbk",errors='ignore')
                article = json.loads(l)
                line = tokenization.convert_to_unicode(
                    article['text'])  # for news2016zh text body
                print("line>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>0 ", l)
                count += 1
                #tokens = tokenizer.tokenize(line)
                #input_ids = [tokenizer.tokenize(token) for token in tokens]
                #input_ids = tf.keras.preprocessing.sequence.pad_sequences(
                #                    [tokenizer.convert_tokens_to_ids(t) 
                #                    for t in input_ids],args.max_seq_length)
                tokens = tokenizer.tokenize(line)
                input_ids = tokenizer.convert_tokens_to_ids(tokens)
                print("line>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>1 ", input_ids)
                article['input_ids'] = input_ids
                print("count...............................: ", count)
                article['inst_index'] = (l_no // args.num_folds)
                # if article['inst_index'] < 100:
                #     print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(article['inst_index'],
                #                                                     tokens,
                #                                                     input_ids
                #                                                     ), flush=True)
                if len(article['input_ids']) <= 12:  # min size of article
                    continue
                yield article
def generate_text(text, ratio=0.75):

    output_lst = []
    prob_lst = []
    with graph.as_default():

        line = tokenization.convert_to_unicode(text)
        line = clean(line)
        print(line)

        bert_tokens = tokenizer.tokenize(line)
        encoded = tokenizer.convert_tokens_to_ids(bert_tokens)
        encoded_prefix = encoded[0:int(len(encoded) * ratio)]
        print("=encoded length== ", len(encoded), '==context length==',
              len(encoded_prefix))
        context_formatted = []
        context_formatted.extend(encoded_prefix)
        start = time.time()
        if len(encoded) > 5:
            print("===process===")
            for i in range(args.samples):
                print("Sample,", i + 1, " of ", args.samples)
                # Format context end
                gens = []
                gens_raw = []
                gen_probs = []
                for chunk_i in range(num_chunks):
                    tokens_out, probs_out = sess.run(
                        [tokens, probs],
                        feed_dict={
                            initial_context:
                            [context_formatted] * batch_size_per_chunk,
                            eos_token: args.eos_token,
                            min_len: int(len(encoded) * 0.85),
                            max_len: len(encoded) + 10,
                            p_for_topp: top_p[chunk_i],
                            k_for_topk: 1000
                        })

                    for t_i, p_i in zip(tokens_out, probs_out):
                        extraction = extract_generated_target(
                            output_tokens=t_i, tokenizer=tokenizer)
                        gens.append(extraction['extraction'])
                        gen_probs.append(p_i)

                # l = re.findall('.{1,70}', gens[0].replace('[UNK]', '').replace('##', ''))
                l = decode(gens[0])
                output_lst.append(l)
                prob_lst.append(gen_probs)
            print(time.time() - start)
            return line, output_lst, prob_lst, bert_tokens
        else:
            return None, None, None, None
示例#6
0
 def process(self, inputs):
     seq_text = tokenization.convert_to_unicode(inputs[self.first_sequence]) # Corresponding to ODPS Table column
     [input_ids, p_for_topp, eos_token, 
         min_len, max_len, k_for_topk] = convert_to_single_example(
         tokenizer=self.tokenizer, text=seq_text, max_seq_length=self.sequence_length)
     ret = {key: np.array([val]) for key, val in inputs.items()}
     ret["initial_context"] = np.array([input_ids]) # Shape of [1, seq_len]
     ret["p_for_topp"] = np.array([p_for_topp])
     ret["eos_token"] = np.array(eos_token)
     ret["min_len"] = np.array(min_len)
     ret["max_len"] = np.array(max_len)
     ret["k_for_topk"] = np.array(k_for_topk)
     return ret
示例#7
0
文件: demo1.py 项目: r2dh4t/Sense_art
    tokens, probs = sample(news_config=news_config,
                           initial_context=initial_context,
                           eos_token=eos_token,
                           min_len=min_len,
                           ignore_ids=None,
                           p_for_topp=p_for_topp,
                           do_topk=False)

    saver = tf.train.Saver()
    saver.restore(sess, args.ckpt_fn)
    print('🍺Model loaded. \nInput something please:⬇️')
    text = input()
    while text != "":
        for i in range(args.samples):
            print("Sample,", i + 1, " of ", args.samples)
            line = tokenization.convert_to_unicode(text)
            bert_tokens = tokenizer.tokenize(line)
            encoded = tokenizer.convert_tokens_to_ids(bert_tokens)
            context_formatted = []
            context_formatted.extend(encoded)
            # Format context end

            gens = []
            gens_raw = []
            gen_probs = []

            for chunk_i in range(num_chunks):
                tokens_out, probs_out = sess.run(
                    [tokens, probs],
                    feed_dict={
                        initial_context:
示例#8
0
def predict():

    ##### ignore tf deprecated warning temporarily
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    # mac-specific settings, comment this when exec in other systems
    os.environ['KMP_DUPLICATE_LIB_OK']='True'

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
    from tensorflow.python.util import deprecation
    deprecation._PRINT_DEPRECATION_WARNINGS = False
    try:
        from tensorflow.python.util import module_wrapper as deprecation
    except ImportError:
        from tensorflow.python.util import deprecation_wrapper as deprecation
    deprecation._PER_MODULE_WARNING_LIMIT = 0
    #####

    parser = argparse.ArgumentParser(description='Contextual generation (aka given some metadata we will generate articles')
    parser.add_argument(
        '-metadata_fn',
        dest='metadata_fn',
        type=str,
        help='Path to a JSONL containing metadata',
    )
    parser.add_argument(
        '-out_fn',
        dest='out_fn',
        type=str,
        help='Out jsonl, which will contain the completed jsons',
    )
    parser.add_argument(
        '-input',
        dest='input',
        type=str,
        help='Text to complete',
    )
    parser.add_argument(
        '-model_config_fn',
        dest='model_config_fn',
        default='configs/mega.json',
        type=str,
        help='Configuration JSON for the model',
    )
    parser.add_argument(
        '-model_ckpt',
        dest='model_ckpt',
        default='model.ckpt-220000',
        type=str,
        help='checkpoint file for the model',
    )
    parser.add_argument(
        '-target',
        dest='target',
        default='article',
        type=str,
        help='What to generate for each item in metadata_fn. can be article (body), title, etc.',
    )
    parser.add_argument(
        '-batch_size',
        dest='batch_size',
        default=1,
        type=int,
        help='How many things to generate per context. will split into chunks if need be',
    )
    parser.add_argument(
        '-num_folds',
        dest='num_folds',
        default=1,
        type=int,
        help='Number of folds. useful if we want to split up a big file into multiple jobs.',
    )
    parser.add_argument(
        '-fold',
        dest='fold',
        default=0,
        type=int,
        help='which fold we are on. useful if we want to split up a big file into multiple jobs.'
    )
    parser.add_argument(
        '-max_batch_size',
        dest='max_batch_size',
        default=None,
        type=int,
        help='max batch size. You can leave this out and we will infer one based on the number of hidden layers',
    )
    parser.add_argument(
        '-top_p',
        dest='top_p',
        default=0.95,
        type=float,
        help='p to use for top p sampling. if this isn\'t none, use this for everthing'
    )
    parser.add_argument(
        '-min_len',
        dest='min_len',
        default=1024,
        type=int,
        help='min length of sample',
    )
    parser.add_argument(
        '-eos_token',
        dest='eos_token',
        default=60000,
        type=int,
        help='eos token id',
    )
    parser.add_argument(
        '-samples',
        dest='samples',
        default=5,
        type=int,
        help='num_samples',
    )

    def extract_generated_target(output_tokens, tokenizer):
        """
        Given some tokens that were generated, extract the target
        :param output_tokens: [num_tokens] thing that was generated
        :param encoder: how they were encoded
        :param target: the piece of metadata we wanted to generate!
        :return:
        """
        # Filter out first instance of start token
        assert output_tokens.ndim == 1

        start_ind = 0
        end_ind = output_tokens.shape[0]

        return {
            'extraction': tokenization.printable_text(''.join(tokenizer.convert_ids_to_tokens(output_tokens))),
            'start_ind': start_ind,
            'end_ind': end_ind,
        }

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    proj_root_path = os.path.dirname(os.path.realpath(__file__))
    vocab_file_path = os.path.join(proj_root_path, "tokenization/clue-vocab.txt")

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path , do_lower_case=True)
    news_config = GroverConfig.from_json_file(args.model_config_fn)

    # We might have to split the batch into multiple chunks if the batch size is too large
    default_mbs = {12: 32, 24: 16, 48: 3}
    max_batch_size = args.max_batch_size if args.max_batch_size is not None else default_mbs[news_config.num_hidden_layers]

    # factorize args.batch_size = (num_chunks * batch_size_per_chunk) s.t. batch_size_per_chunk < max_batch_size
    num_chunks = int(np.ceil(args.batch_size / max_batch_size))
    batch_size_per_chunk = int(np.ceil(args.batch_size / num_chunks))

    # This controls the top p for each generation.
    top_p = np.ones((num_chunks, batch_size_per_chunk), dtype=np.float32) * args.top_p

    tf_config = tf.ConfigProto(allow_soft_placement=True)

    with tf.Session(config=tf_config, graph=tf.Graph()) as sess:
        initial_context = tf.placeholder(tf.int32, [batch_size_per_chunk, None])
        p_for_topp = tf.placeholder(tf.float32, [batch_size_per_chunk])
        eos_token = tf.placeholder(tf.int32, [])
        min_len = tf.placeholder(tf.int32, [])
        tokens, probs = sample(news_config=news_config, initial_context=initial_context,
                            eos_token=eos_token, min_len=min_len, ignore_ids=None, p_for_topp=p_for_topp,
                            do_topk=False)

        saver = tf.train.Saver()
        saver.restore(sess, args.model_ckpt)

        '''
        如果部署到web上,则所有的print都不需要
        input改为web返回的message
        不需要while循环
        将最后的"\n".join(l) 返回到一个参数,并展示到web中
        主要参数(篇数、长度)要用户在web中输入,或者在本代码里写死 -- 有默认值

        待解决:
        sample有5个,下面代码会for循环分别predict 5次,这5次结果要怎么在网页展示?
        min_lens没有用,比如1024的时候还是会生产一两百字的文章

        '''

        # print('🍺Model loaded. \nInput something please:⬇️')

        if request.method == 'POST':
            text = request.form['message']
            # data = [text] 原spam detection里的代码,不确定此处是否需要
        
        for i in range(args.samples):
            # print("Sample,", i + 1, " of ", args.samples)
            line = tokenization.convert_to_unicode(text)
            bert_tokens = tokenizer.tokenize(line)
            encoded = tokenizer.convert_tokens_to_ids(bert_tokens)
            context_formatted = []
            context_formatted.extend(encoded)
            # Format context end

            gens = []
            gens_raw = []
            gen_probs = []
            final_result = []

            for chunk_i in range(num_chunks):
                tokens_out, probs_out = sess.run([tokens, probs],
                                                feed_dict={initial_context: [context_formatted] * batch_size_per_chunk,
                                                            eos_token: args.eos_token, min_len: args.min_len,
                                                            p_for_topp: top_p[chunk_i]})

                for t_i, p_i in zip(tokens_out, probs_out):
                    extraction = extract_generated_target(output_tokens=t_i, tokenizer=tokenizer)
                    gens.append(extraction['extraction'])

            l = re.findall('.{1,70}', gens[0].replace('[UNK]', '').replace('##', ''))
            # 下一句的参应该传给 return
            # print("\n".join(l)) 
            # return a for loop
            # https://stackoverflow.com/questions/44564414/how-to-use-a-return-statement-in-a-for-loop
            final_result.append("\n".join(l))
            

    return render_template('result.html',prediction = final_result)