Exemplo n.º 1
0
def get():
    prefix = request.args.get('text', '')
    
    length = args['length']#101
    temperature = 1
    topk = 8
    topp = 0

    context_tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(prefix))
    l = len(context_tokens)
    out = common.sample_sequence(
        model=model, length=length,
        context=context_tokens,
        temperature=temperature, top_k=topk, top_p=topp, device=device,
        unk_idx = unk_idx
    )
    out = out.tolist()
    text = tokenizer.convert_ids_to_tokens(out[0])
    text[:l] = list(prefix)
    for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
        if common.is_word(item) and common.is_word(text[i+1]):
            text[i] = item + ' '
    for i, item in enumerate(text):
        if item == '[MASK]':
            text[i] = ''
        if item == '[CLS]' or item == '[SEP]':
            text[i] = '\n'
    text = ''.join(text).replace('##', '').strip()
    
    return Response(json.dumps({'status':"ok", 'message':text, 'request':prefix},
        ensure_ascii=False), mimetype="application/json")
Exemplo n.º 2
0
def _t_conv(srcdata): 
    ''' 
    T 是一个可逆变换,由非线性变换 τ 和线性变换 L 复合而成,即 T(.)=L(τ(.)) 
    ''' 
    if common.is_word(srcdata) == False: 
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算') 
     
    return _line_conv_L(_no_line_conv(srcdata)) 
def _tn_conv(srcdata):
    '''
    TN 是一个可逆变换,由非线性变换 τ 和线性变换 L' 复合而成,即 T(.)=L'(τ(.))
    '''
    if common.is_word(srcdata) == False:
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算')

    return _line_conv_LN(_no_line_conv(srcdata))
Exemplo n.º 4
0
def _line_conv_LN(srcdata): 
    ''' 
           线性变换 L' C=L'(B)=B^(B<<<13)^(B<<<23) 
    B : 一个word 
    ''' 
    if common.is_word(srcdata) == False: 
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算') 
     
    return srcdata ^ common.lrotate(srcdata, 13) ^ common.lrotate(srcdata, 23) 
Exemplo n.º 5
0
def _line_conv_L(srcdata): 
    ''' 
           线性变换 L C=L(B)=B^(B<<<2)^(B<<<10)^(B<<<18)^(B<<<24) 
    B : 一个word 
    ''' 
    if common.is_word(srcdata) == False: 
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算') 
     
    return srcdata ^ common.lrotate(srcdata, 2) ^ common.lrotate(srcdata, 10) ^ common.lrotate(srcdata, 18) ^ common.lrotate(srcdata, 24) 
def _line_conv_LN(srcdata):
    '''
           线性变换 L' C=L'(B)=B^(B<<<13)^(B<<<23)
    B : 一个word
    '''
    if common.is_word(srcdata) == False:
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算')

    return srcdata ^ common.lrotate(srcdata, 13) ^ common.lrotate(srcdata, 23)
Exemplo n.º 7
0
def _line_conv_L(srcdata): 
    ''' 
           线性变换 L C=L(B)=B^(B<<<2)^(B<<<10)^(B<<<18)^(B<<<24) 
    B : 一个word 
    ''' 
    if common.is_word(srcdata) == False: 
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算') 
     
    return srcdata ^ common.lrotate(srcdata, 2) ^ common.lrotate(srcdata, 10) ^ common.lrotate(srcdata, 18) ^ common.lrotate(srcdata, 24) 
Exemplo n.º 8
0
def _no_line_conv(srcdata): 
    ''' 
           非线性变换  将一个word通过S 盒转换为另外一个word 
    ''' 
    if common.is_word(srcdata) == False: 
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算') 
     
    four = srcdata & 0x000000FF 
    three = (srcdata & 0x0000FF00) >> 8 
    secod = (srcdata & 0x00FF0000) >> 16 
    first = (srcdata & 0xFF000000) >> 24 
     
    return (SM4_SBOX[first] << 24) | (SM4_SBOX[secod] << 16) | (SM4_SBOX[three] << 8) | (SM4_SBOX[four]) 
Exemplo n.º 9
0
def _no_line_conv(srcdata): 
    ''' 
           非线性变换  将一个word通过S 盒转换为另外一个word 
    ''' 
    if common.is_word(srcdata) == False: 
        raise Exception('源数据不在word定义范围[0x00000000 - 0xFFFFFFFF],不能进行变换计算') 
     
    four = srcdata & 0x000000FF 
    three = (srcdata & 0x0000FF00) >> 8 
    secod = (srcdata & 0x00FF0000) >> 16 
    first = (srcdata & 0xFF000000) >> 24 
     
    return (SM4_SBOX[first] << 24) | (SM4_SBOX[secod] << 16) | (SM4_SBOX[three] << 8) | (SM4_SBOX[four]) 
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0',
                        type=str,
                        required=False,
                        help='生成设备')
    parser.add_argument('--length',
                        default=100,
                        type=int,
                        required=False,
                        help='生成长度')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='生成的batch size')
    parser.add_argument('--nsamples',
                        default=1,
                        type=int,
                        required=False,
                        help='生成几个样本')
    parser.add_argument('--temperature',
                        default=1,
                        type=float,
                        required=False,
                        help='生成温度')
    parser.add_argument('--topk',
                        default=8,
                        type=int,
                        required=False,
                        help='最高几选一')
    parser.add_argument('--topp',
                        default=0,
                        type=float,
                        required=False,
                        help='最高积累概率')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/bud_vocab.txt',
                        type=str,
                        required=False,
                        help='词表路径')
    parser.add_argument('--model_path',
                        default='models/bud',
                        type=str,
                        required=False,
                        help='模型路径')
    parser.add_argument('--prefix',
                        default='如是我闻',
                        type=str,
                        required=False,
                        help='生成文章的开头')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.no_wordpiece:
        from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert
    elif args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp

    unk_idx = open(args.tokenizer_path).read().split('\n').index('[UNK]')
    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()

    if length == -1:
        length = model.config.n_ctx // 2
    elif length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    while True:
        raw_text = args.prefix
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(nsamples // batch_size):
            out = common.sample_sequence(model=model,
                                         length=length,
                                         context=context_tokens,
                                         temperature=temperature,
                                         top_k=topk,
                                         top_p=topp,
                                         device=device,
                                         unk_idx=unk_idx)
            out = out.tolist()

            for i in range(batch_size):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out[0])

                for i, item in enumerate(text[:-1]):  # 确保英文前后有空格
                    if common.is_word(item) and common.is_word(text[i + 1]):
                        text[i] = item + ' '

                for i, item in enumerate(text):
                    if item == '[MASK]':
                        text[i] = ''
                    if item == '[CLS]' or item == '[SEP]':
                        text[i] = '\n'
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                text = ''.join(text).replace('##', '').strip()
                print(text)
        print("=" * 80)