Exemplo n.º 1
0
    def _log(self, msg: str):
        """Method to log detailed information of the actor's actions"""

        log(env=self.env,
            actor_name=self.__class__.__name__,
            condition=self.condition,
            msg=msg)
Exemplo n.º 2
0
def main():
    device = torch.device(f'cuda:{args.gpu}')
    data_list = load_persona_chat()
    tokenizer = PolyEncoderTokenizer.from_pretrained()
    if args.poly:
        model = PretrainedPolyEncoder.from_pretrained()
    else:
        model = PretrainedBiEncoder.from_pretrained()

    history_size = model.opt['history_size']
    text_truncate = model.opt['text_truncate']
    model.to(device)
    model.eval()
    for param in model.parameters():
        param.requires_grad = False

    log(f'Loading LM model from {args.lm_model_dir}')
    lm_model = PolyEncoderLM.from_pretrained(checkpoint=args.lm_model_dir)
    lm_model.to(device)
    lm_model.eval()
    for param in lm_model.parameters():
        param.requires_grad = False
    eval_lm_model = SentenceScorer(device)

    if args.fp16:
        from apex import amp
        model, lm_model = amp.initialize([model, lm_model])

    for data in data_list:
        query = '\n'.join(data[0] + data[1][-history_size:])
        candidates = data[2]
        truth = candidates[-1]
        query_ids = torch.tensor(tokenizer.encode(query, max_length=text_truncate), device=device).unsqueeze(0)
        candidates = torch.tensor(tokenizer.batch_encode_plus(
            candidates, pad_to_max_length=True)['input_ids'], device=device)
        output = model.forward(ctxt_input_ids=query_ids, cand_input_ids=candidates)
        scores = output[0].squeeze()
        if args.nature:
            collision, score = gen_natural_collision(
                query, truth, model, tokenizer, device, lm_model, eval_lm_model, scores.max())
        else:
            collision, score = gen_aggressive_collision(query, scores.max(), model, tokenizer, device, lm_model)

        # get the rank of collision
        scores = scores.cpu().tolist()
        scores = np.asarray([score] + scores)
        n = len(scores)
        ranks = np.empty(n)
        ranks[np.argsort(-scores)] = np.arange(n)

        lm_perp = eval_lm_model.perplexity(collision)
        msg = f'Input={query}\n' \
              f'Ground truth response={truth}\n' \
              f'Collision={collision}\n' \
              f'Collision similarity core={score}\n' \
              f'Rank={ranks[0]}\n' \
              f'LM perp={lm_perp.item()}\n'
        log(msg)
Exemplo n.º 3
0
 def writeCSV(self, df, file):
     """Escrever CSV.
     
     Args:
         df: Dataframe.
         file: Nome do arquivo CSV.
     """
     log("INFO", "[writeCSV] Salvando o CSV.")
     df.repartition(1).write.csv(file, mode='overwrite')
Exemplo n.º 4
0
    def wordCount(self, wordListDF):
        """Cria dataframe com a contagem de palavras

        Args:
            wordListDF: Dataframe com uma coluna chamada 'word'.

        Returns:
            DataFrame: Dataframe contendo 'word' e 'count'.
        """
        log("INFO", "[wordCount] Contando as palavras.")
        return wordListDF.groupBy('word').count()
Exemplo n.º 5
0
    def read_file_and_store_on_datalake(self, spark_session, output_base_dir):
        """
            Read a traditional JSON ([{"a":1, "b":2}, {...}]) file 
            and store it into a postgresql database 

            Parameters
            ----------

            spark_session: SparkSession
                The SparkSession of the application

            output_base_dir: str
                The datalake location

        """
        input_filename = self._args[2]
        file = self._input_data_dir + input_filename
        output_data_name = self._args[3]
        output_path = output_base_dir + "/" + output_data_name
        schema_json = self._args[4]
        date_ops = json.loads(self._args[5]) if len(self._args) == 6 else None

        ##################################
        #
        ## Parse schema JSON to a Spark schema
        #
        ##################################
        schema = get_schema(schema_json)
        log(spark_session).info("Schema: " + str(schema))

        try:
            df = spark_session.read.schema(schema).json(file)

            log(spark_session).info("Number of rows: " + str(df.count()))

            df = convert_date_using_data_ops_schema(df, date_ops)
            df = define_date_columns_in_df(df, date_ops)
            df = change_whitespace_on_columns_by_underscore(df)

            try:
                df \
                    .write \
                    .format("parquet") \
                    .partitionBy("year", "month", "day") \
                    .mode("append") \
                    .save(output_path)

            except Exception as e:
                log(spark_session).error("Error on writing to the data lake... ")
                log(spark_session).error(e)

        except Exception as e:
            log(spark_session).error(e)
Exemplo n.º 6
0
 def readCSV(self, file):
     """Ler CSV.
     
     Args:
         file: Nome do arquivo CSV.
     
     Returns:
         DataFrame: Conteúdo do CSV.
     """
     log("INFO", f"[readCSV] Lendo arquivo CSV: {file}.")
     dfEnd = self.spark.read.csv(file, header=True)
     return dfEnd
Exemplo n.º 7
0
    def joinDataFrames(self, df, newRow):
        """Faz o Join de dois DataFrames.

        Args:
            df: Dataframe.
        
        Returns:
            DataFrame: Dataframe completo.
        """
        log("INFO", "[joinDataFrame] Fazendo o Join dos DataFrames.")
        dfEnd = df.union(newRow)
        return dfEnd
Exemplo n.º 8
0
 def removePunctuation(self, column):
     """Remover pontuação do dataframe
     
     Args:
         column: Coluna para remover as pontuações.
     
     Returns:
         Registros sem as pontuações.
     """
     log("INFO",
         f"[removePunctuation] Removendo pontuação da coluna: {column}")
     return trim(lower(regexp_replace(column, '[^\sa-zA-Z0-9]',
                                      ''))).alias('value')
Exemplo n.º 9
0
 def writeCSV(self, df, file):
     """Escrever CSV.
     
     Args:
         df: Dataframe.
         file: Nome do arquivo CSV.
     """
     log("INFO", "[writeCSV] Salvando o CSV.")
     dfResultToCSV = df.withColumn("lista_pedidos",
                                   df['lista_pedidos'].cast("string"))
     dfResultToCSV.repartition(1).write.csv(file,
                                            mode='overwrite',
                                            header=True)
Exemplo n.º 10
0
 def readCSV(self, file):
     """Ler CSV.
     
     Args:
         file: Nome do arquivo CSV.
     
     Returns:
         DataFrame: Conteúdo do CSV.
     """
     log("INFO", f"[readCSV] Lendo arquivo CSV: {file}")
     dfEnd = self.spark.read.text(file).select(
         self.removePunctuation(col('value')))
     return dfEnd
def main():
    device = torch.device(f'cuda:{args.gpu}')
    target_q_doc, query_scores, bm25_q_doc, best_query_sent, queries = prepare_data_and_scores(
        args.model_name, args.data_name)
    model_path = os.path.join(args.model_dir, args.model_name)
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    model = BertForConcatNextSentencePrediction.from_pretrained(model_path)
    model.to(device)
    model.eval()
    for param in model.parameters():
        param.requires_grad = False

    log(f'Loading LM model from {args.lm_model_dir}')
    lm_model = BertForLM.from_pretrained(args.lm_model_dir)
    lm_model.to(device)
    lm_model.eval()
    for param in lm_model.parameters():
        param.requires_grad = False
    eval_lm_model = SentenceScorer(device)

    if args.fp16:
        from apex import amp
        model, lm_model = amp.initialize([model, lm_model])

    for qid in queries:
        query = queries[qid]
        best = best_query_sent[qid]
        best_score = best[0]
        best_sent = ' '.join(best[1:])

        old_scores = query_scores[qid][::-1]
        if args.nature:
            collision, new_score, collision_cands = gen_natural_collision(
                query, best_sent, model, tokenizer, device, lm_model,
                best_score, eval_lm_model)
        else:
            collision, new_score, collision_cands = gen_aggressive_collision(
                query, best_sent, model, tokenizer, device, best_score,
                lm_model)

        lm_perp = eval_lm_model.perplexity(collision)
        msg = f'Query={query}\n' \
              f'Best true sentences={best_sent}\n' \
              f'Best similarity score={best_score}\n' \
              f'Collision={collision}\n' \
              f'Similarity core={new_score}\n' \
              f'LM perp={lm_perp.item()}\n'
        log(msg)

        if args.verbose:
            log('---Rank shifts for less relevant documents---')
            weighted_new_score = sum(BIRCH_ALPHAS) * new_score
            for did in bm25_q_doc[qid]:
                new_score = bm25_q_doc[qid][
                    did] * BIRCH_GAMMA + weighted_new_score * (1 - BIRCH_GAMMA)
                old_rank, old_score = target_q_doc[qid][did]
                new_rank = 1000 - bisect.bisect_left(old_scores, new_score)
                log(f'Query id={qid}, Doc id={did}, '
                    f'old score={old_score:.2f}, new score={new_score:.2f}, old rank={old_rank}, new rank={new_rank}'
                    )
Exemplo n.º 12
0
    def countOrders(self, df):
        """Calcular a quantidade de todos os pedidos por cliente.

        Args:
            df: Dataframe
        
        Returns:
            Dataframe.
        """
        log("INFO",
            f"[countOrders] Contando a quantidade de pedidos por cliente.")
        dfEnd = (df.groupBy(col('codigo_cliente')).agg(
            count('data_pedido').alias('numero_pedidos')).orderBy(
                "numero_pedidos", ascending=False))
        return dfEnd
Exemplo n.º 13
0
def perturb_logits(
        unpert_logits,
        stepsize=0.01,
        target_model_wrapper=None,
        num_iterations=3,
        kl_scale=0.01,
        temperature=1.0,
        device="cuda",
        verbose=False,
        logit_mask=0.,
):
    # Generate inital perturbed past
    grad_accumulator = np.zeros(unpert_logits.shape, dtype=np.float32)
    perturbation = to_var(grad_accumulator, device=device)
    optimizer = torch.optim.Adam([perturbation], lr=stepsize)

    # accumulate perturbations for num_iterations
    for i in range(num_iterations):
        optimizer.zero_grad()
        # Compute hidden using perturbed past
        logits = unpert_logits * temperature + perturbation + logit_mask
        probs = torch.softmax(logits / temperature, -1)
        unpert_probs = torch.softmax(unpert_logits, -1)

        loss = torch.scalar_tensor(0.0).to(device)
        loss_list = []

        if target_model_wrapper is not None:
            discrim_loss = target_model_wrapper(probs)
            if verbose and i % 2 == 0:
                log(f"Iteration {i + 1}, pplm_discrim_loss: {discrim_loss.data.cpu().numpy()}")
            loss += discrim_loss
            loss_list.append(discrim_loss)

        if kl_scale > 0.0:
            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
            corrected_probs = probs + correction.detach()
            kl_loss = kl_scale * (corrected_probs * (corrected_probs / unpert_probs).log()).sum()
            loss += kl_loss

        # compute gradients
        loss.backward()
        optimizer.step()

    # apply the accumulated perturbations to the past
    pert_logits = unpert_logits * temperature + perturbation
    return pert_logits
Exemplo n.º 14
0
    def createDataFrameWords(self, df):
        """Criar dataframe com uma palavra por linha.

        Args:
            df: Dataframe.
        
        Returns:
            DataFrame: Dataframe contendo as palavras.
        """
        log(
            "INFO",
            "[createDataFrameWords] Criando DataFrame com uma palavra por linha."
        )
        dfWords = (df.select(explode(split(
            df.value, ' ')).alias('word')).where(col('word') != ''))
        return dfWords
Exemplo n.º 15
0
    def read_data_from_socket_and_store_on_datalake(self, spark):
        """
            Read a traditional JSON ([{"a":1, "b":2}, {...}]) file 
            and store it into a postgresql database 

            Parameters
            ----------

            spark: SparkSession
                The SparkSession of the application

        """
        streaming_name = self._args[1]
        schema_json = self._args[2]
        date_ops = json.loads(self._args[3]) if len(self._args) == 4 else None

        output_dir = get_path_by_day(self._output_base_dir, streaming_name)

        ##################################
        #
        ## Parse schema JSON to a Spark schema
        #
        ##################################
        schema = get_schema(schema_json)
        log(spark).info("Schema: " + str(schema))

        try:
            df = spark \
                .readStream \
                .format("socket") \
                .option("host", "localhost") \
                .option("port", 9999) \
                .load()

            df = df \
                .select(from_json("value", schema).alias("json_data")) \
                .selectExpr("json_data.*")

            df.printSchema()

            if date_ops:
                df = convert_date_using_data_ops_schema(df, date_ops)

            try:
                df.writeStream \
                    .format("parquet") \
                    .option("path", output_dir) \
                    .trigger(processingTime=self._streaming_output_interval) \
                    .start() \
                    .awaitTermination()

            except Exception as e:
                log(spark).error("Error on writing database... ")
                log(spark).error(e)

        except Exception as e:
            log(spark).error(e)
Exemplo n.º 16
0
    def filterAgeUnderThirty(self, df):
        """Filtrar os clientes que são menores de 30 anos e compraram na Black Friday.

        Args:
            df: Dataframe
        
        Returns:
            Dataframe.
        """
        log(
            "INFO",
            f"[filterAgeUnderThirty] Filtrando os clientes com idade inferior a 30 anos."
        )
        dfStaging = df.dropDuplicates(['codigo_cliente'])
        dfStaging = dfStaging.filter(df.idade < 30)
        dfEnd = dfStaging.select("codigo_cliente", "idade")
        return dfEnd
Exemplo n.º 17
0
    def joinDataframes(self, dfOrders, dfCount, dfOrdersList, dfAge):
        """Faz o Join de dois DataFrames.

        Args:
            df: Dataframe.
        
        Returns:
            DataFrame: Dataframe completo.
        """
        log("INFO", "[joinDataFrame] Fazendo o Join dos DataFrames.")
        innerJoinAgeOrders = dfOrders.join(dfCount, ["codigo_cliente"],
                                           "inner")
        innerJoinStaging = innerJoinAgeOrders.join(dfOrdersList,
                                                   ["codigo_cliente"], "inner")
        innerJoinEnd = innerJoinStaging.join(dfAge, ["codigo_cliente"],
                                             "inner")
        return innerJoinEnd
Exemplo n.º 18
0
def main():
    device = torch.device(f'cuda:{args.gpu}')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = ExtSummarizer(CONFIG, torch.load(PRESUMM_MODEL_PATH))
    model.to(device)
    model.eval()
    for param in model.parameters():
        param.requires_grad = False

    if args.fp16:
        model = amp.initialize(model)

    eval_lm_model = SentenceScorer(device)
    lm_model = BertForLM.from_pretrained(args.lm_model_dir)
    lm_model.to(device)
    lm_model.eval()
    for param in lm_model.parameters():
        param.requires_grad = False

    data = load_ext_sum_data()
    for ex in data:
        ex = preprocess(ex)
        src, segs, clss, src_sent_labels, src_txt, tgt_txt = ex
        if int(len(src_sent_labels) * args.insert_pos) == 0:
            # too short to insert collision into the article
            continue

        truth = [
            src_txt[j] for j in range(len(src_sent_labels))
            if src_sent_labels[j] == 1
        ]
        truth = ' '.join(truth)
        if args.nature:
            collision, score, rank = gen_natural_collision(
                ex, model, tokenizer, device, lm_model, eval_lm_model)
        else:
            collision, score, rank = gen_aggressive_collision(
                ex, model, tokenizer, device, lm_model)

        lm_perp = eval_lm_model.perplexity(collision)
        msg = f'Ground truth summary={truth}\n' \
              f'Collision={collision}\n' \
              f'Score={score}\n' \
              f'Rank={rank}\n' \
              f'LM perp={lm_perp.item()}\n'
        log(msg)
Exemplo n.º 19
0
    def filterCustomerOrders(self, df):
        """Filtrar clientes com mais de duas compras nos dias da Black Friday

        Args:
            df: Dataframe
        
        Returns:
            Dataframe.
        """
        log(
            "INFO",
            f"[filterCustomerOrders] Filtrando clientes que fizeram duas compras nos dias da Black Friday."
        )
        dfStaging = (df.groupBy(col('codigo_cliente')).agg(
            count('data_pedido').alias('numero_pedidos')).orderBy(
                "numero_pedidos", ascending=False))
        dfEnd = dfStaging.filter(dfStaging.numero_pedidos > 2)
        return dfEnd
Exemplo n.º 20
0
    def filterShoppingBlackFriday(self, df):
        """Filtrar as compras realizadas nas últimas 3 Black Fridays.

        Args:
            df: Dataframe
        
        Returns:
            Dataframe.
        """
        log(
            "INFO",
            f"[filterShoppingBlackFriday] Filtrando as compras das Black Fridays."
        )
        bf_date = [
            "2017-11-24", "2017-11-25", "2017-11-26", "2018-11-23",
            "2018-11-24", "2018-11-25", "2019-11-29", "2019-11-30",
            "2019-12-01"
        ]
        return df.filter(col("data_pedido_date").isin(bf_date))
Exemplo n.º 21
0
 def createAgeDateColumns(self, df):
     """Criar colunas de idade do cliente e data do pedido.
     
     Args:
         df: Dataframe
     
     Returns:
         Dataframe.
     """
     log(
         "INFO",
         f"[createAgeDateColumns] Criando colunas de idade e data do pedido."
     )
     return (df.withColumn(
         "data_pedido_date",
         date_format(
             from_unixtime(col('data_pedido')),
             "yyyy-MM-dd")).withColumn('idade', (months_between(
                 current_date(), col('data_nascimento_cliente')) / 12).cast(
                     IntegerType())))
Exemplo n.º 22
0
    def createDataFrameLength(self, df, flag):
        """Cria um DataFrame com o tamanho das palavras e aplica um filtro.

        Args:
            df: Dataframe.
        
        Returns:
            DataFrame.
        """
        log(
            "INFO",
            "[createDataFrameLength] Criando uma coluna com o tamanho de cada palavra."
        )
        dfStaging = df.withColumn("length", length("word"))
        if flag == "smaller":
            dfEnd = dfStaging.filter(length(col('word')) <= 10)
        elif flag == "bigger":
            dfEnd = dfStaging.filter(length(col('word')) > 10)
        else:
            dfEnd = None
        return dfEnd
Exemplo n.º 23
0
    def createOrderList(self, df):
        """Criando coluna com a lista dos pedidos.
           A coluna lista dos pedidos é formada por um array de arrays. 
           Com os valores de codigo do pedido e a data do pedido.

        Args:
            df: Dataframe
        
        Returns:
            Dataframe.
        """
        log("INFO", f"[createOrderList] Criando a lista dos pedidos.")
        dfStaging = df.withColumn(
            "codigo_pedido_data",
            array(
                concat(col('codigo_pedido'), lit(', '),
                       col('data_pedido_date'))))
        dfStaging = dfStaging.drop('data_nascimento_cliente', 'data_pedido',
                                   'codigo_pedido', 'idade')
        dfEnd = dfStaging.groupBy('codigo_cliente').agg(
            collect_list(col('codigo_pedido_data')).alias('lista_pedidos'))
        return dfEnd
Exemplo n.º 24
0
def main():
    device = torch.device(f'cuda:{args.gpu}')

    model_dir = os.path.join(args.model_dir, args.task_name.lower())
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    log(f'Loading model from {model_dir}')

    model = BertForConcatSequenceClassification.from_pretrained(model_dir)
    model.to(device)
    model.eval()
    for param in model.parameters():
        param.requires_grad = False

    eval_lm_model = SentenceScorer(device)
    lm_model = BertForLM.from_pretrained(args.lm_model_dir)
    lm_model.to(device)
    lm_model.eval()
    for param in lm_model.parameters():
        param.requires_grad = False

    if args.fp16:
        from apex import amp
        model, lm_model = amp.initialize([model, lm_model])

    log(f'Loading data from {args.task_name.upper()}')
    data = glue_processors[args.task_name.lower()]().get_dev_examples(
        model_dir)

    n = 0
    for inputs in data:
        if inputs.label == '1':
            n += 1
            if args.nature:
                collision, score = gen_natural_collision(
                    inputs.text_a,
                    inputs.text_b,
                    model,
                    tokenizer,
                    device,
                    lm_model=lm_model,
                    eval_lm_model=eval_lm_model)
            else:
                collision, score = gen_aggressive_collision(inputs.text_a,
                                                            model,
                                                            tokenizer,
                                                            device,
                                                            lm_model=lm_model)

            lm_perp = eval_lm_model.perplexity(collision)
            msg = f'Input={inputs.text_a}\n' \
                  f'Ground truth paraphrase={inputs.text_b}\n' \
                  f'Collision={collision}\n' \
                  f'Confidence of being paraphrase={score}\n' \
                  f'LM perp={lm_perp.item()}\n'
            log(msg)
def gen_aggressive_collision(inputs_a,
                             inputs_b,
                             model,
                             tokenizer,
                             device,
                             margin=None,
                             lm_model=None):
    word_embedding = model.get_input_embeddings().weight.detach()
    if lm_model is not None:
        lm_word_embedding = lm_model.get_input_embeddings().weight.detach()

    vocab_size = word_embedding.size(0)
    input_mask = torch.zeros(vocab_size, device=device)
    filters = find_filters(inputs_a,
                           model,
                           tokenizer,
                           device,
                           k=args.num_filters)
    best_ids = get_inputs_filter_ids(inputs_b, tokenizer)
    input_mask[best_ids] = -1e9
    remove_tokens = add_single_plural(inputs_a, tokenizer)
    if args.verbose:
        log(','.join(remove_tokens))

    remove_ids = tokenizer.convert_tokens_to_ids(remove_tokens)
    remove_ids.append(tokenizer.vocab['.'])
    input_mask[remove_ids] = -1e9
    num_filters_ids = tokenizer.convert_tokens_to_ids(filters)
    input_mask[num_filters_ids] = -1e9
    sub_mask = get_sub_masks(tokenizer, device)

    input_ids = tokenizer.encode(inputs_a)
    input_ids = torch.tensor(input_ids, device=device).unsqueeze(0)
    # prevent output num_filters neighbor words
    seq_len = args.seq_len
    batch_input_ids = torch.cat([input_ids] * args.topk, 0)
    stopwords_mask = create_constraints(seq_len, tokenizer, device)

    def relaxed_to_word_embs(x):
        # convert relaxed inputs to word embedding by softmax attention
        masked_x = x + input_mask + sub_mask
        if args.regularize:
            masked_x += stopwords_mask
        p = torch.softmax(masked_x / args.stemp, -1)
        x = torch.mm(p, word_embedding)
        # add embeddings for period and SEP
        x = torch.cat([x, word_embedding[tokenizer.sep_token_id].unsqueeze(0)])
        return p, x.unsqueeze(0)

    def get_lm_loss(p):
        x = torch.mm(p.detach(), lm_word_embedding).unsqueeze(0)
        return lm_model(inputs_embeds=x, one_hot_labels=p.unsqueeze(0))[0]

    # some constants
    sep_tensor = torch.tensor([tokenizer.sep_token_id] * args.topk,
                              device=device)
    batch_sep_embeds = word_embedding[sep_tensor].unsqueeze(1)
    labels = torch.ones((1, ), dtype=torch.long, device=device)
    repetition_penalty = 1.0

    best_collision = None
    best_score = -1e9
    prev_score = -1e9
    collision_cands = []

    var_size = (seq_len, vocab_size)
    z_i = torch.zeros(*var_size, requires_grad=True, device=device)
    for it in range(args.max_iter):
        optimizer = torch.optim.Adam([z_i], lr=args.lr)
        for j in range(args.perturb_iter):
            optimizer.zero_grad()
            # relaxation
            p_inputs, inputs_embeds = relaxed_to_word_embs(z_i)
            # forward to BERT with relaxed inputs
            loss, cls_logits, _ = model(input_ids,
                                        inputs_embeds=inputs_embeds,
                                        next_sentence_label=labels)
            if margin is not None:
                loss += torch.sum(torch.relu(margin - cls_logits[:, 1]))

            if args.beta > 0.:
                lm_loss = get_lm_loss(p_inputs)
                loss = args.beta * lm_loss + (1 - args.beta) * loss

            loss.backward()
            optimizer.step()
            if args.verbose and (j + 1) % 10 == 0:
                log(f'It{it}-{j + 1}, loss={loss.item()}')

        # detach to free GPU memory
        z_i = z_i.detach()

        _, topk_tokens = torch.topk(z_i, args.topk)
        probs_i = torch.softmax(z_i / args.stemp, -1).unsqueeze(0).expand(
            args.topk, seq_len, vocab_size)

        output_so_far = None
        # beam search left to right
        for t in range(seq_len):
            t_topk_tokens = topk_tokens[t]
            t_topk_onehot = torch.nn.functional.one_hot(
                t_topk_tokens, vocab_size).float()
            next_clf_scores = []
            for j in range(args.num_beams):
                next_beam_scores = torch.zeros(tokenizer.vocab_size,
                                               device=device) - 1e9
                if output_so_far is None:
                    context = probs_i.clone()
                else:
                    output_len = output_so_far.shape[1]
                    beam_topk_output = output_so_far[j].unsqueeze(0).expand(
                        args.topk, output_len)
                    beam_topk_output = torch.nn.functional.one_hot(
                        beam_topk_output, vocab_size)
                    context = torch.cat([
                        beam_topk_output.float(), probs_i[:,
                                                          output_len:].clone()
                    ], 1)
                context[:, t] = t_topk_onehot
                context_embeds = torch.einsum('blv,vh->blh', context,
                                              word_embedding)
                context_embeds = torch.cat([context_embeds, batch_sep_embeds],
                                           1)
                clf_logits = model(input_ids=batch_input_ids,
                                   inputs_embeds=context_embeds)[0]
                clf_scores = clf_logits[:, 1].detach().float()
                next_beam_scores.scatter_(0, t_topk_tokens, clf_scores)
                next_clf_scores.append(next_beam_scores.unsqueeze(0))

            next_clf_scores = torch.cat(next_clf_scores, 0)
            next_scores = next_clf_scores + input_mask + sub_mask

            if args.regularize:
                next_scores += stopwords_mask[t]

            if output_so_far is None:
                next_scores[1:] = -1e9

            if output_so_far is not None and repetition_penalty > 1.0:
                lm_model.enforce_repetition_penalty_(next_scores, 1,
                                                     args.num_beams,
                                                     output_so_far,
                                                     repetition_penalty)

            # re-organize to group the beam together
            # (we are keeping top hypothesis accross beams)
            next_scores = next_scores.view(
                1, args.num_beams *
                vocab_size)  # (batch_size, num_beams * vocab_size)
            next_scores, next_tokens = torch.topk(next_scores,
                                                  args.num_beams,
                                                  dim=1,
                                                  largest=True,
                                                  sorted=True)
            # next batch beam content
            next_sent_beam = []
            for beam_token_rank, (beam_token_id,
                                  beam_token_score) in enumerate(
                                      zip(next_tokens[0], next_scores[0])):
                # get beam and token IDs
                beam_id = beam_token_id // vocab_size
                token_id = beam_token_id % vocab_size
                next_sent_beam.append((beam_token_score, token_id, beam_id))

            next_batch_beam = next_sent_beam
            # sanity check / prepare next batch
            assert len(next_batch_beam) == args.num_beams
            beam_tokens = torch.tensor([x[1] for x in next_batch_beam],
                                       device=device)
            beam_idx = torch.tensor([x[2] for x in next_batch_beam],
                                    device=device)

            # re-order batch
            if output_so_far is None:
                output_so_far = beam_tokens.unsqueeze(1)
            else:
                output_so_far = output_so_far[beam_idx, :]
                output_so_far = torch.cat(
                    [output_so_far, beam_tokens.unsqueeze(1)], dim=-1)

        pad_output_so_far = torch.cat(
            [output_so_far, sep_tensor[:args.num_beams].unsqueeze(1)], 1)
        concat_input_ids = torch.cat(
            [batch_input_ids[:args.num_beams], pad_output_so_far], 1)
        token_type_ids = torch.cat([
            torch.zeros_like(batch_input_ids[:args.num_beams]),
            torch.ones_like(pad_output_so_far)
        ], 1)
        clf_logits = model(input_ids=concat_input_ids,
                           token_type_ids=token_type_ids)[0]
        actual_clf_scores = clf_logits[:, 1]
        sorter = torch.argsort(actual_clf_scores, -1, descending=True)
        if args.verbose:
            decoded = [
                f'{actual_clf_scores[i].item():.4f}, '
                f'{tokenizer.decode(output_so_far[i].cpu().tolist())}'
                for i in sorter
            ]
            log(f'It={it}, margin={margin:.4f}, query={inputs_a} | ' +
                ' | '.join(decoded))

        valid_idx = sorter[0]
        valid = False
        for idx in sorter:
            valid, _ = valid_tokenization(output_so_far[idx], tokenizer)
            if valid:
                valid_idx = idx
                break

        # re-initialize z_i
        curr_best = output_so_far[valid_idx]
        next_z_i = torch.nn.functional.one_hot(curr_best, vocab_size).float()
        eps = 0.1
        next_z_i = (next_z_i *
                    (1 - eps)) + (1 - next_z_i) * eps / (vocab_size - 1)
        z_i = torch.nn.Parameter(torch.log(next_z_i), True)

        curr_score = actual_clf_scores[valid_idx].item()
        if valid and curr_score > best_score:
            best_score = curr_score
            best_collision = tokenizer.decode(curr_best.cpu().tolist())

        if curr_score <= prev_score:
            break
        prev_score = curr_score

    return best_collision, best_score, collision_cands
Exemplo n.º 26
0
    def read_file_and_store_on_postgresql(self, spark_session, postgresql_access_dict):
        """
            Read a traditional JSON ([{"a":1, "b":2}, {...}]) file 
            and store it into a postgresql database 

            Parameters
            ----------

            spark_session: SparkSession
                The SparkSession of the application

            postgresql_access_dict: dict
                The dictionary with the data necessary to 
                access the postgresql. The json must contain
                {"database":..., "username":..., "password":...}
        """

        input_filename = self._args[2]
        file = self._input_data_dir + input_filename
        tbl_name = self._args[3]
        schema_json = self._args[4]
        date_ops = json.loads(self._args[5]) if len(self._args) == 6 else None

        ##################################
        #
        ## Parse schema JSON to a Spark schema
        #
        ##################################
        schema = get_schema(schema_json)
        log(spark_session).info("Schema: " + str(schema))

        try:
            rdd = spark_session \
                .sparkContext \
                .textFile(file) \
                .map(lambda x: json.loads(x)) \
                .flatMap(lambda x: x)

            df = spark_session.createDataFrame(rdd, schema)

            if date_ops:
                df = convert_date_using_data_ops_schema(df, date_ops)

            log(spark_session).info("Number of rows: " + str(df.count()))

            try:
                df \
                    .write \
                    .format("jdbc") \
                    .option("url", "jdbc:postgresql:" + postgresql_access_dict["database"]) \
                    .option("dbtable", "public." + tbl_name) \
                    .option("user", postgresql_access_dict["username"]) \
                    .option("password", postgresql_access_dict["password"]) \
                    .mode("overwrite") \
                    .save()
            except Exception as e:
                log(spark_session).error("Error on writing database... ")
                log(spark_session).error(e)

        except Exception as e:
            log(spark_session).error(e)
def gen_natural_collision(inputs_a,
                          inputs_b,
                          model,
                          tokenizer,
                          device,
                          lm_model,
                          margin=None,
                          eval_lm_model=None):
    input_mask = torch.zeros(tokenizer.vocab_size, device=device)
    filters = find_filters(inputs_a,
                           model,
                           tokenizer,
                           device,
                           k=args.num_filters)
    best_ids = get_inputs_filter_ids(inputs_b, tokenizer)
    input_mask[best_ids] = -1e9

    num_filters_ids = tokenizer.convert_tokens_to_ids(filters)
    input_mask[num_filters_ids] = -1e9
    remove_tokens = add_single_plural(inputs_a, tokenizer)
    if args.verbose:
        log(','.join(remove_tokens))
    remove_ids = tokenizer.convert_tokens_to_ids(remove_tokens)
    input_mask[remove_ids] = -1e9
    input_mask[tokenizer.convert_tokens_to_ids(['.', '@', '='])] = -1e9
    unk_ids = tokenizer.encode('<unk>', add_special_tokens=False)
    input_mask[unk_ids] = -1e9

    filter_ids = [
        tokenizer.vocab[w] for w in tokenizer.vocab if not w.isalnum()
    ]
    first_mask = torch.zeros_like(input_mask)
    first_mask[filter_ids] = -1e9

    collition_init = tokenizer.convert_tokens_to_ids([BOS_TOKEN])
    start_idx = 1
    num_beams = args.num_beams
    repetition_penalty = 5.0
    curr_len = len(collition_init)

    # scores for each sentence in the beam
    beam_scores = torch.zeros((num_beams, ), dtype=torch.float, device=device)
    beam_scores[1:] = -1e9

    output_so_far = torch.tensor([collition_init] * num_beams, device=device)
    past = None
    vocab_size = tokenizer.vocab_size
    topk = args.topk
    input_ids = tokenizer.encode(inputs_a)

    input_ids = torch.tensor(input_ids, device=device).unsqueeze(0)
    batch_input_ids = torch.cat([input_ids] * topk, 0)
    sep_tensor = torch.tensor([tokenizer.sep_token_id] * topk, device=device)

    is_first = True
    word_embedding = model.get_input_embeddings().weight.detach()
    batch_sep_embeds = word_embedding[sep_tensor].unsqueeze(1)
    batch_labels = torch.ones((num_beams, ), dtype=torch.long, device=device)

    def classifier_loss(p, context):
        context = torch.nn.functional.one_hot(context, len(word_embedding))
        one_hot = torch.cat([context.float(), p.unsqueeze(1)], 1)
        x = torch.einsum('blv,vh->blh', one_hot, word_embedding)
        # add embeddings for SEP
        x = torch.cat([x, batch_sep_embeds[:num_beams]], 1)
        cls_loss = model(batch_input_ids[:num_beams],
                         inputs_embeds=x,
                         next_sentence_label=batch_labels)[0]
        return cls_loss

    best_score = -1e9
    best_collision = None
    collision_cands = []

    while (curr_len - start_idx) < args.seq_len:
        model_inputs = lm_model.prepare_inputs_for_generation(output_so_far,
                                                              past=past)
        outputs = lm_model(**model_inputs)
        present = outputs[1]
        # (batch_size * num_beams, vocab_size)
        next_token_logits = outputs[0][:, -1, :]
        lm_scores = torch.log_softmax(next_token_logits, dim=-1)

        if args.perturb_iter > 0:
            # perturb internal states of LM
            def target_model_wrapper(p):
                return classifier_loss(p,
                                       output_so_far.detach()[:, start_idx:])

            next_token_logits = perturb_logits(
                next_token_logits,
                args.lr,
                target_model_wrapper,
                num_iterations=args.perturb_iter,
                kl_scale=args.kl_scale,
                temperature=args.stemp,
                device=device,
                verbose=args.verbose,
                logit_mask=input_mask,
            )

        if repetition_penalty > 1.0:
            lm_model.enforce_repetition_penalty_(next_token_logits, 1,
                                                 num_beams, output_so_far,
                                                 repetition_penalty)
        next_token_logits = next_token_logits / args.stemp

        # (batch_size * num_beams, vocab_size)
        next_lm_scores = lm_scores + beam_scores[:, None].expand_as(lm_scores)
        _, topk_tokens = torch.topk(next_token_logits, topk)
        # get target model score here
        next_clf_scores = []
        for i in range(num_beams):
            next_beam_scores = torch.zeros(tokenizer.vocab_size,
                                           device=device) - 1e9
            if output_so_far.shape[1] > start_idx:
                curr_beam_topk = output_so_far[i,
                                               start_idx:].unsqueeze(0).expand(
                                                   topk,
                                                   output_so_far.shape[1] -
                                                   start_idx)
                # (topk, curr_len + next_token + sep)
                curr_beam_topk = torch.cat([
                    curr_beam_topk, topk_tokens[i].unsqueeze(1),
                    sep_tensor.unsqueeze(1)
                ], 1)
            else:
                curr_beam_topk = torch.cat(
                    [topk_tokens[i].unsqueeze(1),
                     sep_tensor.unsqueeze(1)], 1)
            concat_input_ids = torch.cat([batch_input_ids, curr_beam_topk], 1)
            token_type_ids = torch.cat([
                torch.zeros_like(batch_input_ids),
                torch.ones_like(curr_beam_topk),
            ], 1)
            clf_logits = model(input_ids=concat_input_ids,
                               token_type_ids=token_type_ids)[0]
            clf_scores = torch.log_softmax(clf_logits, -1)[:, 1].detach()
            next_beam_scores.scatter_(0, topk_tokens[i], clf_scores.float())
            next_clf_scores.append(next_beam_scores.unsqueeze(0))
        next_clf_scores = torch.cat(next_clf_scores, 0)

        if is_first:
            next_clf_scores += beam_scores[:, None].expand_as(lm_scores)
            next_clf_scores += first_mask
            is_first = False

        next_scores = (
            1 - args.beta) * next_clf_scores + args.beta * next_lm_scores
        next_scores += input_mask

        # re-organize to group the beam together
        # (we are keeping top hypothesis accross beams)
        next_scores = next_scores.view(num_beams * vocab_size)
        next_lm_scores = next_lm_scores.view(num_beams * vocab_size)
        next_scores, next_tokens = torch.topk(next_scores,
                                              num_beams,
                                              largest=True,
                                              sorted=True)
        next_lm_scores = next_lm_scores[next_tokens]
        # next batch beam content
        next_sent_beam = []
        for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
                zip(next_tokens, next_lm_scores)):
            # get beam and token IDs
            beam_id = beam_token_id // vocab_size
            token_id = beam_token_id % vocab_size
            next_sent_beam.append((beam_token_score, token_id, beam_id))

        next_batch_beam = next_sent_beam

        # sanity check / prepare next batch
        assert len(next_batch_beam) == num_beams
        beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
        beam_tokens = output_so_far.new([x[1] for x in next_batch_beam])
        beam_idx = output_so_far.new([x[2] for x in next_batch_beam])

        # re-order batch
        output_so_far = output_so_far[beam_idx, :]
        output_so_far = torch.cat(
            [output_so_far, beam_tokens.unsqueeze(1)], dim=-1)

        # sanity check
        pad_output_so_far = torch.cat([
            output_so_far[:, start_idx:], sep_tensor[:num_beams].unsqueeze(1)
        ], 1)
        concat_input_ids = torch.cat(
            [batch_input_ids[:num_beams], pad_output_so_far], 1)
        token_type_ids = torch.cat([
            torch.zeros_like(batch_input_ids[:num_beams]),
            torch.ones_like(pad_output_so_far)
        ], 1)
        clf_logits = model(input_ids=concat_input_ids,
                           token_type_ids=token_type_ids)[0]
        actual_clf_scores = clf_logits[:, 1]
        sorter = torch.argsort(actual_clf_scores, -1, descending=True)
        if args.verbose:
            decoded = [
                f'{actual_clf_scores[i].item():.4f}, '
                f'{tokenizer.decode(output_so_far[i, start_idx:].cpu().tolist())}'
                for i in sorter
            ]
            log(f'Margin={margin if margin else 0:.4f}, query={inputs_a} | ' +
                ' | '.join(decoded))

        if curr_len > args.min_len:
            valid_idx = sorter[0]
            valid = False
            for idx in sorter:
                valid, _ = valid_tokenization(output_so_far[idx, start_idx:],
                                              tokenizer)
                if valid:
                    valid_idx = idx
                    break

            curr_score = actual_clf_scores[valid_idx].item()
            curr_collision = tokenizer.decode(
                output_so_far[valid_idx, start_idx:].cpu().tolist())
            collision_cands.append((curr_score, curr_collision))
            if valid and curr_score > best_score:
                best_score = curr_score
                best_collision = curr_collision

            if args.verbose:
                lm_perp = eval_lm_model.perplexity(curr_collision)
                log(f'LM perp={lm_perp.item()}')

        # re-order internal states
        past = lm_model._reorder_cache(present, beam_idx)
        # update current length
        curr_len = curr_len + 1

    return best_collision, best_score, collision_cands
Exemplo n.º 28
0
def gen_aggressive_collision(ex, model, tokenizer, device, lm_model=None):
    src, segs, clss, src_sent_labels, src_txt, tgt_txt = ex

    word_embedding = model.bert.model.get_input_embeddings().weight.detach()
    if lm_model is not None:
        lm_word_embedding = lm_model.get_input_embeddings().weight.detach()

    vocab_size = word_embedding.size(0)
    src_ids = torch.tensor(src, device=device)
    src_embeds = word_embedding[src_ids]

    sub_mask = get_sub_masks(tokenizer, device)
    input_mask = torch.zeros(vocab_size, device=device)
    src_tokens = [
        w for w in tokenizer.convert_ids_to_tokens(src)
        if w.isalpha() and w not in STOPWORDS
    ]
    input_mask[tokenizer.convert_tokens_to_ids(src_tokens)] = -1e9
    seq_len = args.seq_len
    stopwords_mask = create_constraints(seq_len, tokenizer, device)

    def relaxed_to_word_embs(x):
        # convert relaxed inputs to word embedding by softmax attention
        masked_x = x + input_mask + sub_mask
        if args.regularize:
            masked_x += stopwords_mask

        p = torch.softmax(masked_x / args.stemp, -1)
        x = torch.mm(p, word_embedding)
        # add embeddings for period and SEP
        x = torch.cat([
            word_embedding[tokenizer.cls_token_id].unsqueeze(0), x,
            word_embedding[tokenizer.sep_token_id].unsqueeze(0)
        ])
        return p, x.unsqueeze(0)

    def get_lm_loss(p):
        x = torch.mm(p.detach(), lm_word_embedding).unsqueeze(0)
        return lm_model(inputs_embeds=x, one_hot_labels=p.unsqueeze(0))[0]

    # some constants
    sep_tensor = torch.tensor([tokenizer.sep_token_id] * args.topk,
                              device=device)
    batch_sep_emb = word_embedding[sep_tensor].unsqueeze(1)
    cls_tensor = torch.tensor([tokenizer.cls_token_id] * args.topk,
                              device=device)
    batch_cls_emb = word_embedding[cls_tensor].unsqueeze(1)

    label = int(len(clss) * args.insert_pos)
    labels = torch.tensor([label], device=device)
    batch_prefix_ids, batch_prefix_emb, batch_src_ids, batch_src_emb, mask_cls, batch_segs, batch_new_clss = \
        get_input_constant(label, seq_len, src_ids, src_embeds, segs, clss, device)
    prefix_embeds = batch_prefix_emb[0]
    src_embeds = batch_src_emb[0]
    type_token_ids = batch_segs[0]
    new_clss = batch_new_clss[0]

    loss_fn = torch.nn.CrossEntropyLoss()

    best_collision = None
    best_score = -1e9
    best_rank = -1
    prev_score = -1e9

    var_size = (seq_len, vocab_size)
    z_i = torch.zeros(*var_size, requires_grad=True, device=device)

    for it in range(args.max_iter):
        optimizer = torch.optim.Adam([z_i], lr=args.lr)

        for j in range(args.perturb_iter):
            optimizer.zero_grad()
            # relaxation
            p_inputs, inputs_embeds = relaxed_to_word_embs(z_i)
            # forward to BERT with relaxed inputs
            inputs_embeds = torch.cat([
                prefix_embeds.unsqueeze(0), inputs_embeds,
                src_embeds.unsqueeze(0)
            ], 1)
            scores = model(None,
                           type_token_ids,
                           new_clss,
                           None,
                           mask_cls,
                           inputs_embeds,
                           output_logits=True)
            loss = loss_fn(scores, labels)
            scores = scores.squeeze()
            loss += torch.max(scores) - scores[label]
            if args.beta > 0.:
                lm_loss = get_lm_loss(p_inputs)
                loss = args.beta * lm_loss + (1 - args.beta) * loss

            loss.backward()
            optimizer.step()
            if args.verbose and (j + 1) % 10 == 0:
                log(f'It{it}-{j + 1}, loss={loss.item()}')

        # detach to free GPU memory
        z_i = z_i.detach()

        _, topk_tokens = torch.topk(z_i, args.topk)
        probs_i = torch.softmax(z_i / args.stemp, -1).unsqueeze(0).expand(
            args.topk, seq_len, vocab_size)

        output_so_far = None
        # beam search left to right
        for t in range(seq_len):
            t_topk_tokens = topk_tokens[t]
            t_topk_onehot = torch.nn.functional.one_hot(
                t_topk_tokens, vocab_size).float()
            next_clf_scores = []
            for j in range(args.num_beams):
                next_beam_scores = torch.zeros(tokenizer.vocab_size,
                                               device=device) - 1e9
                if output_so_far is None:
                    context = probs_i.clone()
                else:
                    output_len = output_so_far.shape[1]
                    beam_topk_output = output_so_far[j].unsqueeze(0).expand(
                        args.topk, output_len)
                    beam_topk_output = torch.nn.functional.one_hot(
                        beam_topk_output, vocab_size)
                    context = torch.cat([
                        beam_topk_output.float(), probs_i[:,
                                                          output_len:].clone()
                    ], 1)
                context[:, t] = t_topk_onehot
                context_emb = torch.einsum('blv,vh->blh', context,
                                           word_embedding)

                context_emb = torch.cat(
                    [batch_cls_emb, context_emb, batch_sep_emb], 1)
                inputs_emb = torch.cat(
                    [batch_prefix_emb, context_emb, batch_src_emb], 1)
                scores = model(None,
                               batch_segs,
                               batch_new_clss,
                               None,
                               mask_cls,
                               inputs_emb,
                               output_logits=True)
                clf_scores = scores[:, label].detach().float()
                next_beam_scores.scatter_(0, t_topk_tokens, clf_scores)
                next_clf_scores.append(next_beam_scores.unsqueeze(0))

            next_clf_scores = torch.cat(next_clf_scores, 0)
            next_scores = next_clf_scores + input_mask + sub_mask
            if args.regularize:
                next_scores += stopwords_mask[t]

            if output_so_far is None:
                next_scores[1:] = -1e9

            # re-organize to group the beam together
            # (we are keeping top hypothesis accross beams)
            next_scores = next_scores.view(
                1, args.num_beams *
                vocab_size)  # (batch_size, num_beams * vocab_size)
            next_scores, next_tokens = torch.topk(next_scores,
                                                  args.num_beams,
                                                  dim=1,
                                                  largest=True,
                                                  sorted=True)
            # next batch beam content
            next_sent_beam = []
            for beam_token_rank, (beam_token_id,
                                  beam_token_score) in enumerate(
                                      zip(next_tokens[0], next_scores[0])):
                # get beam and token IDs
                beam_id = beam_token_id // vocab_size
                token_id = beam_token_id % vocab_size
                next_sent_beam.append((beam_token_score, token_id, beam_id))

            next_batch_beam = next_sent_beam

            # sanity check / prepare next batch
            assert len(next_batch_beam) == args.num_beams
            beam_tokens = torch.tensor([x[1] for x in next_batch_beam],
                                       device=device)
            beam_idx = torch.tensor([x[2] for x in next_batch_beam],
                                    device=device)

            # re-order batch
            if output_so_far is None:
                output_so_far = beam_tokens.unsqueeze(1)
            else:
                output_so_far = output_so_far[beam_idx, :]
                output_so_far = torch.cat(
                    [output_so_far, beam_tokens.unsqueeze(1)], dim=-1)

        pad_output_so_far = torch.cat([
            cls_tensor[:args.num_beams].unsqueeze(1), output_so_far,
            sep_tensor[:args.num_beams].unsqueeze(1)
        ], 1)
        concat_input_ids = torch.cat([
            batch_prefix_ids[:args.num_beams], pad_output_so_far,
            batch_src_ids[:args.num_beams]
        ], 1)
        actual_scores = model.forward(concat_input_ids,
                                      batch_segs[:args.num_beams],
                                      batch_new_clss[:args.num_beams], None,
                                      mask_cls, None).squeeze()
        actual_clf_scores = actual_scores[:, label].detach()
        top_scores, top_labels = torch.topk(actual_scores,
                                            actual_scores.shape[-1])
        sorter = torch.argsort(actual_clf_scores, -1, descending=True)
        if args.verbose:
            decoded = [
                f'{actual_clf_scores[i].item():.4f}, '
                f'{tokenizer.decode(output_so_far[i].cpu().tolist())}'
                for i in sorter
            ]
            log(f'It={it}, margin={top_scores[:, 2].max().item()} | ' +
                ' | '.join(decoded))

        valid_idx = sorter[0]
        valid = False
        for idx in sorter:
            valid, _ = valid_tokenization(output_so_far[idx], tokenizer)
            if valid:
                valid_idx = idx
                break

        # re-initialize z_i
        curr_best = output_so_far[valid_idx]
        next_z_i = torch.nn.functional.one_hot(curr_best, vocab_size).float()
        eps = 0.1
        next_z_i = (next_z_i *
                    (1 - eps)) + (1 - next_z_i) * eps / (vocab_size - 1)
        z_i = torch.nn.Parameter(torch.log(next_z_i), True)

        curr_score = actual_clf_scores[valid_idx].item()
        curr_collision = tokenizer.decode(curr_best.cpu().tolist())
        curr_rank = (top_labels[valid_idx] == label).nonzero().squeeze().item()
        if valid and curr_score > best_score:
            best_score = curr_score
            best_collision = curr_collision
            best_rank = curr_rank

        if prev_score == curr_score:
            break
        prev_score = curr_score

    return best_collision, best_score, best_rank
Exemplo n.º 29
0
def gen_natural_collision(ex,
                          model,
                          tokenizer,
                          device,
                          lm_model,
                          eval_lm_model=None):
    src, segs, clss, src_sent_labels, src_txt, tgt_txt = ex
    word_embedding = model.bert.model.get_input_embeddings().weight.detach()
    collition_init = tokenizer.convert_tokens_to_ids([BOS_TOKEN])

    start_idx = 1
    num_beams = args.num_beams
    repetition_penalty = 5.0
    curr_len = len(collition_init)

    # scores for each sentence in the beam
    beam_scores = torch.zeros((num_beams, ), dtype=torch.float, device=device)
    beam_scores[1:] = -1e9

    output_so_far = torch.tensor([collition_init] * num_beams, device=device)
    past = None
    vocab_size = tokenizer.vocab_size
    topk = args.topk
    src_ids = torch.tensor(src, device=device)
    src_embeds = word_embedding[src_ids]

    sub_mask = get_sub_masks(tokenizer, device)
    filter_ids = [
        tokenizer.vocab[w] for w in tokenizer.vocab if not w.isalnum()
    ]
    first_mask = torch.zeros_like(sub_mask)
    first_mask[filter_ids] = -1e9
    input_mask = torch.zeros(vocab_size, device=device)
    src_tokens = [
        w for w in tokenizer.convert_ids_to_tokens(src)
        if w.isalpha() and w not in STOPWORDS
    ]
    input_mask[tokenizer.convert_tokens_to_ids(src_tokens)] = -1e9
    input_mask[tokenizer.convert_tokens_to_ids(['.', '@', '='])] = -1e9
    unk_ids = tokenizer.encode('<unk>', add_special_tokens=False)
    input_mask[unk_ids] = -1e9

    sep_tensor = torch.tensor([tokenizer.sep_token_id] * topk, device=device)
    cls_tensor = torch.tensor([tokenizer.cls_token_id] * topk, device=device)

    is_first = True
    batch_sep_emb = word_embedding[sep_tensor].unsqueeze(1)
    batch_cls_emb = word_embedding[cls_tensor].unsqueeze(1)
    label = int(len(clss) * args.insert_pos)
    labels = torch.tensor([label] * num_beams, device=device)
    loss_fn = torch.nn.CrossEntropyLoss()

    def classifier_loss(p, context, pre_emb, src_emb, type_token_ids, new_clss,
                        mask):
        context = torch.nn.functional.one_hot(context, len(word_embedding))
        one_hot = torch.cat([context.float(), p.unsqueeze(1)], 1)
        x = torch.einsum('blv,vh->blh', one_hot, word_embedding)
        # add embeddings for SEP
        x = torch.cat(
            [batch_cls_emb[:num_beams], x, batch_sep_emb[:num_beams]], 1)
        inputs_embeds = torch.cat([pre_emb, x, src_emb], 1)
        scores = model(None,
                       type_token_ids,
                       new_clss,
                       None,
                       mask,
                       inputs_embeds,
                       output_logits=True)
        loss = loss_fn(scores, labels)
        loss += torch.mean(torch.max(scores, 1)[0] - scores[:, label])
        return loss

    best_collision = None
    best_score = -1e9
    best_rank = -1

    while curr_len < args.seq_len:
        seq_len = curr_len - start_idx + 1
        batch_prefix_ids, batch_prefix_emb, batch_src_ids, batch_src_emb, mask_cls, batch_segs, batch_new_clss = \
            get_input_constant(label, seq_len, src_ids, src_embeds, segs, clss, device)
        model_inputs = lm_model.prepare_inputs_for_generation(output_so_far,
                                                              past=past)
        outputs = lm_model(**model_inputs)
        present = outputs[1]
        # (batch_size * num_beams, vocab_size)
        next_token_logits = outputs[0][:, -1, :]
        lm_scores = torch.log_softmax(next_token_logits, dim=-1)
        next_lm_scores = lm_scores + beam_scores[:, None].expand_as(lm_scores)

        if args.perturb_iter > 0:
            # perturb internal states of LM
            def target_model_wrapper(p):
                return classifier_loss(p,
                                       output_so_far.detach()[:, start_idx:],
                                       batch_prefix_emb[:num_beams],
                                       batch_src_emb[:num_beams],
                                       batch_segs[:num_beams],
                                       batch_new_clss[:num_beams], mask_cls)

            next_token_logits = perturb_logits(
                next_token_logits,
                args.lr,
                target_model_wrapper,
                num_iterations=args.perturb_iter,
                kl_scale=args.kl_scale,
                temperature=args.stemp,
                device=device,
                verbose=args.verbose,
                logit_mask=input_mask,
            )

        if repetition_penalty > 1.0:
            lm_model.enforce_repetition_penalty_(next_token_logits, 1,
                                                 num_beams, output_so_far,
                                                 repetition_penalty)

        next_token_logits = next_token_logits / args.stemp
        # (batch_size * num_beams, vocab_size)
        _, topk_tokens = torch.topk(next_token_logits, topk)

        # get target model score here
        next_clf_scores = []
        for i in range(num_beams):
            next_beam_scores = torch.zeros(tokenizer.vocab_size,
                                           device=device) - 1e9
            if output_so_far.shape[1] > start_idx:
                curr_beam_topk = output_so_far[i,
                                               start_idx:].unsqueeze(0).expand(
                                                   topk,
                                                   output_so_far.shape[1] -
                                                   start_idx)
                # (topk, curr_len + next_token + sep)
                curr_beam_topk = torch.cat([
                    cls_tensor.unsqueeze(1), curr_beam_topk,
                    topk_tokens[i].unsqueeze(1),
                    sep_tensor.unsqueeze(1)
                ], 1)
            else:
                curr_beam_topk = torch.cat([
                    cls_tensor.unsqueeze(1), topk_tokens[i].unsqueeze(1),
                    sep_tensor.unsqueeze(1)
                ], 1)
            concat_input_ids = torch.cat(
                [batch_prefix_ids, curr_beam_topk, batch_src_ids], 1)
            scores = model(concat_input_ids, batch_segs, batch_new_clss, None,
                           mask_cls, None)
            clf_scores = torch.log_softmax(scores, -1)[:, label].detach()
            next_beam_scores.scatter_(0, topk_tokens[i], clf_scores)
            next_clf_scores.append(next_beam_scores.unsqueeze(0))
        next_clf_scores = torch.cat(next_clf_scores, 0)

        if is_first:
            next_clf_scores += beam_scores[:, None].expand_as(lm_scores)
            next_clf_scores += first_mask
            is_first = False

        next_scores = (
            1 - args.beta) * next_clf_scores + args.beta * next_lm_scores
        next_scores += input_mask

        # re-organize to group the beam together
        # (we are keeping top hypothesis accross beams)
        next_scores = next_scores.view(num_beams * vocab_size)
        next_lm_scores = next_lm_scores.view(num_beams * vocab_size)
        next_scores, next_tokens = torch.topk(next_scores,
                                              num_beams,
                                              largest=True,
                                              sorted=True)
        next_lm_scores = next_lm_scores[next_tokens]
        # next batch beam content
        next_sent_beam = []
        for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
                zip(next_tokens, next_lm_scores)):
            # get beam and token IDs
            beam_id = beam_token_id // vocab_size
            token_id = beam_token_id % vocab_size
            next_sent_beam.append((beam_token_score, token_id, beam_id))

        next_batch_beam = next_sent_beam
        # sanity check / prepare next batch
        assert len(next_batch_beam) == num_beams
        beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
        beam_tokens = output_so_far.new([x[1] for x in next_batch_beam])
        beam_idx = output_so_far.new([x[2] for x in next_batch_beam])

        # re-order batch
        output_so_far = output_so_far[beam_idx, :]
        output_so_far = torch.cat(
            [output_so_far, beam_tokens.unsqueeze(1)], dim=-1)

        # sanity check
        pad_output_so_far = torch.cat([
            cls_tensor[:num_beams].unsqueeze(1), output_so_far[:, start_idx:],
            sep_tensor[:num_beams].unsqueeze(1)
        ], 1)
        concat_input_ids = torch.cat([
            batch_prefix_ids[:num_beams], pad_output_so_far,
            batch_src_ids[:num_beams]
        ], 1)
        actual_scores = model.forward(concat_input_ids, batch_segs[:num_beams],
                                      batch_new_clss[:num_beams], None,
                                      mask_cls, None)
        top_scores, top_labels = torch.topk(actual_scores,
                                            actual_scores.shape[-1])
        actual_clf_scores = actual_scores[:, label].detach()
        sorter = torch.argsort(actual_clf_scores, -1, descending=True)
        if args.verbose:
            decoded = [
                f'{actual_clf_scores[i].item():.4f}, '
                f'{tokenizer.decode(output_so_far[i, start_idx:].cpu().tolist())}'
                for i in sorter
            ]
            log(f'Margin={top_scores[:, 2].max().item()} | ' +
                ' | '.join(decoded))

        # re-order internal states
        past = lm_model._reorder_cache(present, beam_idx)
        # update current length
        curr_len = curr_len + 1

        if curr_len > args.min_len:
            valid_idx = sorter[0]
            valid = False
            for idx in sorter:
                valid, _ = valid_tokenization(output_so_far[idx, start_idx:],
                                              tokenizer)
                if valid:
                    valid_idx = idx
                    break
            curr_score = actual_clf_scores[valid_idx].item()
            curr_collision = tokenizer.decode(
                output_so_far[valid_idx, start_idx:].cpu().tolist())
            curr_rank = (
                top_labels[valid_idx] == label).nonzero().squeeze().item()
            if valid and curr_score > best_score:
                best_score = curr_score
                best_collision = curr_collision
                best_rank = curr_rank

            if args.verbose:
                lm_perp = eval_lm_model.perplexity(curr_collision)
                log(f'LM perp={lm_perp.item()}')

    return best_collision, best_score, best_rank