def parse_file_full_embeddings_tapas(fname, outfilename):
    model_name = 'google/tapas-base'
    tokenizer = TapasTokenizer.from_pretrained(model_name)
    config = TapasConfig('google-base-finetuned-wikisql-supervised')
    model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device)

    final_dict = {}
    with open(fname) as f:
        data = list(f)

    print("Num Examples: {}".format(len(data)))
    for i, line in enumerate(data):
        #print(line)
        result = json.loads(line)
        tbl_id = result['table_id']
        table_string = ' '.join(_tbl(result))
        table_list = table_string.split(SPLIT_WORD)
        table_list_filtered = [token for token in table_list if token != '']
        dict_index = {key : [] for key in table_list_filtered}
        table = pd.DataFrame(dict_index)

        query = [' '.join(result['question']['words'])]
        inputs = tokenizer(table=table, queries=query)
        out = model(inputs)[0].tolist()
        final_dict[tbl_id] = out

        if i % 200 == 0:
            print("Num Examples Done: {}".format(i))

    with open(outfilename, 'w') as outfile:
        json.dump(final_dict, outfile)
예제 #2
0
def load_model():
    print('downloading model')
    model_name = 'google/tapas-base-finetuned-wtq'
    model = TapasForQuestionAnswering.from_pretrained(model_name)
    tokenizer = TapasTokenizer.from_pretrained(model_name)
    print('model downloaded')
    return model, tokenizer
예제 #3
0
 def default_tokenizer(self):
     return TapasTokenizer.from_pretrained(
         "google/tapas-base-finetuned-wtq")
예제 #4
0
def korean_table_question_answering_example():
	from transformers import pipeline
	from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer
	import pandas as pd
	# REF [site] >> https://github.com/monologg/KoBERT-Transformers
	from tokenization_kobert import KoBertTokenizer

	data_dict = {
		'배우': ['송광호', '최민식', '설경구'],
		'나이': ['54', '58', '53'],
		'출연작품수': ['38', '32', '42'],
		'생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'],
	}
	data_df = pd.DataFrame.from_dict(data_dict)

	if False:
		# Show the data frame.
		from IPython.display import display, HTML
		display(data_df)
		#print(HTML(data_df.to_html()).data)

	query = '최민식씨의 나이는?'

	# REF [site] >> https://huggingface.co/monologg
	pretrained_model_name = 'monologg/kobert'
	#pretrained_model_name = 'monologg/distilkobert'

	if False:
		# Not working.

		table_pipeline = pipeline(
			'table-question-answering',
			model=pretrained_model_name,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	elif False:
		# Not working.

		#config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
		#model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config)
		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	else:
		# Not correctly working.

		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name)
		)

	answer = table_pipeline(data_dict, query)
	#answer = table_pipeline(data_df, query)
	print('Answer: {}.'.format(answer))
예제 #5
0
import gradio as gr
import pandas as pd
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig

model_name = 'google/tapas-base-finetuned-wtq'
model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)

df_table = pd.read_csv("df_table.csv")
df_table = {c: [str(x) for x in df_table[c].tolist()]
            for c in df_table.columns}
df_table = pd.DataFrame.from_dict(df_table)


def predict(table, queries):
    inputs = tokenizer(table=table, queries=queries,
                       padding='max_length', return_tensors="pt")
    outputs = model(**inputs)
    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs,
        outputs.logits.detach(),
        outputs.logits_aggregation.detach())

    # let's print out the results:
    id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
    aggregation_predictions_string = [id2aggregation[x]
                                      for x in predicted_aggregation_indices]
    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell:
예제 #6
0
def convert_tf_checkpoint_to_pytorch(task, reset_position_index_per_cell,
                                     tf_checkpoint_path, tapas_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model.
    # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of
    # TapasConfig to False.

    # initialize configuration from json file
    config = TapasConfig.from_json_file(tapas_config_file)
    # set absolute/relative position embeddings parameter
    config.reset_position_index_per_cell = reset_position_index_per_cell

    # set remaining parameters of TapasConfig as well as the model based on the task
    if task == "SQA":
        model = TapasForQuestionAnswering(config=config)
    elif task == "WTQ":
        # run_task_main.py hparams
        config.num_aggregation_labels = 4
        config.use_answer_as_supervision = True
        # hparam_utils.py hparams
        config.answer_loss_cutoff = 0.664694
        config.cell_selection_preference = 0.207951
        config.huber_loss_delta = 0.121194
        config.init_cell_selection_weights_to_zero = True
        config.select_one_column = True
        config.allow_empty_column_selection = False
        config.temperature = 0.0352513

        model = TapasForQuestionAnswering(config=config)
    elif task == "WIKISQL_SUPERVISED":
        # run_task_main.py hparams
        config.num_aggregation_labels = 4
        config.use_answer_as_supervision = False
        # hparam_utils.py hparams
        config.answer_loss_cutoff = 36.4519
        config.cell_selection_preference = 0.903421
        config.huber_loss_delta = 222.088
        config.init_cell_selection_weights_to_zero = True
        config.select_one_column = True
        config.allow_empty_column_selection = True
        config.temperature = 0.763141

        model = TapasForQuestionAnswering(config=config)
    elif task == "TABFACT":
        model = TapasForSequenceClassification(config=config)
    elif task == "MLM":
        model = TapasForMaskedLM(config=config)
    elif task == "INTERMEDIATE_PRETRAINING":
        model = TapasModel(config=config)
    else:
        raise ValueError(f"Task {task} not supported.")

    print(f"Building PyTorch model from configuration: {config}")
    # Load weights from tf checkpoint
    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)

    # Save pytorch-model (weights and configuration)
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)

    # Save tokenizer files
    print(f"Save tokenizer files to {pytorch_dump_path}")
    tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] +
                               "vocab.txt",
                               model_max_length=512)
    tokenizer.save_pretrained(pytorch_dump_path)

    print("Used relative position embeddings:",
          model.config.reset_position_index_per_cell)