Exemplo n.º 1
0
    def __init__(self, model_path: str, questions_path: str, preprocessing: Preprocessing, identifier: str,
                 *preprocessor_args, use_cuda=True, limit_posts=None):
        if preprocessing == Preprocessing.LATEX:
            self.preprocessor = BlankSubstituer()
        elif preprocessing == Preprocessing.PREFIX:
            self.preprocessor = PolishSubstituer(*preprocessor_args)
        elif preprocessing == Preprocessing.INFIX:
            self.preprocessor = InfixSubstituer(*preprocessor_args)
        else:
            raise NotImplementedError("No default preprocessor. Pick one from %s" % Preprocessing.enum_members)

        reader = DataReaderRecord(questions_path, limit_posts=limit_posts)

        self.parser = self.preprocessor.process_parser(reader.post_parser)
        self.device = "cuda" if use_cuda else "cpu"
        self.model = SentenceTransformer(model_path, device=self.device)
        self.identifier = identifier
from ARQMathCode.post_reader_record import DataReaderRecord
from arqmath_eval import get_topics, get_judged_documents, get_ndcg
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

from preproc.question_answer.blank_substituer import BlankSubstituer
from preproc.question_answer.infix_substituer import InfixSubstituer
from preproc.question_answer.external_substituer import ExternalSubstituer
from preproc.question_answer.polish_substituer import PolishSubstituer

device = "cuda"

clef_home_directory_file_path = '/home/xstefan3/arqmath/data/Collection_v1.0'
dr = DataReaderRecord(clef_home_directory_file_path)


def get_questions(all_questions_ids, preproc="blank"):
    # prefix has a priority, go for infix only if prefix=False
    all_questions_raw = dict([(int(qid),
                               dr.post_parser.map_questions[int(qid)])
                              for qid in all_questions_ids])
    if preproc == "blank":
        postprocessor = BlankSubstituer()
    elif preproc == "prefix":
        postprocessor = PolishSubstituer(
            '/home/xstefan3/arqmath/data/Collection_v1.0/formula_prefix.V1.0.tsv'
        )
    elif preproc == "infix":
        postprocessor = InfixSubstituer('???')
Exemplo n.º 3
0
from preproc.question_answer.polish_substituer import PolishSubstituer
from preproc.question_answer.blank_substituer import BlankSubstituer
from question_answer.utils import examples_from_questions_tup
from sentence_transformers import SentenceTransformer, losses, SentencesDataset
from scripts.loader_to_tsv import dump_to_tsv

from sentence_transformers.evaluation import IREvaluator
# from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

device = "cpu"

model = SentenceTransformer('/data/arqmath/models/train_sampled_eval16',
                            device=device)

clef_home_directory_file_path = '/data/arqmath/ARQMath_CLEF2020/Collection_v1.0'
dr = DataReaderRecord(clef_home_directory_file_path, limit_posts=1000)

# postprocessor = UniquePrefixSubstituer('/data/arqmath/ARQMath_CLEF2020/Collection/formula_prefix.V0.2.tsv',
#                                        "/home/michal/Documents/projects/arqmath/compubert/question_answer/out/0_BERT/vocab.txt")
postprocessor = PolishSubstituer(
    '/data/arqmath/ARQMath_CLEF2020/Collection_v1.0/formula_prefix.V1.0.tsv')
# postprocessor = BlankSubstituer()

postproc_questions = list(
    postprocessor.process_questions(dr.post_parser.map_questions))
# postprocessor.extend_sbert_vocab(model)

all_examples = list(examples_from_questions_tup(postproc_questions))
# all_examples = list(examples_from_questions_tup(postproc_questions))
examples_len = len(all_examples)
Exemplo n.º 4
0
 def data_reader():
     return DataReaderRecord('/data')