예제 #1
0
def main(args):
    # Read the dataset
    df = pd.read_csv(args.file)
    embedder = BertWrapper(args.model_path, max_seq_length=256)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=True,
                          pooling_mode_cls_token=False,
                          pooling_mode_max_tokens=False,
                          layer_to_use=args.layer)
    model = SentenceEncoder(modules=[embedder, pooler])
    model.eval()

    evaluator = EmbeddingSimilarityEvaluator(
        main_similarity=SimilarityFunction.COSINE)

    if args.t2s:
        df["text_1"] = df["text_1"].apply(convert_t2s)
        df["text_2"] = df["text_2"].apply(convert_t2s)

    tmp = model.encode(df["text_1"].tolist() + df["text_2"].tolist(),
                       batch_size=16,
                       show_progress_bar=True)
    embeddings1, embeddings2 = tmp[:df.shape[0]], tmp[df.shape[0]:]

    spearman_score = evaluator(embeddings1,
                               embeddings2,
                               labels=df["similarity"].values)
    print(spearman_score)

    preds = 1 - paired_cosine_distances(embeddings1, embeddings2)
    df["pred"] = preds
    df.to_csv("cache/annotated_zero_shot_pred.csv", index=False)
    print(f"Pred {pd.Series(preds).describe()}")
    return preds, df["similarity"].values
예제 #2
0
def load_model(model_path: str, linear_transform):
    model_path_ = Path(model_path)
    if (model_path_ / "modules.json").exists():
        encoder = SentenceEncoder(str(model_path))
        encoder[1].pooling_mode_mean_tokens = True
        encoder[1].pooling_mode_cls_token = False
        print(encoder[1].get_config_dict())
    else:
        embedder = BertWrapper(
            model_path,
            max_seq_length=256
        )
        pooler = PoolingLayer(
            embedder.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False,
            layer_to_use=-1
        )
        encoder = SentenceEncoder(modules=[
            embedder, pooler
        ])
    model = SentencePairCosineSimilarity(
        encoder, linear_transform=linear_transform
    )
    if linear_transform:
        model.scaler.data = torch.tensor([0.6]).to(model.encoder.device)
        model.shift.data = torch.tensor([0.3]).to(model.encoder.device)
    return model
예제 #3
0
def main(args):
    encoder = SentenceEncoder(model_path=args.model_path)
    encoder.eval()
    if APEX and args.amp and (not args.torchscript):
        encoder = amp.initialize(encoder, opt_level=args.amp)
    if args.torchscript:
        if args.amp:
            encoder[0].bert = encoder[0].bert.half()
        traced_model = torch.jit.trace(
            encoder[0].bert,
            (torch.zeros(8, 256).long().cuda(),
             torch.zeros(8, 256).long().cuda(),
             torch.ones(8, 256).long().cuda())
        )
        encoder[0].bert = traced_model
        assert isinstance(encoder[0].bert, torch.jit.TopLevelTracedModule)
    encoder.max_seq_length = 256
    print(encoder[1].get_config_dict())
    encoder[1].pooling_mode_cls_token = False
    encoder[1].pooling_mode_mean_tokens = True
    print(encoder[1].get_config_dict())

    preds, _ = raw(args, encoder)

    print(f"Pred {pd.Series(preds).describe()}")
예제 #4
0
def load_model(model_path, ):
    embedder = BertWrapper(model_path, max_seq_length=256, do_lower_case=False)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=True,
                          pooling_mode_cls_token=False,
                          pooling_mode_max_tokens=False,
                          layer_to_use=-1)
    encoder = SentenceEncoder(modules=[embedder, pooler])
    model = SentencePairNliClassification(encoder)
    return model
예제 #5
0
def load_model(model_path, linear_transform):
    embedder = BertWrapper(model_path, max_seq_length=128)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=True,
                          pooling_mode_cls_token=False,
                          pooling_mode_max_tokens=False,
                          layer_to_use=-1)
    encoder = SentenceEncoder(modules=[embedder, pooler])
    model = SentencePairCosineSimilarity(encoder,
                                         linear_transform=linear_transform)
    return model
예제 #6
0
def main(args):
    embedder = BertWrapper(args.model_path, max_seq_length=256)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=True,
                          pooling_mode_cls_token=False,
                          pooling_mode_max_tokens=False,
                          layer_to_use=args.layer)
    encoder = SentenceEncoder(modules=[embedder, pooler])
    model = SentencePairCosineSimilarity(encoder, linear_transform=False)
    model.eval()

    # print("\n".join([name for name, _ in model.named_parameters()]))

    dataset = LcqmcDataset(embedder.tokenizer, filename=args.filename)
    loader = DataLoader(
        dataset,
        sampler=SortSampler(
            dataset,
            key=lambda x: max(len(dataset.text_1[x]), len(dataset.text_2[x]))),
        collate_fn=partial(collate_pairs,
                           pad=0,
                           opening_id=embedder.cls_token_id,
                           closing_id=embedder.sep_token_id,
                           truncate_length=embedder.max_seq_length),
        batch_size=16)
    preds, references = [], []
    with torch.no_grad():
        for features, labels in tqdm(loader):
            for name in features:
                features[name] = features[name].to(encoder.device)
            preds.append(model(features).cpu().numpy())
            references.append(labels.cpu().numpy())

    preds = np.concatenate(preds)
    references = np.concatenate(references)
    spearman_score = spearmanr(preds, references)
    print(f"Spearman: {spearman_score.correlation:.4f}")

    print(f"Pred Min: {np.min(preds)}, {np.max(preds)}")
    if args.threshold == -1:
        best_thres, best_acc = -1, -1
        for threshold in np.arange(0.05, 1, 0.05):
            binarized = (preds > threshold).astype("int")
            acc = (binarized == references).sum() / len(references)
            if acc > best_acc:
                best_acc = acc
                best_thres = threshold
        print(f"Best acc: {best_acc:.4f} @ {best_thres:.2f}")
    else:
        binarized = (preds > args.threshold).astype("int")
        acc = (binarized == references).sum() / len(references)
        print(f"Acc: {acc:.4f} @ {args.threshold:.2f}")
def main(model_name,
         mean_pooling: bool = True,
         max_pooling: bool = False,
         cls_token: bool = False,
         max_seq_length: int = 256,
         model_type: Optional[str] = None,
         output_folder: Optional[str] = None):
    if output_folder is None:
        output_folder = f"models/{model_name.split('/')[-1]}"
        print(f"Using the default output folder: {output_folder}")
    output_path = Path(output_folder)
    output_path.mkdir(parents=True)
    embedder = TransformerWrapper(model_name,
                                  max_seq_length=max_seq_length,
                                  model_type=model_type)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=mean_pooling,
                          pooling_mode_cls_token=cls_token,
                          pooling_mode_max_tokens=max_pooling,
                          layer_to_use=-1)
    encoder = SentenceEncoder(modules=[embedder, pooler])
    encoder.save(output_path.resolve())
예제 #8
0
def load_model(model_path: str,
               dropout: float,
               n_classes: int = 4) -> SentenceClassification:
    embedder = BertWrapper(model_path, max_seq_length=256, do_lower_case=False)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=True,
                          pooling_mode_cls_token=False,
                          pooling_mode_max_tokens=False,
                          layer_to_use=-1)
    encoder = SentenceEncoder(modules=[embedder, pooler])
    model = SentenceClassification(encoder,
                                   n_classes=n_classes,
                                   dropout=dropout)
    return model
예제 #9
0
파일: common.py 프로젝트: ceshine/oggdo
def load_encoder(model_path,
                 model_type,
                 max_length,
                 do_lower_case,
                 mean_pooling=True,
                 cls=False,
                 max_pooling=False,
                 expand_to_dimension: int = -1):
    embedder = TransformerWrapper(model_path,
                                  max_seq_length=max_length,
                                  do_lower_case=do_lower_case,
                                  model_type=model_type)
    pooler = PoolingLayer(embedder.get_word_embedding_dimension(),
                          pooling_mode_mean_tokens=mean_pooling,
                          pooling_mode_cls_token=cls,
                          pooling_mode_max_tokens=max_pooling,
                          layer_to_use=-1,
                          expand_to_dimension=expand_to_dimension)
    encoder = SentenceEncoder(modules=[embedder, pooler])
    return encoder
예제 #10
0
def load_model(model_path, model_type, do_lower_case):
    embedder = TransformerWrapper(
        model_path,
        max_seq_length=256,
        do_lower_case=do_lower_case,
        model_type=model_type
    )
    pooler = PoolingLayer(
        embedder.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False,
        layer_to_use=-1
    )
    encoder = SentenceEncoder(modules=[
        embedder, pooler
    ])
    model = SentencePairCosineSimilarity(
        encoder, linear_transform=False
    )
    return model
예제 #11
0
def main(model_path: str = typer.Argument("streamlit-model/")):
    global MODEL
    MODEL = SentenceEncoder(model_path, device="cpu").eval()
    print(f"Listening to port {PORT}")
    uvicorn.run(APP, host='0.0.0.0', port=PORT)
예제 #12
0
import typer
import uvicorn
from opencc import OpenCC
from fastapi import FastAPI
from pydantic import BaseModel, Field, constr

from oggdo.encoder import SentenceEncoder

os.environ["TOKENIZERS_PARALLELISM"] = "false"

PORT = int(os.environ.get("PORT", "8666"))
APP = FastAPI()
T2S = OpenCC('t2s')
MODEL: Optional[SentenceEncoder] = None
if os.environ.get("MODEL", None):
    MODEL = SentenceEncoder(os.environ["MODEL"], device="cpu").eval()

app = APP


class TextInput(BaseModel):
    text: str = Field(
        None,
        title="The piece of text you want to create embeddings for.",
        max_length=384)
    t2s: bool = False


class BatchTextInput(BaseModel):
    text_batch: List[str] = Field(
        [], title="Pieces of text you want to create embeddings for.")