Exemplo n.º 1
0
import attr
import nltk
from tqdm import tqdm as _tqdm

import re
import os
import sys
import enum
import functools
import multiprocessing as mp

from typing import Tuple
from typing import Generator


log = logger.get('index.setup')
tqdm = functools.partial(_tqdm, ncols=80)


# ---

sys.path.append('lib/CharSplit')
import char_split  # noqa

# ---


#  multiprocessing infrastructure:
#
#  Topology:
#                          rq                        wq
Exemplo n.º 2
0
from tqdm import tqdm as _tqdm
from nltk.tokenize import sent_tokenize as tok_sent
from nltk.tokenize import word_tokenize as tok_wrd

import pickle
import pathlib
import argparse
import functools
import multiprocessing as mp
from collections import defaultdict

dumpr = ucu.load_module('dumpr.common')


log = logger.get('sentemb.prep')
tqdm = functools.partial(_tqdm, ncols=80)


#
#  use dumpr to produce sentence files
#


class Writer:

    def __init__(self, f_out: str):
        self._f_out = f_out

    def __enter__(self):
        self._fd_out = open(self._f_out, mode='w')
Exemplo n.º 3
0
# =*= coding: utf-8 -*-

from ungol.common import logger
from ungol.index import index as uii
from ungol.similarity import measures as usm

import numpy as np
from tabulate import tabulate

import enum
from typing import Tuple

log = logger.get('similarity.rhwmd')


class Strategy(enum.Enum):

    # selecting max(score(d1, d2), score(d2, d1))
    MAX = enum.auto()

    # selecting min(score(d1, d2), score(d2, d1))
    MIN = enum.auto()

    # only use score(ds, dl), where ds = argmin(|d1|, |d2|)
    # and dl = argmax(|d1|, |d2|)
    ADAPTIVE_SMALL = enum.auto()

    # only use score(dl, ds), where ds = argmin(|d1|, |d2|)
    # and dl = argmax(|d1|, |d2|)
    ADAPTIVE_BIG = enum.auto()
Exemplo n.º 4
0
import sys
import pickle
import pathlib
import argparse
import functools
import multiprocessing as mp

from typing import List
from typing import Dict
from typing import Union
from typing import Tuple
from typing import Generator

# ---

log = logger.get('models.analyze')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

RETAINED = 2000  # the k in k-NN
BUF_SIZE = 4000  # resulting file chunk size (BUF_SIZE * RETAINED * 4 Byte)

# ---


@attr.s(frozen=True)
class Neighbour:
Exemplo n.º 5
0
import attr
import h5py
import torch
import numpy as np
from tqdm import tqdm as _tqdm

import math
import pathlib
import argparse
import functools

from typing import Any
from typing import Dict
from typing import Tuple

log = logger.get('models.embcodr')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# --- external interface


def create_codes(compr: ume.Compressor, batch: torch.Tensor,
                 components: int) -> np.array:

    onehot = compr.encoder(batch)

    codemap = onehot.nonzero()[:, 2].to(dtype=torch.uint8)
    codes = codemap.view(-1, components).numpy()

    assert codes.shape[1] == components
    return codes
Exemplo n.º 6
0
import argparse

import attr
import h5py
import torch
import configobj
import numpy as np

from typing import Dict
from typing import Tuple
from typing import Generator

from ungol.common import logger
from ungol.common import util as ucu

log = logger.get('common.embed')


class Embed:
    """
    The embedding provider interface to be implemented.

    """

    CHUNK_SIZE = 8192

    @property
    def name(self) -> str:
        raise NotImplementedError()

    @property
Exemplo n.º 7
0
import os
import math
import enum
import pathlib
import multiprocessing as mp

from typing import Any
from typing import List
from typing import Tuple
from typing import Union
from typing import Callable

# ---

log = logger.get('models.stats')

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

# ---


class Kind(enum.Enum):
    train = enum.auto()
    valid = enum.auto()
    flush = enum.auto()


@attr.s(frozen=True)
class Update:
Exemplo n.º 8
0
import functools
import multiprocessing as mp

from typing import Dict
from typing import Tuple
from typing import Generator
from typing import Collection

# ---

sys.path.append('lib/CharSplit')
import char_split  # noqa

# ---

log = logger.get('retrieval.setup')
tqdm = functools.partial(_tqdm, ncols=80)

# ---


def _parse_topics(
        files: Collection[str]) -> Generator[common.Topic, None, None]:

    for xml in files:
        with open(xml, mode='r', encoding='utf-8') as f:
            topics_raw = f.read()

        soup = bs(topics_raw, 'xml')

        print('')
Exemplo n.º 9
0
import requests
import numpy as np
from tabulate import tabulate
from tqdm import tqdm as _tqdm

import time
import json
from pprint import pformat
from collections import defaultdict

from typing import Callable
from typing import Collection

# ---

log = logger.get('retrieval.experiment')


def tqdm(*args, **kwargs):
    yield from _tqdm(*args, ncols=80, **kwargs)


# --- experiments


class Task(dict):
    """
    Maps    doc_id -> bool
    Named:    task -> flag

    Currently depending on elasticsearch - could also
Exemplo n.º 10
0
import random
import pickle
import pathlib
import argparse
import functools
import collections

from tabulate import tabulate

from typing import List
from typing import Tuple
from typing import Union
from typing import Generator

log = logger.get('retrieval.evaluate')
Stat = collections.namedtuple('Stat', ('name', 'f_dataset', 'f_name', 'stat'))

#  ---

UNGOL_STRATS = {
    'min': usr.Strategy.MIN,
    'max': usr.Strategy.MAX,
    'adaptive-small': usr.Strategy.ADAPTIVE_SMALL,
    'adaptive-big': usr.Strategy.ADAPTIVE_BIG,
    'sum': usr.Strategy.SUM,
}

UNGOL_SCORERS = {
    'rhwmd': uss.rhwmd,
    'bm25': uss.bm25,
Exemplo n.º 11
0
"""

A collection of different similarity and distance measure implementations.
Sometimes batched or gpu accelerated variants exist.

"""

from ungol.common import logger

import numpy as np


log = logger.get('similarity.measures')


# def m_cosine(train_data, test_data, tqdm=lambda x: x, max_k=100):
#     dists, train, test = None, None, None

#     try:
#         train = torch.from_numpy(train_data).to(device=DEV)
#         test  = torch.from_numpy(test_data).to(device=DEV)

#         train /= train.norm(dim=1).unsqueeze(1)
#         test /= test.norm(dim=1).unsqueeze(1)

#         dists = torch.stack([
#             (1-train.matmul(t).squeeze())
#             for t in tqdm(test)])

#         topkek = dists.topk(k=max_k, largest=False, dim=1)
#         sortdists, sortindices = map(lambda t: t.cpu().numpy(), topkek)
Exemplo n.º 12
0
import numpy as np
from tqdm import tqdm as _tqdm

import math
import pathlib
import functools

from typing import Any
from typing import Dict
from typing import List
from typing import Generator


# ---

log = logger.get('models.training')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

# ---


#
#   TRAINING INFRASTRUCTURE
#

Exemplo n.º 13
0
import numpy as np

from typing import Any
from typing import Set
from typing import Dict
from typing import Tuple
from typing import Collection


# conditional imports

skdecomp = ucu.load_module('sklearn.decomposition')
sent2vec = ucu.load_module('sent2vec')
infersent = ucu.load_module('infersent.models')

log = logger.get('sentemb.redux')


class Redux:

    @property
    def name(self) -> str:
        raise NotImplementedError()

    @property
    def dimensions(self) -> int:
        """
        Output dimensionality.
        """
        raise NotImplementedError
Exemplo n.º 14
0
import numpy as np

import os
import json
import queue
import pathlib
import functools
import collections
import multiprocessing as mp
from datetime import datetime

from typing import Tuple

# ---

log = logger.get('models.models')

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

log.warn('enabling cudnn benchmark')
torch.backends.cudnn.benchmark = True

# ---


class ModelException(Exception):
    """
Exemplo n.º 15
0
import pickle
import random
import multiprocessing as mp
from collections import defaultdict

import elasticsearch as es
from wmd import WMD as WMDR
from gensim.models.fasttext import FastText

from typing import Any
from typing import List
from typing import Dict
from typing import Tuple
from typing import Callable

log = logger.get('retrieval.clients')

# ---


@attr.s
class Result:

    doc_id: str = attr.ib()
    score: float = attr.ib()


class Client:
    @property
    def time(self) -> float:
        return sum(self._times) / len(self._times)
Exemplo n.º 16
0
from ungol.common import logger

from tqdm import tqdm as _tqdm

import os
import pickle
import pathlib
import functools
import itertools
import multiprocessing as mp

log = logger.get('sentemb.common')
tqdm = functools.partial(_tqdm, ncols=80)

SENT_MIN_LEN = 2
SENT_MAX_LEN = 40

F_ARRS = 'sentences.arrs'
F_TOKS = 'tokens.txt'
F_VOCAB = 'vocab.pickle'
F_COUNTS = 'counts.pickle'

# ---


def get_vocabs(f_in: str, prefix: str = None):
    log.info(f'loading vocabulary files from "{f_in}" (prefix={prefix})')

    if prefix is None:
        prefix = ''
    else:
Exemplo n.º 17
0
from ungol.sentemb import common as usc
from ungol.sentemb import redux as usr

import attr
import h5py
from tqdm import tqdm as _tqdm

import pickle
import pathlib
import argparse
import functools

from typing import Set
from typing import Dict

log = logger.get('sentemb.training')
tqdm = functools.partial(_tqdm, ncols=80)

# FIXME: make an option
BATCH_SIZE = 2048


def btqdm(*args, step: int = 1, **kwargs):
    bar = tqdm(*args, **kwargs)
    for x in bar:
        yield x
        bar.update(step)


@attr.s
class Stats:
Exemplo n.º 18
0
#   Reference Code: https://github.com/zomux/neuralcompressor
#

from ungol.common import logger
from ungol.common import embed as uce
from ungol.models import models as umm
from ungol.models import training as umt

import re
import argparse
import functools

import torch
from tqdm import tqdm as _tqdm

log = logger.get('models.embcompr')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

# ---


def _print_examples(compressor, training):
    vocab = list(training.ember.vocab.keys())[1000:1010]
    dist = torch.nn.PairwiseDistance()

    for word in vocab: