Exemplo n.º 1
0
    def __init__(self, lang):
        """
		:param lang: language code of precomputed fasttext encodings, see
		https://fasttext.cc/docs/en/crawl-vectors.html
		"""

        import fasttext
        import fasttext.util

        super().__init__()
        self._lang = lang

        download_path = make_cache_path() / 'models'
        download_path.mkdir(exist_ok=True, parents=True)

        filename = "cc.%s.300.bin" % self._lang
        if not (download_path / filename).exists():
            os.chdir(download_path)

            with tqdm(desc="Downloading " + self.name,
                      total=10000,
                      bar_format='{desc:<30}{percentage:3.2f}%|{bar:40}'
                      ) as pbar:
                with contextlib.redirect_stdout(ProgressParser(pbar)):
                    filename = fasttext.util.download_model(self._lang,
                                                            if_exists='ignore')

        with tqdm(desc="Opening " + self.name,
                  total=1,
                  bar_format='{l_bar}{bar}') as pbar:
            ft = fasttext.load_model(str(download_path / filename))
            pbar.update(1)

        self._ft = ft
Exemplo n.º 2
0
def extract_numberbatch(path, languages):
	# e.g. extract_numberbatch("/path/to/numberbatch-19.08.txt", ["en", "de"])
	# then use KeyedVectors.load()

	path = Path(path)
	languages = set(languages)

	pattern = re.compile(r"^/c/([a-z]+)/")

	with open(path, "r") as f:
		num_lines, num_dimensions = [int(x) for x in f.readline().split()]
		vectors = collections.defaultdict(lambda: {
			"keys": [],
			"vectors": []
		})

		for _ in tqdm(range(num_lines)):
			line = f.readline()
			m = pattern.match(line)
			if m:
				lang = m.group(1)
				if lang in languages:
					line = line[len(m.group(0)):]
					cols = line.split()
					key = cols[0]
					if key.isalpha():
						record = vectors[lang]
						record["keys"].append(key)
						record["vectors"].append(
							np.array([float(x) for x in cols[1:]]))

	for lang, record in vectors.items():
		wv = gensim.models.KeyedVectors(num_dimensions)
		wv.add_vectors(record["keys"], record["vectors"])
		wv.save(str(path.parent / f"{path.stem}-{lang}.kv"))
Exemplo n.º 3
0
        def gen_spans():
            with tqdm(desc="Encoding", total=i_spans[-1],
                      disable=not pbar) as pbar_instance:

                for doc in docs:
                    spans = list(doc.spans(partition))
                    yield doc, spans
                    pbar_instance.update(len(spans))
Exemplo n.º 4
0
	def create_encoder(self, session):
		normalizers = session.normalizers
		normalizer = normalizers['text'].to_callable()
		key = json.dumps({
			'emb': self.unique_name,
			'nrm': normalizer.ident,
			'sampling': self._embedding_sampling,
			'tfm': [t.name for t in self._transforms]
		})

		loaded = self._loaded.get(key)
		if loaded is None:
			name = self.unique_name

			dat_path = self._cache.get(key)

			#dat_path = normalized_cache_path / f"{name}-{normalizer.name}-{self._embedding_sampling}.dat"

			if dat_path and dat_path.exists():
				with tqdm(desc="Opening " + self.name, total=1,  bar_format='{l_bar}{bar}',
						  disable=not CachedWordEmbedding.pbar_on_open) as pbar:
					with open(dat_path.with_suffix('.json'), 'r') as f:
						data = json.loads(f.read())
					tokens = data['tokens']
					vectors_mmap = np.memmap(
						dat_path, dtype=np.float32, mode='r', shape=tuple(data['shape']))
					pbar.update(1)
			else:
				tokens, vectors = self._load()
				tokens, vectors = normalize_word2vec(
					self.name, tokens, vectors, normalizer.unpack(), self._embedding_sampling)

				for t in self._transforms:
					vectors = t.apply(Vectors(vectors)).unmodified

				dat_path = self._cache.create_new_data_path()

				vectors_mmap = np.memmap(
					dat_path, dtype=np.float32, mode='w+', shape=vectors.shape)
				vectors_mmap[:, :] = vectors[:, :]
				vectors = None

				with open(dat_path.with_suffix('.json'), 'w') as f:
					f.write(json.dumps({
						'tokens': tokens,
						'shape': tuple(vectors_mmap.shape)
					}))

				self._cache.put(key, dat_path)

			loaded = CachedWordEmbedding.Encoder(
				self, name, tokens, vectors_mmap)
			self._loaded[key] = loaded

		return loaded
Exemplo n.º 5
0
    def make(root, normalization, docs):
        builder = FlavorBuilder(root, normalization.normalizers)

        with tqdm(total=2 * len(docs),
                  desc=f"Adding Normalization '{normalization.name}'") as pbar:
            for stage in (FlavorBuilder.Stage.PREFLIGHT,
                          FlavorBuilder.Stage.ADD):
                builder.set_stage(stage)
                for unique_id, doc in docs.items():
                    builder.add(unique_id, doc)
                    pbar.update(1)
Exemplo n.º 6
0
def normalize_word2vec(name, tokens, embeddings, normalizer, sampling='nearest'):
	if sampling not in ('nearest', 'average'):
		raise ValueError(f'Expected "nearest" or "average", got "{sampling}"')

	embeddings = embeddings.astype(np.float32)

	f_mask = np.zeros((embeddings.shape[0],), dtype=np.bool)
	f_tokens = []
	token_to_ids = dict()

	for i, t in enumerate(tqdm(tokens, desc=f"Normalizing tokens in {name}")):
		nt = normalizer(t)
		if nt is None:
			continue
		if sampling != 'average' and nt != t:
			continue
		indices = token_to_ids.get(nt)
		if indices is None:
			token_to_ids[nt] = [i]
			f_tokens.append(nt)
			f_mask[i] = True
		else:
			indices.append(i)

	if sampling == 'average':
		for indices in tqdm(token_to_ids.values(), desc=f"Merging tokens in {name}", total=len(token_to_ids)):
			if len(indices) > 1:
				i = indices[0]
				embeddings[i] = np.mean(embeddings[indices], axis=0)

	f_embeddings = embeddings[f_mask]
	embeddings = None

	assert f_embeddings.shape[0] == len(f_tokens)

	return f_tokens, f_embeddings
Exemplo n.º 7
0
    def __init__(self, session, vocab, corpus):
        self._vocab = vocab
        self._docs = []

        with corpus.flavor_cache(session.normalization.name) as flavor_cache:

            with tqdm(desc="Preparing Documents", total=len(corpus)) as pbar:

                def prepare_doc(doc):
                    pbar.update(1)
                    return doc.prepare(corpus, flavor_cache, session)

                with concurrent.futures.ThreadPoolExecutor(
                        max_workers=2) as executor:
                    self._docs = list(executor.map(prepare_doc, corpus.docs))
Exemplo n.º 8
0
def download(url, path, force_download=False):
	path = Path(path)
	download_path = path / urllib.parse.urlparse(url).path.split("/")[-1]
	is_zip = download_path.suffix == ".zip"

	if is_zip:
		result_path = path / download_path.stem
	else:
		result_path = download_path

	if result_path.exists() and not force_download:
		return result_path

	with tqdm(desc="Downloading " + url, unit='iB', unit_scale=True) as pbar:
		response = requests.get(url, stream=True)

		total_length = int(response.headers.get('content-length', 0))
		pbar.reset(total=total_length)

		try:
			with open(download_path, "wb") as f:
				for data in response.iter_content(chunk_size=4096):
					pbar.update(len(data))
					f.write(data)
		except:
			download_path.unlink(missing_ok=True)
			raise

	if download_path != result_path:
		extracted = []
		with zipfile.ZipFile(download_path, 'r') as zf:
			for zi in zf.infolist():
				if zi.filename[-1] == '/':
					continue
				zi.filename = os.path.basename(zi.filename)
				p = zf.extract(zi, result_path.parent)
				extracted.append(Path(p))

		if len(extracted) == 1:
			extracted[0].rename(result_path)

		download_path.unlink()

	return result_path if result_path.exists() else None
Exemplo n.º 9
0
    def save(self, path):
        path = Path(path)
        path.mkdir(exist_ok=True)

        offset = 0
        session = self._partition.session
        corpus_vec = self.corpus_vec.ummodified

        for doc in tqdm(session.documents, desc="Saving"):
            size = doc.n_spans(self._partition)
            np.save(str(path / (doc.caching_name + ".npy")),
                    corpus_vec[offset:size],
                    allow_pickle=False)
            offset += size

        with open(path / "index.json", "w") as f:
            f.write(
                json.dumps({
                    'class': self.__class__.__name__,
                    'partition': self._partition.to_args()
                }))
Exemplo n.º 10
0
def load_glove_txt(csv_path):
	tokens = []
	with open(csv_path, "r") as f:
		text = f.read()

	lines = text.split("\n")
	n_rows = len(lines)
	n_cols = len(lines[0].strip().split()) - 1

	embeddings = np.empty(
		shape=(n_rows, n_cols), dtype=np.float32)

	for line in tqdm(lines, desc="Importing " + str(csv_path)):
		values = line.strip().split()
		if values:
			t = values[0]
			if t:
				embeddings[len(tokens), :] = values[1:]
				tokens.append(t)

	embeddings = embeddings[:len(tokens), :]

	return tokens, embeddings
Exemplo n.º 11
0
def extraction_tqdm(tokens, name):
	return tqdm(tokens, desc=f"Extracting {name}", disable=len(tokens) < 5000)
Exemplo n.º 12
0
    def __init__(self, path, mutable=False):
        path = Path(path)
        if not path.exists():
            path.mkdir()
        elif not path.is_dir():
            raise ValueError(f"expected directory path, got '{path}'")
        self._path = path

        self._documents_path = path / "documents"
        self._flavors_path = path / "flavors"
        self._flavors_path.mkdir(exist_ok=True)

        if not (path / "corpus.h5").exists():
            mutable = True

        self._corpus_h5 = h5py.File(path / "corpus.h5",
                                    "a" if mutable else "r")
        self._documents_group = self._corpus_h5.require_group("documents")
        #self._flavors_group = self._corpus_h5.require_group("flavors")

        self._corpus_sql = sqlite3.connect(path / "corpus.db")
        self._mutable = mutable

        with self._corpus_sql:
            self._corpus_sql.execute('''
				CREATE TABLE IF NOT EXISTS text(
					unique_id TEXT PRIMARY KEY,
					content TEXT,
					content_hash TEXT)''')
            self._corpus_sql.execute('''
				CREATE INDEX IF NOT EXISTS text_hash ON text(content_hash);
			''')

        data = threading.local()

        def create_catalog():
            return EmbeddingCatalog(path / "embeddings.db")

        def load_doc(unique_id):
            if 'catalog' in data.__dict__:
                embedding_catalog = data.catalog
            else:
                embedding_catalog = create_catalog()
                data.catalog = embedding_catalog

            p = self._documents_path / unique_id
            doc = Document.load_from_corpus(unique_id, p, self._corpus_sql,
                                            self._documents_group[unique_id],
                                            embedding_catalog)

            return unique_id, doc

        unique_ids = list(self._documents_group.keys())
        self._docs = {}

        self._doc_to_unique_id = {}
        self._unique_id_to_index = {}
        self._ordered_docs = []
        self._catalog = create_catalog()

        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            for unique_id, doc in tqdm(executor.map(load_doc, unique_ids),
                                       total=len(unique_ids),
                                       desc="Opening Corpus",
                                       disable=len(unique_ids) < 1):

                self._add_doc(unique_id, doc)
Exemplo n.º 13
0
 def cache_contextual_embeddings(self):
     for doc in tqdm(self.documents, desc="Loading Vectors"):
         doc.cache_contextual_embeddings()
Exemplo n.º 14
0
    def _make_doc(self,
                  md,
                  partitions,
                  loc_ax,
                  locations,
                  show_progress=True,
                  extra_metadata=None):

        pipe = self._nlp.pipe(partitions,
                              batch_size=self._batch_size,
                              disable=['ner',
                                       'lemmatizer'])  # check nlp.pipe_names

        contextual_vectors = dict((e.name, []) for e in self._embeddings)
        texts = []
        tokens = []

        sents = {'start': [], 'end': [], 'loc': []}

        text_len = 0

        encoders = [e.create_encoder(None) for e in self._embeddings]

        for location, doc in tqdm(zip(locations, pipe),
                                  total=len(locations),
                                  desc=f'Importing {md.origin}',
                                  disable=not show_progress):

            if len(doc) < 1:
                continue

            doc_json = doc.to_json()

            partition_tokens = doc_json['tokens']
            for token in partition_tokens:
                token['start'] += text_len
                token['end'] += text_len
            tokens.extend(partition_tokens)

            for sent in doc_json['sents']:
                sents['start'].append(text_len + sent['start'])
                sents['end'].append(text_len + sent['end'])
                sents['loc'].append(location)

            texts.append(doc_json['text'])
            text_len += len(doc_json['text'])

            for e in encoders:
                v = e.encode([doc])
                if len(v.shape) != 3:
                    raise ValueError(
                        f'expected shape (a, b, c), got {v.shape}')
                if v.shape[0] != 1:
                    raise ValueError(f"expected one vector, got {v.shape[0]}")
                v = v[0]
                if v.shape[0] != len(doc):
                    raise ValueError(
                        f'expected ({len(doc)}, ...), got {v.shape}')
                if v.shape[1] == 0:
                    raise ValueError(f'doc "{doc}" got illegal encoding: {v}')
                contextual_vectors[e.name].append(v)

        if not tokens:
            return None

        spans = {
            'sentence': compile_spans(sents, tokens, loc_ax),
            'document': compile_doc_spans(tokens)
        }

        if extra_metadata is None:
            extra_metadata = {}

        extended_metadata = dict(**md._asdict(),
                                 **extra_metadata,
                                 loc_ax=loc_ax)

        from vectorian.corpus.document import Document, InternalMemoryDocumentStorage

        emb_by_name = dict((e.name, e) for e in self._embeddings)

        def transformed(k, v):
            v = Vectors(np.vstack(v))

            embedding = emb_by_name[k]
            if embedding.transform:
                v = embedding.transform.apply(v)

            return ProxyVectorsRef(v)

        contextual_embeddings = dict(
            (k, transformed(k, v)) for k, v in contextual_vectors.items())

        return Document(
            InternalMemoryDocumentStorage(extended_metadata, ''.join(texts),
                                          make_tokens_dict(tokens), spans),
            contextual_embeddings)