def run_tag_protocol_xml(): config_filename: str = aj("./tests/test_data/test_config.yml") cfg: Config = load_typed_config(config_filename) tagger: pyriksprot.ITagger = TaggerRegistry.get( tagger_cls=StanzaTagger, model=cfg.stanza_dir, dehyphen_opts=dict(word_frequency_filename=cfg.word_frequency.fullname, **cfg.dehyphen.opts), use_gpu=False, ) input_filename: str = jj("tests", "test_data", "fake", "prot-1958-fake.xml") output_filename: str = jj("tests", "output", "prot-1958-fake.zip") pyriksprot.tag_protocol_xml( input_filename, output_filename, tagger, storage_format="json", ) assert os.path.isfile(output_filename)
def test_snakemake_execute(): config_filename = aj("./tests/test_data/test_config.yml") cfg: Config = load_typed_config(config_name=config_filename) snakefile = jj('workflow', 'Snakefile') rmtree(cfg.annotated_folder, ignore_errors=True) makedirs(cfg.annotated_folder, exist_ok=True) success = snakemake.snakemake( snakefile, config=dict(config_filename=config_filename), debug=True, # workdir=workdir, keep_target_files=True, cores=1, verbose=True, ) assert success source_files: List[str] = glob.glob(jj( cfg.data_folder, 'riksdagen-corpus/corpus/**/prot*.xml'), recursive=True) for filename in source_files: document_name: str = strip_path_and_extension(filename) target_dir: str = jj(cfg.annotated_folder, document_name.split('-')[1]) assert isfile(jj(target_dir, f"{document_name}.zip"))
def store(self, folder: str): """Stores the corpus used in training.""" os.makedirs(folder, exist_ok=True) if isinstance(self.corpus, pc.VectorizedCorpus): corpus: pc.VectorizedCorpus = self.corpus elif isinstance(self.corpus, corpora.Sparse2Corpus): corpus: pc.VectorizedCorpus = pc.VectorizedCorpus( bag_term_matrix=self.corpus.sparse.tocsr().T, token2id=self.token2id, document_index=self.document_index, ) else: raise NotImplementedError( f"type: {type(self.corpus)} save not implemented") assert len( self.document_index ) == corpus.data.shape[0], 'bug check: corpus transpose needed?' os.makedirs(folder, exist_ok=True) utility.write_json(jj(folder, VECTORIZER_ARGS_FILENAME), data=self.vectorizer_args or {}) utility.write_json(jj(folder, CORPUS_OPTIONS_FILENAME), data=self.corpus_options or {}) corpus.dump(tag='train', folder=folder)
def to_sqlite(self, folder: str, data: InferredTopicsData) -> None: """Create an sqlite cache for data""" db = sqlite3.connect(jj(folder, "inferred_topics_data.sqlite")) # Load the CSV in chunks: for c in pd.read_csv("voters.csv", chunksize=1000): # Append all rows to a new database table, which # we name 'voters': c.to_sql("voters", db, if_exists="append") # Add an index on the 'street' column: db.execute("CREATE INDEX street ON voters(street)") db.close() with sqlite3.connect(jj(folder, "inferred_topics_data.sqlite")) as db: data.dictionary.to_sql("dictionary", db, if_exists="replace") data.document_index.to_sql("document_index", db, if_exists="replace") data.document_topic_weights.to_sql("document_topic_weights", db, if_exists="replace") data.topic_token_overview.to_sql("topic_token_overview", db, if_exists="replace") data.topic_token_weights.to_sql("topic_token_weights", db, if_exists="replace")
def setup_parlaclarin_repository( test_protocols: List[str], root_path: str = DEFAULT_ROOT_PATH, repository_name: str = "riksdagen-corpus", ) -> List[str]: """Create a mimimal ParlaClarin XML Git repository""" repository_folder: str = jj(root_path, repository_name) corpus_folder: str = jj(repository_folder, "corpus") source_filenames: List[str] = [] rmtree(repository_folder, ignore_errors=True) init_repository(repository_folder, True) makedirs(corpus_folder, exist_ok=True) for filename in test_protocols: year_specifier = filename.split('-')[1] target_subfolder = jj(corpus_folder, year_specifier) makedirs(target_subfolder, exist_ok=True) url = f'{GITHUB_SOURCE_URL}/{year_specifier}/{filename}' download_url(url=url, target_folder=target_subfolder, filename=filename) source_filenames.append(jj(target_subfolder, filename)) return source_filenames
def test_snakemake_word_frequency(): test_protocols: List[str] = [ 'prot-1936--ak--8.xml', 'prot-197778--160.xml', ] workdir = aj("./tests/output/work_folder") config_filename = aj("./tests/test_data/test_config_output.yml") rmtree(workdir, ignore_errors=True) makedirs(workdir, exist_ok=True) makedirs(jj(workdir, "logs"), exist_ok=True) setup_parlaclarin_repository(test_protocols, workdir, "riksdagen-corpus") setup_work_folder_for_tagging_with_stanza(workdir) snakefile = jj('workflow', 'Snakefile') snakemake.snakemake( snakefile, config=dict(config_filename=config_filename, processes=4), debug=True, # workdir=workdir, keep_target_files=True, cores=1, verbose=True, targets=['word_frequency'], ) assert isfile(jj(workdir, "riksdagen-corpus-term-frequencies.pkl"))
def explode_pickle(folder: str) -> None: filename: str = jj(folder, 'train_document_index.csv.gz') if not isfile(jj(folder, filename)): return corpus: pc.VectorizedCorpus = pc.VectorizedCorpus.load(folder=folder, tag='train') corpus.store_metadata(tag='train', folder=folder, mode='files')
def ensure_models_folder(target_relative_folder: str): source_folder = jj(DEFAULT_DATA_FOLDER, target_relative_folder) target_folder = jj(TEST_DATA_FOLDER, target_relative_folder) if not isdir(target_folder): if isdir(source_folder): symlink(target_folder, source_folder)
def test_word_frequency_file_path(): cfg: Config = load_typed_config("test_config.yml") cfg.data_folder = jj("tests", "output") result = jj(cfg.work_folders.data_folder, cfg.word_frequency.filename) expected_path: str = jj("tests", "output", "riksdagen-corpus-term-frequencies.pkl") assert result == expected_path assert cfg.word_frequency.fullname == expected_path
def load_model(folder: str) -> Any: """Load a topic model from pickled file.""" if not InferredModel.exists(folder): raise FileNotFoundError(f"no model found in folder {folder}") for filename in ["topic_model.pickle.pbz2", "topic_model.pickle"]: if isfile(jj(folder, filename)): return utility.unpickle_from_file(jj(folder, filename)) return None
def convert_topic_tokens(folder: str, source_filename: str = None) -> pd.DataFrame: mallet_folder: str = jj(folder, "mallet") target_filename = jj(folder, 'topic_token_weights.zip') if isfile(target_filename): return None source_filename: str = source_filename or probe_filenames( mallet_folder, [ "topicwordweights.txt.gz", "topicwordweights.txt", "topicwordweights.zip" ]) id2token: dict[int, str] = pd.read_json(jj(folder, "topic_model_id2token.json.gz"), typ="series") token2id: dict[str, int] = {v: k for k, v in id2token.items()} ttw: pd.DataFrame = pd.read_csv( source_filename, names=['topic_id', 'token', 'weight'], dtype={ 'topic_id': np.int16, 'weight': np.float64 }, header=None, sep='\t', ) ttw = ttw[ttw.weight > ttw.weight.min()] # pylint: disable=no-member ttw['token_id'] = ttw.token.apply(token2id.get) ttw.drop(columns='token', inplace=True) ttw['topic_id'] = ttw.topic_id.astype(np.int16) ttw['token_id'] = ttw.token_id.astype(np.int32) # df['weight'] = df.weight.astype(np.float32) ttw = ttw[['topic_id', 'token_id', 'weight']].reset_index(drop=True) # ttw.to_feather(jj(folder, "topic_token_weights.feather")) ttw.to_csv( target_filename, sep='\t', compression=dict(method='zip', archive_name="topic_token_weights.csv"), header=True, ) return ttw
def load( *, folder: str, filename_fields: pu.FilenameFieldSpecs = None, slim: bool = False, verbose: bool = False, ): """Loads previously stored aggregate""" if not isfile(jj(folder, "topic_token_weights.zip")): return PickleUtility.explode(source=folder, target_folder=folder) document_index: pd.DataFrame = (pd.read_feather( jj(folder, "documents.feather")).rename_axis('document_id') if isfile( jj(folder, "documents.feather")) else pc.load_document_index( jj(folder, 'documents.zip'), filename_fields=filename_fields, **CSV_OPTS).set_index('document_id', drop=True)) data: InferredTopicsData = InferredTopicsData( dictionary=smart_load(jj(folder, 'dictionary.zip'), feather_pipe=pu.set_index, columns='token_id'), document_index=document_index, topic_token_weights=smart_load( jj(folder, 'topic_token_weights.zip')), document_topic_weights=smart_load( jj(folder, 'document_topic_weights.zip')), topic_token_overview=smart_load(jj(folder, 'topic_token_overview.zip'), feather_pipe=pu.set_index, columns='topic_id'), topic_diagnostics=smart_load(jj(folder, 'topic_diagnostics.zip'), missing_ok=True, feather_pipe=pu.set_index, columns='topic_id'), token_diagnostics=smart_load(jj(folder, 'token_diagnostics.zip'), missing_ok=True), ) # HACK: Handle renamed column: data.document_index = fix_renamed_columns(data.document_index) assert "year" in data.document_index.columns data.topic_token_overview = data.load_topic_labels(folder, **CSV_OPTS) data.slim_types() if slim: data.slimmer() if verbose: data.log_usage(total=True) return data
def load(*, tag: str, folder: str) -> IVectorizedCorpus: """Loads corpus with tag `tag` in folder `folder` Raises `FileNotFoundError` if any of the two files containing metadata and matrix doesn't exist. Two files are loaded based on specified `tag`: {tag}_vectorizer_data.pickle Contains metadata `token2id`, `document_index` and `overridden_term_frequency` {tag}_vector_data.[npz|npy] Contains the document-term matrix (numpy or sparse format) Parameters ---------- tag : str Corpus identifier (prefixed to filename) folder : str, optional Corpus folder to look in, by default './output' Returns ------- VectorizedCorpus Loaded corpus """ data: dict = load_metadata(tag=tag, folder=folder) token2id: Mapping = data.get("token2id") """Load TF override, convert if in older (dict) format""" overridden_term_frequency: np.ndarray = ( data.get("term_frequency", None) or data.get("overridden_term_frequency", None) or data.get("term_frequency_mapping", None) or data.get("token_counter", None)) if isinstance(overridden_term_frequency, dict): fg = {v: k for k, v in token2id.items()}.get overridden_term_frequency = np.array([ overridden_term_frequency[fg(i)] for i in range(0, len(token2id)) ]) """Document-term-matrix""" if os.path.isfile(jj(folder, f"{tag}_vector_data.npz")): bag_term_matrix = scipy.sparse.load_npz( jj(folder, f"{tag}_vector_data.npz")) else: bag_term_matrix = np.load(jj(folder, f"{tag}_vector_data.npy"), allow_pickle=True).item() return create_corpus_instance( bag_term_matrix, token2id=token2id, document_index=data.get("document_index"), overridden_term_frequency=overridden_term_frequency, )
def test_expand_target_files(): source_folder: str = jj("tests", "output", "corpus") target_folder: str = jj("tests", "output", "annotated") create_test_source_tree(source_folder, TEST_DUMMY_FILENAMES) target_files = expand_target_files(source_folder, "xml", target_folder, "zip") assert set(target_files) == { jj("tests", "output", "annotated", filename.split('-')[1], f"{filename}.zip") for filename in TEST_DUMMY_FILENAMES }
def convert_document_index(folder: str) -> None: target_filename: str = jj(folder, "documents.zip") source_filename: str = jj(folder, "train_document_index.csv.gz") if isfile(target_filename): return explode_pickle(folder) di: pd.DataFrame = ( pd.read_csv(source_filename, sep=';', index_col=0).set_index('document_name', drop=False).rename_axis('') ) di.to_csv(target_filename, sep='\t', compression=dict(method='zip', archive_name="document_index.csv"), header=True)
def test_expand_call_arguments(): target_folder = nj( "/data/riksdagen_corpus_data/riksdagen-corpus-exports/speech_xml") source_folder = nj("/data/riksdagen_corpus_data/riksdagen-corpus/corpus/") extension = "xml" years, basenames = glob_wildcards( jj(source_folder, "{year}", f"{{file}}.{extension}")) filenames = expand(jj(target_folder, '{year}', f'{{basename}}.{extension}'), zip, year=years, basename=basenames) assert len(filenames) == len(years)
def load_topic_labels(self, folder: str, **csv_opts: dict) -> pd.DataFrame: tto: pd.DataFrame = self.topic_token_overview if isfile(jj(folder, "topic_token_overview_label.csv")): labeled_tto: pd.DataFrame = pd.read_csv( jj(folder, 'topic_token_overview_label.csv'), **csv_opts) if self.is_satisfied_topic_token_overview(labeled_tto): # logger.info(f"labeled file loaded from: {folder}") tto = labeled_tto if 'label' not in tto.columns: tto['label'] = tto['document_id'].astype( str) if 'document_id' in tto.columns else tto.index.astype(str) return tto
def setup_work_folder_for_tagging_with_sparv(root_path: str): """Write a default Sparv config file (NOT USED)""" """Target folder for extracted speeches""" speech_folder: str = jj(root_path, "riksdagen-corpus-export", "speech-xml") """Create target folder for extracted speeches""" rmtree(speech_folder, ignore_errors=True) makedirs(speech_folder, exist_ok=True) """Target folder for PoS tagged speeches""" makedirs(speech_folder, exist_ok=True) makedirs(jj(root_path, "sparv"), exist_ok=True) shutil.copyfile("tests/test_data/sparv_config.yml", jj(root_path, "sparv", "config.yaml"))
def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus): tag: str = f'{str(uuid.uuid1())[:6]}' folder: str = jj(OUTPUT_FOLDER, tag) os.makedirs(folder, exist_ok=True) vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode) assert VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert VectorizedCorpus.find_tags(folder) == [tag] loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag, folder=folder) assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency ).all() assert vectorized_corpus.document_index.to_dict( ) == loaded_corpus.document_index.to_dict() assert vectorized_corpus.token2id == loaded_corpus.token2id loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict() VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1)) loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict(apa=1) VectorizedCorpus.remove(tag=tag, folder=folder) assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert not VectorizedCorpus.find_tags(folder) shutil.rmtree(folder)
def test_load_stored_metadata_simple(mode: str): tag: str = f'{uuid.uuid1()}' folder: str = jj(OUTPUT_FOLDER, tag) os.makedirs(folder, exist_ok=True) document_index_str: str = (";filename;year;document_name;document_id\n" "a;a.txt;2019;a;0\n" "b;b.txt;2019;b;1\n" "c;c.txt;2019;c;2\n" "d;d.txt;2020;d;3\n" "e;e.txt;2020;e;4\n") token2id: dict = dict(x=0, y=1, z=2) overridden_term_frequency = np.arange(3) metadata = { 'document_index': load_document_index(filename=StringIO(document_index_str), sep=';'), 'token2id': token2id, 'overridden_term_frequency': overridden_term_frequency, } store_metadata(tag=tag, folder=folder, mode=mode, **metadata) metadata_loaded = load_metadata(tag=tag, folder=folder) assert metadata_loaded['document_index'].to_csv( sep=';') == document_index_str assert metadata_loaded['token2id'] == token2id assert (metadata_loaded['overridden_term_frequency'] == overridden_term_frequency).all() shutil.rmtree(folder)
def dump( self: IVectorizedCorpusProtocol, *, tag: str, folder: str, compressed: bool = True, mode: Literal['bundle', 'files'] = 'files', ) -> IVectorizedCorpus: """Store corpus to disk. The DTM is stored in `folder` with files prefixed with tag `tag`: {tag}_vectorizer_data.pickle Bundle with `token2id`, `document_index` and `overridden_term_frequency` {tag}_document_index.csv.gz Document index as compressed CSV (if mode is `files`) {tag}_token2id.json.gz Vocabulary as compressed JSON (if mode is `files`) {tag}_term_frequency.npy Term frequency to use, overrides TF sums in DTM (if mode is `files`) {tag}_vector_data.[npz|npy] The document-term matrix (numpy or sparse format) Parameters ---------- tag : str, optional String to be prepended to file name, set to timestamp if None folder : str, optional Target folder, by default './output' compressed : bool, optional Specifies if matrix is stored as .npz or .npy, by default .npz mode : str, optional, values 'bundle' or 'files' Specifies if metadata should be bundled in a pickle file or stored as individual compressed files. """ tag = tag or time.strftime("%Y%m%d_%H%M%S") store_metadata(tag=tag, folder=folder, mode=mode, **self.metadata) if compressed: assert scipy.sparse.issparse(self.bag_term_matrix) scipy.sparse.save_npz(jj(folder, f"{tag}_vector_data"), self.bag_term_matrix, compressed=True) else: np.save(jj(folder, f"{tag}_vector_data.npy"), self.bag_term_matrix, allow_pickle=True) return self
def write_payload(folder: str, payload: DocumentPayload) -> DocumentPayload: filename: str = jj(folder, replace_extension(payload.filename, ".feather")) payload.content.reset_index(drop=True).to_feather(filename, compression="lz4") return payload
def load_token2id(folder: str) -> pc.Token2Id: dictionary: pd.DataFrame = smart_load(jj(folder, 'dictionary.zip'), feather_pipe=pu.set_index, columns='token_id') token2id: pc.Token2Id = pc.Token2Id( data={t: i for (t, i) in zip(dictionary.token, dictionary.index)}) return token2id
def convert_overview(folder: str) -> None: target_filename: str = jj(folder, 'topic_token_overview.zip') if isfile(target_filename): return source_filename: str = probe_filenames(jj(folder, "mallet"), ["topickeys.txt.gz", "topickeys.txt", "topickeys.zip"]) df: pd.DataFrame = pd.read_csv(source_filename, sep='\t', names=['topic_id', 'alpha', 'tokens']).set_index( 'topic_id' ) df.to_csv( target_filename, sep='\t', compression=dict(method='zip', archive_name="topic_token_overview.csv"), header=True, )
def convert_dictionary(folder: str) -> None: target_filename: str = jj(folder, "dictionary.zip") source_filename: str = jj(folder, "train_token2id.json.gz") if isfile(target_filename): return explode_pickle(folder) token2id: pd.Series = pd.read_json(source_filename, typ='series') dictionary = pd.DataFrame(data=dict(token_id=token2id, token=token2id.index, dfs=0)).set_index('token_id') dictionary.to_csv( target_filename, sep='\t', compression=dict(method='zip', archive_name="dictionary.csv"), header=True )
def write_document_index(folder: str, document_index: pd.DataFrame): if document_index is None: return _sanitize_document_index(document_index) filename = jj(folder, FEATHER_DOCUMENT_INDEX_NAME) document_index.reset_index(drop=True).to_feather(filename, compression="lz4")
def load_id2token(folder: str) -> Mapping[int, str]: """Loads vocabulary from file""" filename: str = jj(folder, InferredModel.ID2TOKEN_FILENAME) if not os.path.isfile(filename): """Backward compatibility: read dictionary.zip""" return InferredTopicsData.load_token2id(folder).id2token with gzip.open(filename, 'rb') as f: json_str = f.read().decode(encoding='utf-8') return {int(k): v for k, v in json.loads(json_str).items()}
def convert_document_topics(folder: str, source_filename: str = None, normalize: bool = True, epsilon: float = 0.005) -> pd.DataFrame: """Converts a 2.0.8+ MALLET doc-topics file into data frame stored in FEATHER format.""" mallet_folder: str = jj(folder, "mallet") target_filename: str = jj(folder, 'document_topic_weights.zip') if isfile(target_filename): return source_filename: str = source_filename or probe_filenames( mallet_folder, [ "doctopics.txt.infer.gz", "doctopics.txt.infer.zip", "doctopics.txt.infer", "doctopics.txt.gz", "doctopics.txt.zip", "doctopics.zip", "doctopics.txt", ], ) dtw: pd.DataFrame = doctopics_to_dataframe(source_filename, normalize, epsilon) dtw.to_feather(jj(mallet_folder, "doctopics.feather")) di: pd.DataFrame = pd.read_csv(jj(folder, "documents.zip"), sep='\t').set_index('document_id', drop=False) dtw = dtw.merge(di[['year']], left_on='document_id', right_index=True, how='left') dtw['year'] = dtw.year.astype(np.int16) dtw.to_csv( target_filename, sep='\t', compression=dict(method='zip', archive_name="document_topic_weights.csv"), header=True, )
def load(folder: str) -> TrainingCorpus: """Loads an training corpus from pickled file.""" """Load from vectorized corpus if exists""" if pc.VectorizedCorpus.dump_exists(tag='train', folder=folder): corpus: pc.VectorizedCorpus = pc.VectorizedCorpus.load( tag='train', folder=folder) return TrainingCorpus( corpus=corpus, document_index=corpus.document_index, token2id=pc.Token2Id(data=corpus.token2id), corpus_options=utility.read_json(jj(folder, CORPUS_OPTIONS_FILENAME), default={}), vectorizer_args=utility.read_json(jj(folder, VECTORIZER_ARGS_FILENAME), default={}), ) return None
def store_options(self, folder: str): filename: str = jj(folder, InferredModel.OPTIONS_FILENAME) options: dict = {'method': self.method, **self.options} os.makedirs(folder, exist_ok=True) with open(filename, 'w') as fp: json.dump(options, fp, indent=4, default=lambda o: f"<<non-serializable: {type(o).__qualname__}>>")