def _create_dissim_mat(arr, dissim_measure, force_singlethread=False, n_chunks=200, silent=False, metainf=None, continue_from=None): # return squareform(np.apply_along_axis(cos_to_normangdiff, 0, pdist(arr, metric="cosine"))) # assert np.allclose(np.hstack([cdist(arr, arr[i*10:(i+1)*10], "cosine") for i in range(10)]), squareform(tmp)) if dissim_measure in ["cosine", "norm_ang_dist"]: dist_func = "cosine" else: dist_func = dissim_measure metainf = {} tmp = [] RAM_PER_CORE = 15 #TODO dependent on the dataset if not force_singlethread and get_ncpu(ram_per_core=RAM_PER_CORE) > 1: # max. 1 thread per 10GB RAM # with WorkerPool(get_ncpu(ram_per_core=10), arr, pgbar="Creating dissimilarity matrix" if not silent else None) as pool: # tmp = pool.work(list(np.array_split(arr, n_chunks)), lambda arr, chunk: cdist(arr, chunk, dist_func)) with Interruptible(np.array_split(arr, n_chunks), [tmp], metainf, shutdown_time=210, continue_from=continue_from, contains_mp=True) as iter: with WorkerPool(get_ncpu(ram_per_core=RAM_PER_CORE), arr, pgbar="Creating dissimilarity matrix" if not silent else None, comqu=iter.comqu) as pool: tmpres, interrupted = pool.work(iter, lambda arr, chunk: cdist(arr, chunk, dist_func)) tmp = iter.notify([tmpres], exception=interrupted)[0] if iter.interrupted: return tmp, metainf else: print("Running interruptible with one process") with Interruptible(np.array_split(arr, n_chunks), tmp, metainf, continue_from=continue_from, pgbar=None if silent else "Creating dissimilarity matrix") as iter: for chunk in iter: #np.array_split(arr, n_chunks) if silent else tqdm(np.array_split(arr, n_chunks), desc="Creating dissimilarity matrix") tmp.append(cdist(arr, chunk, dist_func)) if iter.interrupted: return tmp, metainf assert np.allclose(np.hstack(tmp), np.hstack(tmp).T), "The matrix must be symmetric!" res = np.hstack(tmp) if dissim_measure == "norm_ang_dist": flat = squareform(np.hstack(tmp), checks=False) #dunno why this one fails though np.hstack(tmp) == np.hstack(tmp).T res = squareform(np.apply_along_axis(cos_to_normangdiff, 0, flat)) assert np.allclose(np.diagonal(res), 0, atol=1e-10) or np.allclose(np.diagonal(res), 1, atol=1e-10), "Diagonal must be 1 or 0!" assert np.allclose(res, res.T), "The matrix must be symmetric!" return res, metainf
def pmi(doc_term_matrix, positive=False, verbose=False, descriptions=None): # PMI as defined by DESC15 logger.info("Calculating PMIs...") arr = doc_term_matrix.as_csr() total_words = arr.sum() arr = arr / total_words #now arr is p_{et} words_per_doc = arr.sum(axis=0) #p_{e*} ges_occurs_per_term = arr.sum(axis=1) #p_{*t} prod = ges_occurs_per_term * words_per_doc #I'd like to scipy.sparse.csr.csr_matrix(...), but that conversion kills my RAM completely.. res = arr / prod res[np.isnan(res)] = 0 del arr del prod gc.collect() res = np.log1p( res ) # DESC15 say it's just the log, but if we take the log all the values 0<val<1 are negative and [i for i in res[doc_term_matrix.reverse_term_dict["building"]].tolist()[0] if i > 0] becomes a much smaller number if positive: res[res < 0] = 0.0 assert not np.isnan(res).any(), "There are NaNs in the PPMI!" quantifications = csr_to_list(res.T) del res gc.collect() if verbose: print( "The counting that'll come now will take long and is only there because you're verbose" ) print_quantification(doc_term_matrix, quantifications, descriptions) return quantifications
def create_isomap(dissim_mat, embed_dimensions, neighbor_factor=2, **kwargs): # https://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling says isomap better suited than MDS, but DESC15 say they compared it and it's worse ([15] of [DESC15])! n_neighbors=min(max(5, dissim_mat.shape[0]//neighbor_factor), dissim_mat.shape[0]-1) print(f"Running Isomap with {get_ncpu(ignore_debug=True)} jobs for max {n_neighbors} neighbors.") embedding = Isomap(n_jobs=get_ncpu(ignore_debug=True), n_neighbors=n_neighbors, n_components=embed_dimensions, metric="precomputed", **kwargs) isomap = embedding.fit(dissim_mat) return isomap
def run_lsi(pp_descriptions, filtered_dcm, verbose): """as in [VISR12: 4.2.1]""" if verbose: filtered_dcm.show_info(descriptions=pp_descriptions) if get_setting("DCM_QUANT_MEASURE") != "binary": logger.warn("VISR12 say it works best with binary!") orig_len = len(filtered_dcm.dtm) filtered_dcm.add_pseudo_keyworddocs() # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html svd = TruncatedSVD(n_components=100, random_state=get_setting("RANDOM_SEED")) transformed = svd.fit_transform(filtered_dcm.as_csr().T) desc_psdoc_dists = cdist(transformed[:orig_len], transformed[orig_len:], "cosine") already_keywords = [ [ind, j[0]] for ind, elem in enumerate(filtered_dcm.dtm[:orig_len]) for j in elem ] # we don't gain information from those that are close but already keywords desc_psdoc_dists[list(zip(*already_keywords))] = np.inf WHICH_LOWEST = 30 tenth_lowest = np.partition(desc_psdoc_dists.min(axis=1), WHICH_LOWEST)[ WHICH_LOWEST] # https://stackoverflow.com/a/43171216/5122790 good_fits = np.where(desc_psdoc_dists.min(axis=1) < tenth_lowest)[0] for ndesc, keyword in zip(good_fits, np.argmin(desc_psdoc_dists[good_fits], axis=1)): assert not filtered_dcm.all_terms[ keyword] in pp_descriptions._descriptions[ndesc] print(f"*b*{filtered_dcm.all_terms[keyword]}*b*", pp_descriptions._descriptions[ndesc]) print()
def show_data_info(ctx): from derive_conceptualspace.cli.args_from_filename import LAST_RESULT ctx.obj[LAST_RESULT] = ctx.obj["json_persister"].load( None, LAST_RESULT ) #TODO: make the LAST_RESULT ONE THING used also in Snakefile and args_from_filename show_data_info_base(ctx) print()
def get_file_config(base_dir, filepath, dirname_vars): if not isfile(filepath) and isfile(join(base_dir, filepath)): filepath = join(base_dir, filepath) try: with open(filepath) as rfile: used_conf = next(ijson.items(rfile, "used_influentials") ) #next(ijson.items(rfile, "used_config"))[0] rfile.seek(0) used_files = next(ijson.items(rfile, "loaded_files")) except Exception as e: print(f"Error for {filepath}") raise e used_conf = { k: v for k, v in used_conf.items() if k not in set(settings.MAY_DIFFER_IN_DEPENDENCIES) - set(standardize_config_name(i) for i in dirname_vars) } all_used_conf = dict( ChainMap( used_conf, *(i["metadata"].get("used_influentials", {}) for i in used_files.values()))) return { k: float(v) if isinstance(v, decimal.Decimal) else v for k, v in all_used_conf.items() }
def get_ncpu(ram_per_core=None, ignore_debug=False): import psutil if not ignore_debug and get_setting("DEBUG"): return 1 ncpus = get_setting("N_CPUS") if os.getenv("NSLOTS"): if not os.getenv(f"{ENV_PREFIX}shutups_nslots"): print("This machine has been given NSLOTS and it is", os.getenv("NSLOTS")) os.environ[f"{ENV_PREFIX}shutups_nslots"] = "1" ncpus = max(int(os.environ["NSLOTS"]) - 1, 1) # "To ensure that your job is scheduled on a host you are advised not to have request more than $NCPU -1 parallel environments." if "GOTO_NUM_THREADS" in os.environ: # see https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#threads print( f"Snakemake restricts the #Threads to {os.environ['GOTO_NUM_THREADS']}" ) ncpus = min(ncpus, int(os.environ["GOTO_NUM_THREADS"])) if ram_per_core: #TODO if I'm on the grid, I should have an env-var with the assigned ram and use that instead!! ncpus = min( ncpus, round(psutil.virtual_memory().total / 1024 / 1024 / 1024 / ram_per_core)) if "SGE_SMK_mem" in os.environ and os.environ["SGE_SMK_mem"].endswith( "G"): ncpus = min(ncpus, round( int(os.environ["SGE_SMK_mem"][:-1]) / ram_per_core)) # max. 1 thread per XGB RAM return ncpus
def init_context(self, load_envfile=False, load_conffile=True, ignore_envs=False): #works for both a click-Context and my custom one if not self._initialized: #first of all, load settings from env-vars and, if you have it by then, from config-file if not ignore_envs: if load_envfile and os.environ.get(ENV_PREFIX+"_"+"ENV_FILE"): load_dotenv(os.environ.get(ENV_PREFIX+"_"+"ENV_FILE")) relevant_envvars = {k[len(ENV_PREFIX)+1:]: v for k, v in os.environ.items() if k.startswith(ENV_PREFIX+"_")} for param, val in relevant_envvars.items(): if param.startswith("CONF_FORCE_"): #that's how snakemake enforces the config-file, in that situation conf-file has higher prio than env-var self.set_config(param[len("CONF_FORCE_"):], val, "smk_wildcard") else: self.set_config(param, val, "env_vars") if self.get_config("conf_file"): if load_conffile: self.read_configfile() else: print("The env-vars contain the path to a config-file, but it intentionally isn't loaded!") self.obj["dataset_class"] = dataset_specifics.load_dataset_class(self.get_config("dataset")) if hasattr(self.obj["dataset_class"], "configs"): for param, val in self.obj["dataset_class"].configs.items(): self.set_config(param, val, "dataset_class") if hasattr(self.obj["dataset_class"], "init"): self.obj["dataset_class"].init(self) CustomIO.init(self) self.obj["json_persister"] = JsonPersister(self, settings.DIR_STRUCT) self.set_debug() if self.has_config("base_dir", include_default=False): os.chdir(self.get_config("base_dir")) self._init_time = datetime.now() self._initialized = True
def extract_candidateterms_keybert_preprocessed(descriptions, max_ngram, faster_keybert=False, verbose=False, **kwargs): from keybert import KeyBERT # lazily loaded as it needs tensorflow/torch which takes some time to init model_name = "paraphrase-MiniLM-L6-v2" if faster_keybert else "paraphrase-mpnet-base-v2" print(f"Using model {model_name}") candidateterms = [] kw_model = KeyBERT(model_name) descs = descriptions._descriptions if not get_setting( "DEBUG") else descriptions._descriptions[:get_setting("DEBUG_N_ITEMS")] for desc in tqdm(descs, desc="Running KeyBERT on descriptions"): stopwords = get_stopwords(desc.lang) candidates = set() for nwords in range(1, max_ngram): n_candidates = kw_model.extract_keywords( desc.processed_as_string(), keyphrase_ngram_range=(1, nwords), stop_words=stopwords) candidates |= set(i[0] for i in n_candidates) candidates = list(candidates) if (ct := extract_coursetype(desc)) and ct not in candidates: candidates += [ct] candidateterms.append(candidates)
def cli(ctx): """ You can call this pipeline in many ways: With correct env-vars already set, with a provided `--env-file`, with a provided `--conf-file`, with command-line args (at the appropriate sub-command), or with default-values. If a multiple values for settings are given, precedence-order is command-line-args > env-vars (--env-file > pre-existing) > conf-file > dataset_class > defaults """ print("Starting up at", datetime.now().strftime("%d.%m.%Y, %H:%M:%S")) setup_logging(ctx.get_config("log"), ctx.get_config("logfile")) ctx.init_context( ) #after this point, no new env-vars should be set anymore (are not considered)
def _filter_step2(dtm, used_terms_set, verbose=False, descriptions=None): all_terms_new = dict( enumerate( [v for k, v in dtm.all_terms.items() if k in used_terms_set])) all_terms_new_rev = {v: k for k, v in all_terms_new.items()} dtm_translator = { k: all_terms_new_rev[v] for k, v in dtm.all_terms.items() if k in used_terms_set } doc_term_matrix = [[[dtm_translator.get(ind), num] for ind, num in doc if ind in used_terms_set] for doc in dtm.dtm] if descriptions: if get_setting("DO_SANITYCHECKS"): expected_bows = { ndoc: {all_terms_new[elem]: count for elem, count in doc} for ndoc, doc in enumerate(doc_term_matrix[:10]) } assert all( all(v == descriptions._descriptions[i].bow()[k] for k, v in expected_bows[i].items() if not " " in k) for i in range(10)) assert all( all(v == descriptions._descriptions[i].count_phrase(k) for k, v in expected_bows[i].items() if not " " in k) for i in range(10)) assert all( all_terms_new[ind] in descriptions._descriptions[ndoc] for ndoc, doc in enumerate( tqdm( doc_term_matrix, desc= "Cross-checking filtered DCM with Descriptions [sanity-check]" )) for ind, count in doc) if verbose: shown = [] for n_keyphrases in [0, 1, 20]: items = [[ descriptions._descriptions[i], [all_terms_new[j[0]] for j in e] ] for i, e in enumerate(doc_term_matrix) if len(e) <= n_keyphrases] if items: print( f"Documents with max {n_keyphrases} keyphrases ({len(items)}):\n " + "\n ".join( f"{i[0]}: {', '.join(i[1])}" for i in [j for j in items if j[0] not in shown][:5][:5])) shown += [i[0] for i in items] return DocTermMatrix(dtm=doc_term_matrix, all_terms=all_terms_new, quant_name="count", verbose=verbose)
def create_doc_cand_matrix(postprocessed_candidates, descriptions, verbose=False): postprocessed_candidates, changeds_dict = postprocessed_candidates.values() if get_setting("DEBUG"): maxlen = min(len(postprocessed_candidates), len(descriptions), get_setting("DEBUG_N_ITEMS")) postprocessed_candidates = postprocessed_candidates[:maxlen] descriptions._descriptions = descriptions._descriptions[:maxlen] else: assert len(postprocessed_candidates) == len(descriptions) assert all(cand in desc for ndesc, desc in enumerate(descriptions._descriptions) for cand in postprocessed_candidates[ndesc]) all_phrases = list(set(flatten(postprocessed_candidates))) if get_setting("DEBUG"): all_phrases = all_phrases[:get_setting("DEBUG_N_ITEMS")] # if I used gensim for this, it would be `dictionary,doc_term_matrix = corpora.Dictionary(descriptions), [dictionary.doc2bow(doc) for doc in descriptions]` # dictionary = corpora.Dictionary([all_phrases]) dtm = [ sorted( [(nphrase, desc.count_phrase(phrase)) for nphrase, phrase in enumerate(all_phrases) if phrase in desc], key=lambda x: x[0]) for ndesc, desc in enumerate( tqdm(descriptions._descriptions, desc="Creating Doc-Cand-Matrix")) ] #TODO statt dem ^ kann ich wieder SkLearn nehmen if get_setting("DO_SANITYCHECKS"): assert all( [n for n, i in enumerate(descriptions._descriptions) if term in i] == [ n for n, i in enumerate(dtm) if all_phrases.index(term) in [j[0] for j in i] ] for term in random.sample(all_phrases, 5)) doc_term_matrix = DocTermMatrix(dtm=dtm, all_terms=all_phrases, verbose=verbose, quant_name="count") if get_setting("DO_SANITYCHECKS"): assert all( len([i for i in descriptions._descriptions if term in i]) == len( [i for i in doc_term_matrix.term_quants(term) if i > 0]) for term in random.sample(all_phrases, 5)) #TODO why do I even need to filter this uhm err if verbose and get_setting("EXTRACTION_METHOD") != "all": print( "The 25 terms that are most often detected as candidate terms (incl. their #detections):", ", ".join(f"{k} ({v})" for k, v in sorted(dict( Counter(flatten(postprocessed_candidates))).items(), key=lambda x: x[1], reverse=True)[:25])) return doc_term_matrix
def get_defaultsetting(key, silent=False, default_false=False): if "DEFAULT_" + key not in globals(): if not default_false: raise ValueError( f"You didn't provide a value for {key} and there is no default-value!" ) else: return False default = globals()["DEFAULT_" + key] if key not in NON_INFLUENTIAL_CONFIGS and not silent: if not os.getenv(f"{ENV_PREFIX}shutups_{key}"): print(f"returning {key} from default: *b*{default}*b*") os.environ[f"{ENV_PREFIX}shutups_{key}"] = "1" return default
def plot_perweightingalgo(averaged, detailed, lambda1, do_print=True): # TODO what's missing: # * maybe also plot standard deviation, maybe make a scatterplot for all of the param-combis instead of a simple barplot (or overlay both!) # TODO[i]: HAVE to add number of samples here, and SHOULD add info on what other configs (including which dataset!) are given # TODO[i]: Die Namen der actual kappa-funcs klingen noch ziemlich shitty title = f"Number of candidate-directions with κ ≥ {lambda1} per weighting-algorithm,\n averaged over {round(len(detailed)/len(averaged))} parameter-combinations each (TODOsInHere!)" averaged = pd.DataFrame(averaged) if do_print: print("\n**"+title.replace("\n", "")+":**\n\n", averaged) ax = averaged.plot(kind="bar", logy=True) ax.set_xticklabels([i._text.replace("kappa_", "").replace("_", " ") for i in ax.get_xticklabels()], ha="right", rotation=45) plt.tight_layout() plt.subplots_adjust(top=0.88) plt.title(title) plt.show()
def create_candidate_svms(dcm, embedding, descriptions, verbose, continue_from=None): #TODO I am still not sure about if I am calculating with vectors somewhere where when I should be working with points if hasattr(embedding, "embedding_"): embedding = embedding.embedding_ decision_planes = {} metrics = {} terms = list(dcm.all_terms.values()) metainf = {} if get_setting("DEBUG"): maxlen = min(len(terms), len(embedding), get_setting("DEBUG_N_ITEMS"), len(dcm.dtm)) working_inds = [nterm for nterm, term in enumerate(terms[:maxlen]) if np.array(dcm.term_quants(term)[:maxlen], dtype=bool).std()] #those with >1 class term_inds = unique(flatten([j[0] for j in dcm.dtm[i]] for i in working_inds)) terms = [dcm.all_terms[i] for i in term_inds] embedding = embedding[working_inds] ind_translator = {v: k for k, v in enumerate(term_inds)} dcm = DocTermMatrix([[[ind_translator[j[0]],j[1]] for j in dcm.dtm[i]] for i in working_inds], {ind_translator[i]: dcm.all_terms[i] for i in term_inds}, dcm.quant_name) print(f"Debug-Mode: Running for {len(working_inds)} Items and {len(terms)} Terms.") # warnings.warn("PRECOMMIT there's stuff here!") # assert all(i in terms for i in ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze']) # terms = ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze'] # assert len([i for i in descriptions._descriptions if 'nature' in i]) == len([i for i in dcm.term_quants('nature') if i > 0]) # print(f"Running only for the terms {terms}") else: assert all(len([i for i in descriptions._descriptions if term in i]) == len([i for i in dcm.term_quants(term) if i > 0]) for term in random.sample(terms, 5)) if get_setting("DO_SANITYCHECKS"): assert all(dcm.term_quants(terms[i]) == list(dcm.as_csr()[i,:].toarray().squeeze()) for i in random.sample(range(len(terms)), 5)) quants_s = dcm.as_csr().toarray().tolist() # [dcm.term_quants(term) for term in tqdm(terms, desc="Counting Terms")] ncpu = get_ncpu(ram_per_core=10) #TODO: make ram_per_core dependent on dataset-size if ncpu == 1: #TODO Interruptible: for ncpu==1, I'm adding direct key-value-pairs, in the ncpu>1 version I'm appending to a list -> they are incompatible! with Interruptible(zip(terms, quants_s), ([], decision_planes, metrics), metainf, continue_from=continue_from, pgbar="Creating Candidate SVMs [1 proc]", total=len(terms), name="SVMs") as iter: for term, quants in iter: #in tqdm(zip(terms, quants_s), desc="Creating Candidate SVMs", total=len(terms)) cand_mets, decision_plane, term = create_candidate_svm(embedding, term, quants, classifier=get_setting("CLASSIFIER"), descriptions=descriptions, quant_name=dcm.quant_name) metrics[term] = cand_mets decision_planes[term] = decision_plane else: print(f"Starting Multiprocessed with {ncpu} CPUs") with Interruptible(zip(terms, quants_s), [None, [], None], metainf, continue_from=continue_from, contains_mp=True, name="SVMs", total=len(quants_s)) as iter: with tqdm(total=iter.n_elems, desc=f"Creating Candidate SVMs [{ncpu} procs]") as pgbar, ThreadPool(ncpu, comqu=iter.comqu) as p: res, interrupted = p.starmap(create_candidate_svm, zip(repeat(embedding, iter.n_elems), repeat("next_0"), repeat("next_1"), repeat(get_setting("CLASSIFIER")), repeat(False), repeat(None), repeat(dcm.quant_name), repeat(pgbar)), draw_from=iter.iterable) _, res, _ = iter.notify([None, res, None], exception=interrupted) if interrupted is not False: return quants_s, res, None, metainf for cand_mets, decision_plane, term in res: metrics[term] = cand_mets decision_planes[term] = decision_plane assert set(terms) == set(metrics.keys()) if (didnt_converge := len([1 for i in metrics.values() if i and not i["did_converge"]])): warnings.warn(f"{didnt_converge} of the {len(metrics)} SVMs did not converge!", sklearn.exceptions.ConvergenceWarning)
def show_close_descriptions(dissim_mat, descriptions, is_embedding=False, num=10, title="Dissim-Mat"): # closest_entries = list(zip(*np.where(dissim_mat==min(dissim_mat[dissim_mat>0])))) # closest_entries = set(tuple(sorted(i)) for i in closest_entries) # print(f"Closest Nonequal Descriptions: \n", "\n".join(["*b*"+("*b* & *b*".join([descriptions._descriptions[i].title for i in j]))+"*b*" for j in closest_entries])) print(f"Closest {num} Descriptions in {title}:") if is_embedding: dissim_mat = _create_dissim_mat(dissim_mat, get_setting("DISSIM_MEASURE"), force_singlethread=len(dissim_mat)<500, silent=len(dissim_mat)<500)[0] is_dissim = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10) assert is_dissim, "TODO now it's a similarity matrix" min_vals = sorted(squareform(dissim_mat))[:num] min_indices = np.where(np.isin(dissim_mat, min_vals)) min_indices = [(i,j) for i,j in zip(*min_indices) if i!=j] min_indices = list({j: None for j in [tuple(sorted(i)) for i in min_indices]}.keys()) #remove duplicates ("aircraft cabin and airplane cabin" and "airplane cabin and aircraft cabin") for first, second in min_indices[:num]: print(f" *b*{descriptions._descriptions[first].title}*b* and *b*{descriptions._descriptions[second].title}*b*")
def __init__(self, dtm, all_terms, quant_name, verbose=False): self.includes_pseudodocs = False self.dtm = dtm self.all_terms = {n: elem for n, elem in enumerate(all_terms)} if isinstance( all_terms, list) else all_terms self.quant_name = quant_name # if "all_terms" in kwargs and "descriptions" in kwargs: assert hasattr(kwargs["descriptions"][0], "bow") # for desc in kwargs["descriptions"]: self.dtm.append([[self.reverse_term_dict[k], v] for k,v in desc.bow().items()]) assert set(self.all_terms) == set( flatten([[elem[0] for elem in row] for row in self.dtm])) if verbose: print( f"Loaded Doc-Term-Matrix with {len(self.dtm)} documents and {len(self.all_terms)} items." ) self.show_info()
def json_load(fname, **kwargs): #assert_meta=(), return_meta=False, try: if isinstance(fname, str): with open(fname, "r") as rfile: tmp = json.load(rfile, **kwargs) else: #then it may be a sacred opened resource (https://sacred.readthedocs.io/en/stable/apidoc.html#sacred.Experiment.open_resource) tmp = json.load(fname, **kwargs) return npify_rek(tmp) except json.decoder.JSONDecodeError as e: print(f"{fname} doesn't work!") raise json.decoder.JSONDecodeError(msg=f"NAME:{fname}|||MSG:{e.msg}", doc=e.doc, pos=e.pos) from e except Exception as e: print(f"{fname} doesn't work!") raise e
def classify_shallowtree_multi(clusters, embedding, descriptions, dataset_class, classes=None, verbose=False, **kwargs): results = {} for classes in (([classes] if isinstance(classes, str) else classes) or descriptions.additionals_names): for test_percentage_crossval in [0.33, 0.5, 4, 5]: for one_vs_rest in [True, False]: for dt_depth in [1, 2, 3, None]: for balance_classes in [True, False]: results[(classes, test_percentage_crossval, one_vs_rest, dt_depth, balance_classes)] = {} for metric in ["accuracy", "f1"]: print("==" * 50) score = classify_shallowtree( clusters, embedding, descriptions, dataset_class, one_vs_rest, dt_depth, test_percentage_crossval, classes, verbose=verbose, return_features=False, balance_classes=balance_classes, metric=metric, also_unweighted=True, **kwargs) results[(classes, test_percentage_crossval, one_vs_rest, dt_depth, balance_classes)][metric] = score df = pd.DataFrame(results, columns=pd.MultiIndex.from_tuples( [i for i in results.keys()], names=("classes", "test%/Xval", "1vsRest", "Tree-Depth", "balanced"))) with pd.option_context('display.max_rows', 51, 'display.max_columns', 20, 'display.expand_frame_repr', False, 'display.max_colwidth', 20, 'display.float_format', '{:.4f}'.format): print(df) return df
def __init__(self, **kwargs): assert kwargs.keys() == self.OPTION_LETTER.keys() if kwargs["use_skcountvec"]: must_override = [ i for i in kwargs.keys() - self.SKCOUNTVEC_SUPPORTS - {"use_skcountvec"} if kwargs[i] ] if must_override: print( f"Must overwrite the following PP-Components to False as SKLearn-CountVectorizer doesn't support it: {', '.join(must_override)}" ) raise Exception("No can do!") kwargs = { k: False if k in must_override else v for k, v in kwargs.items() } self.di = kwargs
def main(): DATASET = "siddata" args = parse_command_line_args() setup_logging() load_envfiles(DATASET) ctx = SnakeContext.loader_context(silent=False) descriptions = ctx.load("pp_descriptions") res = extract_classes(descriptions, args.classes, ctx.obj["dataset_class"], use_name=args.named) fname = join( ctx.p.in_dir, "fb_classifier", f"{DATASET}_{args.classes}{'_named' if args.named else ''}.csv") res.reset_index().to_csv(fname) print(f"Saved under {fname}.")
def list_paramcombis(ctx): from derive_conceptualspace.cli.args_from_filename import LAST_RESULT # TODO get rid of this entirely. # TODO this should ONLY consider command-line-args as config to compare the candidates to candidates = [ join(path, name)[len(ctx.p.in_dir):] for path, subdirs, files in os.walk(join(ctx.p.in_dir, "")) for name in files if name.startswith(f"{LAST_RESULT}.json") ] #TODO better LAST_RESULT candidates = [ i for i in candidates if i.startswith( ctx.p.get_subdir({ i: ctx.get_config(i) for i in ["DEBUG", "DATASET", "LANGUAGE"] })[0]) ] for cand in candidates: print(cand)
def set_config(self, key, val, source, silent=False): #this is only a suggestion, it will only be finally set once it's accessed! key, val = standardize_config(key, val) if key in self.used_configs and val != self.used_configs[key]: raise ValueError(fmt(f"{source} is trying to overwrite config {key} with *r*{val}*r*, but it was already used with value *b*{self.used_configs[key]}*b*!")) self.toset_configs.append([key, val, source]) existing_configs = list(zip(*[i for i in self.toset_configs if i[0] == key and i[2] not in ["defaults", "smk_args"]])) if existing_configs and len(set(existing_configs[1])) > 1 and existing_configs[0][0] not in settings.MAY_DIFFER_IN_DEPENDENCIES: #TODO this has become a mess. I originally only wanted this warning for dependency, but then expanded it for force and now it's BS. Overhaul this!! ordered_args = sorted(list(zip(*existing_configs[::-1][:2])), key=lambda x:CONF_PRIORITY.index(re.sub(r'\[.*?\]', '', x[0]))) ordered_args = dict(sorted({v:k for k,v in list({v: k for k, v in ordered_args[::-1]}.items())}.items(), key=lambda x:CONF_PRIORITY.index(re.sub(r'\[.*?\]', '', x[0])))) # per value only keep the highest-priority-thing that demanded it if "dependency" in ordered_args and ordered_args["dependency"] != ordered_args.get("force", ordered_args["dependency"]): raise ValueError(f"A Dependency requires {existing_configs[0][0]} to be {dict(ordered_args)['dependency']} but your other config demands {[v for k,v in ordered_args.items() if k!='dependency'][0]}") # if "dataset_class" in ordered_args and bool([k for k, v in ordered_args.items() if v != ordered_args["dataset_class"]]): #if something of higher prio overwrites dataset_class # raise ValueError(f"dataset_class requires {existing_configs[0][0]} to be {dict(ordered_args)['dataset_class']} but it will be overwritten by {[k for k, v in ordered_args.items() if v != ordered_args['dataset_class']]}") ordered_args = list(ordered_args.items()) if f"{existing_configs[0][0]} from {ordered_args[1][1]} to {ordered_args[0][1]}" not in self._given_warnings: self._given_warnings.append(f"{existing_configs[0][0]} from {ordered_args[1][1]} to {ordered_args[0][1]}") if not (silent or (hasattr(self, "silent") and self.silent)): print(f"{ordered_args[1][0]} demanded config {existing_configs[0][0]} to be *r*{ordered_args[1][1]}*r*, but {ordered_args[0][0]} overwrites it to *b*{ordered_args[0][1]}*b*")
def extract_classes(descriptions, classes, dataset_class, use_name=False): #TODO: merge this with the content of the very same thing in derive_conceptualspace.evaluate.shallow_trees.classify_shallowtree if classes is None: classes = descriptions.additionals_names[0] if classes in descriptions.additionals_names: catnames = None if hasattr(dataset_class, "CATNAMES") and classes in dataset_class.CATNAMES: catnames = dataset_class.CATNAMES.get(classes) hascat = [ n for n, i in enumerate(descriptions._descriptions) if i._additionals[classes] is not None ] getcat = lambda i: descriptions._descriptions[i]._additionals[classes] elif hasattr(dataset_class, "get_custom_class"): getcat, hascat, catnames = dataset_class.get_custom_class(classes, descriptions, verbose=True) else: raise Exception(f"The class {classes} does not exist!") if catnames and use_name: orig_getcat = getcat getcat = lambda x: catnames.get(int(orig_getcat(x)), orig_getcat(x)) else: orig_getcat = getcat getcat = lambda x: int(orig_getcat(x)) - 1 #labels 0-9 instead of 1-10 print( f"Using classes from {classes} - {len(hascat)}/{len(descriptions)} entities have a class" ) cats = {i: getcat(i) for i in hascat} print( f"Labels ({len(set(cats.values()))} classes):", ", ".join(f"*b*{k}*b*: {v}" for k, v in Counter(cats.values()).items())) return pd.DataFrame( { descriptions._descriptions[i].title: [descriptions._descriptions[i].unprocessed_text, getcat(i)] for i in hascat }, index=["text", "class"]).T
def term_freqs(self, verbose=False): """the number of documents containing a word, for all words""" if not hasattr(self, "_term_freqs"): # occurences = [set(i[0] for i in doc) for doc in self.dtm] # self._term_freqs = {term: sum(term in doc for doc in occurences) for term in tqdm(list(self.all_terms.keys()), desc="Calculating Doc-Frequencies")} self._term_freqs = dict( enumerate( self.as_csr(binary=True).sum( axis=1).squeeze().tolist()[0])) if verbose: most_freq = sorted(self._term_freqs.items(), key=lambda x: x[1], reverse=True)[:5] print( "Most frequent terms:", ", ".join([ f"{self.all_terms[term]} ({num})" for term, num in most_freq ])) return self._term_freqs
def create_languages_file(raw_descriptions, columns, json_persister, dataset_class, declare_silent=False, pp_components=None, proc_descs=None): if isinstance(columns, str): columns = [columns] results = {} for col in columns: try: languages = json_persister.load(None, f"{col}_languages", loader=lambda langs: langs, silent=declare_silent) except FileNotFoundError: if proc_descs is None: proc_descs = dataset_class.preprocess_raw_file( raw_descriptions, pp_components=PPComponents.from_str(pp_components)) langs = get_langs(proc_descs[col], assert_len=False, pgbar_name=f"Getting Language of {col}") langs = { i[col]: langs[i[col]] for _, i in proc_descs.iterrows() if not pd.isna(i[col]) } json_persister.save(f"{col}_languages.json", langs=langs, ignore_confs=[ "DEBUG", "PP_COMPONENTS", "TRANSLATE_POLICY", "LANGUAGE" ]) languages = json_persister.load(None, f"{col}_languages", loader=lambda langs: langs, silent=declare_silent) else: print(f"Languages-file for {col} already exists!") results[col] = languages return results
def create_mds(dissim_mat, embed_dimensions, metric=True, init_from_isomap=True): max_iter = 10000 if not get_setting("DEBUG") else 100 if not init_from_isomap: warnings.warn("sklearn's MDS is broken!! Have to init from something, don't f*****g ask why!") n_inits = math.ceil((max(get_ncpu()*2, (10 if not get_setting("DEBUG") else 3)))/get_ncpu())*get_ncpu() # minimally 10, maximally ncpu*2, but in any case a multiple of ncpu print(f"Running {'non-' if not metric else ''}metric MDS {n_inits} times with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations.") embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric, #TODO with metric=True it always breaks after the second step if n_components>>2 (well, mit metric=False auch^^) n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=n_inits, max_iter=max_iter) mds = embedding.fit(dissim_mat) else: print(f"Running {'non-' if not metric else ''}metric MDS with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations, initialized from Isomap-Embeddings") embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric, n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=1, max_iter=max_iter) try: isomap_init = create_isomap(dissim_mat, embed_dimensions, neighbor_factor=25).embedding_ except ValueError: #There are significant negative eigenvalues... isomap_init = np.random.random((len(dissim_mat), embed_dimensions))*0.01 mds = embedding.fit(dissim_mat, init=isomap_init) return mds
def read_configfile(self): if self.get_config("conf_file"): fname = join(os.getenv(f"{ENV_PREFIX}_CONFIGDIR", dirname(settings.__file__)), self.get_config("conf_file")) if not isfile(self.get_config("conf_file")) and join(os.getenv(f"{ENV_PREFIX}_CONFIGDIR", dirname(settings.__file__)), self.get_config("conf_file")) else self.get_config("conf_file") with open(fname, "r") as rfile: config = yaml.load(rfile, Loader=yaml.SafeLoader) if config.get("__perdataset__"): if config["__perdataset__"].get(self.get_config("dataset"), {}): config.update(config.get("__perdataset__", {}).get(self.get_config("dataset"), {})) del config["__perdataset__"] for k, v in config.items(): if isinstance(v, list): #IDEA: wenn v eine liste ist und wenn ein cmd-arg bzw env-var einen wert hat der damit consistent ist, nimm das arg overwriters = [i[1:] for i in self.toset_configs if i[0]==standardize_config_name(k) and CONF_PRIORITY.index(re.sub(r'\[.*?\]', '', i[2])) < CONF_PRIORITY.index("conf_file")] if overwriters and len(set([i[0] for i in overwriters])) > 1: # assert len(overwriters) == 1 and overwriters[0][0] in v, "TODO: do this" self.set_config(k, overwriters[0][0], "conf_file") else: self.set_config(k, v[0], "conf_file") else: self.set_config(k, v, "conf_file") if not self.silent: print(f"Config-File {fname} loaded.")
def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False): """as in [VISR12: 4.2.1]""" # TODO options here: # * if it should filter AFTER the LSI if verbose: filtered_dcm.show_info(descriptions=pp_descriptions) if get_setting("DCM_QUANT_MEASURE") != "binary": logger.warn("VISR12 say it works best with binary!") filtered_dcm.add_pseudo_keyworddocs() dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())]) print("Start creating the LSA-Model with MORE topics than terms...") lsamodel_manytopics = LsiModel(doc_term_matrix, num_topics=len(all_terms) * 2, id2word=dictionary) print("Start creating the LSA-Model with FEWER topics than terms...") lsamodel_lesstopics = LsiModel(filtered_dcm.dtm, num_topics=len(filtered_dcm.all_terms) // 10, id2word=dictionary) print() import matplotlib.cm import matplotlib.pyplot as plt # TODO use the mpl_tools here as well to also save plot! plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200], vmin=lsamodel_lesstopics.get_topics().min(), vmax=lsamodel_lesstopics.get_topics().max(), cmap=matplotlib.cm.get_cmap("coolwarm")) plt.show()
def extract_candidateterms(pp_descriptions, extraction_method, max_ngram, verbose=False, **kwargs): if extraction_method == "keybert": candidateterms, metainf = extract_candidateterms_keybert_nopp( pp_descriptions, max_ngram, get_setting("faster_keybert"), verbose=verbose, **kwargs) elif extraction_method == "pp_keybert": candidateterms, metainf = extract_candidateterms_keybert_preprocessed( pp_descriptions, max_ngram, get_setting("faster_keybert"), verbose=verbose, **kwargs) elif extraction_method in ["tfidf", "tf", "all", "ppmi"]: candidateterms, metainf = extract_candidateterms_quantific( pp_descriptions, max_ngram, quantific=extraction_method, verbose=verbose, **kwargs) else: raise NotImplementedError() flattened = set(flatten(candidateterms)) print( "Unique Terms I found: ", ", ".join([ f"{k+1}-grams: {v}" for k, v in sorted(Counter([i.count(" ") for i in flattened]).items(), key=lambda x: x[0]) ]), "| sum:", len(flattened)) metainf["n_candidateterms"] = len(flattened) return candidateterms, metainf