예제 #1
0
def descriptions_from_bow(descs, languages, translations, translate_policy):
    if translate_policy != "onlyorig" or languages != "en":
        raise NotImplementedError()
    desc_list = DescriptionList(add_title=False,
                                add_subtitle=False,
                                translate_policy=translate_policy,
                                additionals_names=list(
                                    descs["classes"].keys()))
    if get_setting("DEBUG"):
        descs["vecs"] = dict(
            list(descs["vecs"].items())[:get_setting("DEBUG_N_ITEMS")])
    for name, bow in descs["vecs"].items():
        desc_list.add(
            Description(lang=languages,
                        text=None,
                        title=name,
                        subtitle=None,
                        orig_textlang=None,
                        bow=bow,
                        additionals={
                            k: v.get(name)
                            for k, v in descs["classes"].items()
                        }))
    desc_list.proc_steps.append("bow")
    return desc_list
 def preprocess_raw_file(df, pp_components, min_ges_nwords=20):
     """loads the given Siddata-Style CSV into a pandas-dataframe, already performing some processing like
         dropping duplicates"""
     #TODO in exploration I also played around with Levenhsthein-distance etc!
     #remove those for which the Name (exluding stuff in parantheses) is equal...
     assert isinstance(df, pd.DataFrame)
     df = df.reset_index().drop(columns=["Unnamed: 0", "index"])
     # df = df[~df['description'].isnull()]
     df = df[df["description"] != "[]"]
     if get_setting("DEBUG"):
         df = df[:get_setting("DEBUG_N_ITEMS") * 2]
     df = Dataset.merge_multidescs(df, pp_components)
     df.loc[:, 'ges_nwords'] = df["description"].str.count(" ").fillna(0)
     df["subtitle"] = df["subtitle"] + df[
         "subject"]  #TODO: maybe have an extra pp_comp for this?
     if pp_components.add_title:
         df.loc[:, 'ges_nwords'] += df["title"].str.count(" ").fillna(0)
     if pp_components.add_subtitle:
         df.loc[:, 'ges_nwords'] += df["subtitle"].str.count(" ").fillna(0)
     df = df[df["ges_nwords"] >= min_ges_nwords]
     with pd.option_context('mode.chained_assignment',
                            None):  #TODO publisher to get uni
         for column in ["title", "description", "subtitle"]:
             df.loc[:, column] = df[column].copy().str.strip()
     return df
def run_lsi(pp_descriptions, filtered_dcm, verbose):
    """as in [VISR12: 4.2.1]"""
    if verbose:
        filtered_dcm.show_info(descriptions=pp_descriptions)
        if get_setting("DCM_QUANT_MEASURE") != "binary":
            logger.warn("VISR12 say it works best with binary!")
    orig_len = len(filtered_dcm.dtm)
    filtered_dcm.add_pseudo_keyworddocs()
    # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    svd = TruncatedSVD(n_components=100,
                       random_state=get_setting("RANDOM_SEED"))
    transformed = svd.fit_transform(filtered_dcm.as_csr().T)
    desc_psdoc_dists = cdist(transformed[:orig_len], transformed[orig_len:],
                             "cosine")
    already_keywords = [
        [ind, j[0]] for ind, elem in enumerate(filtered_dcm.dtm[:orig_len])
        for j in elem
    ]  # we don't gain information from those that are close but already keywords
    desc_psdoc_dists[list(zip(*already_keywords))] = np.inf
    WHICH_LOWEST = 30
    tenth_lowest = np.partition(desc_psdoc_dists.min(axis=1), WHICH_LOWEST)[
        WHICH_LOWEST]  # https://stackoverflow.com/a/43171216/5122790
    good_fits = np.where(desc_psdoc_dists.min(axis=1) < tenth_lowest)[0]
    for ndesc, keyword in zip(good_fits,
                              np.argmin(desc_psdoc_dists[good_fits], axis=1)):
        assert not filtered_dcm.all_terms[
            keyword] in pp_descriptions._descriptions[ndesc]
        print(f"*b*{filtered_dcm.all_terms[keyword]}*b*",
              pp_descriptions._descriptions[ndesc])
    print()
def create_candidate_svm(embedding, term, quants, classifier, plot_svm=False, descriptions=None, quant_name=None, pgbar=None, **kwargs):
    #!! term is only used for visualization, and ist must stay that way for CLUSTER_DIRECTION_ALGO = "reclassify" !
    bin_labels = np.array(quants, dtype=bool) # Ensure that regardless of quant_measure this is correct binary classification labels
    # (tmp := len(quants)/(2*np.bincount(bin_labels)))[0]/tmp[1] is roughly equal to bin_labels.mean() so balancing is good
    if classifier == "SVM":
        svm = sklearn.svm.LinearSVC(class_weight="balanced", loss="hinge", max_iter=20000)
    elif classifier == "SVM_square":
        svm = sklearn.svm.LinearSVC(dual=False, class_weight="balanced") #squared-hinge instead of hinge (but fastest!)
    elif classifier == "SVM2":
        warnings.warn("Using an SVM Implementation that's slower for this kind of data!")
        svm = sklearn.svm.SVC(kernel="linear", class_weight="balanced", decision_function_shape="ovo")  #slower than LinearSVC, don't use!
        # see https://stackoverflow.com/q/33843981/5122790, https://stackoverflow.com/q/35076586/5122790
    else:
        raise NotImplementedError(f"Demanded classifier {classifier} not implemented!")
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        svm.fit(embedding, bin_labels)
        if w: assert issubclass(w[0].category, (sklearn.exceptions.ConvergenceWarning, DeprecationWarning))
        no_converge = (bool(w) and issubclass(w[0].category, sklearn.exceptions.ConvergenceWarning))
    tn, fp, fn, tp = confusion_matrix(bin_labels, svm.predict(embedding)).ravel()
    res = {"accuracy": (tp + tn) / len(quants), "precision": tp / (tp + fp), "recall": tp / (tp + fn), "did_converge": not no_converge}
    res["f_one"] = 2 * (res["precision"] * res["recall"]) / (res["precision"] + res["recall"])
    #now, in [DESC15:4.2.1], they compare the "ranking induced by \vec{v_t} with the number of times the term occurs in the entity's documents" with Cohen's Kappa.

    #see notebooks/proof_of_concept/get_svm_decisionboundary.ipynb#Checking-projection-methods-&-distance-measures-from-point-to-projection for the ranking
    decision_plane = NDPlane(svm.coef_[0], svm.intercept_[0])  #don't even need the plane class here
    dist = lambda x, plane: np.dot(plane.normal, x) + plane.intercept
    distances = [dist(point, decision_plane) for point in embedding]
    assert np.allclose(distances, svm.decision_function(embedding)) #see https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.decision_function, https://stats.stackexchange.com/a/14881
    distances /= np.linalg.norm(svm.coef_[0]) #TODO: add the links and this normalification to the distances-notebook
    #sanity check: do most of the points with label=0 have the same sign `np.count_nonzero(np.sign(np.array(distances)[bin_labels])+1)
    # bin_labels, np.array((np.sign(np.array(distances))+1)/2, dtype=bool)
    # quant_ranking = np.zeros(quants.shape); quant_ranking[np.where(quants > 0)] = np.argsort(quants[quants > 0])
    #TODO cohen's kappa hat nen sample_weight parameter!! DESC15 write they select Kappa "due to its tolerance to class imbalance." -> Does that mean I have to set the weight?!
    kappa_weights = get_setting("KAPPA_WEIGHTS") if get_setting("KAPPA_WEIGHTS") != "None" else None
    res["kappa_rank2rank_dense"]  = cohen_kappa(rankdata(quants, method="dense"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next is a 1
    res["kappa_rank2rank_min"] = cohen_kappa(rankdata(quants, method="min"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next one is a 14.901
    res["kappa_bin2bin"]    = cohen_kappa(bin_labels, [i > 0 for i in distances], weights=kappa_weights)
    res["kappa_digitized"]  = cohen_kappa(np.digitize(quants, np.histogram_bin_edges(quants)[1:]), np.digitize(distances, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights)
    res["ndcg_all"] = ndcg_score(np.array([quants]), np.expand_dims(distances,0))
    res["ndcg_onlypos"] = ndcg_score(np.array([quants]), np.expand_dims(distances, 0), k=np.count_nonzero(np.array(quants)))
    nonzero_indices = np.where(np.array(quants) > 0)[0]
    q2, d2 = np.array(quants)[nonzero_indices], np.array(distances)[nonzero_indices]
    with nullcontext(): #warnings.catch_warnings(): #TODO get rid of what cuases the nans here!!!
        # warnings.filterwarnings('ignore', r'invalid value encountered in true_divide')
        if quant_name == "count":  # in DESC15 they write "measure the correlation between the ranking induced by \vec{vt} and the number of times t appears in the documents associated with each entity", so maybe compare ranking to count?!
            # res["kappa_count2rank"] = cohen_kappa(quants, rankdata(distances, method="dense"), weights=kappa_weights)
            res["kappa_count2rank_onlypos"] = cohen_kappa(q2, rankdata(d2, method="dense"), weights=kappa_weights)
        res["kappa_rank2rank_onlypos_dense"] = cohen_kappa(rankdata(q2, method="dense"), rankdata(d2, method="dense"), weights=kappa_weights)
        res["kappa_rank2rank_onlypos_min"] = cohen_kappa(rankdata(q2, method="min"), rankdata(d2, method="min"), weights=kappa_weights)
        res["kappa_rank2rank_onlypos_max"] = cohen_kappa(rankdata(q2, method="max"), rankdata(d2, method="max"), weights=kappa_weights)
        # res["kappa_digitized_onlypos_1"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(quants)[1:]), np.digitize(d2, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights)
        #one ^ has as histogram-bins what it would be for ALL data, two only for the nonzero-ones
        res["kappa_digitized_onlypos_2"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(q2)[1:]), np.digitize(d2, np.histogram_bin_edges(d2)[1:]), weights=kappa_weights)
    if plot_svm and descriptions is not None:
        display_svm(embedding, np.array(bin_labels, dtype=int), svm, term=term, descriptions=descriptions, name=term+" "+(", ".join(f"{k}: {round(v, 3)}" for k, v in res.items())), quants=quants, distances=distances, **kwargs)
    if pgbar is not None:
        pgbar.update(1)
    return res, decision_plane, term
def create_candidate_svms(dcm, embedding, descriptions, verbose, continue_from=None):
    #TODO I am still not sure about if I am calculating with vectors somewhere where when I should be working with points
    if hasattr(embedding, "embedding_"): embedding = embedding.embedding_
    decision_planes = {}
    metrics = {}
    terms = list(dcm.all_terms.values())
    metainf = {}
    if get_setting("DEBUG"):
        maxlen = min(len(terms), len(embedding), get_setting("DEBUG_N_ITEMS"), len(dcm.dtm))
        working_inds = [nterm for nterm, term in enumerate(terms[:maxlen]) if np.array(dcm.term_quants(term)[:maxlen], dtype=bool).std()] #those with >1 class
        term_inds = unique(flatten([j[0] for j in dcm.dtm[i]] for i in working_inds))
        terms = [dcm.all_terms[i] for i in term_inds]
        embedding = embedding[working_inds]
        ind_translator = {v: k for k, v in enumerate(term_inds)}
        dcm = DocTermMatrix([[[ind_translator[j[0]],j[1]] for j in dcm.dtm[i]] for i in working_inds],
                            {ind_translator[i]: dcm.all_terms[i] for i in term_inds}, dcm.quant_name)
        print(f"Debug-Mode: Running for {len(working_inds)} Items and {len(terms)} Terms.")
        # warnings.warn("PRECOMMIT there's stuff here!")
        # assert all(i in terms for i in ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze'])
        # terms = ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze']
        # assert len([i for i in descriptions._descriptions if 'nature' in i]) == len([i for i in dcm.term_quants('nature') if i > 0])
        # print(f"Running only for the terms {terms}")
    else:
        assert all(len([i for i in descriptions._descriptions if term in i]) == len([i for i in dcm.term_quants(term) if i > 0]) for term in random.sample(terms, 5))
    if get_setting("DO_SANITYCHECKS"):
        assert all(dcm.term_quants(terms[i]) == list(dcm.as_csr()[i,:].toarray().squeeze()) for i in random.sample(range(len(terms)), 5))

    quants_s = dcm.as_csr().toarray().tolist()  # [dcm.term_quants(term) for term in tqdm(terms, desc="Counting Terms")]
    ncpu = get_ncpu(ram_per_core=10) #TODO: make ram_per_core dependent on dataset-size
    if ncpu == 1:  #TODO Interruptible: for ncpu==1, I'm adding direct key-value-pairs, in the ncpu>1 version I'm appending to a list -> they are incompatible!
        with Interruptible(zip(terms, quants_s), ([], decision_planes, metrics), metainf, continue_from=continue_from, pgbar="Creating Candidate SVMs [1 proc]", total=len(terms), name="SVMs") as iter:
            for term, quants in iter: #in tqdm(zip(terms, quants_s), desc="Creating Candidate SVMs", total=len(terms))
                cand_mets, decision_plane, term = create_candidate_svm(embedding, term, quants, classifier=get_setting("CLASSIFIER"), descriptions=descriptions, quant_name=dcm.quant_name)
                metrics[term] = cand_mets
                decision_planes[term] = decision_plane
    else:
        print(f"Starting Multiprocessed with {ncpu} CPUs")
        with Interruptible(zip(terms, quants_s), [None, [], None], metainf, continue_from=continue_from, contains_mp=True, name="SVMs", total=len(quants_s)) as iter:
            with tqdm(total=iter.n_elems, desc=f"Creating Candidate SVMs [{ncpu} procs]") as pgbar, ThreadPool(ncpu, comqu=iter.comqu) as p:
                res, interrupted = p.starmap(create_candidate_svm, zip(repeat(embedding, iter.n_elems), repeat("next_0"), repeat("next_1"), repeat(get_setting("CLASSIFIER")), repeat(False), repeat(None), repeat(dcm.quant_name), repeat(pgbar)), draw_from=iter.iterable)
            _, res, _ = iter.notify([None, res, None], exception=interrupted)
            if interrupted is not False:
                return quants_s, res, None, metainf
        for cand_mets, decision_plane, term in res:
            metrics[term] = cand_mets
            decision_planes[term] = decision_plane
        assert set(terms) == set(metrics.keys())
    if (didnt_converge := len([1 for i in metrics.values() if i and not i["did_converge"]])):
        warnings.warn(f"{didnt_converge} of the {len(metrics)} SVMs did not converge!", sklearn.exceptions.ConvergenceWarning)
def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False):
    """as in [VISR12: 4.2.1]"""
    # TODO options here:
    # * if it should filter AFTER the LSI

    if verbose:
        filtered_dcm.show_info(descriptions=pp_descriptions)
        if get_setting("DCM_QUANT_MEASURE") != "binary":
            logger.warn("VISR12 say it works best with binary!")

    filtered_dcm.add_pseudo_keyworddocs()
    dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())])
    print("Start creating the LSA-Model with MORE topics than terms...")
    lsamodel_manytopics = LsiModel(doc_term_matrix,
                                   num_topics=len(all_terms) * 2,
                                   id2word=dictionary)
    print("Start creating the LSA-Model with FEWER topics than terms...")
    lsamodel_lesstopics = LsiModel(filtered_dcm.dtm,
                                   num_topics=len(filtered_dcm.all_terms) //
                                   10,
                                   id2word=dictionary)
    print()
    import matplotlib.cm
    import matplotlib.pyplot as plt
    # TODO use the mpl_tools here as well to also save plot!
    plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200],
               vmin=lsamodel_lesstopics.get_topics().min(),
               vmax=lsamodel_lesstopics.get_topics().max(),
               cmap=matplotlib.cm.get_cmap("coolwarm"))
    plt.show()
예제 #7
0
def get_name_dict(clusters, cluster_reprs, clus_rep_algo=None):
    clus_rep_algo = clus_rep_algo or get_setting("CLUS_REP_ALGO")
    if clus_rep_algo.startswith("top"):
        topwhat = int(clus_rep_algo.split("_")[1])
        return {k: ",".join(([k]+v)[:topwhat]) for k, v in clusters.items()}
    elif clus_rep_algo in list(cluster_reprs.values())[0].keys():
        return {k: v[clus_rep_algo] for k, v in cluster_reprs.items()}
    raise NotImplementedError()
예제 #8
0
def create_dissim_mat(descriptions: DescriptionList,
                      quantification_measure,
                      verbose=False,
                      **interrupt_kwargs):
    #Options here: get_setting("NGRAMS_IN_EMBEDDING"), get_setting("DISSIM_MAT_ONLY_PARTNERED")
    if get_setting("DEBUG"):
        descriptions._descriptions = descriptions._descriptions[:get_setting(
            "DEBUG_N_ITEMS")]
    dtm, metainf = descriptions.generate_DocTermMatrix(
        min_df=2 if get_setting("DISSIM_MAT_ONLY_PARTNERED") else 1,
        max_ngram=get_setting("MAX_NGRAM")
        if get_setting("NGRAMS_IN_EMBEDDING") else None,
        do_tfidf=quantification_measure
        if quantification_measure in ["tfidf", "tf"] else None)
    assert any(
        " " in i
        for i in dtm.all_terms.values()) == (get_setting("NGRAMS_IN_EMBEDDING")
                                             and get_setting("MAX_NGRAM") > 1)
    quantification = dtm.apply_quant(
        quantification_measure, descriptions=descriptions,
        verbose=verbose) if not metainf.get("sklearn_tfidf") else dtm
    # das ist jetzt \textbf{v}_e with all e's as rows
    #cannot use ppmis directly, because a) too sparse, and b) we need a geometric representation with euclidiean props (betweeness, parallism,..)
    assert all(
        len(set((lst := [i[0] for i in dtm]))) == len(lst)
        for dtm in quantification.dtm)
def create_mds(dissim_mat, embed_dimensions, metric=True, init_from_isomap=True):
    max_iter = 10000 if not get_setting("DEBUG") else 100
    if not init_from_isomap:
        warnings.warn("sklearn's MDS is broken!! Have to init from something, don't f*****g ask why!")
        n_inits = math.ceil((max(get_ncpu()*2, (10 if not get_setting("DEBUG") else 3)))/get_ncpu())*get_ncpu() # minimally 10, maximally ncpu*2, but in any case a multiple of ncpu
        print(f"Running {'non-' if not metric else ''}metric MDS {n_inits} times with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations.")
        embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed",
                        metric=metric, #TODO with metric=True it always breaks after the second step if  n_components>>2 (well, mit metric=False auch^^)
                        n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=n_inits, max_iter=max_iter)
        mds = embedding.fit(dissim_mat)
    else:
        print(f"Running {'non-' if not metric else ''}metric MDS with {get_ncpu(ignore_debug=True)} jobs for max {max_iter} iterations, initialized from Isomap-Embeddings")
        embedding = MDS(n_components=embed_dimensions, dissimilarity="precomputed", metric=metric,
                        n_jobs=get_ncpu(ignore_debug=True), verbose=1 if get_setting("VERBOSE") else 0, n_init=1, max_iter=max_iter)
        try:
            isomap_init = create_isomap(dissim_mat, embed_dimensions, neighbor_factor=25).embedding_
        except ValueError: #There are significant negative eigenvalues...
            isomap_init = np.random.random((len(dissim_mat), embed_dimensions))*0.01
        mds = embedding.fit(dissim_mat, init=isomap_init)
    return mds
def run_preprocessing_funcs(descriptions: DescriptionList,
                            components,
                            word_tokenizer=None):
    #components: sent_tokenize=True, lemmatize=True, remove_stopwords=True, convert_lower=True, remove_diacritcs=True, remove_punctuation=True
    # TODO use TreeTagger? https://textmining.wp.hs-hannover.de/Preprocessing.html#Alternative:-Treetagger
    # https://textmining.wp.hs-hannover.de/Preprocessing.html#Satzerkennung-und-Tokenization
    assert components.convert_lower, "Stopwords are lower-case so not converting is not allowed (bad for german...!)"
    if components.remove_htmltags:
        descriptions.process_all(
            lambda data: re.compile(r'<.*?>').sub('', data), "remove_htmltags")
    if components.sent_tokenize:
        if not get_setting("USE_STANZA"):
            descriptions.process_all(
                nltk_sent_tokenize,
                "sent_tokenize",
                indiv_kwargs=dict(
                    language=lambda desc: NLTK_LAN_TRANSLATOR[desc.lang])
            )  #nltk suuucks!! sent_tokenize(*, language=german) trennt sogar "...am Ende des 2. Semesters", oder, even worse, "Relevante Probleme wie z.B. Lautierungsregeln", but if there's no space after a dot it DOESN'T split obvious sentences! very visible in description "!! FÄLLT AB 15.11. AUS !! Lektürekurs Spanisch I (Gruppe A und B)." #TODO!
            #TODO maybe write small rule-based-after-thingy that handles common stuff like... * "z.B." wird nicht getrennt * "\d+\. Nomen" (bspw "2. Semester") wird nicht getrennt * Kram wie "15.11." (also daten) wird nicht getrennt, ...
        else:
            logging.getLogger('stanza').setLevel(logging.ERROR)
            import stanza
            if len(descriptions.languages) > 1: raise NotImplementedError()
            nlp = stanza.Pipeline(lang='de', processors='tokenize')
            fn = lambda txt: [i._text for i in nlp(txt).sentences]
            descriptions.process_all(fn,
                                     "sent_tokenize",
                                     pgbar="Stanza Sentence-Tokenizing")
    if components.convert_lower:
        convert_lower_all(descriptions)
    #tokenization will happen anyway!
    if not components.lemmatize:
        word_tokenize_all(descriptions,
                          word_tokenizer=word_tokenizer,
                          remove_stopwords=components.remove_stopwords)
    else:
        word_tokenize_all(descriptions,
                          word_tokenizer=word_tokenizer,
                          remove_stopwords=False)
        lemmatize_all(descriptions, components.convert_lower,
                      components.remove_punctuation)
        if components.remove_stopwords:
            descriptions.process_all(
                lambda txt, stopwords:
                [[lemma for lemma in sent if lemma not in stopwords]
                 for sent in txt],
                "remove_stopwords",
                indiv_kwargs=dict(
                    stopwords=lambda desc: get_stopwords(desc.lang)))
    if components.remove_diacritics:
        remove_diacritics_all(descriptions)
    if components.remove_punctuation:
        remove_punctuation_all(descriptions)
    return descriptions
예제 #11
0
 def _filter_step2(dtm, used_terms_set, verbose=False, descriptions=None):
     all_terms_new = dict(
         enumerate(
             [v for k, v in dtm.all_terms.items() if k in used_terms_set]))
     all_terms_new_rev = {v: k for k, v in all_terms_new.items()}
     dtm_translator = {
         k: all_terms_new_rev[v]
         for k, v in dtm.all_terms.items() if k in used_terms_set
     }
     doc_term_matrix = [[[dtm_translator.get(ind), num] for ind, num in doc
                         if ind in used_terms_set] for doc in dtm.dtm]
     if descriptions:
         if get_setting("DO_SANITYCHECKS"):
             expected_bows = {
                 ndoc: {all_terms_new[elem]: count
                        for elem, count in doc}
                 for ndoc, doc in enumerate(doc_term_matrix[:10])
             }
             assert all(
                 all(v == descriptions._descriptions[i].bow()[k]
                     for k, v in expected_bows[i].items() if not " " in k)
                 for i in range(10))
             assert all(
                 all(v == descriptions._descriptions[i].count_phrase(k)
                     for k, v in expected_bows[i].items() if not " " in k)
                 for i in range(10))
             assert all(
                 all_terms_new[ind] in descriptions._descriptions[ndoc]
                 for ndoc, doc in enumerate(
                     tqdm(
                         doc_term_matrix,
                         desc=
                         "Cross-checking filtered DCM with Descriptions [sanity-check]"
                     )) for ind, count in doc)
         if verbose:
             shown = []
             for n_keyphrases in [0, 1, 20]:
                 items = [[
                     descriptions._descriptions[i],
                     [all_terms_new[j[0]] for j in e]
                 ] for i, e in enumerate(doc_term_matrix)
                          if len(e) <= n_keyphrases]
                 if items:
                     print(
                         f"Documents with max {n_keyphrases} keyphrases ({len(items)}):\n  "
                         + "\n  ".join(
                             f"{i[0]}: {', '.join(i[1])}" for i in
                             [j
                              for j in items if j[0] not in shown][:5][:5]))
                     shown += [i[0] for i in items]
     return DocTermMatrix(dtm=doc_term_matrix,
                          all_terms=all_terms_new,
                          quant_name="count",
                          verbose=verbose)
예제 #12
0
def extract_coursetype(desc, coursetypes=None):
    raise NotImplementedError("Hard TODO: move this to dataset_spefics.siddata")
    coursetypes = coursetypes or get_setting("COURSE_TYPES")
    for type in coursetypes:
        if any(i in desc.unprocessed_text.lower() for i in [f"this {type}"]):
            return type
    counts = {i: desc.bow().get(i, 0) for i in coursetypes}
    if any(i > 0 for i in counts.values()):
        return max(counts.items(), key=lambda x:x[1])[0]
    counts = {i: desc.unprocessed_text.lower().count(i) for i in coursetypes}
    if any(i > 0 for i in counts.values()):
        return max(counts.items(), key=lambda x:x[1])[0]
    return None
def join_clusters_reclassify(clusters, dcm, embedding, verbose=False):
    if hasattr(embedding, "embedding_"): embedding = embedding.embedding_
    all_cand_mets = {}
    cluster_directions = {}
    for k, v in tqdm(clusters.items(), desc="Reclassifying Clusters"):
        embed = embedding
        dtm = DocTermMatrix.submat_forterms(dcm, [k] + v)
        combined_quants = dtm.as_csr().toarray().sum(axis=0)
        if any(i < get_setting("CANDIDATE_MIN_TERM_COUNT") or i > dtm.n_docs-get_setting("CANDIDATE_MIN_TERM_COUNT") for i in Counter(np.array(combined_quants, dtype=bool)).values()):
            #TODO have an option for doing this GENERALLY for the SVMs (and plot in 3D)
            c0_inds = np.where(combined_quants <= np.percentile(combined_quants, 30))[0]
            c1_inds = np.where(combined_quants >= np.percentile(combined_quants, 70))[0]
            used_inds = sorted(list(set(c0_inds)|set(c1_inds)))
            embed = embedding[used_inds]
            if verbose:
                print(f"For cluster {k}, the distribution is {dict(Counter(np.array(combined_quants, dtype=bool)))}, so we'll take the most distinct {get_setting('MOST_DISTINCT_PERCENT')}% ({len(c0_inds)} entities per class)")
            combined_quants = [combined_quants[i] if i in c1_inds else 0 for i in used_inds]
        cand_mets, decision_plane, _ = create_candidate_svm(embed, f"cluster:{k}", combined_quants, get_setting("CLASSIFIER"), quant_name=dtm.quant_name)
        all_cand_mets[k] = cand_mets
        cluster_directions[k] = decision_plane
    if verbose:
        print(f"Scores for {get_setting('CLASSIFIER_SUCCMETRIC')} per cluster:", ", ".join(f"{k}: {v[get_setting('CLASSIFIER_SUCCMETRIC')]:.2f}" for k, v in all_cand_mets.items()))
    return cluster_directions
def show_close_descriptions(dissim_mat, descriptions, is_embedding=False, num=10, title="Dissim-Mat"):
    # closest_entries = list(zip(*np.where(dissim_mat==min(dissim_mat[dissim_mat>0]))))
    # closest_entries = set(tuple(sorted(i)) for i in closest_entries)
    # print(f"Closest Nonequal Descriptions: \n", "\n".join(["*b*"+("*b* & *b*".join([descriptions._descriptions[i].title for i in j]))+"*b*" for j in closest_entries]))
    print(f"Closest {num} Descriptions in {title}:")
    if is_embedding:
        dissim_mat = _create_dissim_mat(dissim_mat, get_setting("DISSIM_MEASURE"), force_singlethread=len(dissim_mat)<500, silent=len(dissim_mat)<500)[0]
    is_dissim = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10)
    assert is_dissim, "TODO now it's a similarity matrix"
    min_vals = sorted(squareform(dissim_mat))[:num]
    min_indices = np.where(np.isin(dissim_mat, min_vals))
    min_indices = [(i,j) for i,j in zip(*min_indices) if i!=j]
    min_indices = list({j: None for j in [tuple(sorted(i)) for i in min_indices]}.keys()) #remove duplicates ("aircraft cabin and airplane cabin" and "airplane cabin and aircraft cabin")
    for first, second in min_indices[:num]:
        print(f"  *b*{descriptions._descriptions[first].title}*b* and *b*{descriptions._descriptions[second].title}*b*")
def create_embedding(dissim_mat, embed_dimensions, embed_algo, verbose=False, pp_descriptions=None):
    dtm, dissim_mat = dissim_mat
    if get_setting("DEBUG"):
        dissim_mat = dissim_mat[:get_setting("DEBUG_N_ITEMS"), :get_setting("DEBUG_N_ITEMS")]
    is_dissim = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10)
    if not is_dissim:
        print("Seems like you had a similarity matrix, not a dissimilarity matrix! Fixing it.")
        assert np.allclose(np.diagonal(dissim_mat), 1, atol=1e-10)
        assert dissim_mat.min() >= 0 and dissim_mat.max() <= 1
        dissim_mat = 1-dissim_mat
    if embed_algo == "mds":
        embed = create_mds(dissim_mat, embed_dimensions)
    elif embed_algo == "tsne":
        embed = create_tsne(dissim_mat, embed_dimensions)
    elif embed_algo == "isomap":
        embed = create_isomap(dissim_mat, embed_dimensions)
    else:
        raise NotImplementedError(f"Algorithm {embed_algo} is not implemented!")
    if verbose and pp_descriptions is not None:
        show_close_descriptions(embed.embedding_, pp_descriptions, is_embedding=True, num=10, title=f"Embedding-Distances ({get_setting('DISSIM_MEASURE')})")
    if hasattr(embed, "dissimilarity_matrix_") and np.allclose(embed.dissimilarity_matrix_, dissim_mat):
        print("Dropping the dissim-mat from the embedding - it only bloats and is the same as in the previous step.")
        embed.dissimilarity_matrix_ = None
    return embed
def get_stopwords(language,
                  include_desc15_stopwords=True,
                  include_custom=True,
                  include_withoutdiacritics=True):
    if language in NLTK_LAN_TRANSLATOR:
        language = NLTK_LAN_TRANSLATOR[language]
    assert language in NLTK_LAN_TRANSLATOR.values(
    ), f"Cannot deal with language {language}"
    stopwords = set(nlstopwords.words(language))
    if include_desc15_stopwords and language == "english":
        stopwords |= load_desc15_stopwords()
    if include_custom and language == "english":
        stopwords |= set(get_setting("CUSTOM_STOPWORDS"))
    if include_withoutdiacritics:
        stopwords |= set(strip_accents_unicode(i) for i in stopwords)
    return tuple(stopwords)
def select_salient_terms(metrics, decision_planes, dcm, embedding, prim_lambda, sec_lambda, metricname, verbose=False):
    #TODO waitwaitwait. Am I 100% sure that the intercepts of the decision_planes are irrelevant?!
    #TODO what about those with high negative kappa? Einfach abs-wert nehmen und consideren (AUCH SCHON IM SCHRITT VORHER IF SO)
    print(f"Calculated Metrics: {list(list(metrics.values())[0].keys())}")
    print(f"Lambda1: {prim_lambda}, Lambda2: {sec_lambda}, compareto-metric: {metricname}")
    metrics = sorted(list({k: v[metricname] for k, v in metrics.items()}.items()), key=lambda x:x[1], reverse=True)
    get_tlambda = lambda metrics, lamb: [i[0] for i in metrics if i[1] >= prim_lambda]
    get_tlambda2 = lambda metrics, lamb1objs, seclamb: [i[0] for i in metrics if i[1] >= sec_lambda and i[0] not in lamb1objs]
    candidates = get_tlambda(metrics, prim_lambda)
    salient_directions = [metrics[0][0],]
    n_terms = min(len(candidates), get_setting("NDIMS_NCANDS_FACTOR")*len(decision_planes[salient_directions[0]].coef)) #2 in [DESC15]
    if get_setting("DEBUG"): n_terms = min(n_terms, 15)
    comparer = Comparer(decision_planes, vec_cos)
    #DESC15: "as the ith term, we select the term t minimising max_{j<i}cos(v_t_j, v_t) - In other words, we repeatedly select the term which is least similar to the terms that have already been selected"
    for nterm in tqdm(range(1, n_terms), desc="Finding Salient Directions"):
        cands = set(candidates)-set(salient_directions)
        compares = {cand: min(comparer(cand, compareto) for compareto in salient_directions) for cand in cands}
        #vec_cos(decision_planes[next(iter(cands))].normal, decision_planes[salient_directions[0]].normal)
        salient_directions.append(max(compares.items(), key=lambda x:x[1])[0])
    print(f"Found {len(salient_directions)} salient directions: {', '.join(salient_directions)}")
    compare_vecs = [decision_planes[term].normal for term in salient_directions]
    clusters = {term: [] for term in salient_directions}
    #TODO optionally instead do the cluster-assignment with k-means!
    nongreats = get_tlambda2(metrics, salient_directions, sec_lambda)
    if get_setting("DEBUG"): nongreats = nongreats[:2000]
    for term in tqdm(nongreats, desc="Associating the rest to Clusters"):
        # "we then associate with each term d_i a Cluster C_i containing all terms from T^{0.1} which are more similar to d_i than to any of the
        # other directions d_j." TODO: experiment with thresholds, if it's extremely unsimilar to ALL just effing discard it!
        clusters[salient_directions[np.argmin([vec_cos(decision_planes[term].normal, vec2) for vec2 in compare_vecs])]].append(term)
    # TODO maybe have a smart weighting function that takes into account the kappa-score of the term and/or the closeness to the original clustercenter (to threshold which cluster they are added to)

    #TODO an option here to either take mean, or only main-one, or smartly-weighted (I think DESC15 did only main-one)
    if get_setting("CLUSTER_DIRECTION_ALGO") == "mean":
        cluster_directions = join_clusters_average(clusters, decision_planes)
    elif get_setting("CLUSTER_DIRECTION_ALGO") == "main":
        cluster_directions = {term: decision_planes[term] for term in clusters.keys()}
    elif get_setting("CLUSTER_DIRECTION_ALGO") == "reclassify":
        cluster_directions = join_clusters_reclassify(clusters, dcm, embedding, verbose=verbose)
    else:
        raise NotImplementedError("TODO: weighted and others")
        #missing: weighted-by-kappa-averaged, weighted-by-distance-to-center-averaged (cosine, cosine+coef)
    #regarding mean-algorithm: taking the mean of the respective orthogonals seems reasonable, it's the mean direction. However we also care for the actual position of the
    # hyperplane (to get the actual ranking-wrt-this-feature), which is specified by orthogonal+intercept... and simply averaging the intercepts of it's clustercomponents seems really stupid.
    #  that however gives us another way to weight which-candidates-may-cluster: the closer the orthogonals (cosine-dist) AND the closer their intercepts, the more we want to have them in a cluster.

    return clusters, cluster_directions
예제 #18
0
 def show_info(self, descriptions=None):
     occurs_in = [set(j[0] for j in i) if i else set() for i in self.dtm]
     num_occurences = [
         sum([term_ind in i for i in occurs_in])
         for term_ind in tqdm(range(len(self.all_terms)),
                              desc="Counting Occurences [verbose]")
     ]
     show_hist(
         num_occurences,
         f"Docs per Keyword ({self.n_docs} docs, {len(self.all_terms)} terms)",
         xlabel="# Documents the Keyword appears in",
         ylabel="Count (log scale)",
         cutoff_percentile=98,
         log=True)
     above_threshold = len([
         i for i in num_occurences
         if i >= get_setting("CANDIDATE_MIN_TERM_COUNT", silent=True)
     ])
     sorted_canditerms = sorted(
         [[ind, elem] for ind, elem in enumerate(num_occurences)],
         key=lambda x: x[1],
         reverse=True)
     print(
         f"Found {len(self.all_terms)} candidate Terms, {above_threshold} ({round(above_threshold/len(self.all_terms)*100)}%) of which occur in at least {get_setting('CANDIDATE_MIN_TERM_COUNT', silent=True)} descriptions."
     )
     print(
         "The 25 terms that occur in the most descriptions (incl the #descriptions they occur in):",
         ", ".join([
             f"{self.all_terms[ind]} ({occs})"
             for ind, occs in sorted_canditerms[:25]
         ]))
     if descriptions is not None:
         max_ind = np.unravel_index(self.as_csr().argmax(),
                                    self.as_csr().shape)
         print(
             f"Max value: Term *b*{self.all_terms[max_ind[0]]}*b* has value *b*{dict(self.dtm[max_ind[1]])[max_ind[0]]:.3f}*b* for doc *b*{descriptions._descriptions[max_ind[1]].title}*b*"
         )
예제 #19
0
def get_countvec(pp_components, max_ngram, language, min_df=1):
    if isinstance(pp_components, str):
        pp_components = PPComponents.from_str(pp_components)
    if pp_components.remove_stopwords and get_setting(
            "TRANSLATE_POLICY") == "origlang":
        raise NotImplementedError(
            "Cannot deal with per-language-stopwords when using sklearn's CountVectorizer!"
        )
    cnt = CountVectorizer(
        strip_accents="unicode" if pp_components.remove_diacritics else None,
        lowercase=pp_components.convert_lower,
        ngram_range=(1, max_ngram),
        min_df=
        min_df,  #If 2, every term has a "partner", making the dissimilarity-matrix more compact
        stop_words=get_stopwords(language)
        if pp_components.remove_stopwords else
        None,  #TODO see https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
    )
    # I cannot set min_df and max_df, as I need all words for the dissimilarity-matrix!
    # TODO when I set preprocessor here I can override the preprocessing (strip_accents and lowercase) stage while preserving tokenizing and n-grams generation steps
    # TODO gucken wie viele Schritte mir das schon spart - keyword extraction, grundlage für dissim_matrix, ...? (Corollary: gucken was für min_df/max_df-ranges für dissim_matrix sinnvoll sind)
    # TODO I can merge this and the old one: If the PPComponents-Entry is uppercase, use a subcomponent of the countvectorizer instead of original one
    #  (it's both tokenization and occurence counting in one class, see https://scikit-learn.org/stable/modules/feature_extraction.html#common-vectorizer-usage)
    return cnt
예제 #20
0
        if quantification_measure in ["tfidf", "tf"] else None)
    assert any(
        " " in i
        for i in dtm.all_terms.values()) == (get_setting("NGRAMS_IN_EMBEDDING")
                                             and get_setting("MAX_NGRAM") > 1)
    quantification = dtm.apply_quant(
        quantification_measure, descriptions=descriptions,
        verbose=verbose) if not metainf.get("sklearn_tfidf") else dtm
    # das ist jetzt \textbf{v}_e with all e's as rows
    #cannot use ppmis directly, because a) too sparse, and b) we need a geometric representation with euclidiean props (betweeness, parallism,..)
    assert all(
        len(set((lst := [i[0] for i in dtm]))) == len(lst)
        for dtm in quantification.dtm)
    dissim_mat, metainf = create_dissimilarity_matrix(
        quantification.as_csr(),
        dissim_measure=get_setting("dissim_measure"),
        metainf=metainf,
        **interrupt_kwargs)
    if metainf.get("NEWLY_INTERRUPTED"):
        return quantification, dissim_mat, metainf
    assert np.allclose(
        dissim_mat, dissim_mat.T
    )  #if so it's a correct dissimilarity-matrix and we can do squareform to compress
    metainf["is_dissim"] = np.allclose(np.diagonal(dissim_mat), 0, atol=1e-10)
    if verbose:
        show_close_descriptions(dissim_mat, descriptions)
    dissim_mat = squareform(dissim_mat,
                            checks=True)  #saves > 50% storage space!
    return quantification, dissim_mat, metainf

    #TODO: When I calculate PPMI here, relative to all documents and all possible terms, ist das relevant/unintended dass
def postprocess_candidateterms(candidate_terms, descriptions,
                               extraction_method):
    """
    In this method I'll try to fix candidate-terms and check if they are really in the descriptions they claim to be.
    To count the descriptions they are in, I'll both generate a new doc-term-matrix with the respective ngram
    AND check if it's in the literal text of the description, such that after this, i can savely forget the original descriptions and focus on DTMs.
    """
    if get_setting("DEBUG"):
        maxlen = min(len(candidate_terms), len(descriptions._descriptions),
                     get_setting("DEBUG_N_ITEMS"))
        descriptions._descriptions = descriptions._descriptions[:maxlen]
        candidate_terms = candidate_terms[:maxlen]
    assert len(candidate_terms) == len(
        descriptions
    ), f"Candidate Terms: {len(candidate_terms)}, Descriptions: {len(descriptions)}"
    flattened = set(flatten(candidate_terms))
    print(
        "Extracted Unique Terms: ", ", ".join([
            f"{k+1}-grams: {v}"
            for k, v in sorted(Counter([i.count(" ")
                                        for i in flattened]).items(),
                               key=lambda x: x[0])
        ]), "| sum:", len(flattened))
    print(
        "Most often extracted Terms:", ", ".join(
            f"{i[0]} ({i[1]} times)"
            for i in sorted(list(Counter(flatten(candidate_terms)).items()),
                            key=lambda x: x[1],
                            reverse=True)[:5]))
    max_found_ngram = max(i.count(" ") for i in flatten(candidate_terms)) + 1
    dtm = descriptions.generate_DocTermMatrix(
        min_df=1, max_ngram=max_found_ngram)[
            0]  #TODO check if this works for all parameter-combis

    postprocessed_candidates = [[] for _ in candidate_terms]
    fails, changeds, toolong, ignores = set(), set(), set(), set()

    if extraction_method == "keybert":
        from derive_conceptualspace.create_spaces.preprocess_descriptions import PPComponents, get_countvec
        assert PPComponents.from_str(
            descriptions.recover_settings["pp_components"]).use_skcountvec
        #this is my try to reproduce the preprocessing for the terms from keybert (as it said in some T0D0 somewhere) - TODO do the non-skcountvec-method as well!!
        cnt = get_countvec(**descriptions.recover_settings,
                           max_ngram=1,
                           min_df=1)
        processor = lambda cand: " ".join(cnt.build_analyzer()(cand))
        try_edit_fns = (
            processor, strip_accents_unicode, fix_cand, lambda x: x.lower()
        )  #all PERMUTATIONS of these will be tried, that's a combinatorical explosion!
    else:
        try_edit_fns = ()
    all_edit_fns = flatten([
        list(permutations(try_edit_fns, i + 1))
        for i in range(len(try_edit_fns))
    ])

    for desc_ind, desc in enumerate(
            tqdm(descriptions._descriptions,
                 desc="Checking extracted candidates per-description")):
        term_counts = {
            dtm.all_terms[ind]: count
            for ind, count in dtm.dtm[desc_ind]
        }
        for cand in candidate_terms[desc_ind]:
            if cand.count(" ") + 1 > (get_setting("MAX_NGRAM") or 1):
                toolong.add(cand)
                continue
            if "xxMA_SENTBORDERxx" in cand:
                ignores.add(cand)
                continue
            cond, ncand = check_cand(cand, desc, edit_fns=all_edit_fns)
            if cond:
                if not extracted_literally():
                    assert term_counts[ncand] == desc.count_phrase(
                        ncand
                    )  #!!this shows that the DTM contains exactly the bow!!
                    if cand != ncand:
                        changeds.add((cand, ncand))
                postprocessed_candidates[desc_ind].append(ncand)
            else:
                fails.add(cand)

    if extracted_literally():
        assert not changeds and not toolong

    # changeds are for example when extract_coursetype extracted "seminar" from a description because it says "hauptseminar".
    # we can use that to make a mapping saying that a description containing the latter is defined to count as positive sample for the former.
    changeds_dict = {k: [] for k, vs in changeds}
    for k, v in changeds:
        changeds_dict[k].append(v)

    for desc_ind, desc in enumerate(
            tqdm(descriptions._descriptions,
                 desc="Checking a second time " +
                 ("(quickly)"
                  if descriptions.proc_steps == ["bow"] else "(slowly)"))):
        desc_txt = desc.processed_as_string(allow_shorten=True)
        desc_dtm = set(i[0] for i in dtm.dtm[desc_ind])
        for cand in postprocessed_candidates[desc_ind]:
            assert cand in desc
            if not descriptions.proc_steps == [
                    "bow"
            ]:  #superflous check if it was created solely from the bow
                assert cand in desc_txt
            assert dtm.reverse_term_dict[cand] in desc_dtm

    if toolong:
        print(
            f"Had to drop {len(toolong)} out of {len(flatten(candidate_terms))} (non-unique) candidates because they were too long."
        )
    if ignores:
        print(
            f"Had to drop {len(ignores)} out of {len(flatten(candidate_terms))} (non-unique) candidates because they were across sentence borders."
        )
    print(
        f"Had to drop {len(fails)} out of {len(flatten(candidate_terms))} (non-unique) candidates"
        + (f" and edit {len(changeds)}." if changeds else "."))
    print(
        "Postprocessed Unique Terms: ", ", ".join([
            f"{k+1}-grams: {v}" for k, v in sorted(Counter(
                [i.count(" ")
                 for i in set(flatten(postprocessed_candidates))]).items(),
                                                   key=lambda x: x[0])
        ]), "| sum:", len(set(flatten(postprocessed_candidates))))
    return postprocessed_candidates, changeds_dict
def extracted_literally():
    #some extraction-methods did extract literally, in which case I want to assert that no changes need to be done.
    return get_setting("EXTRACTION_METHOD") not in ["pp_keybert", "keybert"]
예제 #23
0
def translate_text(text, target="en", charlim=490000, origlans=None):
    # and I can still use the data from THIS call!!
    """Translates text into the target language. Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    Text can also be a sequence of strings, in which case this method will return a sequence of results for each text.
    """
    if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_setting(
            "GOOGLE_CREDENTIALS_FILE")
    print(f"Translate-Charlim set to {charlim}")
    BYTELIM = int(
        204800 *
        0.9)  #if a request is bigger than google API will raise an Error!
    SEGLIM = 128  #https://github.com/googleapis/google-cloud-python/issues/5425#issuecomment-562745220
    TEXT_LEN_LIM = 2800  #google, this is getting ridiculus.
    SUMMED_TEXT_LEN_LIM = 100000  #102423 was too long...
    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")
    translate_client = translate.Client()

    origtext = copy.deepcopy(text)
    if isinstance(text, (list, set, tuple)):
        accumulated_chars = list(accumulate([len(i) for i in text]))
        if accumulated_chars[-1] >= charlim:
            limit = [i > charlim for i in accumulated_chars].index(True)
            print(
                f"Have to drop {len(accumulated_chars)-limit} of {len(accumulated_chars)} elements!"
            )
            if accumulated_chars[0] <= charlim:
                text = text[:limit - 1 if limit >= 0 else None]
            else:
                print("Limit already reached!")
                return

    prelimtext = copy.deepcopy(text)
    res = []
    rest = []
    while True:
        while len("|".join(text).encode('utf-8')) > BYTELIM or len(
                text) > SEGLIM:
            rest.insert(0, text[-1])
            text = text[:-1]
        if any([len(i) > TEXT_LEN_LIM for i in text]):
            print("F**k you google.")
            flatten = lambda l: [item for sublist in l for item in sublist]
            # splitfn = lambda txt, maxlen: [txt[i:i+maxlen] for i in range(0, len(txt)+maxlen, maxlen) if txt[i:i+maxlen]]
            # split_text = [i.split(". ") if len(i) > TEXT_LEN_LIM else [i] for i in text]
            split_text = [
                nltk.sent_tokenize(i) if len(i) > TEXT_LEN_LIM else [i]
                for i in text
            ]  #sent_tokenize(x, language=origlans[n]) but whatever
            assert all(len(i) <= TEXT_LEN_LIM for i in split_text)
            longer_index = {
                ind: len(elem) - 1
                for ind, elem in enumerate(split_text) if len(elem) > 1
            }
            #now we merge the split sentences until they are all text-len-lim long
            for ind in longer_index.keys():
                lens = [len(i) for i in split_text[ind]]
                index_mapper = {0: 0}  # startindex: nwords
                indexmappernum = 0
                for num, elem in enumerate(lens):
                    assert elem <= TEXT_LEN_LIM, "one sentence is aleady too long."
                    if index_mapper[indexmappernum] + elem >= TEXT_LEN_LIM:
                        indexmappernum = num
                        index_mapper[indexmappernum] = 0
                    index_mapper[indexmappernum] += elem
                indices = list(
                    index_mapper.keys()) + [len(split_text[ind]) + 1]
                indices = [(indices[i], indices[i + 1])
                           for i in range(len(indices) - 1)]
                split_text[ind] = [
                    "".join(split_text[ind][i1:i2]) for i1, i2 in indices
                ]
                longer_index[ind] = len(split_text[ind]) - 1
            text = [i[0] if isinstance(i, list) else i for i in split_text]
            latterparts = flatten([
                i[1:] for i in split_text if isinstance(i, list) and len(i) > 1
            ])
            assert len(latterparts) <= SEGLIM, "f**k this."
            assert all(len(i) < TEXT_LEN_LIM for i in text)
            assert all(len(i) < TEXT_LEN_LIM for i in latterparts)
            assert sum([
                len(i) for i in text
            ]) <= SUMMED_TEXT_LEN_LIM, "geez google what the actual f**k"
            assert sum([
                len(i) for i in latterparts
            ]) <= SUMMED_TEXT_LEN_LIM, "geez google what the actual f**k"
            try:
                translations = translate_client.translate(
                    text, target_language=target)
                translations2 = translate_client.translate(
                    latterparts, target_language=target)
            except:
                failed = True
            else:
                failed = False
            assert sum(longer_index.values()) == len(translations2)
            latterparts_iter = iter(translations2)
            for index, ntranslations in longer_index.items():
                for i in range(ntranslations):
                    translations[index]["translatedText"] += next(
                        latterparts_iter)["translatedText"]
            translated = translations
        else:
            assert all(len(i) < TEXT_LEN_LIM for i in text)
            try:
                translated = translate_client.translate(text,
                                                        target_language=target)
            except:
                failed = True
            else:
                failed = False
        if not failed:
            assert len(translated) == len(text)
            res.extend(translated)
            assert len(res) + len(rest) == len(prelimtext)
            if rest:
                text = rest
                rest = []
            else:
                assert len(prelimtext) == len(res)
                break
        else:
            break

    # print(u"Text: {}".format(result["input"]))
    # print(u"Translation: {}".format(result["translatedText"]))
    # print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
    return [html.unescape(i["translatedText"]) for i in res]
예제 #24
0
def dtm_loader(doc_term_matrix):
    dtm = DocTermMatrix.fromstruct(doc_term_matrix[1][1])
    if get_setting("DEBUG"):
        if len(dtm.dtm) > get_setting("DEBUG_N_ITEMS"):
            warnings.warn("len(dtm) > DEBUG_N_ITEMS!!")
    return dtm
def classify(input,
             target,
             axnames,
             catnames,
             dt_depth,
             test_percentage_crossval,
             metric,
             do_plot=False,
             features_outvar=None,
             balance_classes=True,
             do_render=False,
             shuffle=False):
    # input[:, 99] = (target == "Shops&Services"); axnames[99] = "is_shop"
    # input[:, 98] = (target == "Food"); axnames[98] = "is_food"
    metric = "accuracy" if metric == "acc" else metric  #https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
    kwargs = dict(class_weight="balanced") if balance_classes else {}
    clf = DecisionTreeClassifier(random_state=get_setting("RANDOM_SEED"),
                                 max_depth=dt_depth,
                                 **kwargs)
    if test_percentage_crossval > 1:
        if metric == "f1" and len(catnames) > 2:
            metric = "f1_micro"
        cv = test_percentage_crossval if not shuffle else StratifiedKFold(
            n_splits=test_percentage_crossval, shuffle=True)
        #see "cv" parameter of cross_val_score at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
        if isinstance(metric, (list, tuple)):
            if len(catnames) > 2:
                metric = [i if i != "f1" else "f1_macro"
                          for i in metric]  #f1_macro would be = accuracy
            return None, None, cross_validate(clf,
                                              input,
                                              target,
                                              cv=cv,
                                              scoring=metric)
        else:
            scores = cross_val_score(clf, input, target, cv=cv, scoring=metric)
        score = scores.mean()
        clf.fit(input, target)
        if metric == "accuracy":
            assert get_score(clf, input, target, metric) == np.array([
                res == target[i] for i, res in enumerate(clf.predict(input))
            ]).mean()
        else:
            get_score(clf,
                      input,
                      target,
                      metric,
                      is_multiclass=len(catnames) >
                      2)  #have to to be able to plot_tree
        # print(f"Doing {test_percentage_crossval}-fold cross-validation. Best Score: {scores.max():.2f}, Mean: {score}:.2f")
    elif test_percentage_crossval == 0:
        warnings.warn(
            "Using the full data as training set without a test-set!")
        clf.fit(input, target)
        score = scores = get_score(clf,
                                   input,
                                   target,
                                   metric,
                                   is_multiclass=len(catnames) > 2)
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            input, target, test_size=test_percentage_crossval
        )  #TODO: stratify? https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
        clf.fit(X_train, y_train)
        score = scores = get_score(clf,
                                   X_test,
                                   y_test,
                                   metric,
                                   is_multiclass=len(catnames) > 2)
    if features_outvar is not None:
        features_outvar.append(clf)
    if do_plot:
        if catnames: assert len(clf.classes_) == len(catnames)
        return score, plot_tree(clf,
                                axnames,
                                (catnames or [str(i) for i in clf.classes_]),
                                do_render=do_render), scores
    return score, None, scores
예제 #26
0
 def generate_DocTermMatrix(self, min_df=1, max_ngram=None, do_tfidf=None):
     if self.proc_steps[-1] == "bow":
         assert max_ngram in [None, 1], "Can't do!"
         print(
             "Preprocessed produced a bag-of-words already. Config `max_ngram` is useless!"
         )
         forbid_setting("max_ngram")
         all_words = dict(
             enumerate(
                 set(flatten(i.bow().keys() for i in self._descriptions))))
         rev = {v: k for k, v in all_words.items()}
         dtm = [[[rev[k], v] for k, v in i.bow().items()]
                for i in self._descriptions]
         dtm = DocTermMatrix(dtm=dtm,
                             all_terms=all_words,
                             quant_name="count")
         if min_df > 1:
             dtm = DocTermMatrix.filter(
                 dtm,
                 min_df,
                 use_n_docs_count=get_setting("CANDS_USE_NDOCS_COUNT"),
                 verbose=get_setting("VERBOSE"),
                 descriptions=self)
         return dtm, {"ngrams_in_embedding": False}
     elif hasattr(self, "recover_settings"):
         from derive_conceptualspace.create_spaces.preprocess_descriptions import PPComponents, get_countvec
         pp_comps = PPComponents.from_str(
             self.recover_settings["pp_components"])
         if pp_comps.use_skcountvec:
             cnt = get_countvec(**self.recover_settings,
                                max_ngram=(max_ngram or 1),
                                min_df=min_df)
             fit_base = lambda: self.unprocessed_texts(
                 remove_htmltags=pp_comps.remove_htmltags)
         else:
             raise NotImplementedError()
     else:
         cnt = CountVectorizer(
             strip_accents=None,
             lowercase=False,
             stop_words=None,
             ngram_range=(1, (max_ngram or 1)),
             min_df=min_df,
             token_pattern=r"(?u)(\b\w\w+\b|\b\d\b)"
         )  #this token_pattern allows for single-digit-numbers
         fit_base = lambda: self.iter("processed_as_string",
                                      insert_sentborder=("sent_tokenize" in
                                                         self.proc_steps))
         #if we sent_tokenize, we have something like "2. Semester" in the original, which becomes due to nltk's suckiness [["bla", "2"], ["Semester", "blub"]]. It shouldn't find stuff across sentence borders, so we insert detectables strings that we can remove later.
         # TODO If I can do sent_tokenize for the CountVectorizer I need to update this here as well!
     if do_tfidf is not None:
         #https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
         # count = Pipeline([("count", cnt)]).fit(fit_base()) #to test if count already dies, otherwise we could try #https://discuss.analyticsvidhya.com/t/tfidf-on-sklearn-library-is-giving-me-a-huge-file-and-memory-error/78448/2
         # see also https://stackoverflow.com/a/64996792/5122790
         pipe = Pipeline([
             ("count", cnt),
             ("tfidf", TfidfTransformer(use_idf=(do_tfidf == "tfidf")))
         ]).fit(fit_base())
         aslist, all_words = csr_to_list(pipe.transform(fit_base()),
                                         pipe["count"].vocabulary_)
         return DocTermMatrix(dtm=aslist,
                              all_terms=all_words,
                              quant_name=do_tfidf), {
                                  "ngrams_in_embedding":
                                  any(" " in i for i in all_words.values()),
                                  "sklearn_tfidf": True
                              }
     X = cnt.fit_transform(fit_base())
     aslist, all_words = csr_to_list(X, cnt.vocabulary_)
     return DocTermMatrix(dtm=aslist,
                          all_terms=all_words,
                          quant_name="count"), {
                              "ngrams_in_embedding":
                              any(" " in i for i in all_words.values())
                          }
def create_tsne(dissim_mat, embed_dimensions):
    embedding = TSNE(n_components=embed_dimensions, random_state=get_setting("RANDOM_SEED"), metric="precomputed")
    tsne = embedding.fit(dissim_mat)
    return tsne
예제 #28
0
def preprocess_descriptions_full(raw_descriptions,
                                 dataset_class,
                                 pp_components,
                                 for_language,
                                 translate_policy,
                                 languages,
                                 translations=None,
                                 verbose=True):
    #TODO should I assert a minimal number of PP-Components? If I don't word-tokenize it all doesn't make much sense, does it?
    pp_components = PPComponents.from_str(pp_components)
    print("The following Pre-Processings will be performed:",
          ", ".join([k for k, v in pp_components.di.items() if v]))
    descriptions = dataset_class.preprocess_raw_file(raw_descriptions,
                                                     pp_components)
    if get_setting("preprocessed_bow", default_false=True):
        descriptions = descriptions_from_bow(descriptions, languages,
                                             translations, translate_policy)
        if len(raw_descriptions["vecs"]) > len(descriptions):
            warnings.warn(
                f"Because of the min-words-per-desc setting, {len(raw_descriptions['vecs'])-len(descriptions)} of the original items needed to be removed!"
            )
    else:
        if get_setting("DEBUG"):
            descriptions = descriptions[:get_setting(
                "DEBUG_N_ITEMS"
            )]  #pd.DataFrame([descriptions.iloc[key] for key in random.sample(range(len(descriptions)), k=get_setting("DEBUG_N_ITEMS"))])
        if isinstance(languages, str):
            languages = {
                k: {k2: languages
                    for k2 in descriptions[k]}
                if set(descriptions[k]) != {''} else None
                for k in descriptions.keys()
            }
        descriptions = create_bare_desclist(
            languages,
            translations,
            for_language,
            list(descriptions["title"]),
            list(descriptions["description"]),
            [i if str(i) != "nan" else None for i in descriptions["subtitle"]],
            translate_policy,
            pp_components=pp_components,
            assert_all_translated=False,
            additionals={
                i: [
                    j if not (isinstance(j, float) and math.isnan(j)) else None
                    for j in descriptions[i]
                ]
                for i in dataset_class.additionals
            } if pp_components.add_additionals else None)
        if pp_components.use_skcountvec:
            descriptions = pp_descriptions_countvec(descriptions,
                                                    pp_components,
                                                    for_language)
        else:
            descriptions = preprocess_descriptions(descriptions, pp_components)
    descriptions = descriptions.filter_words(
        min_words=get_setting("MIN_WORDS_PER_DESC"))
    if verbose:
        show_hist([i.n_words() for i in descriptions._descriptions],
                  "Words per Description",
                  xlabel="Number of Words")
    return descriptions, {"n_samples": len(descriptions)}
        for cand_mets, decision_plane, term in res:
            metrics[term] = cand_mets
            decision_planes[term] = decision_plane
        assert set(terms) == set(metrics.keys())
    if (didnt_converge := len([1 for i in metrics.values() if i and not i["did_converge"]])):
        warnings.warn(f"{didnt_converge} of the {len(metrics)} SVMs did not converge!", sklearn.exceptions.ConvergenceWarning)
    if verbose:
        df = pd.DataFrame(metrics).T
        df.columns = df.columns.str.replace("kappa", "k").str.replace("rank2rank", "r2r").str.replace("bin2bin", "b2b").str.replace("f_one", "f1").str.replace("digitized", "dig")
        for metricname in df.columns:
            print(f"\nAverage *r*{metricname}*r*: {df[metricname].mean():.5f}")
            with pd.option_context('display.max_rows', 11, 'display.max_columns', 20, 'display.expand_frame_repr', False, 'display.max_colwidth', 20, 'display.float_format', '{:.4f}'.format):
                print(str(df.sort_values(by=metricname, ascending=False)[:10]).replace(metricname, f"*r*{metricname}*r*"))
        if embedding.shape[1] == 3 and IS_INTERACTIVE:
            best_elem = max(metrics.items(), key=lambda x:(x[1] or {}).get("f_one",0))
            create_candidate_svm(embedding, best_elem[0], dcm.term_quants(best_elem[0]), classifier=get_setting("CLASSIFIER"), quant_name=dcm.quant_name, plot_svm=True, descriptions=descriptions)
            while (another := input("Another one to display: ").strip()) != "":
                if "," in another:
                    highlight = [i.strip() for i in another.split(",")[1:]]
                    another = another.split(",")[0].strip()
                else:
                    highlight = []
                create_candidate_svm(embedding, another, dcm.term_quants(another), classifier=get_setting("CLASSIFIER"), quant_name=dcm.quant_name, plot_svm=True, descriptions=descriptions, highlight=highlight)
    return quants_s, decision_planes, metrics, metainf


class Comparer():
    def __init__(self, decision_planes, compare_fn):
        self.decision_planes = decision_planes
        self.already_compared = {}
        self.compare_fn = compare_fn