def test_make_factor_cxts(self): fct1 = fca.Concept({'g1', 'g3'}, {'m2', 'm1'}) fct2 = fca.Concept({'g2'}, {'m2', 'm3'}) cxt_objs_fcts, cxt_fcts_atts = fca.factors.make_factor_cxts( (fct1, fct2)) print(cxt_objs_fcts) assert cxt_objs_fcts == fca.Context([[1, 0], [0, 1], [1, 0]], ['g1', 'g2', 'g3'], ['f0', 'f1']) assert cxt_fcts_atts == fca.Context([[1, 1, 0], [0, 1, 1]], ['f0', 'f1'], ['m1', 'm2', 'm3'])
def _oplus(D_objs: Set[str], y: str, cxt: 'fca.Context', U: Set[Tuple[str, str]]): yprime = cxt.get_attribute_extent(y) Dy_prime = D_objs & yprime Dy_primeprime = cxt.oprime(Dy_prime) cpt = fca.Concept(extent=Dy_prime, intent=Dy_primeprime) # result = {x for x in cpt.pairs() if x not in U} result = set(cpt.pairs()) & U return result
def get_sense(factor): docs = list(factor.extent) term_rank = dict() common_terms = factor.intent for term in common_terms: term_rank[term] = sum(doc2preds[doc].index(term) for doc in docs) sense_ids = sorted(term_rank, key=lambda x: term_rank[x])[:n_sense_indicators] sense = fca.Concept(docs, set(sense_ids)) return sense
def algorithm2_weighted(cxt, fidelity=1): """ Algorithm2 from article{ title = "Discovery of optimal factors in binary data via a novel method of matrix decomposition ", journal = "Journal of Computer and System Sciences ", volume = "76", number = "1", pages = "3 - 20", year = "2010", doi = "http://dx.doi.org/10.1016/j.jcss.2009.05.002", url = "http://www.sciencedirect.com/science/article/pii/S0022000009000415", author = "Radim Belohlavek and Vilem Vychodil"} Extensions: Fidelity of coverage - stop when fidelity level is covered by factors """ len_objs_initial = len(cxt.objects) len_atts_initial = len(cxt.attributes) def score(obj_att_pairs): objs = {x[0] for x in obj_att_pairs} atts = {x[1] for x in obj_att_pairs} score = len(objs) * len(atts) / (len_objs_initial * len_atts_initial) return score U = set(cxt.object_attribute_pairs) len_initial = len(U) while (len_initial - len(U)) / len_initial < fidelity: D = set() V = 0 to_remove = set() while True: D_objs = cxt.oprime(D) ls_measures = [(score(_oplus(D_objs, j, cxt, U)), j) for j in set(cxt.attributes) - D] if ls_measures: maxDj = max(ls_measures, key=lambda x: x[0]) else: maxDj = [0,] if maxDj[0] > V: j_score, j = maxDj Dj = D | {j} C = cxt.aprime(Dj) D = cxt.oprime(C) to_remove = set(itertools.product(C, D)) & U V = len(to_remove) else: break if len(to_remove) == 0: print('Algorithm stuck, something went wrong, pairs left ', len(U)) assert False U -= to_remove yield fca.Concept(C, D), j_score
def end_element(name): global cs, new_intent, new_extent, new_meta global new_obj, new_attr, buffer if name == "object": if new_obj: d_objects[new_obj] = buffer objects.append(buffer) new_obj = None buffer = "" elif name == "attribute": if new_attr: d_attributes[new_attr] = buffer attributes.append(buffer) new_attr = None buffer = "" elif name == "concept": new_concept = fca.Concept(new_extent, new_intent) new_concept.meta = new_meta cs.append(new_concept) new_extent = [] new_intent = [] new_meta = {}
def algorithm2_w_condition(cxt, fidelity: float = 1, allow_repeatitions=True, min_atts_and_objs=3, objs_ge_atts=False): """ Algorithm2 from article{ title = "Discovery of optimal factors in binary data via a novel method of matrix decomposition ", journal = "Journal of Computer and System Sciences ", volume = "76", number = "1", pages = "3 - 20", year = "2010", doi = "http://dx.doi.org/10.1016/j.jcss.2009.05.002", url = "http://www.sciencedirect.com/science/article/pii/S0022000009000415", author = "Radim Belohlavek and Vilem Vychodil"} :param objs_ge_atts: should the number of objects be greater or equal to the number of attributes in the output factors :param min_atts_and_objs: minimum number of attributes and objects in the output factors :param fidelity: stops when this fraction of crosses in the table are covered :param allow_repeatitions: exclude attributes in already obtained factors from further consideration - they still may appear in the closure """ def good_factor(cpt: 'fca.Concept'): if objs_ge_atts: return len(cpt.extent) >= len(cpt.intent) >= min_atts_and_objs else: return len(cpt.extent) >= min_atts_and_objs and \ len(cpt.intent) >= min_atts_and_objs U = set(cxt.object_attribute_pairs) len_initial = len(U) removed_atts = set() removed_objs = set() if not len_initial: return while (len_initial - len(U)) / len_initial < fidelity: D = set() C = set(cxt.objects) V = 0 to_remove = set() available_atts = {x[1] for x in U} - removed_atts while True: Dprime = cxt.aprime(D) ls_measures = [(len(_oplus(Dprime, j, cxt, U)), j) for j in available_atts - D] if not ls_measures: # print(f'Empty ls_measures. len(U) = {len(U)}, {set(u[1] for u in U)}, len(D) = {len(D)}, len(avail_atts) = {len(available_atts)}') return maxDj = max(ls_measures, key=lambda x: x[0]) # print(D, Dprime, maxDj, V) # print(cxt) if maxDj[0] > V or not good_factor(cpt=fca.Concept(C, D)): # update the values # D_old = D.copy() j_score, j = maxDj Dj = D | {j} C = cxt.aprime(Dj) if len(C) < min_atts_and_objs or not (available_atts - D): # early restart U = {u for u in U if u[1] not in Dj} removed_atts |= Dj break D = cxt.oprime(C) to_remove_U = set(itertools.product(C, D)) & U V = len(to_remove_U) if not allow_repeatitions: to_remove = (set(itertools.product(C, cxt.attributes)) | set(itertools.product(cxt.objects, D))) & U else: to_remove = to_remove_U elif good_factor(cpt=fca.Concept(C, D)): if len(to_remove) == 0: raise Exception( f'Algorithm stuck, something went wrong, pairs left ' f'{len(U)}') U -= to_remove # print(f'Factor out: {len(C)}, {len(D)}') yield fca.Concept(C, D), len(to_remove) / len_initial, (len_initial - len(U)) / len_initial break else: assert False
def test_norris(self): cl = fca.ConceptLattice(self.small_cxt) assert fca.Concept(self.small_cxt.objects, []) in cl assert fca.Concept([], self.small_cxt.attributes) in cl assert len(cl) > 2
def induce(contexts: List[str], target_start_end_tuples: List[Tuple[int, int]], titles: List[str] = None, target_pos: str = None, n_sense_descriptors=5, lang='eng', top_n_pred=100, min_number_contexts_for_fca_clustering=3, min_sub_len=3, verbose=False, logger=None) -> List[fca.Concept]: """ The function induces sense(s) of the target from a collection of contexts. This function always returns a result. If the proper clustering does not produce any factors then the most common predictions are output. :param min_sub_len: min length of produced substitute :param contexts: the contexts themselves :param target_start_end_tuples: the (start index, end index) pairs indicating the target in the respective context. len(contexts) == len(target_start_end_tuples) :param top_n_pred: how many predictions are produced for each context :param titles: Titles of contexts :param n_sense_descriptors: how many sense indicators - subset of all predictions - are output for each sense :param target_pos: the desired part of speach of predictions :param lang: language. Used for POS tagging and lemmatization of predictions :param min_number_contexts_for_fca_clustering: minimum number of contexts to try the fca clustering. If there are only 1 or 2 then it often does not make sense to cluster. """ if logger is None: logger = local_logger if not len(contexts) == len(target_start_end_tuples): raise ValueError(f'Length of contexts {len(contexts)} is not equal to ' f'the length of start and end indices list ' f'{len(target_start_end_tuples)}.') subs = iter_substitutes( contexts, target_start_end_tuples, titles=titles, th_substitute_len=min_sub_len, top_n=top_n_pred, target_pos=target_pos, lang=lang, ) if verbose: subs = tqdm(subs, total=len(contexts)) predicted = { title: top_pred_m + top_pred_unm for title, top_pred_m, top_pred_unm in subs } senses = [] target_phrase_in_fiurst_cintext = contexts[0][ target_start_end_tuples[0][0]:target_start_end_tuples[0][1]] if len(contexts) >= min_number_contexts_for_fca_clustering: senses = fca_cluster(predicted, n_sense_indicators=n_sense_descriptors) logger.debug( f'For {target_phrase_in_fiurst_cintext} with {len(contexts)} contexts ' f'fca_cluster produced {len(senses)} senses.') if not senses: # fca_cluster did not produce results all_predicted = sum(predicted.values(), []) top_predicted = [ x[0] for x in Counter(all_predicted).most_common( min([top_n_pred, n_sense_descriptors])) ] senses = [ fca.Concept(intent=top_predicted, extent=list(predicted.keys())) ] logger.debug( f'For {target_phrase_in_fiurst_cintext} with {len(contexts)} contexts ' f'most common {len(top_predicted)} predictions are ' f'taken as sense indicators.') return senses