def compute_groups(corpus, stoplist_id = None, overwrite_id = None): """ 1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma 2) Create an empty GROUPLIST node (for a list of "synonym" ngrams) 3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2) """ stop_ngrams_ids = {} # we will need the ngrams of the stoplist to filter if stoplist_id is not None: for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all(): stop_ngrams_ids[id[0]] = True # 1) compute stems/lemmas # and group if same stem/lemma stemmers = prepare_stemmers(corpus) print("# STEMMERS LOADED", stemmers) supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] if lang != "__unknown__"] print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang) # todo dict {lg => {ngrams_todo} } todo_ngrams_per_lg = defaultdict(set) # res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} } my_groups = defaultdict(Counter) # preloop per doc to sort ngrams by language for doc in corpus.children('DOCUMENT'): if doc.id not in corpus.hyperdata['skipped_docs']: if ('language_iso2' in doc.hyperdata) and doc.hyperdata['language_iso2'] \ in supported_stemmers_lang: lgid = doc.hyperdata['language_iso2'] else: lgid = "__unknown__" doc.status("NGRAMS_GROUPS", error="Error: unsupported language for stemming") doc.save_hyperdata() #corpus.hyperdata["skipped_docs"].append(doc.id) #corpus.save_hyperdata() # doc.ngrams is an sql query (ugly but useful intermediate step) # FIXME: move the counting and stoplist filtering up here for ngram_pack in doc.ngrams.all(): todo_ngrams_per_lg[lgid].add(ngram_pack) # -------------------- # long loop per ngrams for (lgid,todo_ngs) in todo_ngrams_per_lg.items(): # fun: word::str => stem::str stem_it = stemmers[lgid].stem for ng in todo_ngs: doc_wei = ng[0] ngram = ng[1] # Ngram obj # break if in STOPLIST if ngram.id in stop_ngrams_ids: next lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)] # STEM IT, and this term's stems will become a new grouping key... stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms]) # ex: # groups['post'] = {'poste':3, 'poster':5, 'postés':2...} # groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...} my_groups[stemseq][ngram.id] += doc_wei del todo_ngrams_per_lg # now serializing all groups to a list of couples ng_couples = [] addcouple = ng_couples.append for grped_ngramids in my_groups.values(): if len(grped_ngramids) > 1: # first find most frequent term in the counter winner_id = grped_ngramids.most_common(1)[0][0] for ngram_id in grped_ngramids: if ngram_id != winner_id: addcouple((winner_id, ngram_id)) del my_groups # 2) the list node if overwrite_id: # overwrite pre-existing id the_id = overwrite_id # or create the new id else: the_group = corpus.add_child( typename = "GROUPLIST", name = "Group (src:%s)" % corpus.name[0:10] ) # and save the node session.add(the_group) session.commit() the_id = the_group.id # 3) Save each grouping couple to DB thanks to Translations.save() table ndngng_list = Translations( [(sec,prim) for (prim,sec) in ng_couples], just_items=True ) # ...referring to the list node we just got ndngng_list.save(the_id) return the_id
def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]): """ Integrates an external terms table to the current one: - merges groups (using group_union() function) - resolves conflicts if terms belong in different lists > map wins over both other types > main wins over stop > stop never wins £TODO STOP wins over candidates from main @param new_lists: a dict of *new* imported lists with format: {'stop': UnweightedList, 'main': UnweightedList, 'map': UnweightedList, 'groupings': Translations } if any of those lists is absent it is considered empty @param onto_corpus: a corpus node to get the *old* lists @param del_originals: an array of original wordlists to ignore and delete during the merge possible values : ['stop','main','map'] par exemple del_originals = ['stop','main'] => effacera la stoplist et la mainlist mais pas la maplist qui sera fusionnée (les éléments de la map list seront remis dans la main à la fin) NB: Uses group_tools.group_union() to merge the synonym links. Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs """ # log to send back to client-side (lines will be joined) my_log = [] # the tgt node arg has to be a corpus here if not hasattr(onto_corpus, "typename") or onto_corpus.typename != "CORPUS": raise TypeError("IMPORT: 'onto_corpus' argument must be a Corpus Node") # for stats added_nd_ng = 0 # number of added list elements # our list shortcuts will be 0,1,2 (aka lid) # by order of precedence linfos = [ { 'key': 'stop', 'name': "STOPLIST" }, # lid = 0 { 'key': 'main', 'name': "MAINLIST" }, # lid = 1 { 'key': 'map', 'name': "MAPLIST" } # lid = 2 ] # ======== Index the new ngrams in the docs ========= all_possibly_new_ngram_ids = [] collect = all_possibly_new_ngram_ids.append for lid, info in enumerate(linfos): list_type = info['key'] if list_type in new_lists: for ng_id in new_lists[list_type].items: collect(ng_id) from gargantext.util.toolchain.main import t print("MERGE DEBUG: starting index_new_ngrams", t()) n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus) print("MERGE DEBUG: finished index_new_ngrams", t()) my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added) # ======== Get the old lists ========= old_lists = {} # DB nodes stored with same indices 0,1,2 (resp. stop, miam and map) # find target ids of the list node objects tgt_nodeids = [ onto_corpus.children( "STOPLIST").first().id, # £todo via parent project? onto_corpus.children("MAINLIST").first().id, onto_corpus.children("MAPLIST").first().id ] old_group_id = onto_corpus.children("GROUPLIST").first().id # retrieve old data into old_lists[list_type]... # ---------------------------------------------- for lid, linfo in enumerate(linfos): list_type = linfo['key'] if list_type not in del_originals: # NB can't use UnweightedList(tgt_nodeids[lid]) # because we need to include out-of-list subforms list_ngrams_q = query_list(tgt_nodeids[lid], groupings_id=old_group_id) old_lists[list_type] = UnweightedList(list_ngrams_q.all()) else: # ...or use empty objects if replacing old list # ---------------------------------------------- old_lists[list_type] = UnweightedList() msg = "MERGE: ignoring old %s which will be overwritten" % linfo[ 'name'] print(msg) my_log.append(msg) # ======== Merging all involved ngrams ========= # all memberships with resolved conflicts of interfering memberships resolved_memberships = {} for list_set in [old_lists, new_lists]: for lid, info in enumerate(linfos): list_type = info['key'] # if you don't want to merge one list just don't put it in new_lists if list_type in list_set: # we use the fact that lids are ordered ints... for ng_id in list_set[list_type].items: if ng_id not in resolved_memberships: resolved_memberships[ng_id] = lid else: # ...now resolving is simply taking the max # stop < main < map resolved_memberships[ng_id] = max( lid, resolved_memberships[ng_id]) # now each ngram is only in its most important list # ------------------------------------------------- # NB temporarily map items are not in main anymore # but we'll copy it at the end # NB temporarily all subforms were treated separately # from mainforms but we'll force them into same list # after we merge the groups del old_lists # ======== Merging old and new groups ========= # get the arcs already in the target DB (directed couples) previous_links = session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == old_group_id).all() n_links_previous = len(previous_links) # same format for the new arcs (Translations ~~~> array of couples) translated_imported_links = [] add_link = translated_imported_links.append n_links_added = 0 for (y, x) in new_lists['groupings'].items.items(): add_link((x, y)) n_links_added += 1 del new_lists # group_union: joins 2 different synonym-links lists into 1 new list new_links = group_union(previous_links, translated_imported_links) del previous_links del translated_imported_links n_links_after = len(new_links) merged_group = Translations([(y, x) for (x, y) in new_links]) del new_links # ======== Overwrite old data with new ========= merged_group.save(old_group_id) msg = "MERGE: groupings %i updated (links before/added/after: %i/%i/%i)" % ( old_group_id, n_links_previous, n_links_added, n_links_after) my_log.append(msg) print(msg) # ======== Target list(s) append data ========= # if list 2 => write in both tgt_data_lists [1,2] # lists 0 or 1 => straightforward targets [0] or [1] merged_results = { 'stop': UnweightedList(), 'main': UnweightedList(), 'map': UnweightedList() } for (ng_id, winner_lid) in resolved_memberships.items(): ## 1) using the new groups # normal case if not a subform if ng_id not in merged_group.items: target_lid = winner_lid # inherit case if is a subform else: mainform_id = merged_group.items[ng_id] # inherited winner try: target_lid = resolved_memberships[mainform_id] except KeyError: target_lid = winner_lid print("MERGE: WARN ng_id %i has incorrect mainform %i ?" % (ng_id, mainform_id)) ## 2) map => map + main if target_lid == 2: todo_lids = [1, 2] else: todo_lids = [target_lid] ## 3) storage for lid in todo_lids: list_type = linfos[lid]['key'] merged_results[list_type].items.add(ng_id) # print("IMPORT: added %i elements in the lists indices" % added_nd_ng) # ======== Overwrite old data with new ========= for lid, info in enumerate(linfos): tgt_id = tgt_nodeids[lid] list_type = info['key'] result = merged_results[list_type] result.save(tgt_id) msg = "MERGE: %s %i updated (new size: %i)" % ( info['name'], tgt_id, len(merged_results[list_type].items)) my_log.append(msg) print(msg) # return a log return ("\n".join(my_log))