def patch(self,request): """ A copy of POST (merging list) but with the source == just an internal corpus_id params in request.GET: onto_corpus: the corpus whose lists are getting patched from: the corpus from which we take the source lists to merge in todo: an array of the list types ("map", "main", "stop") to merge in """ if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res params = get_parameters(request) print(params) # the corpus with the target lists to be patched corpus_id = int(params.pop("onto_corpus")) corpus_node = cache.Node[corpus_id] print(params) if request.user.id != corpus_node.user_id: res = HttpResponse("Unauthorized") res.status_code = 401 return res list_types = {'map':'MAPLIST', 'main':'MAINLIST', 'stop':'STOPLIST'} # internal DB retrieve source_lists source_corpus_id = int(params.pop("from_corpus")) source_node = cache.Node[source_corpus_id] todo_lists = params.pop("todo").split(',') # ex: ['map', 'stop'] source_lists = {} for key in todo_lists: source_lists[key] = UnweightedList( source_node.children(list_types[key]).first().id ) # add the groupings too source_lists['groupings'] = Translations( source_node.children("GROUPLIST").first().id ) # attempt to merge and send response try: # merge the source_lists onto those of the target corpus log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node) return JsonHttpResponse({ 'log': log_msg, }, 200) except Exception as e: return JsonHttpResponse({ 'err': str(e), }, 400)
def get(self, request): parameters = get_parameters(request) glance_limit = None mainlist_id = None scores_id = None groups_id = None other_list_ids = {'maplist': None, 'stoplist': None} # 1) retrieve a mainlist_id and other lists ########################################## # simple request: just refers to the parent corpus # ------------------------------------------------ if "corpus" in parameters: corpus_id = parameters['corpus'] corpus = cache.Node[corpus_id] # with a corpus_id, the explicit scoring pointer is optional if "scoring" in parameters: scores_id = parameters['scoring'] else: scores_id = corpus.children('OCCURRENCES').first().id # retrieve the family of lists that have corpus as parent mainlist_id = corpus.children('MAINLIST').first().id groups_id = corpus.children('GROUPLIST').first().id other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id other_list_ids['maplist'] = corpus.children('MAPLIST').first().id # custom request: refers to each list individually # ------------------------------------------------- elif "mainlist" in parameters and "scoring" in parameters: mainlist_id = parameters['mainlist'] scores_id = parameters['scoring'] groups_id = None if 'groups' in parameters: groups_id = parameters['scoring'] for k in ['stoplist', 'maplist']: if k in parameters: other_list_ids[k] = parameters[k] # or request has an error # ----------------------- else: raise ValidationException( "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required" ) # 2) get the infos for each list ################################ ngraminfo = {} # ngram details sorted per ngram id linkinfo = {} # ngram groups sorted per ngram id listmembers = {} # ngram ids sorted per list name if "head" in parameters: # head <=> only mainlist AND only k top ngrams glance_limit = int(parameters['head']) mainlist_query = query_list(mainlist_id, details=True, pagination_limit=glance_limit, scoring_metric_id=scores_id) else: # infos for all ngrams from mainlist mainlist_query = query_list(mainlist_id, details=True, scoring_metric_id=scores_id) # infos for grouped ngrams, absent from mainlist hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True) # infos for stoplist terms, absent from mainlist stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True, scoring_metric_id=scores_id) # and for the other lists (stop and map) # no details needed here, just the member ids for li in other_list_ids: li_elts = query_list(other_list_ids[li], details=False).all() # simple array of ngram_ids listmembers[li] = [ng[0] for ng in li_elts] # and the groupings if groups_id: links = Translations(groups_id) linkinfo = links.groups # list of ngrams_which_need_detailed_info = [] if "head" in parameters: # head triggered simplified form: just the top of the mainlist # TODO add maplist membership ngrams_which_need_detailed_info = mainlist_query.all() else: ngrams_which_need_detailed_info = mainlist_query.all( ) + hidden_ngrams_query.all() + stop_ngrams_query.all() # the output form of details is: # ngraminfo[id] => [term, weight] for ng in ngrams_which_need_detailed_info: ng_id = ng[0] ngraminfo[ng_id] = ng[1:] # NB the client js will sort mainlist ngs from hidden ngs after ajax # using linkinfo (otherwise needs redundant listmembers for main) return JsonHttpResponse({ 'ngraminfos': ngraminfo, 'listmembers': listmembers, 'links': linkinfo, 'nodeids': { 'mainlist': mainlist_id, 'maplist': other_list_ids['maplist'], 'stoplist': other_list_ids['stoplist'], 'groups': groups_id, 'scores': scores_id, } })
def compute_groups(corpus, stoplist_id = None, overwrite_id = None): """ 1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma 2) Create an empty GROUPLIST node (for a list of "synonym" ngrams) 3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2) """ stop_ngrams_ids = {} # we will need the ngrams of the stoplist to filter if stoplist_id is not None: for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all(): stop_ngrams_ids[id[0]] = True # 1) compute stems/lemmas # and group if same stem/lemma stemmers = prepare_stemmers(corpus) print("# STEMMERS LOADED", stemmers) supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] if lang != "__unknown__"] print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang) # todo dict {lg => {ngrams_todo} } todo_ngrams_per_lg = defaultdict(set) # res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} } my_groups = defaultdict(Counter) # preloop per doc to sort ngrams by language for doc in corpus.children('DOCUMENT'): if doc.id not in corpus.hyperdata['skipped_docs']: if ('language_iso2' in doc.hyperdata) and doc.hyperdata['language_iso2'] \ in supported_stemmers_lang: lgid = doc.hyperdata['language_iso2'] else: lgid = "__unknown__" doc.status("NGRAMS_GROUPS", error="Error: unsupported language for stemming") doc.save_hyperdata() #corpus.hyperdata["skipped_docs"].append(doc.id) #corpus.save_hyperdata() # doc.ngrams is an sql query (ugly but useful intermediate step) # FIXME: move the counting and stoplist filtering up here for ngram_pack in doc.ngrams.all(): todo_ngrams_per_lg[lgid].add(ngram_pack) # -------------------- # long loop per ngrams for (lgid,todo_ngs) in todo_ngrams_per_lg.items(): # fun: word::str => stem::str stem_it = stemmers[lgid].stem for ng in todo_ngs: doc_wei = ng[0] ngram = ng[1] # Ngram obj # break if in STOPLIST if ngram.id in stop_ngrams_ids: next lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)] # STEM IT, and this term's stems will become a new grouping key... stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms]) # ex: # groups['post'] = {'poste':3, 'poster':5, 'postés':2...} # groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...} my_groups[stemseq][ngram.id] += doc_wei del todo_ngrams_per_lg # now serializing all groups to a list of couples ng_couples = [] addcouple = ng_couples.append for grped_ngramids in my_groups.values(): if len(grped_ngramids) > 1: # first find most frequent term in the counter winner_id = grped_ngramids.most_common(1)[0][0] for ngram_id in grped_ngramids: if ngram_id != winner_id: addcouple((winner_id, ngram_id)) del my_groups # 2) the list node if overwrite_id: # overwrite pre-existing id the_id = overwrite_id # or create the new id else: the_group = corpus.add_child( typename = "GROUPLIST", name = "Group (src:%s)" % corpus.name[0:10] ) # and save the node session.add(the_group) session.commit() the_id = the_group.id # 3) Save each grouping couple to DB thanks to Translations.save() table ndngng_list = Translations( [(sec,prim) for (prim,sec) in ng_couples], just_items=True ) # ...referring to the list node we just got ndngng_list.save(the_id) return the_id
def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]): """ Integrates an external terms table to the current one: - merges groups (using group_union() function) - resolves conflicts if terms belong in different lists > map wins over both other types > main wins over stop > stop never wins £TODO STOP wins over candidates from main @param new_lists: a dict of *new* imported lists with format: {'stop': UnweightedList, 'main': UnweightedList, 'map': UnweightedList, 'groupings': Translations } if any of those lists is absent it is considered empty @param onto_corpus: a corpus node to get the *old* lists @param del_originals: an array of original wordlists to ignore and delete during the merge possible values : ['stop','main','map'] par exemple del_originals = ['stop','main'] => effacera la stoplist et la mainlist mais pas la maplist qui sera fusionnée (les éléments de la map list seront remis dans la main à la fin) NB: Uses group_tools.group_union() to merge the synonym links. Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs """ # log to send back to client-side (lines will be joined) my_log = [] # the tgt node arg has to be a corpus here if not hasattr(onto_corpus, "typename") or onto_corpus.typename != "CORPUS": raise TypeError("IMPORT: 'onto_corpus' argument must be a Corpus Node") # for stats added_nd_ng = 0 # number of added list elements # our list shortcuts will be 0,1,2 (aka lid) # by order of precedence linfos = [ { 'key': 'stop', 'name': "STOPLIST" }, # lid = 0 { 'key': 'main', 'name': "MAINLIST" }, # lid = 1 { 'key': 'map', 'name': "MAPLIST" } # lid = 2 ] # ======== Index the new ngrams in the docs ========= all_possibly_new_ngram_ids = [] collect = all_possibly_new_ngram_ids.append for lid, info in enumerate(linfos): list_type = info['key'] if list_type in new_lists: for ng_id in new_lists[list_type].items: collect(ng_id) from gargantext.util.toolchain.main import t print("MERGE DEBUG: starting index_new_ngrams", t()) n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus) print("MERGE DEBUG: finished index_new_ngrams", t()) my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added) # ======== Get the old lists ========= old_lists = {} # DB nodes stored with same indices 0,1,2 (resp. stop, miam and map) # find target ids of the list node objects tgt_nodeids = [ onto_corpus.children( "STOPLIST").first().id, # £todo via parent project? onto_corpus.children("MAINLIST").first().id, onto_corpus.children("MAPLIST").first().id ] old_group_id = onto_corpus.children("GROUPLIST").first().id # retrieve old data into old_lists[list_type]... # ---------------------------------------------- for lid, linfo in enumerate(linfos): list_type = linfo['key'] if list_type not in del_originals: # NB can't use UnweightedList(tgt_nodeids[lid]) # because we need to include out-of-list subforms list_ngrams_q = query_list(tgt_nodeids[lid], groupings_id=old_group_id) old_lists[list_type] = UnweightedList(list_ngrams_q.all()) else: # ...or use empty objects if replacing old list # ---------------------------------------------- old_lists[list_type] = UnweightedList() msg = "MERGE: ignoring old %s which will be overwritten" % linfo[ 'name'] print(msg) my_log.append(msg) # ======== Merging all involved ngrams ========= # all memberships with resolved conflicts of interfering memberships resolved_memberships = {} for list_set in [old_lists, new_lists]: for lid, info in enumerate(linfos): list_type = info['key'] # if you don't want to merge one list just don't put it in new_lists if list_type in list_set: # we use the fact that lids are ordered ints... for ng_id in list_set[list_type].items: if ng_id not in resolved_memberships: resolved_memberships[ng_id] = lid else: # ...now resolving is simply taking the max # stop < main < map resolved_memberships[ng_id] = max( lid, resolved_memberships[ng_id]) # now each ngram is only in its most important list # ------------------------------------------------- # NB temporarily map items are not in main anymore # but we'll copy it at the end # NB temporarily all subforms were treated separately # from mainforms but we'll force them into same list # after we merge the groups del old_lists # ======== Merging old and new groups ========= # get the arcs already in the target DB (directed couples) previous_links = session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == old_group_id).all() n_links_previous = len(previous_links) # same format for the new arcs (Translations ~~~> array of couples) translated_imported_links = [] add_link = translated_imported_links.append n_links_added = 0 for (y, x) in new_lists['groupings'].items.items(): add_link((x, y)) n_links_added += 1 del new_lists # group_union: joins 2 different synonym-links lists into 1 new list new_links = group_union(previous_links, translated_imported_links) del previous_links del translated_imported_links n_links_after = len(new_links) merged_group = Translations([(y, x) for (x, y) in new_links]) del new_links # ======== Overwrite old data with new ========= merged_group.save(old_group_id) msg = "MERGE: groupings %i updated (links before/added/after: %i/%i/%i)" % ( old_group_id, n_links_previous, n_links_added, n_links_after) my_log.append(msg) print(msg) # ======== Target list(s) append data ========= # if list 2 => write in both tgt_data_lists [1,2] # lists 0 or 1 => straightforward targets [0] or [1] merged_results = { 'stop': UnweightedList(), 'main': UnweightedList(), 'map': UnweightedList() } for (ng_id, winner_lid) in resolved_memberships.items(): ## 1) using the new groups # normal case if not a subform if ng_id not in merged_group.items: target_lid = winner_lid # inherit case if is a subform else: mainform_id = merged_group.items[ng_id] # inherited winner try: target_lid = resolved_memberships[mainform_id] except KeyError: target_lid = winner_lid print("MERGE: WARN ng_id %i has incorrect mainform %i ?" % (ng_id, mainform_id)) ## 2) map => map + main if target_lid == 2: todo_lids = [1, 2] else: todo_lids = [target_lid] ## 3) storage for lid in todo_lids: list_type = linfos[lid]['key'] merged_results[list_type].items.add(ng_id) # print("IMPORT: added %i elements in the lists indices" % added_nd_ng) # ======== Overwrite old data with new ========= for lid, info in enumerate(linfos): tgt_id = tgt_nodeids[lid] list_type = info['key'] result = merged_results[list_type] result.save(tgt_id) msg = "MERGE: %s %i updated (new size: %i)" % ( info['name'], tgt_id, len(merged_results[list_type].items)) my_log.append(msg) print(msg) # return a log return ("\n".join(my_log))
def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM, group_delimiter=DEFAULT_CSV_DELIM_GROUP): ''' This function reads a CSV of an ngrams table for a Corpus, then it converts old ngram_ids to those of the current DB (and adds to DB any unknown ngrams) then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS Input example: status | label |forms --------+---------------+--------------------- map water table water tables map water supply water-supply|&|water supplies stop wastewater The title line is mandatory. The label will correspond to our DB mainform type. Variants: ---------- For user accessibility, we allow different formats using equivalence rules: 1) It is implicit that the label string is also one of the forms therefore the input example table is equivalent to this "verbose" table: status | label |forms --------+---------------+--------------------- map water table water table|&|water tables map water supply water supply|&|water-supply|&|water supplies stop wastewater wastewater 2) The default status is map and the status column is optional thus, if we ignore "wastewater", the input table is also equivalent to: label |forms ---------------+--------------------- water table water tables water supply water-supply|&|water supplies 3) From DB point of view, both "forms that are labels" and "other forms" are finally saved just as ngrams. So the input table is also equivalent to: status | label |forms --------+---------------+--------------------- map water table water tables map water tables map water supply water-supply|&|water supplies map water supplies map water-supply stop wastewater Output: ------- 3 x UnweightedList + 1 x Translations @param the_file a local filename or file contents or a filehandle-like @param delimiter a character used as separator in the CSV @param group_delimiter a character used as grouped subforms separator (in the last column) The retrieval of ngram_ids works in 2 steps: => look up each term str in the DB with bulk_insert_ifnotexists (creates absent ngrams if necessary) => use the new ids to map the relations involving the old ones NB: the creation of MAINLIST also adds all elements from the MAPLIST NB: To merge the imported lists into a corpus node's lists, chain this function with merge_ngramlists() ''' # --------------- # ngram storage # --------------- # main storage for the ngrams by list imported_nodes_ngrams = {'stop': [], 'main': [], 'map': []} # and all the terms (for unique and for dbdata bulk_insert) imported_unique_ngramstrs = {} # and for the imported_grouping list of couples [(str1,str1),(str1,str2)..] imported_groupings = [] # /!\ imported_grouping contains the subforms' terms themselves # (that will have to be translated to ngram_ids for the target db) # =============== READ CSV =============== if isinstance(the_file, list): fname = 'imported_file' contents = the_file else: if isinstance(the_file, str): fh = open(the_file, "r") fname = the_file elif callable(getattr(the_file, "read", None)): fh = the_file fname = the_file else: raise TypeError("IMPORT: the_file argument has unknown type %s" % type(the_file)) # reading all directly b/c csv.reader takes only lines or a real fh in bytes # and we usually have a "false" fh (uploadedfile.InMemoryUploadedFile) in strings # (but we checked its size before!) contents = fh.read().decode("UTF-8").split("\n") # end of CSV read fh.close() # <class 'django.core.files.uploadedfile.InMemoryUploadedFile'> ngrams_csv_rows = reader(contents, delimiter=delimiter, quoting=QUOTE_MINIMAL) # for stats n_read_lines = 0 n_total_ng = 0 n_added_ng = 0 n_group_relations = 0 # columntype => int columns = {} # load CSV + initial checks for i, csv_row in enumerate(ngrams_csv_rows): # fyi n_read_lines += 1 # print("---------------READ LINE %i" % i) # headers if i == 0: n_cols = len(csv_row) for j, colname in enumerate(csv_row): if colname in ['label', 'status', 'forms']: columns[colname] = j # skip empty columns elif match(r'^\s*$', colname): pass else: raise ValueError( 'Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")' % (colname, n_read_lines)) if 'label' not in columns: raise ValueError( 'CSV must contain at least one column with the header "label"' ) if not len(csv_row): continue # try: # mandatory column this_row_label = str(csv_row[columns['label']]) # other columns or their default values if 'status' in columns: this_list_type = str(csv_row[columns['status']]) else: this_list_type = 'map' if 'forms' in columns: this_row_forms = str(csv_row[columns['forms']]) else: this_row_forms = '' # string normalizations this_row_label = normalize_forms(normalize_chars(this_row_label)) # except: # if i == 0: # print("IMPORT WARN: (skip line) probable header line at CSV %s:l.0" % fname) # continue # else: # raise ValueError("Error on CSV read line %i" % i) # --- term checking if not len(this_row_label) > 0: print("IMPORT WARN: (skip line) empty term at CSV %s:l.%i" % (fname, i)) continue # --- check correct list type if not this_list_type in ['stop', 'main', 'map']: print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" % (fname, i)) continue # subforms can be duplicated (in forms and another label) # but we must take care of unwanted other duplicates too if this_row_label in imported_unique_ngramstrs: print( "TODO IMPORT DUPL: (skip line) term appears more than once at CSV %s:l.%i" % (fname, i)) # ================= Store the data ==================== # the ngram census imported_unique_ngramstrs[this_row_label] = True # and the "list to ngram" relation imported_nodes_ngrams[this_list_type].append(this_row_label) # ====== Store synonyms from the import (if any) ====== if len(this_row_forms) != 0: other_terms = [] for raw_term_str in this_row_forms.split(group_delimiter): # each subform is also like an ngram declaration term_str = normalize_forms(normalize_chars(raw_term_str)) imported_unique_ngramstrs[term_str] = True imported_nodes_ngrams[this_list_type].append(term_str) # the optional repeated mainform doesn't interest us # because we already have it via the label if term_str != this_row_label: # save links imported_groupings.append((this_row_label, term_str)) # ======== ngram save + id lookup ========= n_total_ng = len(imported_unique_ngramstrs) # prepare data format imported_ngrams_dbdata = [] for ngram_str in imported_unique_ngramstrs: # DB needs the number of separate words n_words = 1 + len(findall(r' ', ngram_str)) imported_ngrams_dbdata.append((ngram_str, n_words)) # returns a dict {term => id} and a count of inserted ones # ------------------------- (new_ngrams_ids, n_added_ng) = bulk_insert_ifnotexists( # ------------------------- model=Ngram, uniquekey='terms', fields=('terms', 'n'), data=imported_ngrams_dbdata, do_stats=True) del imported_ngrams_dbdata # new_ngrams_ids contains a direct mapping ng_str => new_id del imported_unique_ngramstrs # print(new_ngrams_ids) # print(imported_nodes_ngrams) # ======== Import into lists ========= # 3 x abstract lists + 1 translations result = { 'map': UnweightedList(), 'main': UnweightedList(), 'stop': UnweightedList(), 'groupings': Translations() } for list_type in imported_nodes_ngrams: for ng_str in imported_nodes_ngrams[list_type]: new_id = new_ngrams_ids[ng_str] # add to the abstract list result[list_type].items.add(new_id) # for main also add map elements if list_type == 'main': for ng_str in imported_nodes_ngrams['map']: new_id = new_ngrams_ids[ng_str] result['main'].items.add(new_id) # ======== Synonyms ========= for (x_str, y_str) in imported_groupings: new_mainform_id = new_ngrams_ids[x_str] new_subform_id = new_ngrams_ids[y_str] # /!\ Translations use (subform => mainform) order result['groupings'].items[new_subform_id] = new_mainform_id n_group_relations += 1 # ------------------------------------------------------------------ print("IMPORT: read %i lines from the CSV" % n_read_lines) print("IMPORT: read %i terms (%i added and %i already existing)" % (n_total_ng, n_added_ng, n_total_ng - n_added_ng)) print("IMPORT: read %i grouping relations" % n_group_relations) # print("IMPORT RESULT", result) return result
def filterMatrix(matrix, mapList_id, groupList_id): mapList = UnweightedList(mapList_id) group_list = Translations(groupList_id) cooc = matrix & (mapList * group_list) return cooc