def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
    """
    1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
    2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
    3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
    """

    stop_ngrams_ids = {}
    # we will need the ngrams of the stoplist to filter
    if stoplist_id is not None:
        for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all():
            stop_ngrams_ids[id[0]] = True


    # 1) compute stems/lemmas
    #    and group if same stem/lemma
    stemmers = prepare_stemmers(corpus)
    print("# STEMMERS LOADED", stemmers)
    supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] if lang != "__unknown__"]

    print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang)
    # todo dict {lg => {ngrams_todo} }
    todo_ngrams_per_lg = defaultdict(set)

    # res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} }
    my_groups = defaultdict(Counter)

    # preloop per doc to sort ngrams by language
    for doc in corpus.children('DOCUMENT'):
        if doc.id not in corpus.hyperdata['skipped_docs']:
            if ('language_iso2' in doc.hyperdata) and doc.hyperdata['language_iso2'] \
                                                    in supported_stemmers_lang:
                lgid = doc.hyperdata['language_iso2']

            else:
                lgid = "__unknown__"
                doc.status("NGRAMS_GROUPS", error="Error: unsupported language for stemming")
                doc.save_hyperdata()
                #corpus.hyperdata["skipped_docs"].append(doc.id)
                #corpus.save_hyperdata()
            # doc.ngrams is an sql query (ugly but useful intermediate step)
            # FIXME: move the counting and stoplist filtering up here
            for ngram_pack in doc.ngrams.all():
                todo_ngrams_per_lg[lgid].add(ngram_pack)

    # --------------------
    # long loop per ngrams
    for (lgid,todo_ngs) in todo_ngrams_per_lg.items():
        # fun: word::str => stem::str
        stem_it = stemmers[lgid].stem

        for ng in todo_ngs:
            doc_wei = ng[0]
            ngram  = ng[1]       # Ngram obj

            # break if in STOPLIST
            if ngram.id in stop_ngrams_ids:
                next

            lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)]

            # STEM IT, and this term's stems will become a new grouping key...
            stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms])

            # ex:
            # groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
            # groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
            my_groups[stemseq][ngram.id] += doc_wei

    del todo_ngrams_per_lg

    # now serializing all groups to a list of couples
    ng_couples = []
    addcouple = ng_couples.append
    for grped_ngramids in my_groups.values():
        if len(grped_ngramids) > 1:
            # first find most frequent term in the counter
            winner_id = grped_ngramids.most_common(1)[0][0]

            for ngram_id in grped_ngramids:
                if ngram_id != winner_id:
                    addcouple((winner_id, ngram_id))

    del my_groups

    # 2) the list node
    if overwrite_id:
        # overwrite pre-existing id
        the_id = overwrite_id
    # or create the new id
    else:
        the_group =  corpus.add_child(
            typename  = "GROUPLIST",
            name = "Group (src:%s)" % corpus.name[0:10]
        )

        # and save the node
        session.add(the_group)
        session.commit()
        the_id = the_group.id

    # 3) Save each grouping couple to DB thanks to Translations.save() table
    ndngng_list = Translations(
                                [(sec,prim) for (prim,sec) in ng_couples],
                                just_items=True
                   )

    # ...referring to the list node we just got
    ndngng_list.save(the_id)

    return the_id
示例#2
0
def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
    """
    Integrates an external terms table to the current one:
       - merges groups (using group_union() function)
       - resolves conflicts if terms belong in different lists
          > map wins over both other types
          > main wins over stop
          > stop never wins   £TODO STOP wins over candidates from main

    @param new_lists:     a dict of *new* imported lists with format:
                                {'stop':     UnweightedList,
                                 'main':     UnweightedList,
                                 'map':      UnweightedList,
                                 'groupings': Translations }

                   if any of those lists is absent it is considered empty

    @param onto_corpus:   a corpus node to get the *old* lists

    @param del_originals: an array of original wordlists to ignore
                          and delete during the merge
                          possible values : ['stop','main','map']

            par exemple
            del_originals = ['stop','main'] => effacera la stoplist
                                                 et la mainlist
                                          mais pas la maplist qui sera fusionnée
                                         (les éléments de la map list
                                          seront remis dans la main à la fin)

    NB: Uses group_tools.group_union() to merge the synonym links.
        Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs
    """
    # log to send back to client-side (lines will be joined)
    my_log = []

    # the tgt node arg has to be a corpus here
    if not hasattr(onto_corpus,
                   "typename") or onto_corpus.typename != "CORPUS":
        raise TypeError("IMPORT: 'onto_corpus' argument must be a Corpus Node")

    # for stats
    added_nd_ng = 0  # number of added list elements

    # our list shortcuts will be 0,1,2 (aka lid)
    # by order of precedence
    linfos = [
        {
            'key': 'stop',
            'name': "STOPLIST"
        },  # lid = 0
        {
            'key': 'main',
            'name': "MAINLIST"
        },  # lid = 1
        {
            'key': 'map',
            'name': "MAPLIST"
        }  # lid = 2
    ]

    # ======== Index the new ngrams in the docs =========
    all_possibly_new_ngram_ids = []
    collect = all_possibly_new_ngram_ids.append
    for lid, info in enumerate(linfos):
        list_type = info['key']
        if list_type in new_lists:
            for ng_id in new_lists[list_type].items:
                collect(ng_id)

    from gargantext.util.toolchain.main import t
    print("MERGE DEBUG: starting index_new_ngrams", t())
    n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
    print("MERGE DEBUG: finished index_new_ngrams", t())

    my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)

    # ======== Get the old lists =========
    old_lists = {}

    # DB nodes stored with same indices 0,1,2 (resp. stop, miam and map)
    # find target ids of the list node objects
    tgt_nodeids = [
        onto_corpus.children(
            "STOPLIST").first().id,  # £todo via parent project?
        onto_corpus.children("MAINLIST").first().id,
        onto_corpus.children("MAPLIST").first().id
    ]

    old_group_id = onto_corpus.children("GROUPLIST").first().id

    # retrieve old data into old_lists[list_type]...
    # ----------------------------------------------
    for lid, linfo in enumerate(linfos):
        list_type = linfo['key']
        if list_type not in del_originals:

            # NB can't use UnweightedList(tgt_nodeids[lid])
            # because we need to include out-of-list subforms
            list_ngrams_q = query_list(tgt_nodeids[lid],
                                       groupings_id=old_group_id)
            old_lists[list_type] = UnweightedList(list_ngrams_q.all())
        else:
            # ...or use empty objects if replacing old list
            # ----------------------------------------------
            old_lists[list_type] = UnweightedList()
            msg = "MERGE: ignoring old %s which will be overwritten" % linfo[
                'name']
            print(msg)
            my_log.append(msg)

    # ======== Merging all involved ngrams =========

    # all memberships with resolved conflicts of interfering memberships
    resolved_memberships = {}

    for list_set in [old_lists, new_lists]:
        for lid, info in enumerate(linfos):
            list_type = info['key']
            # if you don't want to merge one list just don't put it in new_lists
            if list_type in list_set:
                # we use the fact that lids are ordered ints...
                for ng_id in list_set[list_type].items:
                    if ng_id not in resolved_memberships:
                        resolved_memberships[ng_id] = lid
                    else:
                        # ...now resolving is simply taking the max
                        # stop < main < map
                        resolved_memberships[ng_id] = max(
                            lid, resolved_memberships[ng_id])
            # now each ngram is only in its most important list
            # -------------------------------------------------
            # NB temporarily map items are not in main anymore
            #    but we'll copy it at the end
            # NB temporarily all subforms were treated separately
            #    from mainforms but we'll force them into same list
            #    after we merge the groups

    del old_lists

    # ======== Merging old and new groups =========
    # get the arcs already in the target DB (directed couples)
    previous_links = session.query(
        NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
            NodeNgramNgram.node_id == old_group_id).all()

    n_links_previous = len(previous_links)

    # same format for the new arcs (Translations ~~~> array of couples)
    translated_imported_links = []
    add_link = translated_imported_links.append
    n_links_added = 0
    for (y, x) in new_lists['groupings'].items.items():
        add_link((x, y))
        n_links_added += 1
    del new_lists

    # group_union: joins 2 different synonym-links lists into 1 new list
    new_links = group_union(previous_links, translated_imported_links)
    del previous_links
    del translated_imported_links

    n_links_after = len(new_links)

    merged_group = Translations([(y, x) for (x, y) in new_links])
    del new_links

    # ======== Overwrite old data with new =========

    merged_group.save(old_group_id)

    msg = "MERGE: groupings %i updated (links before/added/after: %i/%i/%i)" % (
        old_group_id, n_links_previous, n_links_added, n_links_after)
    my_log.append(msg)
    print(msg)

    # ======== Target list(s) append data =========
    # if list 2 => write in both tgt_data_lists [1,2]
    # lists 0 or 1 => straightforward targets [0] or [1]

    merged_results = {
        'stop': UnweightedList(),
        'main': UnweightedList(),
        'map': UnweightedList()
    }

    for (ng_id, winner_lid) in resolved_memberships.items():

        ## 1) using the new groups
        # normal case if not a subform
        if ng_id not in merged_group.items:
            target_lid = winner_lid
        # inherit case if is a subform
        else:
            mainform_id = merged_group.items[ng_id]
            # inherited winner
            try:
                target_lid = resolved_memberships[mainform_id]
            except KeyError:
                target_lid = winner_lid
                print("MERGE: WARN ng_id %i has incorrect mainform %i ?" %
                      (ng_id, mainform_id))

        ## 2) map => map + main
        if target_lid == 2:
            todo_lids = [1, 2]
        else:
            todo_lids = [target_lid]

        ## 3) storage
        for lid in todo_lids:
            list_type = linfos[lid]['key']
            merged_results[list_type].items.add(ng_id)

    # print("IMPORT: added %i elements in the lists indices" % added_nd_ng)

    # ======== Overwrite old data with new =========
    for lid, info in enumerate(linfos):
        tgt_id = tgt_nodeids[lid]
        list_type = info['key']
        result = merged_results[list_type]
        result.save(tgt_id)

        msg = "MERGE: %s %i updated (new size: %i)" % (
            info['name'], tgt_id, len(merged_results[list_type].items))
        my_log.append(msg)
        print(msg)

    # return a log
    return ("\n".join(my_log))