def corpus_list(corpus_id,
                list_types=ALL_LIST_TYPES,
                with_synonyms=False,
                with_count=False):
    # Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
    NNN = NodeNgramNgram

    # Get the list type from the Node type -- as in CSV export
    list_type = (case([(Node.typename == 'MAINLIST', 'main'),
                       (Node.typename == 'MAPLIST', 'map'),
                       (Node.typename == 'STOPLIST', 'stop')]).label('type'))

    # We will retrieve each ngram as the following tuple:
    entities = (list_type, Ngram.terms.label('ng'))

    if with_count:
        entities += (Ngram.id.label('id'), )

    # First, get ngrams from wanted lists
    ngrams = _ngrams(corpus_id, list_types, entities)

    # Secondly, exclude "synonyms" (grouped ngrams that are not normal forms).
    # We have to exclude synonyms first because data is inconsistent and some
    # of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to
    # take synonyms from GROUPLIST only -- see below.
    Groups = aliased(Node, name='groups')
    query = (ngrams.outerjoin(
        Groups, (Groups.parent_id == corpus_id) &
        (Groups.typename == 'GROUPLIST')).outerjoin(
            NNN, (NNN.node_id == Groups.id) &
            (NNN.ngram2_id == Ngram.id)).filter(NNN.ngram1_id == None))

    # If `with_synonyms` is True, add them from GROUPLIST: this is the reliable
    # source for them
    if with_synonyms:
        Synonym = aliased(Ngram)
        ent = (list_type, Synonym.terms.label('ng'), Synonym.id.label('id'))
        synonyms = (ngrams.with_entities(*ent).filter(
            NNN.ngram1_id == Ngram.id, NNN.ngram2_id == Synonym.id,
            NNN.node_id == Groups.id, Groups.parent_id == corpus_id,
            Groups.typename == 'GROUPLIST'))
        query = query.union(synonyms)

    # Again, data is inconsistent: MAINLIST may intersect with MAPLIST and
    # we don't wan't that
    if 'main' in list_types and 'map' not in list_types:
        # Exclude MAPLIST ngrams from MAINLIST
        query = query.except_(_ngrams(corpus_id, 'map', entities))

    if with_count:
        N = query.subquery()
        return (session.query(N.c.type, N.c.ng, NodeNodeNgram.score).join(
            Node, (Node.parent_id == corpus_id) &
            (Node.typename == 'OCCURRENCES')).outerjoin(
                NodeNodeNgram, (NodeNodeNgram.ngram_id == N.c.id) &
                (NodeNodeNgram.node1_id == Node.id) &
                (NodeNodeNgram.node2_id == corpus_id)))

    # Return found ngrams sorted by list type, and then alphabetically
    return query.order_by('type', 'ng')
Exemplo n.º 2
0
def query_groups(groupings_id, details=False):
    """
    Listing of couples (mainform,   subform)
                 aka   (ngram1_id, ngram2_id)

    Parameter:
      - details: if False, just send the array of couples
                 if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
    """
    if not details:
        # simple contents
        query = session.query(NodeNgramNgram.ngram1_id,
                              NodeNgramNgram.ngram2_id)
    else:
        # detailed contents (id + terms)
        Ngram1 = aliased(Ngram)
        Ngram2 = aliased(Ngram)
        query = (session.query(
            NodeNgramNgram.ngram1_id,
            Ngram1.terms,
            NodeNgramNgram.ngram2_id,
            Ngram2.terms,
        ).join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id).join(
            Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id))

    # main filter
    # -----------
    query = query.filter(NodeNgramNgram.node_id == groupings_id)

    return query
def nodes(parent=None,
          group_by='typename',
          order_by='typename',
          has_child='check'):
    if group_by or has_child is not None:
        select = [
            func.min(Node.id).label('id'),
            func.min(Node.name).label('name'),
            func.min(Node.typename).label('typename'),
            func.count(Node.id).label('cnt')
        ]
    else:
        select = [
            Node.id.label('id'),
            Node.name.label('name'),
            Node.typename.label('typename'),
            literal_column('1').label('cnt')
        ]

    if has_child is not None:
        N = aliased(Node)
        select.append(func.count(N.id).label('children'))
    else:
        select.append(literal_column('NULL').label('children'))

    parent_id = getattr(parent, 'id', parent)
    q = session.query(*select).filter_by(parent_id=parent_id) \
               .group_by(getattr(Node, group_by if group_by else 'id'))

    if has_child is not None:
        q = q.outerjoin(N, N.parent_id == Node.id).group_by(N.parent_id)

    return q.order_by(order_by)
def query_groups(groupings_id, details=False, sort=False):
    """
    Listing of couples (mainform,   subform)
                 aka   (ngram1_id, ngram2_id)

    Parameter:
      - details: if False, just send the array of couples
                 if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
      - sort: order results by terms of ngram1 then ngram2
    """
    if details or sort:
        Ngram1, Ngram2 = Ngram, aliased(Ngram)

    if not details:
        # simple contents
        columns = (NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
    else:
        # detailed contents (id + terms)
        columns = (Ngram1.id, Ngram1.terms, Ngram2.id, Ngram2.terms)

    query = session.query(*columns)

    if details or sort:
        query = (query.join(Ngram1,
                            NodeNgramNgram.ngram1_id == Ngram1.id).join(
                                Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id))

    if sort:
        query = query.order_by(Ngram1.terms, Ngram2.terms)

    # main filter
    # -----------
    query = query.filter(NodeNgramNgram.node_id == groupings_id)

    return query
Exemplo n.º 5
0
 def contacts(self):
     """get all contacts in relation with the user"""
     Friend = aliased(User)
     query = (session
         .query(Friend)
         .join(Contact, Contact.user2_id == Friend.id)
         .filter(Contact.user1_id == self.id)
     )
     return query.all()
Exemplo n.º 6
0
    def get(self, request, corpus_id, doc_id):
        """
        Get all ngrams for a doc id, sorted by list

        usual route: /annotations/documents/<docid>

        NB1 : we are within a doc only
        NB2 : MAINLIST items are actually MAINLIST without MAP items
        NB3 : mostly the mainforms are in lists, but doc can have subform
              => if we simply join on ngram_id, we'll filter out the subforms
              => join on value filled by case switch:
                    (the ngram itself or a mainform if exists)
        """
        corpus_id = int(corpus_id)
        doc_id = int(doc_id)

        # our results: ngrams within a doc and a list + weights in the doc
        doc_ngram_list = []
        doc_ngram_list_add = doc_ngram_list.append
        lists = {}

        corpus_nod = cache.Node[corpus_id]
        doc_nod = cache.Node[doc_id]
        # scores_nod = corpus_nod.children(typename="OCCURRENCES").first()
        groups_nod = corpus_nod.children(typename="GROUPLIST").first()

        # synonyms sub table for outerjoins
        Syno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groups_nod.id).subquery())

        # maplist_ids to filter map ngrams from mainlist
        maplist_ids = {}

        # NB must do mainlist after map for filtering map items out of main
        for list_type in ['MAPLIST', 'STOPLIST', 'MAINLIST']:
            list_nod = corpus_nod.children(typename=list_type).first()
            list_id = list_nod.id
            lists["%s" % list_id] = list_type

            ListsTable = aliased(NodeNgram)

            mainform_id = case([(Syno.c.ngram1_id != None, Syno.c.ngram1_id),
                                (Syno.c.ngram1_id == None, Ngram.id)])

            q = (session
                    # ngrams from the doc_id
                    .query(NodeNgram.weight, Ngram, mainform_id)
                    # debug
                    #.query(NodeNgram.weight, Ngram.terms, Ngram.id, Syno.c.ngram1_id, mainform_id)
                    .select_from(NodeNgram)
                    .join(Ngram)
                    .filter(NodeNgram.node_id == doc_id)

                    # add mainforms next to their subforms
                    .outerjoin(Syno,
                                Syno.c.ngram2_id == Ngram.id)

                    # filter mainforms on the list we want
                    .join(ListsTable,
                            #  possible that mainform is in list
                            #  and not the subform
                            ListsTable.ngram_id == mainform_id
                        )
                    .filter(ListsTable.node_id == list_id)
                )

            # add to results (and optional filtering)
            for (w, obj, mainform_id) in q.all():

                ngram_id = obj.id

                # boolean if needed
                # is_subform = (ngram_id == mainform_id)

                # special filtering case
                # when MAINLIST requested we actually want MAIN without MAP
                if list_type == "MAPLIST":
                    maplist_ids[ngram_id] = True
                if list_type == "MAINLIST":
                    if ngram_id in maplist_ids:
                        # skip object
                        continue

                if mainform_id == ngram_id:
                    group = None
                else:
                    group = mainform_id
                # normal case
                doc_ngram_list_add((ngram_id, obj.terms, group, w, list_id))

        # debug
        # print("annotations.views.NgramList.doc_ngram_list: ", doc_ngram_list)
        data = {
            '%s' % corpus_id: {
                '%s' % doc_id: [
                    {
                        'uuid': ngram_id,
                        'group': group,  # the mainform if there is a group
                        'text': ngram_text,
                        'occs': ngram_occs,
                        'list_id': list_id,
                    } for (ngram_id, ngram_text, group, ngram_occs,
                           list_id) in doc_ngram_list
                ],
                'lists':
                lists
            }
        }

        # format alternatif de transmission des "annotations", classé par listes puis ngram_id
        # { 'corpus_id' : {
        #    list_id_stop: {term_stop1: {term_data}, term_stop2: {term_data}..},
        #    list_id_miam: {term_miam1: {term_data}, term_miam2: {term_data}..},
        #    list_id_map:  {term_map1:  {term_data}, term_map2:  {term_data}..},
        #   }
        #   'lists' : {"list_id" : "list_type" ... }
        # }

        # NB 3rd possibility: unicity of ngram_text could also allow us to use it
        #    as key and could enhance lookup later (frequent checks if term exists)
        return Response(data)
Exemplo n.º 7
0
def do_maplist(corpus,
               overwrite_id=None,
               mainlist_id=None,
               specclusion_id=None,
               genclusion_id=None,
               grouplist_id=None,
               limit=DEFAULT_MAPLIST_MAX,
               genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
               monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO):
    '''
    According to Genericity/Specificity and mainlist

    Parameters:
      - mainlist_id (starting point, already cleaned of stoplist terms)
      - specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
      - genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
      - grouplist_id (filtering grouped ones)
      - overwrite_id: optional if preexisting MAPLIST node to overwrite

      + 3 params to modulate the terms choice
        - limit for the amount of picked terms
        - monograms_part: a ratio of terms with only one lexical unit to keep
                          (multigrams quota = limit * (1-monograms_part))
        - genclusion_part: a ratio of terms with only one lexical unit to keep
                           (speclusion quota = limit * (1-genclusion_part))
    '''

    if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
        raise ValueError(
            "Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id"
        )

    quotas = {'topgen': {}, 'topspec': {}}
    genclusion_limit = round(limit * genclusion_part)
    speclusion_limit = limit - genclusion_limit
    quotas['topgen']['monograms'] = round(genclusion_limit * monograms_part)
    quotas['topgen'][
        'multigrams'] = genclusion_limit - quotas['topgen']['monograms']
    quotas['topspec']['monograms'] = round(speclusion_limit * monograms_part)
    quotas['topspec'][
        'multigrams'] = speclusion_limit - quotas['topspec']['monograms']

    print("MAPLIST quotas:", quotas)

    #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)

    MainlistTable = aliased(NodeNgram)

    IsSubform = (session
                            # we want only secondary terms (ngram2)
                            # to be able to filter them out
                            .query(NodeNgramNgram.ngram2_id)
                            .filter(NodeNgramNgram.node_id == grouplist_id)
                            .subquery()
                         )

    ScoreSpec = aliased(NodeNgram)
    ScoreGen = aliased(NodeNgram)

    # ngram with both ranking factors spec and gen
    query = (
        session.query(ScoreSpec.ngram_id, ScoreSpec.weight,
                      ScoreGen.weight, Ngram.n).join(
                          Ngram, Ngram.id == ScoreSpec.ngram_id).join(
                              ScoreGen,
                              ScoreGen.ngram_id == ScoreSpec.ngram_id).filter(
                                  ScoreSpec.node_id == specclusion_id).filter(
                                      ScoreGen.node_id == genclusion_id)

        # we want only terms within mainlist
        .join(MainlistTable, Ngram.id == MainlistTable.ngram_id).filter(
            MainlistTable.node_id == mainlist_id)

        # we remove all ngrams matching an ngram2_id from the synonyms
        .outerjoin(IsSubform,
                   IsSubform.c.ngram2_id == ScoreSpec.ngram_id).filter(
                       IsSubform.c.ngram2_id == None)

        # specificity-ranked
        .order_by(desc(ScoreSpec.weight)))

    # format in scored_ngrams array:
    # -------------------------------
    # [(37723,    8.428, 14.239,   3    ),   etc]
    #   ngramid   wspec   wgen    nwords
    scored_ngrams = query.all()
    n_ngrams = len(scored_ngrams)

    if n_ngrams == 0:
        raise ValueError("No ngrams in cooc table ?")
        #return
    # results, with same structure as quotas
    chosen_ngrams = {
        'topgen': {
            'monograms': [],
            'multigrams': []
        },
        'topspec': {
            'monograms': [],
            'multigrams': []
        }
    }

    # specificity and genericity are rather reverse-correlated
    # but occasionally they can have common ngrams (same ngram well ranked in both)
    # => we'll use a lookup table to check if we didn't already get it
    already_gotten_ngramids = {}

    # 2 loops to fill spec-clusion then gen-clusion quotas
    #   (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st)
    for rkr in ['topspec', 'topgen']:
        got_enough_mono = False
        got_enough_multi = False
        all_done = False
        i = -1
        while ((not all_done)
               and (not (got_enough_mono and got_enough_multi))):
            # retrieve sorted ngram n° i
            i += 1
            (ng_id, wspec, wgen, nwords) = scored_ngrams[i]

            # before any continue case, we check the next i for max reached
            all_done = (i + 1 >= n_ngrams)

            if ng_id in already_gotten_ngramids:
                continue

            # NB: nwords could be replaced by a simple search on r' '
            if nwords == 1:
                if got_enough_mono:
                    continue
                else:
                    # add ngram to results and lookup
                    chosen_ngrams[rkr]['monograms'].append(ng_id)
                    already_gotten_ngramids[ng_id] = True
            # multi
            else:
                if got_enough_multi:
                    continue
                else:
                    # add ngram to results and lookup
                    chosen_ngrams[rkr]['multigrams'].append(ng_id)
                    already_gotten_ngramids[ng_id] = True

            got_enough_mono = (len(chosen_ngrams[rkr]['monograms']) >=
                               quotas[rkr]['monograms'])
            got_enough_multi = (len(chosen_ngrams[rkr]['multigrams']) >=
                                quotas[rkr]['multigrams'])

        # at the end of the first loop we just need to sort all by the second ranker (gen)
        scored_ngrams = sorted(scored_ngrams,
                               key=lambda ng_infos: ng_infos[2],
                               reverse=True)

    obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
    obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams'])
    obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
    obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
    obtained_total = obtained_spec_mono   \
                    + obtained_spec_multi \
                    + obtained_gen_mono   \
                    + obtained_gen_multi
    print("MAPLIST: top_spec_monograms =", obtained_spec_mono)
    print("MAPLIST: top_spec_multigrams =", obtained_spec_multi)
    print("MAPLIST: top_gen_monograms =", obtained_gen_mono)
    print("MAPLIST: top_gen_multigrams =", obtained_gen_multi)
    print("MAPLIST: kept %i ngrams in total " % obtained_total)

    obtained_data = chosen_ngrams['topspec']['monograms']      \
                    + chosen_ngrams['topspec']['multigrams']   \
                    + chosen_ngrams['topgen']['monograms']     \
                    + chosen_ngrams['topgen']['multigrams']

    # NEW MAPLIST NODE
    # -----------------
    # saving the parameters of the analysis in the Node JSON
    new_hyperdata = {
        'corpus': corpus.id,
        'limit': limit,
        'monograms_part': monograms_part,
        'genclusion_part': genclusion_part,
    }
    if overwrite_id:
        # overwrite pre-existing node
        the_maplist = cache.Node[overwrite_id]
        the_maplist.hyperdata = new_hyperdata
        the_maplist.save_hyperdata()
        session.commit()
        the_id = overwrite_id
    else:
        # create a new maplist node
        the_maplist = corpus.add_child(name="Maplist (in %i)" % corpus.id,
                                       typename="MAPLIST",
                                       hyperdata=new_hyperdata)
        session.add(the_maplist)
        session.commit()
        the_id = the_maplist.id

    # create UnweightedList object and save (=> new NodeNgram rows)
    datalist = UnweightedList(obtained_data)

    # save
    datalist.save(the_id)

    # dbg.show('MapList computed')

    return the_id
def countCooccurrences(corpus_id=None,
                       cooc_id=None,
                       field1='ngrams',
                       field2='ngrams',
                       start=None,
                       end=None,
                       mapList_id=None,
                       groupList_id=None,
                       distance=None,
                       bridgeness=None,
                       n_min=1,
                       n_max=None,
                       limit=1000,
                       isMonopartite=True,
                       threshold=3,
                       save_on_db=True,
                       reset=True):
    '''
    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
    For the moment list of parameters are not supported because, lists need to
    be merged before.
    corpus           :: Corpus

    mapList_id       :: Int
    groupList_id     :: Int

    start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
    end   :: TimeStamp
    limit :: Int

    '''
    # FIXME remove the lines below after factorization of parameters
    parameters = dict()
    parameters['field1'] = field1
    parameters['field2'] = field2

    # Get corpus as Python object
    corpus = session.query(Node).filter(Node.id == corpus_id).first()

    # Get node of the Graph
    if not cooc_id:

        cooc_id = (session.query(Node.id).filter(
            Node.typename == "COOCCURRENCES", Node.name == "GRAPH EXPLORER",
            Node.parent_id == corpus.id).first())
        if not cooc_id:
            coocNode = corpus.add_child(typename="COOCCURRENCES",
                                        name="GRAPH (in corpus %s)" %
                                        corpus.id)

            session.add(coocNode)
            session.commit()
            cooc_id = coocNode.id
        else:
            cooc_id = int(cooc_id[0])

    # when cooc_id preexisted, but we want to continue  (reset = True)
    #    (to give new contents to this cooc_id)
    elif reset:
        print("GRAPH #%s ... Counting new cooccurrences data." % cooc_id)
        session.query(NodeNgramNgram).filter(
            NodeNgramNgram.node_id == cooc_id).delete()
        session.commit()

    # when cooc_id preexisted and we just want to load it (reset = False)
    else:
        print("GRAPH #%s ... Loading cooccurrences computed already." %
              cooc_id)
        cooc = session.query(NodeNgramNgram.ngram1_id,
                             NodeNgramNgram.ngram2_id,
                             NodeNgramNgram.weight).filter(
                                 NodeNgramNgram.node_id == cooc_id).all()
        return (int(cooc_id), WeightedMatrix(cooc))

    NodeNgramX = aliased(NodeNgram)

    # Simple Cooccurrences
    cooc_score = func.count(NodeNgramX.node_id).label('cooc_score')

    # A kind of Euclidean distance cooccurrences
    #cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')

    if isMonopartite:
        NodeNgramY = aliased(NodeNgram)

        cooc_query = (session.query(
            NodeNgramX.ngram_id, NodeNgramY.ngram_id,
            cooc_score).join(Node, Node.id == NodeNgramX.node_id).join(
                NodeNgramY, NodeNgramY.node_id == Node.id).filter(
                    Node.parent_id == corpus.id, Node.typename == "DOCUMENT"))
    else:
        NodeNgramY = aliased(NodeNgram)

        cooc_query = (session.query(
            NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id,
            cooc_score).join(Node, Node.id == NodeHyperdataNgram.node_id).join(
                NodeNgramY, NodeNgramY.node_id == Node.id).join(
                    Hyperdata,
                    Hyperdata.id == NodeHyperdataNgram.hyperdata_id).filter(
                        Node.parent_id == corpus.id,
                        Node.typename == "DOCUMENT").filter(
                            Hyperdata.name == field1))

    # Size of the ngrams between n_min and n_max
    if n_min is not None or n_max is not None:
        if isMonopartite:
            NgramX = aliased(Ngram)
            cooc_query = cooc_query.join(NgramX,
                                         NgramX.id == NodeNgramX.ngram_id)

        NgramY = aliased(Ngram)
        cooc_query = cooc_query.join(NgramY, NgramY.id == NodeNgramY.ngram_id)

    if n_min is not None:
        cooc_query = (cooc_query.filter(NgramY.n >= n_min))
        if isMonopartite:
            cooc_query = cooc_query.filter(NgramX.n >= n_min)

    if n_max is not None:
        cooc_query = (cooc_query.filter(NgramY.n >= n_min))
        if isMonopartite:
            cooc_query = cooc_query.filter(NgramX.n >= n_min)

    # Cooc between the dates start and end
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        # TODO : more precise date format here (day is smaller grain actually).
        date_start = datetime.strptime(str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")

        Start = aliased(NodeHyperdata)
        cooc_query = (cooc_query.join(Start, Start.node_id == Node.id).filter(
            Start.key == 'publication_date').filter(
                Start.value_utc >= date_start_utc))

        parameters['start'] = date_start_utc

    if end is not None:
        # TODO : more precise date format here (day is smaller grain actually).
        date_end = datetime.strptime(str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")

        End = aliased(NodeHyperdata)

        cooc_query = (cooc_query.join(End, End.node_id == Node.id).filter(
            End.key == 'publication_date').filter(
                End.value_utc <= date_end_utc))

        parameters['end'] = date_end_utc

    if isMonopartite:
        # Cooc is symetric, take only the main cooccurrences and cut at the limit
        cooc_query = cooc_query.filter(
            NodeNgramX.ngram_id < NodeNgramY.ngram_id)

    cooc_query = cooc_query.having(cooc_score >= threshold)

    if isMonopartite:
        cooc_query = cooc_query.group_by(NodeNgramX.ngram_id,
                                         NodeNgramY.ngram_id)
    else:
        cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id,
                                         NodeNgramY.ngram_id)

    # Order according some scores
    # If ordering is really needed, use Ordered Index (faster)
    #cooc_query = cooc_query.order_by(desc('cooc_score'))

    matrix = WeightedMatrix(cooc_query)

    print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
    cooc = filterMatrix(matrix, mapList_id, groupList_id)

    parameters['MapList_id'] = str(mapList_id)
    parameters['GroupList_id'] = str(groupList_id)

    # TODO factorize savings on db
    if save_on_db:
        # Saving the cooccurrences
        cooc.save(cooc_id)
        print("GRAPH #%s ... Node Cooccurrence Matrix saved" % cooc_id)

        # Saving the parameters
        print("GRAPH #%s ... Parameters saved in Node." % cooc_id)
        coocNode = session.query(Node).filter(Node.id == cooc_id).first()

        coocNode.hyperdata["parameters"] = dict()
        coocNode.hyperdata["parameters"] = parameters
        coocNode.save_hyperdata()
        session.commit()

        #data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
    else:
        return cooc

    return (coocNode.id, cooc)
Exemplo n.º 9
0
def get_graph(request=None,
              corpus=None,
              field1='ngrams',
              field2='ngrams',
              mapList_id=None,
              groupList_id=None,
              cooc_id=None,
              type='node_link',
              start=None,
              end=None,
              distance='conditional',
              bridgeness=5,
              threshold=1,
              isMonopartite=True,
              saveOnly=True):
    '''
    Get_graph : main steps:
    0) Check the parameters

    get_graph :: GraphParameters -> Either (Dic Nodes Links) (Dic State Length)
        where type Length = Int

    get_graph first checks the parameters and return either graph data or a dict with
    state "type" with an integer to indicate the size of the parameter
    (maybe we could add a String in that step to factor and give here the error message)

    1) compute_graph (see function above)
    2) return graph

    '''
    overwrite_node_contents = False

    # Case of graph has been computed already
    if cooc_id is not None:
        print("GRAPH#%d ... Loading data already computed." % int(cooc_id))
        node = session.query(Node).filter(Node.id == cooc_id).first()

        # Structure of the Node.hyperdata[distance][bridbeness]
        # All parameters (but distance and bridgeness)
        # are in Node.hyperdata["parameters"]

        # Check distance of the graph
        if node.hyperdata.get(distance, None) is not None:
            graph = node.hyperdata[distance]

            # Check bridgeness of the graph
            if graph.get(str(bridgeness), None) is not None:
                return graph[str(bridgeness)]

    # new graph: we give it an empty node with new id and status
    elif saveOnly:
        # NB: we do creation already here (instead of same in countCooccurrences)
        #     to guarantee a unique ref id to the saveOnly graph (async generation)
        new_node = corpus.add_child(typename="COOCCURRENCES",
                                    name="GRAPH (in corpus %s)" % corpus.id)

        session.add(new_node)
        session.commit()
        cooc_id = new_node.id
        cooc_name = new_node.name
        cooc_date = new_node.date
        # and the empty content will need redoing by countCooccurrences
        overwrite_node_contents = True
        print("GRAPH #%d ... Created new empty data node for saveOnly" %
              int(cooc_id))

    # Case of graph has not been computed already
    # First, check the parameters

    # Case of mapList not big enough
    # ==============================

    # if we do not have any mapList_id already
    if mapList_id is None:
        mapList_id = session.query(
            Node.id).filter(Node.typename == "MAPLIST").first()[0]

    mapList_size = session.query(NodeNgram).filter(
        NodeNgram.node_id == mapList_id).count()

    if mapList_size < graph_constraints['mapList']:
        # Do not compute the graph if mapList is not big enough
        return {'state': "mapListError", "length": mapList_size}

    # Instantiate query for case of corpus not big enough
    # ===================================================
    corpus_size_query = (session.query(Node).filter(
        Node.typename == "DOCUMENT").filter(Node.parent_id == corpus.id))

    # Filter corpus by date if any start date
    # ---------------------------------------
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        date_start = datetime.strptime(str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")

        Start = aliased(NodeHyperdata)
        corpus_size_query = (corpus_size_query.join(
            Start, Start.node_id == Node.id).filter(
                Start.key == 'publication_date').filter(
                    Start.value_utc >= date_start_utc))

    # Filter corpus by date if any end date
    # -------------------------------------
    if end is not None:
        date_end = datetime.strptime(str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")

        End = aliased(NodeHyperdata)

        corpus_size_query = (corpus_size_query.join(
            End, End.node_id == Node.id).filter(
                End.key == 'publication_date').filter(
                    End.value_utc <= date_end_utc))

    # Finally test if the size of the corpora is big enough
    # --------------------------------
    corpus_size = corpus_size_query.count()

    if saveOnly is not None and saveOnly == "True":
        scheduled(compute_graph)(
            corpus_id=corpus.id,
            cooc_id=cooc_id
            #, field1="ngrams", field2="ngrams"
            ,
            start=start,
            end=end,
            mapList_id=mapList_id,
            groupList_id=groupList_id,
            isMonopartite=True,
            threshold=threshold,
            distance=distance,
            bridgeness=bridgeness,
            save_on_db=True,
            reset=overwrite_node_contents
            #, limit=size
        )

        return {
            "state": "saveOnly",
            "target_id": cooc_id,
            "target_name": cooc_name,
            "target_date": cooc_date
        }

    elif corpus_size > graph_constraints['corpusMax']:
        # Then compute cooc asynchronously with celery
        scheduled(compute_graph)(
            corpus_id=corpus.id,
            cooc_id=cooc_id
            #, field1="ngrams", field2="ngrams"
            ,
            start=start,
            end=end,
            mapList_id=mapList_id,
            groupList_id=groupList_id,
            isMonopartite=True,
            threshold=threshold,
            distance=distance,
            bridgeness=bridgeness,
            save_on_db=True,
            reset=overwrite_node_contents
            #, limit=size
        )
        # Dict to inform user that corpus maximum is reached
        # then graph is computed asynchronously
        return {"state": "corpusMax", "length": corpus_size}

    elif corpus_size <= graph_constraints['corpusMin']:
        # Do not compute the graph if corpus is not big enough
        return {"state": "corpusMin", "length": corpus_size}

    else:
        # If graph_constraints are ok then compute the graph in live
        data = compute_graph(
            corpus_id=corpus.id,
            cooc_id=cooc_id
            #, field1="ngrams", field2="ngrams"
            ,
            start=start,
            end=end,
            mapList_id=mapList_id,
            groupList_id=groupList_id,
            isMonopartite=True,
            threshold=threshold,
            distance=distance,
            bridgeness=bridgeness,
            save_on_db=True,
            reset=overwrite_node_contents
            #, limit=size
        )

    # case when 0 coocs are observed (usually b/c not enough ngrams in maplist)

    if len(data) == 0:
        print("GRAPH #   ... GET_GRAPH: 0 coocs in matrix")
        data = {'nodes': [], 'links': []}  # empty data

    return data
Exemplo n.º 10
0
def compute_tfidf_local(corpus,
                        on_list_id=None,
                        groupings_id=None,
                        overwrite_id=None):
    """
    Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus

    Parameters:
      - the corpus itself
      - groupings_id: optional synonym relations to add all subform counts
                      with their mainform's counts
      - on_list_id: mainlist or maplist type, to constrain the input ngrams
      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
                   (the Node and its previous NodeNodeNgram rows will be replaced)
    """

    # All docs of this corpus
    docids_subquery = (session.query(
        Node.id).filter(Node.parent_id == corpus.id).filter(
            Node.typename == "DOCUMENT").subquery())

    # N
    total_docs = session.query(docids_subquery).count()

    # define the counted form
    if not groupings_id:
        ngform_id = NodeNgram.ngram_id
    else:
        Syno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())

        ngform_id = case([(Syno.c.ngram1_id != None, Syno.c.ngram1_id),
                          (Syno.c.ngram1_id == None, NodeNgram.ngram_id)])

    # tf for each couple (number of rows = N docs X M ngrams)
    tf_doc_query = (
        session.query(
            ngform_id,
            NodeNgram.node_id,
            func.sum(NodeNgram.weight).label("tf"),  # tf: occurrences
        )

        # select within docs of current corpus
        .join(docids_subquery, docids_subquery.c.id == NodeNgram.node_id))

    if groupings_id:
        tf_doc_query = (tf_doc_query.outerjoin(
            Syno, Syno.c.ngram2_id == NodeNgram.ngram_id))
        # now when we'll group_by the ngram2 freqs will be added to ngram1

    if on_list_id:
        Miamlist = aliased(NodeNgram)
        tf_doc_query = (tf_doc_query.join(
            Miamlist, Miamlist.ngram_id == ngform_id).filter(
                Miamlist.node_id == on_list_id))

    # execute query to do our tf sum
    tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all()

    # ex: [(128371, 9732, 1.0),
    #      (128383, 9740, 1.0),
    #      (128373, 9731, 1.0),
    #      (128376, 9734, 1.0),
    #      (128372, 9731, 1.0),
    #      (128383, 9733, 1.0),
    #      (128383, 9735, 1.0),
    #      (128389, 9734, 1.0),
    #      (8624, 9731, 1.0),
    #      (128382, 9740, 1.0),
    #      (128383, 9739, 1.0),
    #      (128383, 9736, 1.0),
    #      (128378, 9735, 1.0),
    #      (128375, 9733, 4.0),
    #      (128383, 9732, 1.0)]
    #        ^ ^     ^^    ^^
    #       ngram   doc   freq in this doc

    # simultaneously count docs with given term (number of rows = M ngrams)

    ndocswithngram = {}
    for triple in tf_per_doc:
        ng = triple[0]
        doc = triple[1]
        if ng in ndocswithngram:
            ndocswithngram[ng] += 1
        else:
            ndocswithngram[ng] = 1

    # print(ndocswithngram)

    # store for use in formula
    # { ngram_id => log(nd) }
    log_nd_lookup = {
        ng: log(nd_count)
        for (ng, nd_count) in ndocswithngram.items()
    }

    # ---------------------------------------------------------
    tfidfs = {}
    log_tot_docs = log(total_docs)
    for (ngram_id, node_id, tf) in tf_per_doc:
        log_nd = log_nd_lookup[ngram_id]
        # tfidfs[ngram_id] = tf * log(total_docs/nd)
        tfidfs[node_id, ngram_id] = tf * (log_tot_docs - log_nd)
    # ---------------------------------------------------------

    if overwrite_id:
        the_id = overwrite_id
        session.query(NodeNodeNgram).filter(
            NodeNodeNgram.node1_id == the_id).delete()
        session.commit()
    else:
        # create the new TFIDF-CORPUS node
        tfidf_node = corpus.add_child()
        tfidf_node.typename = "TFIDF-CORPUS"
        tfidf_node.name = "tfidf-sims-corpus (in:%s)" % corpus.id
        session.add(tfidf_node)
        session.commit()
        the_id = tfidf_node.id

    # reflect that in NodeNodeNgrams
    # £TODO replace bulk_insert by something like WeightedIndex.save()
    bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'),
                ((the_id, node_id, ngram_id, tfidfs[node_id, ngram_id])
                 for (node_id, ngram_id) in tfidfs))

    return the_id
Exemplo n.º 11
0
def compute_ti_ranking(corpus,
                       groupings_id=None,
                       count_scope="local",
                       termset_scope="local",
                       overwrite_id=None):
    """
    Calculates tfidf ranking within given scope
                ----------
                   |
            via weighting of
            cumulated tfidf  --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
             per ngram ng_i
         (or per mainform ng_i' if groups)
           across some docs d_j

    Parameters:
      - the corpus itself (or corpus_id)
      - groupings_id: optional id of a GROUPLIST node for these ngrams
                        IF absent the ti weights are the sums for each ngram
                        IF present they're the sums for each ngram's mainform

      - count_scope: {"local" or "global"}
         - local  <=> frequencies counted in the current corpus
         - global <=> frequencies counted in all corpora of this type

        when the count_scope is global, there is another parameter:
          - termset_scope: {"local" or "global"}
             - local <=> output list of terms limited to the current corpus
               (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
             - global <=> output list of terms found in global doc scope
                                                    !!!! (many more terms)

      - overwrite_id: optional id of a pre-existing XXXX node for this corpus
                   (the Node and its previous Node NodeNgram rows will be replaced)
    """
    # validate string params
    if count_scope not in ["local", "global"]:
        raise ValueError(
            "compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
        )
    if termset_scope not in ["local", "global"]:
        raise ValueError(
            "compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
        )
    if count_scope == "local" and termset_scope == "global":
        raise ValueError(
            "compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
        )

    # get corpus
    if type(corpus) == int:
        corpus_id = corpus
        corpus = cache.Node[corpus_id]
    elif type(corpus) == str and match(r'\d+$', corpus):
        corpus_id = int(corpus)
        corpus = cache.Node[corpus_id]
    else:
        # assuming Node class
        corpus_id = corpus.id

    # prepare sqla mainform vs ngram selector
    ngform_i = None

    if not groupings_id:
        ngform_i = NodeNgram.ngram_id

    else:
        # prepare translations
        syno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())
        # cf commentaire détaillé dans compute_occs() + todo facto

        ngform_i = case([(syno.c.ngram1_id != None, syno.c.ngram1_id),
                         (syno.c.ngram1_id == None, NodeNgram.ngram_id)
                         #     condition               value
                         ])

    # MAIN QUERY SKELETON
    tf_nd_query = (
        session.query(
            # NodeNgram.ngram_id
            # or similar if grouping ngrams under their mainform
            ngform_i.label("counted_ngform"),

            # the tfidf elements
            # ------------------
            func.sum(NodeNgram.weight),  # tf: same as occurrences
            # -----------------------
            func.count(NodeNgram.node_id)  # nd: n docs with term
            # --------------------
        ).group_by("counted_ngform")

        # count_scope to specify in which doc nodes to count
        # -----------
        # .join(countdocs_subquery,
        #       countdocs_subquery.c.id == NodeNgram.node_id)

        # optional termset_scope: if we'll restrict the ngrams
        #          -------------
        # .join(termset_subquery,
        #       termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)

        # optional translations to bring the subform's replacement
        #          ------------
        # .outerjoin(syno,
        #           syno.c.ngram2_id == NodeNgram.ngram_id)
    )

    # TUNING THE QUERY

    if groupings_id:
        tf_nd_query = tf_nd_query.outerjoin(
            syno, syno.c.ngram2_id == NodeNgram.ngram_id)

    # local <=> within this corpus
    if count_scope == "local":
        # All docs of this corpus
        countdocs_subquery = (session.query(
            Node.id).filter(Node.typename == "DOCUMENT").filter(
                Node.parent_id == corpus_id).subquery())

        # no need to independantly restrict the ngrams
        tf_nd_query = tf_nd_query.join(
            countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id)
        # ---

    # global <=> within all corpora of this source
    elif count_scope == "global":
        this_source_type = corpus.resources()[0]['type']

        CorpusNode = aliased(Node)

        # All docs **in all corpora of the same source**
        countdocs_subquery = (
            session.query(Node.id).filter(Node.typename == "DOCUMENT")

            # join on parent_id with selected corpora nodes
            .join(CorpusNode, CorpusNode.id == Node.parent_id).filter(
                CorpusNode.typename == "CORPUS")
            # TODO index corpus_sourcetype in DB
            .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(
                this_source_type)).subquery())

        if termset_scope == "global":
            # both scopes are the same: no need to independantly restrict the ngrams
            tf_nd_query = tf_nd_query.join(
                countdocs_subquery,
                countdocs_subquery.c.id == NodeNgram.node_id)
            # ---

        elif termset_scope == "local":

            # All unique terms...
            termset_subquery = (
                session.query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
                # ... in the original corpus
                .join(Node).filter(Node.typename == "DOCUMENT").filter(
                    Node.parent_id == corpus_id).subquery())

            # only case of independant restrictions on docs and terms
            tf_nd_query = (tf_nd_query.join(
                countdocs_subquery,
                countdocs_subquery.c.id == NodeNgram.node_id).join(
                    termset_subquery,
                    termset_subquery.c.uniq_ngid == NodeNgram.ngram_id))
            # ---

    # M
    total_docs = session.query(countdocs_subquery).count()
    log_tot_docs = log(total_docs)

    # result
    tf_nd = tf_nd_query.all()

    # -------------- "sommatoire" sur mot i ----------------
    tfidfsum = {}
    for (ngram_i, tf_i, nd_i) in tf_nd:
        # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
        tfidfsum[ngram_i] = tf_i * (log_tot_docs - log(nd_i))
    # ------------------------------------------------------

    # N pour info
    total_ngramforms = len(tfidfsum)

    if overwrite_id:
        the_id = overwrite_id
        session.query(NodeNodeNgram).filter(
            NodeNodeNgram.node1_id == the_id).delete()
        session.commit()
    else:
        # create the new TFIDF-XXXX node to get an id
        tir_nd = corpus.add_child()
        if count_scope == "local":
            tir_nd.typename = "TIRANK-CORPUS"
            tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % (
                total_ngramforms, corpus_id)
        elif count_scope == "global":
            tir_nd.typename = "TIRANK-GLOBAL"
            tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % (
                total_ngramforms, ("from corpus %i" % corpus_id) if
                (termset_scope == "local") else "", this_source_type)

        session.add(tir_nd)
        session.commit()
        the_id = tir_nd.id

    # TODO 1 discuss use and find new typename
    # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
    # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
    # TODO 4 requalify this here as a NodeNgram
    # then TODO 5 use WeightedList.save() !

    # reflect that in NodeNodeNgrams
    bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'),
                ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum))

    return the_id
Exemplo n.º 12
0
def compute_coocs(
        corpus,
        overwrite_id=None,
        just_pass_result=True,  # just return the WeightedMatrix,
        #    (don't write to DB)
    threshold=DEFAULT_COOC_THRESHOLD,
        groupings_id=None,
        on_list_id=None,
        stoplist_id=None,
        start=None,
        end=None,
        symmetry_filter=False,
        diagonal_filter=True):
    """
    Count how often some extracted terms appear
    together in a small context (document)
    throughout a larger context (corpus).

             [NodeNgram]                       [NodeNgramNgram]

    node_id | ngram_id | weight       ngram1_id | ngram2_id | score |
    --------+----------+--------      ----------+-----------+-------+
     MyDocA |      487 |      1   =>        487 |       294 |     2 |
     MyDocA |      294 |      3
     MyDocB |      487 |      1
     MyDocB |      294 |      4

    Fill that info in DB:
      - a *new* COOCCURRENCES node
      - and all corresponding NodeNgramNgram rows

    worse case complexity ~ O(N²/2) with N = number of ngrams

    If a mainlist is provided, we filter doc ngrams to those also in the list.

    Parameters:
      - the corpus node
      - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
                     (all hyperdata and previous NodeNgramNgram rows will be replaced)
      - threshold: on output cooc count (previously called hapax)
      - groupings_id: optional synonym relations to add all subform counts
                      with their mainform's counts
      - on_list_id: mainlist or maplist type, to constrain the input ngrams
      - stoplist_id: stoplist for filtering input ngrams
                     (normally unnecessary if a mainlist is already provided)
      - start, end: provide one or both temporal limits to filter on doc date
                    NB the expected type of parameter value is datetime.datetime
                        (string is also possible but format must follow
                          this convention: "2001-01-01" aka "%Y-%m-%d")
      - symmetry_filter: prevent calculating where ngram1_id  > ngram2_id
      - diagonal_filter: prevent calculating where ngram1_id == ngram2_id


     (deprecated parameters)
      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
      - isMonopartite: ?? used a nodes_hyperdata_ngrams table ???

    basic idea for one doc
    ======================
    each pair of ngrams sharing same doc (node_id)
        SELEC idxa.ngram_id, idxb.ngram_id
        FROM nodes_ngrams AS idxa
        ---------------------------------
        JOIN nodes_ngrams AS idxb
        ON idxa.node_id = idxb.node_id      <== that's cooc
        ---------------------------------
        AND idxa.ngram_id <> idxb.ngram_id   (diagonal_filter)
        AND idxa.node_id = MY_DOC ;

    on entire corpus
    =================
    coocs for each doc :
      - each given pair like (termA, termB) will likely appear several times
        => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
      - we count unique appearances of the pair (cooc)


    """

    #   - TODO cvalue_id: allow a metric as additional  input filter
    #   - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
    #   - TODO weighted: if False normal cooc to be saved as result
    #                    if True  weighted cooc (experimental)

    # /!\ big combinatorial complexity /!\
    # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
    #  1.859.408 lignes pour la requête cooc simple
    #     71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)

    # 2 x the occurrence index table
    Xindex = aliased(NodeNgram)
    Yindex = aliased(NodeNgram)

    # for debug (1/4)
    # Xngram = aliased(Ngram)
    # Yngram = aliased(Ngram)

    # 1) prepare definition of counted forms
    if not groupings_id:

        # no groupings => the counted forms are the ngrams
        Xindex_ngform_id = Xindex.ngram_id
        Yindex_ngform_id = Yindex.ngram_id

    # groupings: cf commentaire détaillé dans compute_occs() + todo facto
    else:
        # prepare translations
        Xsyno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())

        # further use as anon tables prevent doing Ysyno = Xsyno
        Ysyno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())

        # groupings => define the counted form depending on the existence of a synonym
        Xindex_ngform_id = case([
            (Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
            (Xsyno.c.ngram1_id == None, Xindex.ngram_id)
            #     condition               value
        ])

        Yindex_ngform_id = case([(Ysyno.c.ngram1_id != None,
                                  Ysyno.c.ngram1_id),
                                 (Ysyno.c.ngram1_id == None, Yindex.ngram_id)])
        # ---

    # 2) BASE DB QUERY

    # cooccurrences columns definition ----------------
    ucooc = func.count(Xindex_ngform_id).label("ucooc")
    # NB could be X or Y in this line
    #    (we're counting grouped rows and just happen to do it on this column)
    base_query = (
        session.query(
            Xindex_ngform_id, Yindex_ngform_id, ucooc

            # for debug (2/4)
            # , Xngram.terms.label("w_x")
            # , Yngram.terms.label("w_y")
        ).join(Yindex,
               Xindex.node_id == Yindex.node_id)  # <- by definition of cooc
        .join(Node, Node.id == Xindex.node_id)  # <- b/c within corpus
        .filter(Node.parent_id == corpus.id)  # <- b/c within corpus
        .filter(Node.typename == "DOCUMENT")  # <- b/c within corpus
    )

    # outerjoin the synonyms if needed
    if groupings_id:
        base_query = (
            base_query.outerjoin(
                Xsyno,  # <- synonyms for Xindex.ngrams
                Xsyno.c.ngram2_id == Xindex.ngram_id).outerjoin(
                    Ysyno,  # <- synonyms for Yindex.ngrams
                    Ysyno.c.ngram2_id == Yindex.ngram_id))

    # 3) counting clause in any case
    coocs_query = (
        base_query.group_by(
            Xindex_ngform_id,
            Yindex_ngform_id  # <- what we're counting
            # for debug (3/4)
            # ,"w_x", "w_y"
        )

        # for debug (4/4)
        # .join(Xngram, Xngram.id == Xindex_ngform_id)
        # .join(Yngram, Yngram.id == Yindex_ngform_id)
        .order_by(ucooc))

    # 4) INPUT FILTERS (reduce N before O(N²))
    if on_list_id:
        # £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
        #       car permettrait expansion de liste aux plus proches voisins (MacLachlan)
        #       (avec une matr rectangulaire)

        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)

        coocs_query = (coocs_query.join(
            m1, m1.ngram_id == Xindex_ngform_id).join(
                m2, m2.ngram_id == Yindex_ngform_id).filter(
                    m1.node_id == on_list_id).filter(m2.node_id == on_list_id))

    if stoplist_id:
        s1 = (session.query(NodeNgram.ngram_id).filter(
            NodeNgram.node_id == stoplist_id).subquery())

        # further use as anon tables prevent doing s2 = s1
        s2 = (session.query(NodeNgram.ngram_id).filter(
            NodeNgram.node_id == stoplist_id).subquery())

        coocs_query = (
            coocs_query.outerjoin(s1,
                                  s1.c.ngram_id == Xindex_ngform_id).outerjoin(
                                      s2, s2.c.ngram_id == Yindex_ngform_id)

            # équivalent NOT IN stoplist
            .filter(s1.c.ngram_id == None).filter(s2.c.ngram_id == None))

    if diagonal_filter:
        # don't compute ngram with itself
        coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)

    if start or end:
        Time = aliased(NodeHyperdata)

        coocs_query = (coocs_query.join(Time,
                                        Time.node_id == Xindex.node_id).filter(
                                            Time.key == "publication_date"))

        if start:
            if not isinstance(start, datetime):
                try:
                    start = datetime.strptime(start, '%Y-%m-%d')
                except:
                    raise TypeError(
                        "'start' param expects datetime object or %%Y-%%m-%%d string"
                    )

            # the filtering by start limit
            coocs_query = coocs_query.filter(Time.value_utc >= start)

        if end:
            if not isinstance(end, datetime):
                try:
                    end = datetime.strptime(end, '%Y-%m-%d')
                except:
                    raise TypeError(
                        "'end' param expects datetime object or %%Y-%%m-%%d string"
                    )

            # the filtering by start limit
            coocs_query = coocs_query.filter(Time.value_utc <= end)

    if symmetry_filter:
        # 1 filtre tenant en compte de la symétrie
        #  -> réduit le travail de moitié !!
        #  -> mais récupération sera plus couteuse via des requêtes OR comme:
        #       WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
        coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id)

    # 5) OUTPUT FILTERS
    # ------------------
    # threshold
    # £TODO adjust COOC_THRESHOLD a posteriori:
    # ex: sometimes 2 sometimes 4 depending on sparsity
    print("COOCS: filtering pairs under threshold:", threshold)
    coocs_query = coocs_query.having(ucooc >= threshold)

    # 6) EXECUTE QUERY
    # ----------------
    #  => storage in our matrix structure
    matrix = WeightedMatrix(coocs_query.all())
    #                      -------------------

    # fyi
    shape_0 = len({pair[0] for pair in matrix.items})
    shape_1 = len({pair[1] for pair in matrix.items})
    print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))

    if just_pass_result:
        return matrix
    else:
        # 5) SAVE
        # --------
        # saving the parameters of the analysis in the Node JSON
        new_hyperdata = {'corpus': corpus.id, 'threshold': threshold}

        if overwrite_id:
            # overwrite pre-existing id
            the_cooc = cache.Node[overwrite_id]
            the_cooc.hyperdata = new_hyperdata
            the_cooc.save_hyperdata()
            session.commit()
            the_id = overwrite_id
        else:
            # create the new cooc node
            the_cooc = corpus.add_child(
                typename="COOCCURRENCES",
                name="Coocs (in:%s)" % corpus.name[0:10],
                hyperdata=new_hyperdata,
            )
            session.add(the_cooc)
            session.commit()

            the_id = the_cooc.id

        # ==> save all NodeNgramNgram with link to new cooc node id
        matrix.save(the_id)

        return the_id