예제 #1
0
def index_hyperdata(corpus):
    bulk_insert(
        table=NodeHyperdata,
        fields=('node_id', 'key', 'value_int', 'value_flt', 'value_utc',
                'value_str', 'value_txt'),
        data=_nodes_hyperdata_generator(corpus),
    )
예제 #2
0
    def put(self, request, corpus_id, check_each_doc=True):
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        # user is ok
        fav_node = self._get_fav_node(corpus_id)

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'count_added':
                0
            }
        else:
            req_params = validate(get_parameters(request), {
                'docs': list,
                'default': ""
            })
            nodeids_to_add = [
                int(did) for did in req_params['docs'].split(',')
            ]

            if check_each_doc:
                # verification que ce sont bien des documents du bon corpus
                # un peu long => désactiver par défaut ?
                known_docs_q = (session.query(
                    Node.id).filter(Node.parent_id == corpus_id).filter(
                        Node.typename == 'DOCUMENT'))
                lookup = {
                    known_doc.id: True
                    for known_doc in known_docs_q.all()
                }
                # debug
                # print("lookup hash", lookup)
                rejected_list = []
                for doc_node_id in nodeids_to_add:
                    if (doc_node_id not in lookup):
                        rejected_list.append(doc_node_id)
                if len(rejected_list):
                    raise ValidationException(
                        "Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)"
                        % (str(rejected_list), int(corpus_id)))

            # add them
            bulk_insert(NodeNode, ('node1_id', 'node2_id', 'score'),
                        ((fav_node.id, doc_node_id, 1.0)
                         for doc_node_id in nodeids_to_add))

            # todo count really added (here: counts input param not result)
            response = {'count_added': len(nodeids_to_add)}

        return JsonHttpResponse(response)
예제 #3
0
 def save(self, node_id=None):
     from gargantext.models import NodeNgram
     if node_id is None:
         if hasattr(self, 'id'):
             node_id = self.id
         else:
             raise ValueError('Please mention an ID to save the node.')
     # delete previous data
     session.query(NodeNgram).filter(NodeNgram.node_id == node_id).delete()
     session.commit()
     # insert new data
     bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'),
                 ((node_id, key, 1.0) for key in self.items))
예제 #4
0
 def save(self, node_id=None):
     from gargantext.models import NodeNgramNgram
     if node_id is None:
         if hasattr(self, 'id'):
             node_id = self.id
         else:
             raise ValueError('Please mention an ID to save the node.')
     # delete previous data
     session.query(NodeNgramNgram).filter(
         NodeNgramNgram.node_id == node_id).delete()
     session.commit()
     # insert new data
     print("WeightedMatrix bulk_insert start")
     bulk_insert(NodeNgramNgram,
                 ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
                 ((node_id, key1, key2, value)
                  for key1, key2, value in self))
     print("WeightedMatrix bulk_insert stop")
    def put(self, request):
        """
        Add some group elements to a group node
          => adds new couples from GroupsBuffer._to_add of terms view

        TODO see use of util.lists.Translations

        Parameters are all in the url (for symmetry with DELETE method)
           api/ngramlists/groups?node=783&1228[]=891,1639
                                     => creates 1228 - 891
                                            and 1228 - 1639

        general format is:   mainform_id[]=subform_id1,subform_id2 etc
                                     => creates mainform_id - subform_id1
                                            and mainform_id - subform_id2

        NB: also checks if the couples exist before because the ngram table
            will send the entire group (old existing links + new links)
        """
        # from the url
        params = get_parameters(request)
        # the node param is unique
        group_node = params.pop('node')
        # the others params are links to change
        couples = self.links_to_couples(params)

        # debug
        # print("==couples from url =================================++++=")
        # print(couples)

        # local version of "insert if not exists" -------------------->8--------
        # (1) check already existing elements
        check_query = (session.query(NodeNgramNgram).filter(
            NodeNgramNgram.node_id == group_node).filter(
                tuple_(NodeNgramNgram.ngram1_id,
                       NodeNgramNgram.ngram2_id).in_(couples)))

        existing = {}
        for synonyms in check_query.all():
            existing[(synonyms.ngram1_id, synonyms.ngram2_id)] = True

        # debug
        #print("==existing")
        #print(existing)

        # (2) compute difference locally
        couples_to_add = [(mform, sform) for (mform, sform) in couples
                          if (mform, sform) not in existing]

        # debug
        # print("== couples_to_add =================================++++=")
        # print(couples_to_add)

        # (3) add new groupings
        bulk_insert(NodeNgramNgram,
                    ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
                    ((group_node, mainform, subform, 1.0)
                     for (mainform, subform) in couples_to_add))

        # ------------------------------------------------------------>8--------

        return JsonHttpResponse({
            'count_added': len(couples_to_add),
        }, 200)
예제 #6
0
def index_new_ngrams(ngram_ids, corpus, keys=(
    'title',
    'abstract',
)):
    """
    Find occurrences of some ngrams for every document of the given corpus.
    + insert them in the NodeNgram table.

    @param ngram_ids: a list of ids for Ngram objects
                      (we assume they already went throught normalizations
                       and they were already added to Ngrams table
                       and optionally to some of the lists like MAPLIST)

            (but we can't know if they were previously indexed in the corpus)

    @param corpus: the CORPUS node

    @param keys: the hyperdata fields to index

    # FIXME too slow: index_new_ngrams should be faster via tsvector on DB
    """

    # retrieve *all* the ngrams from our list
    # (even if some relations may be already indexed
    #  b/c they were perhaps not extracted in all docs
    #   => we'll use already_indexed later)
    todo_ngrams = (session.query(Ngram).filter(Ngram.id.in_(ngram_ids)).all())

    # initialize result dict
    node_ngram_to_write = {}

    # loop throught the docs and their text fields
    for (i, doc) in enumerate(corpus.children('DOCUMENT')):

        if (i % 100 == 0):
            print('CORPUS #%d: [%s] ngrams_addition: doc %i' %
                  (corpus.id, t(), i))
            print()

        # a new empty counting subdict
        node_ngram_to_write[doc.id] = {}

        for key in keys:
            # a text field
            text = doc.hyperdata.get(key, None)

            if not isinstance(text, str):
                # print("WARN: doc %i has no text in field %s" % (doc.id, key))
                continue

            for ngram in todo_ngrams:
                # build regexp : "british" => r'\bbritish\b'
                ngram_re = r'\b%s\b' % ngram.terms

                # --------------------------------------- find ---
                n_occs = len(findall(ngram_re, text, IGNORECASE))
                # -----------------------------------------------

                # save the count results
                if n_occs > 0:
                    if ngram.id not in node_ngram_to_write[doc.id]:
                        node_ngram_to_write[doc.id][ngram.id] = n_occs
                    else:
                        node_ngram_to_write[doc.id][ngram.id] += n_occs

    # debug
    # print("new node_ngrams before filter:", node_ngram_to_write)

    # check the relations we won't insert (those that were already indexed)
    # NB costly but currently impossible with bulk_insert_ifnotexists
    #                                         b/c double uniquekey
    already_indexed = (session.query(
        NodeNgram.node_id,
        NodeNgram.ngram_id).join(Node, Node.id == NodeNgram.node_id).filter(
            Node.parent_id == corpus.id).filter(
                Node.typename == 'DOCUMENT').all())
    filter_out = {(nd_id, ng_id) for (nd_id, ng_id) in already_indexed}
    # POSSIBLE update those that are filtered out if wei_previous != wei

    # integrate all at the end
    my_new_rows = []
    add_new_row = my_new_rows.append
    for doc_id in node_ngram_to_write:
        for ngram_id in node_ngram_to_write[doc_id]:
            if (doc_id, ngram_id) not in filter_out:
                wei = node_ngram_to_write[doc_id][ngram_id]
                add_new_row([doc_id, ngram_id, wei])

    del node_ngram_to_write

    # debug
    # print("new node_ngrams after filter:", my_new_rows)

    bulk_insert(table=NodeNgram,
                fields=('node_id', 'ngram_id', 'weight'),
                data=my_new_rows)

    # bulk_insert_ifnotexists(
    #     model = NodeNgram,
    #     uniquekey = ('node_id','ngram_id'),        <= currently impossible
    #     fields = ('node_id', 'ngram_id', 'weight'),
    #     data = my_new_rows
    # )

    n_added = len(my_new_rows)
    print("index_new_ngrams: added %i new NodeNgram rows" % n_added)

    return n_added
예제 #7
0
def compute_tfidf_local(corpus,
                        on_list_id=None,
                        groupings_id=None,
                        overwrite_id=None):
    """
    Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus

    Parameters:
      - the corpus itself
      - groupings_id: optional synonym relations to add all subform counts
                      with their mainform's counts
      - on_list_id: mainlist or maplist type, to constrain the input ngrams
      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
                   (the Node and its previous NodeNodeNgram rows will be replaced)
    """

    # All docs of this corpus
    docids_subquery = (session.query(
        Node.id).filter(Node.parent_id == corpus.id).filter(
            Node.typename == "DOCUMENT").subquery())

    # N
    total_docs = session.query(docids_subquery).count()

    # define the counted form
    if not groupings_id:
        ngform_id = NodeNgram.ngram_id
    else:
        Syno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())

        ngform_id = case([(Syno.c.ngram1_id != None, Syno.c.ngram1_id),
                          (Syno.c.ngram1_id == None, NodeNgram.ngram_id)])

    # tf for each couple (number of rows = N docs X M ngrams)
    tf_doc_query = (
        session.query(
            ngform_id,
            NodeNgram.node_id,
            func.sum(NodeNgram.weight).label("tf"),  # tf: occurrences
        )

        # select within docs of current corpus
        .join(docids_subquery, docids_subquery.c.id == NodeNgram.node_id))

    if groupings_id:
        tf_doc_query = (tf_doc_query.outerjoin(
            Syno, Syno.c.ngram2_id == NodeNgram.ngram_id))
        # now when we'll group_by the ngram2 freqs will be added to ngram1

    if on_list_id:
        Miamlist = aliased(NodeNgram)
        tf_doc_query = (tf_doc_query.join(
            Miamlist, Miamlist.ngram_id == ngform_id).filter(
                Miamlist.node_id == on_list_id))

    # execute query to do our tf sum
    tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all()

    # ex: [(128371, 9732, 1.0),
    #      (128383, 9740, 1.0),
    #      (128373, 9731, 1.0),
    #      (128376, 9734, 1.0),
    #      (128372, 9731, 1.0),
    #      (128383, 9733, 1.0),
    #      (128383, 9735, 1.0),
    #      (128389, 9734, 1.0),
    #      (8624, 9731, 1.0),
    #      (128382, 9740, 1.0),
    #      (128383, 9739, 1.0),
    #      (128383, 9736, 1.0),
    #      (128378, 9735, 1.0),
    #      (128375, 9733, 4.0),
    #      (128383, 9732, 1.0)]
    #        ^ ^     ^^    ^^
    #       ngram   doc   freq in this doc

    # simultaneously count docs with given term (number of rows = M ngrams)

    ndocswithngram = {}
    for triple in tf_per_doc:
        ng = triple[0]
        doc = triple[1]
        if ng in ndocswithngram:
            ndocswithngram[ng] += 1
        else:
            ndocswithngram[ng] = 1

    # print(ndocswithngram)

    # store for use in formula
    # { ngram_id => log(nd) }
    log_nd_lookup = {
        ng: log(nd_count)
        for (ng, nd_count) in ndocswithngram.items()
    }

    # ---------------------------------------------------------
    tfidfs = {}
    log_tot_docs = log(total_docs)
    for (ngram_id, node_id, tf) in tf_per_doc:
        log_nd = log_nd_lookup[ngram_id]
        # tfidfs[ngram_id] = tf * log(total_docs/nd)
        tfidfs[node_id, ngram_id] = tf * (log_tot_docs - log_nd)
    # ---------------------------------------------------------

    if overwrite_id:
        the_id = overwrite_id
        session.query(NodeNodeNgram).filter(
            NodeNodeNgram.node1_id == the_id).delete()
        session.commit()
    else:
        # create the new TFIDF-CORPUS node
        tfidf_node = corpus.add_child()
        tfidf_node.typename = "TFIDF-CORPUS"
        tfidf_node.name = "tfidf-sims-corpus (in:%s)" % corpus.id
        session.add(tfidf_node)
        session.commit()
        the_id = tfidf_node.id

    # reflect that in NodeNodeNgrams
    # £TODO replace bulk_insert by something like WeightedIndex.save()
    bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'),
                ((the_id, node_id, ngram_id, tfidfs[node_id, ngram_id])
                 for (node_id, ngram_id) in tfidfs))

    return the_id
예제 #8
0
def compute_occs(
    corpus,
    overwrite_id=None,
    groupings_id=None,
):
    """
    Calculates sum of occs per ngram (or per mainform if groups) within corpus
                 (used as info in the ngrams table view)

    ? optimize ?  OCCS here could be calculated simultaneously within TFIDF-CORPUS loop

    ? use cases ?
       => not the main score for users (their intuition for nb of docs having word)
       => but is the main weighting value for any NLP task

    Parameters:
        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
                     (the Node and its previous NodeNodeNgram rows will be replaced)
        - groupings_id: optional id of a GROUPLIST node for these ngrams
                        IF absent the occurrences are the sums for each ngram
                        IF present they're the sums for each ngram's mainform
    """
    #  simple case : no groups
    #                ---------
    #    (the occurrences are the sums for each ngram)
    if not groupings_id:

        # NodeNgram index
        occs_q = (
            session.query(
                NodeNgram.ngram_id,
                func.sum(NodeNgram.weight)  # <== OCCURRENCES
            )
            # filter docs within corpus
            .join(Node).filter(Node.parent_id == corpus.id).filter(
                Node.typename == "DOCUMENT")

            # for the sum
            .group_by(NodeNgram.ngram_id))

    #   difficult case: with groups
    #                   ------------
    # (the occurrences are the sums for each ngram's mainform)
    else:
        # sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
        syn = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())

        # NodeNgram index with additional subform => mainform replacement
        occs_q = (
            session.query(
                # intermediate columns for debug
                # -------------------------------
                # NodeNgram.node_id,        # document
                # NodeNgram.ngram_id,       # <= the occurring ngram
                # NodeNgram.weight,         # <= its frequency in doc
                # syn.c.ngram1_id           # mainform
                # syn.c.ngram2_id,          # subform

                # ngram to count aka counted_form
                # ----------------------------------
                #     either NodeNgram.ngram_id as before
                #         or mainform if it exists
                case([(syn.c.ngram1_id != None, syn.c.ngram1_id)],
                     else_=NodeNgram.ngram_id).label("counted_form"),

                # the sum itself
                # --------------
                func.sum(NodeNgram.weight)  # <== OCCURRENCES
            )
            # this brings the mainform if NodeNgram.ngram_id has one in syn
            .outerjoin(syn, syn.c.ngram2_id == NodeNgram.ngram_id)

            # filter docs within corpus
            .join(Node).filter(Node.parent_id == corpus.id).filter(
                Node.typename == "DOCUMENT")

            # for the sum
            .group_by("counted_form"))

    occ_sums = occs_q.all()
    # example result = [(1970, 1.0), (2024, 2.0),  (259, 2.0), (302, 1.0), ... ]
    #                    ^^^^  ^^^
    #                ngram_id   sum_wei
    #                   OR
    #              counted_form

    if overwrite_id:
        # overwrite pre-existing id
        the_id = overwrite_id
        session.query(NodeNodeNgram).filter(
            NodeNodeNgram.node1_id == the_id).delete()
        session.commit()
    else:
        # create the new OCCURRENCES node
        occnode = corpus.add_child(typename="OCCURRENCES",
                                   name="occ_sums (in:%s)" % corpus.id)
        session.add(occnode)
        session.commit()
        the_id = occnode.id

    # £TODO  make it NodeNgram instead NodeNodeNgram ! and rebase :/
    #        (idem ti_ranking)
    bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'),
                ((the_id, corpus.id, res[0], res[1]) for res in occ_sums))

    return the_id
예제 #9
0
def compute_ti_ranking(corpus,
                       groupings_id=None,
                       count_scope="local",
                       termset_scope="local",
                       overwrite_id=None):
    """
    Calculates tfidf ranking within given scope
                ----------
                   |
            via weighting of
            cumulated tfidf  --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
             per ngram ng_i
         (or per mainform ng_i' if groups)
           across some docs d_j

    Parameters:
      - the corpus itself (or corpus_id)
      - groupings_id: optional id of a GROUPLIST node for these ngrams
                        IF absent the ti weights are the sums for each ngram
                        IF present they're the sums for each ngram's mainform

      - count_scope: {"local" or "global"}
         - local  <=> frequencies counted in the current corpus
         - global <=> frequencies counted in all corpora of this type

        when the count_scope is global, there is another parameter:
          - termset_scope: {"local" or "global"}
             - local <=> output list of terms limited to the current corpus
               (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
             - global <=> output list of terms found in global doc scope
                                                    !!!! (many more terms)

      - overwrite_id: optional id of a pre-existing XXXX node for this corpus
                   (the Node and its previous Node NodeNgram rows will be replaced)
    """
    # validate string params
    if count_scope not in ["local", "global"]:
        raise ValueError(
            "compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
        )
    if termset_scope not in ["local", "global"]:
        raise ValueError(
            "compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
        )
    if count_scope == "local" and termset_scope == "global":
        raise ValueError(
            "compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
        )

    # get corpus
    if type(corpus) == int:
        corpus_id = corpus
        corpus = cache.Node[corpus_id]
    elif type(corpus) == str and match(r'\d+$', corpus):
        corpus_id = int(corpus)
        corpus = cache.Node[corpus_id]
    else:
        # assuming Node class
        corpus_id = corpus.id

    # prepare sqla mainform vs ngram selector
    ngform_i = None

    if not groupings_id:
        ngform_i = NodeNgram.ngram_id

    else:
        # prepare translations
        syno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())
        # cf commentaire détaillé dans compute_occs() + todo facto

        ngform_i = case([(syno.c.ngram1_id != None, syno.c.ngram1_id),
                         (syno.c.ngram1_id == None, NodeNgram.ngram_id)
                         #     condition               value
                         ])

    # MAIN QUERY SKELETON
    tf_nd_query = (
        session.query(
            # NodeNgram.ngram_id
            # or similar if grouping ngrams under their mainform
            ngform_i.label("counted_ngform"),

            # the tfidf elements
            # ------------------
            func.sum(NodeNgram.weight),  # tf: same as occurrences
            # -----------------------
            func.count(NodeNgram.node_id)  # nd: n docs with term
            # --------------------
        ).group_by("counted_ngform")

        # count_scope to specify in which doc nodes to count
        # -----------
        # .join(countdocs_subquery,
        #       countdocs_subquery.c.id == NodeNgram.node_id)

        # optional termset_scope: if we'll restrict the ngrams
        #          -------------
        # .join(termset_subquery,
        #       termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)

        # optional translations to bring the subform's replacement
        #          ------------
        # .outerjoin(syno,
        #           syno.c.ngram2_id == NodeNgram.ngram_id)
    )

    # TUNING THE QUERY

    if groupings_id:
        tf_nd_query = tf_nd_query.outerjoin(
            syno, syno.c.ngram2_id == NodeNgram.ngram_id)

    # local <=> within this corpus
    if count_scope == "local":
        # All docs of this corpus
        countdocs_subquery = (session.query(
            Node.id).filter(Node.typename == "DOCUMENT").filter(
                Node.parent_id == corpus_id).subquery())

        # no need to independantly restrict the ngrams
        tf_nd_query = tf_nd_query.join(
            countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id)
        # ---

    # global <=> within all corpora of this source
    elif count_scope == "global":
        this_source_type = corpus.resources()[0]['type']

        CorpusNode = aliased(Node)

        # All docs **in all corpora of the same source**
        countdocs_subquery = (
            session.query(Node.id).filter(Node.typename == "DOCUMENT")

            # join on parent_id with selected corpora nodes
            .join(CorpusNode, CorpusNode.id == Node.parent_id).filter(
                CorpusNode.typename == "CORPUS")
            # TODO index corpus_sourcetype in DB
            .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(
                this_source_type)).subquery())

        if termset_scope == "global":
            # both scopes are the same: no need to independantly restrict the ngrams
            tf_nd_query = tf_nd_query.join(
                countdocs_subquery,
                countdocs_subquery.c.id == NodeNgram.node_id)
            # ---

        elif termset_scope == "local":

            # All unique terms...
            termset_subquery = (
                session.query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
                # ... in the original corpus
                .join(Node).filter(Node.typename == "DOCUMENT").filter(
                    Node.parent_id == corpus_id).subquery())

            # only case of independant restrictions on docs and terms
            tf_nd_query = (tf_nd_query.join(
                countdocs_subquery,
                countdocs_subquery.c.id == NodeNgram.node_id).join(
                    termset_subquery,
                    termset_subquery.c.uniq_ngid == NodeNgram.ngram_id))
            # ---

    # M
    total_docs = session.query(countdocs_subquery).count()
    log_tot_docs = log(total_docs)

    # result
    tf_nd = tf_nd_query.all()

    # -------------- "sommatoire" sur mot i ----------------
    tfidfsum = {}
    for (ngram_i, tf_i, nd_i) in tf_nd:
        # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
        tfidfsum[ngram_i] = tf_i * (log_tot_docs - log(nd_i))
    # ------------------------------------------------------

    # N pour info
    total_ngramforms = len(tfidfsum)

    if overwrite_id:
        the_id = overwrite_id
        session.query(NodeNodeNgram).filter(
            NodeNodeNgram.node1_id == the_id).delete()
        session.commit()
    else:
        # create the new TFIDF-XXXX node to get an id
        tir_nd = corpus.add_child()
        if count_scope == "local":
            tir_nd.typename = "TIRANK-CORPUS"
            tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % (
                total_ngramforms, corpus_id)
        elif count_scope == "global":
            tir_nd.typename = "TIRANK-GLOBAL"
            tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % (
                total_ngramforms, ("from corpus %i" % corpus_id) if
                (termset_scope == "local") else "", this_source_type)

        session.add(tir_nd)
        session.commit()
        the_id = tir_nd.id

    # TODO 1 discuss use and find new typename
    # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
    # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
    # TODO 4 requalify this here as a NodeNgram
    # then TODO 5 use WeightedList.save() !

    # reflect that in NodeNodeNgrams
    bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'),
                ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum))

    return the_id