Python Translations примеры использования

Язык программирования: Python

Пространство имен/Пакет: gargantext.util.lists

Класс/Тип: Translations

Примеров на hotexamples.com: 6

Python Translations - 6 примеров найдено. Это лучшие примеры Python кода для gargantext.util.lists.Translations, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Translations(6)

save(2)

Основные методы

Translations (6)

save (2)

Пример #1

Показать файл

Файл: ngramlists.py Проект: fabelier/gargantext

    def patch(self,request):
        """
        A copy of POST (merging list) but with the source == just an internal corpus_id

        params in request.GET:
            onto_corpus:  the corpus whose lists are getting patched
            from:         the corpus from which we take the source lists to merge in
            todo:         an array of the list types ("map", "main", "stop") to merge in

        """
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        params = get_parameters(request)
        print(params)

        # the corpus with the target lists to be patched
        corpus_id = int(params.pop("onto_corpus"))
        corpus_node = cache.Node[corpus_id]

        print(params)

        if request.user.id != corpus_node.user_id:
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        list_types = {'map':'MAPLIST', 'main':'MAINLIST', 'stop':'STOPLIST'}

        # internal DB retrieve source_lists
        source_corpus_id = int(params.pop("from_corpus"))
        source_node = cache.Node[source_corpus_id]

        todo_lists = params.pop("todo").split(',')   # ex: ['map', 'stop']
        source_lists = {}
        for key in todo_lists:
            source_lists[key] = UnweightedList(
                                    source_node.children(list_types[key]).first().id
                                )

        # add the groupings too
        source_lists['groupings'] = Translations(
                                        source_node.children("GROUPLIST").first().id
                                    )

        # attempt to merge and send response
        try:
            # merge the source_lists onto those of the target corpus
            log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node)
            return JsonHttpResponse({
                'log': log_msg,
                }, 200)

        except Exception as e:
            return JsonHttpResponse({
                'err': str(e),
                }, 400)

Пример #2

Показать файл

Файл: ngramlists.py Проект: project-renard-survey/gargantext

    def get(self, request):

        parameters = get_parameters(request)
        glance_limit = None
        mainlist_id = None
        scores_id = None
        groups_id = None
        other_list_ids = {'maplist': None, 'stoplist': None}

        # 1) retrieve a mainlist_id and other lists
        ##########################################

        # simple request: just refers to the parent corpus
        # ------------------------------------------------
        if "corpus" in parameters:
            corpus_id = parameters['corpus']
            corpus = cache.Node[corpus_id]
            # with a corpus_id, the explicit scoring pointer is optional
            if "scoring" in parameters:
                scores_id = parameters['scoring']
            else:
                scores_id = corpus.children('OCCURRENCES').first().id
            # retrieve the family of lists that have corpus as parent
            mainlist_id = corpus.children('MAINLIST').first().id
            groups_id = corpus.children('GROUPLIST').first().id
            other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id
            other_list_ids['maplist'] = corpus.children('MAPLIST').first().id

        # custom request: refers to each list individually
        # -------------------------------------------------
        elif "mainlist" in parameters and "scoring" in parameters:
            mainlist_id = parameters['mainlist']
            scores_id = parameters['scoring']
            groups_id = None
            if 'groups' in parameters:
                groups_id = parameters['scoring']
            for k in ['stoplist', 'maplist']:
                if k in parameters:
                    other_list_ids[k] = parameters[k]

        # or request has an error
        # -----------------------
        else:
            raise ValidationException(
                "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required"
            )

        # 2) get the infos for each list
        ################################
        ngraminfo = {}  # ngram details sorted per ngram id
        linkinfo = {}  # ngram groups sorted per ngram id
        listmembers = {}  # ngram ids sorted per list name
        if "head" in parameters:
            # head <=> only mainlist AND only k top ngrams
            glance_limit = int(parameters['head'])
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        pagination_limit=glance_limit,
                                        scoring_metric_id=scores_id)
        else:
            # infos for all ngrams from mainlist
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        scoring_metric_id=scores_id)
            # infos for grouped ngrams, absent from mainlist
            hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True)

            # infos for stoplist terms, absent from mainlist
            stop_ngrams_query = query_list(other_list_ids['stoplist'],
                                           details=True,
                                           scoring_metric_id=scores_id)

            # and for the other lists (stop and map)
            # no details needed here, just the member ids
            for li in other_list_ids:
                li_elts = query_list(other_list_ids[li], details=False).all()
                # simple array of ngram_ids
                listmembers[li] = [ng[0] for ng in li_elts]

            # and the groupings
            if groups_id:
                links = Translations(groups_id)
                linkinfo = links.groups

        # list of
        ngrams_which_need_detailed_info = []
        if "head" in parameters:
            # head triggered simplified form: just the top of the mainlist
            # TODO add maplist membership
            ngrams_which_need_detailed_info = mainlist_query.all()
        else:
            ngrams_which_need_detailed_info = mainlist_query.all(
            ) + hidden_ngrams_query.all() + stop_ngrams_query.all()

        # the output form of details is:
        # ngraminfo[id] => [term, weight]
        for ng in ngrams_which_need_detailed_info:
            ng_id = ng[0]
            ngraminfo[ng_id] = ng[1:]

            # NB the client js will sort mainlist ngs from hidden ngs after ajax
            #    using linkinfo (otherwise needs redundant listmembers for main)

        return JsonHttpResponse({
            'ngraminfos': ngraminfo,
            'listmembers': listmembers,
            'links': linkinfo,
            'nodeids': {
                'mainlist': mainlist_id,
                'maplist': other_list_ids['maplist'],
                'stoplist': other_list_ids['stoplist'],
                'groups': groups_id,
                'scores': scores_id,
            }
        })

Пример #3

Показать файл

Файл: ngram_groups.py Проект: project-renard-survey/gargantext

def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
    """
    1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
    2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
    3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
    """

    stop_ngrams_ids = {}
    # we will need the ngrams of the stoplist to filter
    if stoplist_id is not None:
        for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all():
            stop_ngrams_ids[id[0]] = True


    # 1) compute stems/lemmas
    #    and group if same stem/lemma
    stemmers = prepare_stemmers(corpus)
    print("# STEMMERS LOADED", stemmers)
    supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] if lang != "__unknown__"]

    print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang)
    # todo dict {lg => {ngrams_todo} }
    todo_ngrams_per_lg = defaultdict(set)

    # res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} }
    my_groups = defaultdict(Counter)

    # preloop per doc to sort ngrams by language
    for doc in corpus.children('DOCUMENT'):
        if doc.id not in corpus.hyperdata['skipped_docs']:
            if ('language_iso2' in doc.hyperdata) and doc.hyperdata['language_iso2'] \
                                                    in supported_stemmers_lang:
                lgid = doc.hyperdata['language_iso2']

            else:
                lgid = "__unknown__"
                doc.status("NGRAMS_GROUPS", error="Error: unsupported language for stemming")
                doc.save_hyperdata()
                #corpus.hyperdata["skipped_docs"].append(doc.id)
                #corpus.save_hyperdata()
            # doc.ngrams is an sql query (ugly but useful intermediate step)
            # FIXME: move the counting and stoplist filtering up here
            for ngram_pack in doc.ngrams.all():
                todo_ngrams_per_lg[lgid].add(ngram_pack)

    # --------------------
    # long loop per ngrams
    for (lgid,todo_ngs) in todo_ngrams_per_lg.items():
        # fun: word::str => stem::str
        stem_it = stemmers[lgid].stem

        for ng in todo_ngs:
            doc_wei = ng[0]
            ngram  = ng[1]       # Ngram obj

            # break if in STOPLIST
            if ngram.id in stop_ngrams_ids:
                next

            lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)]

            # STEM IT, and this term's stems will become a new grouping key...
            stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms])

            # ex:
            # groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
            # groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
            my_groups[stemseq][ngram.id] += doc_wei

    del todo_ngrams_per_lg

    # now serializing all groups to a list of couples
    ng_couples = []
    addcouple = ng_couples.append
    for grped_ngramids in my_groups.values():
        if len(grped_ngramids) > 1:
            # first find most frequent term in the counter
            winner_id = grped_ngramids.most_common(1)[0][0]

            for ngram_id in grped_ngramids:
                if ngram_id != winner_id:
                    addcouple((winner_id, ngram_id))

    del my_groups

    # 2) the list node
    if overwrite_id:
        # overwrite pre-existing id
        the_id = overwrite_id
    # or create the new id
    else:
        the_group =  corpus.add_child(
            typename  = "GROUPLIST",
            name = "Group (src:%s)" % corpus.name[0:10]
        )

        # and save the node
        session.add(the_group)
        session.commit()
        the_id = the_group.id

    # 3) Save each grouping couple to DB thanks to Translations.save() table
    ndngng_list = Translations(
                                [(sec,prim) for (prim,sec) in ng_couples],
                                just_items=True
                   )

    # ...referring to the list node we just got
    ndngng_list.save(the_id)

    return the_id

Пример #4

Показать файл

def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
    """
    Integrates an external terms table to the current one:
       - merges groups (using group_union() function)
       - resolves conflicts if terms belong in different lists
          > map wins over both other types
          > main wins over stop
          > stop never wins   £TODO STOP wins over candidates from main

    @param new_lists:     a dict of *new* imported lists with format:
                                {'stop':     UnweightedList,
                                 'main':     UnweightedList,
                                 'map':      UnweightedList,
                                 'groupings': Translations }

                   if any of those lists is absent it is considered empty

    @param onto_corpus:   a corpus node to get the *old* lists

    @param del_originals: an array of original wordlists to ignore
                          and delete during the merge
                          possible values : ['stop','main','map']

            par exemple
            del_originals = ['stop','main'] => effacera la stoplist
                                                 et la mainlist
                                          mais pas la maplist qui sera fusionnée
                                         (les éléments de la map list
                                          seront remis dans la main à la fin)

    NB: Uses group_tools.group_union() to merge the synonym links.
        Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs
    """
    # log to send back to client-side (lines will be joined)
    my_log = []

    # the tgt node arg has to be a corpus here
    if not hasattr(onto_corpus,
                   "typename") or onto_corpus.typename != "CORPUS":
        raise TypeError("IMPORT: 'onto_corpus' argument must be a Corpus Node")

    # for stats
    added_nd_ng = 0  # number of added list elements

    # our list shortcuts will be 0,1,2 (aka lid)
    # by order of precedence
    linfos = [
        {
            'key': 'stop',
            'name': "STOPLIST"
        },  # lid = 0
        {
            'key': 'main',
            'name': "MAINLIST"
        },  # lid = 1
        {
            'key': 'map',
            'name': "MAPLIST"
        }  # lid = 2
    ]

    # ======== Index the new ngrams in the docs =========
    all_possibly_new_ngram_ids = []
    collect = all_possibly_new_ngram_ids.append
    for lid, info in enumerate(linfos):
        list_type = info['key']
        if list_type in new_lists:
            for ng_id in new_lists[list_type].items:
                collect(ng_id)

    from gargantext.util.toolchain.main import t
    print("MERGE DEBUG: starting index_new_ngrams", t())
    n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
    print("MERGE DEBUG: finished index_new_ngrams", t())

    my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)

    # ======== Get the old lists =========
    old_lists = {}

    # DB nodes stored with same indices 0,1,2 (resp. stop, miam and map)
    # find target ids of the list node objects
    tgt_nodeids = [
        onto_corpus.children(
            "STOPLIST").first().id,  # £todo via parent project?
        onto_corpus.children("MAINLIST").first().id,
        onto_corpus.children("MAPLIST").first().id
    ]

    old_group_id = onto_corpus.children("GROUPLIST").first().id

    # retrieve old data into old_lists[list_type]...
    # ----------------------------------------------
    for lid, linfo in enumerate(linfos):
        list_type = linfo['key']
        if list_type not in del_originals:

            # NB can't use UnweightedList(tgt_nodeids[lid])
            # because we need to include out-of-list subforms
            list_ngrams_q = query_list(tgt_nodeids[lid],
                                       groupings_id=old_group_id)
            old_lists[list_type] = UnweightedList(list_ngrams_q.all())
        else:
            # ...or use empty objects if replacing old list
            # ----------------------------------------------
            old_lists[list_type] = UnweightedList()
            msg = "MERGE: ignoring old %s which will be overwritten" % linfo[
                'name']
            print(msg)
            my_log.append(msg)

    # ======== Merging all involved ngrams =========

    # all memberships with resolved conflicts of interfering memberships
    resolved_memberships = {}

    for list_set in [old_lists, new_lists]:
        for lid, info in enumerate(linfos):
            list_type = info['key']
            # if you don't want to merge one list just don't put it in new_lists
            if list_type in list_set:
                # we use the fact that lids are ordered ints...
                for ng_id in list_set[list_type].items:
                    if ng_id not in resolved_memberships:
                        resolved_memberships[ng_id] = lid
                    else:
                        # ...now resolving is simply taking the max
                        # stop < main < map
                        resolved_memberships[ng_id] = max(
                            lid, resolved_memberships[ng_id])
            # now each ngram is only in its most important list
            # -------------------------------------------------
            # NB temporarily map items are not in main anymore
            #    but we'll copy it at the end
            # NB temporarily all subforms were treated separately
            #    from mainforms but we'll force them into same list
            #    after we merge the groups

    del old_lists

    # ======== Merging old and new groups =========
    # get the arcs already in the target DB (directed couples)
    previous_links = session.query(
        NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
            NodeNgramNgram.node_id == old_group_id).all()

    n_links_previous = len(previous_links)

    # same format for the new arcs (Translations ~~~> array of couples)
    translated_imported_links = []
    add_link = translated_imported_links.append
    n_links_added = 0
    for (y, x) in new_lists['groupings'].items.items():
        add_link((x, y))
        n_links_added += 1
    del new_lists

    # group_union: joins 2 different synonym-links lists into 1 new list
    new_links = group_union(previous_links, translated_imported_links)
    del previous_links
    del translated_imported_links

    n_links_after = len(new_links)

    merged_group = Translations([(y, x) for (x, y) in new_links])
    del new_links

    # ======== Overwrite old data with new =========

    merged_group.save(old_group_id)

    msg = "MERGE: groupings %i updated (links before/added/after: %i/%i/%i)" % (
        old_group_id, n_links_previous, n_links_added, n_links_after)
    my_log.append(msg)
    print(msg)

    # ======== Target list(s) append data =========
    # if list 2 => write in both tgt_data_lists [1,2]
    # lists 0 or 1 => straightforward targets [0] or [1]

    merged_results = {
        'stop': UnweightedList(),
        'main': UnweightedList(),
        'map': UnweightedList()
    }

    for (ng_id, winner_lid) in resolved_memberships.items():

        ## 1) using the new groups
        # normal case if not a subform
        if ng_id not in merged_group.items:
            target_lid = winner_lid
        # inherit case if is a subform
        else:
            mainform_id = merged_group.items[ng_id]
            # inherited winner
            try:
                target_lid = resolved_memberships[mainform_id]
            except KeyError:
                target_lid = winner_lid
                print("MERGE: WARN ng_id %i has incorrect mainform %i ?" %
                      (ng_id, mainform_id))

        ## 2) map => map + main
        if target_lid == 2:
            todo_lids = [1, 2]
        else:
            todo_lids = [target_lid]

        ## 3) storage
        for lid in todo_lids:
            list_type = linfos[lid]['key']
            merged_results[list_type].items.add(ng_id)

    # print("IMPORT: added %i elements in the lists indices" % added_nd_ng)

    # ======== Overwrite old data with new =========
    for lid, info in enumerate(linfos):
        tgt_id = tgt_nodeids[lid]
        list_type = info['key']
        result = merged_results[list_type]
        result.save(tgt_id)

        msg = "MERGE: %s %i updated (new size: %i)" % (
            info['name'], tgt_id, len(merged_results[list_type].items))
        my_log.append(msg)
        print(msg)

    # return a log
    return ("\n".join(my_log))

Пример #5

Показать файл

def import_ngramlists(the_file,
                      delimiter=DEFAULT_CSV_DELIM,
                      group_delimiter=DEFAULT_CSV_DELIM_GROUP):
    '''
    This function reads a CSV of an ngrams table for a Corpus,
    then it converts old ngram_ids to those of the current DB
       (and adds to DB any unknown ngrams)
    then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS

    Input example:
        status  | label         |forms
        --------+---------------+---------------------
        map      water table     water tables
        map      water supply    water-supply|&|water supplies
        stop     wastewater

    The title line is mandatory.
    The label will correspond to our DB mainform type.

    Variants:
    ----------
    For user accessibility, we allow different formats using equivalence rules:

    1) It is implicit that the label string is also one of the forms
       therefore the input example table is equivalent to this "verbose" table:

        status  | label         |forms
        --------+---------------+---------------------
        map      water table     water table|&|water tables
        map      water supply    water supply|&|water-supply|&|water supplies
        stop     wastewater      wastewater


    2) The default status is map and the status column is optional
       thus, if we ignore "wastewater", the input table is also equivalent to:

         label         |forms
        ---------------+---------------------
        water table     water tables
        water supply    water-supply|&|water supplies


    3) From DB point of view, both "forms that are labels" and "other forms" are
       finally saved just as ngrams. So the input table is also equivalent to:

        status  | label         |forms
        --------+---------------+---------------------
        map      water table     water tables
        map      water tables
        map      water supply    water-supply|&|water supplies
        map      water supplies
        map      water-supply
        stop     wastewater


    Output:
    -------
        3 x UnweightedList + 1 x Translations

    @param the_file         a local filename or file contents or a filehandle-like
    @param delimiter        a character used as separator in the CSV
    @param group_delimiter  a character used as grouped subforms separator
                            (in the last column)

    The retrieval of ngram_ids works in 2 steps:
        => look up each term str in the DB with bulk_insert_ifnotexists
           (creates absent ngrams if necessary)
        => use the new ids to map the relations involving the old ones

    NB: the creation of MAINLIST also adds all elements from the MAPLIST

    NB: To merge the imported lists into a corpus node's lists,
        chain this function with merge_ngramlists()
    '''
    # ---------------
    #  ngram storage
    # ---------------

    # main storage for the ngrams by list
    imported_nodes_ngrams = {'stop': [], 'main': [], 'map': []}

    # and all the terms (for unique and for dbdata bulk_insert)
    imported_unique_ngramstrs = {}

    # and for the imported_grouping list of couples [(str1,str1),(str1,str2)..]
    imported_groupings = []

    # /!\ imported_grouping contains the subforms' terms themselves
    #     (that will have to be translated to ngram_ids for the target db)

    # =============== READ CSV ===============

    if isinstance(the_file, list):
        fname = 'imported_file'
        contents = the_file
    else:
        if isinstance(the_file, str):
            fh = open(the_file, "r")
            fname = the_file
        elif callable(getattr(the_file, "read", None)):
            fh = the_file
            fname = the_file
        else:
            raise TypeError("IMPORT: the_file argument has unknown type %s" %
                            type(the_file))

        # reading all directly b/c csv.reader takes only lines or a real fh in bytes
        # and we usually have a "false" fh (uploadedfile.InMemoryUploadedFile) in strings
        # (but we checked its size before!)
        contents = fh.read().decode("UTF-8").split("\n")

        # end of CSV read
        fh.close()

    # <class 'django.core.files.uploadedfile.InMemoryUploadedFile'>

    ngrams_csv_rows = reader(contents,
                             delimiter=delimiter,
                             quoting=QUOTE_MINIMAL)

    # for stats
    n_read_lines = 0
    n_total_ng = 0
    n_added_ng = 0
    n_group_relations = 0

    # columntype => int
    columns = {}

    # load CSV + initial checks
    for i, csv_row in enumerate(ngrams_csv_rows):
        # fyi
        n_read_lines += 1
        # print("---------------READ LINE %i" % i)

        # headers
        if i == 0:
            n_cols = len(csv_row)
            for j, colname in enumerate(csv_row):
                if colname in ['label', 'status', 'forms']:
                    columns[colname] = j
                # skip empty columns
                elif match(r'^\s*$', colname):
                    pass
                else:
                    raise ValueError(
                        'Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")'
                        % (colname, n_read_lines))
            if 'label' not in columns:
                raise ValueError(
                    'CSV must contain at least one column with the header "label"'
                )

        if not len(csv_row):
            continue

        # try:
        # mandatory column
        this_row_label = str(csv_row[columns['label']])

        # other columns or their default values
        if 'status' in columns:
            this_list_type = str(csv_row[columns['status']])
        else:
            this_list_type = 'map'

        if 'forms' in columns:
            this_row_forms = str(csv_row[columns['forms']])
        else:
            this_row_forms = ''

        # string normalizations
        this_row_label = normalize_forms(normalize_chars(this_row_label))

        # except:
        #     if i == 0:
        #         print("IMPORT WARN: (skip line) probable header line at CSV %s:l.0" % fname)
        #         continue
        #     else:
        #         raise ValueError("Error on CSV read line %i" % i)

        # --- term checking
        if not len(this_row_label) > 0:
            print("IMPORT WARN: (skip line) empty term at CSV %s:l.%i" %
                  (fname, i))
            continue

        # --- check correct list type
        if not this_list_type in ['stop', 'main', 'map']:
            print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" %
                  (fname, i))
            continue

        # subforms can be duplicated (in forms and another label)
        # but we must take care of unwanted other duplicates too
        if this_row_label in imported_unique_ngramstrs:
            print(
                "TODO IMPORT DUPL: (skip line) term appears more than once at CSV %s:l.%i"
                % (fname, i))

        # ================= Store the data ====================
        # the ngram census
        imported_unique_ngramstrs[this_row_label] = True

        # and the "list to ngram" relation
        imported_nodes_ngrams[this_list_type].append(this_row_label)

        # ====== Store synonyms from the import (if any) ======
        if len(this_row_forms) != 0:
            other_terms = []
            for raw_term_str in this_row_forms.split(group_delimiter):

                # each subform is also like an ngram declaration
                term_str = normalize_forms(normalize_chars(raw_term_str))
                imported_unique_ngramstrs[term_str] = True
                imported_nodes_ngrams[this_list_type].append(term_str)

                # the optional repeated mainform doesn't interest us
                # because we already have it via the label
                if term_str != this_row_label:

                    # save links
                    imported_groupings.append((this_row_label, term_str))

    # ======== ngram save + id lookup =========
    n_total_ng = len(imported_unique_ngramstrs)

    # prepare data format
    imported_ngrams_dbdata = []
    for ngram_str in imported_unique_ngramstrs:
        # DB needs the number of separate words
        n_words = 1 + len(findall(r' ', ngram_str))
        imported_ngrams_dbdata.append((ngram_str, n_words))

    # returns a dict {term => id} and a count of inserted ones
    #                             -------------------------
    (new_ngrams_ids, n_added_ng) = bulk_insert_ifnotexists(
        #                             -------------------------
        model=Ngram,
        uniquekey='terms',
        fields=('terms', 'n'),
        data=imported_ngrams_dbdata,
        do_stats=True)
    del imported_ngrams_dbdata

    # new_ngrams_ids contains a direct mapping ng_str => new_id
    del imported_unique_ngramstrs

    # print(new_ngrams_ids)
    # print(imported_nodes_ngrams)

    # ======== Import into lists =========

    # 3 x abstract lists + 1 translations
    result = {
        'map': UnweightedList(),
        'main': UnweightedList(),
        'stop': UnweightedList(),
        'groupings': Translations()
    }

    for list_type in imported_nodes_ngrams:
        for ng_str in imported_nodes_ngrams[list_type]:
            new_id = new_ngrams_ids[ng_str]
            # add to the abstract list
            result[list_type].items.add(new_id)

        # for main also add map elements
        if list_type == 'main':
            for ng_str in imported_nodes_ngrams['map']:
                new_id = new_ngrams_ids[ng_str]
                result['main'].items.add(new_id)

    # ======== Synonyms =========
    for (x_str, y_str) in imported_groupings:
        new_mainform_id = new_ngrams_ids[x_str]
        new_subform_id = new_ngrams_ids[y_str]

        # /!\ Translations use (subform => mainform) order
        result['groupings'].items[new_subform_id] = new_mainform_id
        n_group_relations += 1

    # ------------------------------------------------------------------
    print("IMPORT: read %i lines from the CSV" % n_read_lines)
    print("IMPORT: read %i terms (%i added and %i already existing)" %
          (n_total_ng, n_added_ng, n_total_ng - n_added_ng))
    print("IMPORT: read %i grouping relations" % n_group_relations)

    # print("IMPORT RESULT", result)
    return result

Пример #6

Показать файл

Файл: cooccurrences.py Проект: project-renard-survey/gargantext

def filterMatrix(matrix, mapList_id, groupList_id):
    mapList = UnweightedList(mapList_id)
    group_list = Translations(groupList_id)
    cooc = matrix & (mapList * group_list)
    return cooc