示例#1
0
文件: core.py 项目: X5GON/lamapi
def get_resource_modelsdshknn(parameters):
    # max_concepts
    tmp_parameters = {}
    tmp_parameters["resource_id"] = parameters['id']
    tmp_parameters["n_neighbors"] = parameters['max_resources']
    tmp_parameters["remove_duplicates"] = True
    tmp_parameters["return_reduction"] = True
    tmp_parameters["return_matrix"] = True
    tmp_parameters["return_vectors"] = True
    tmp_parameters["return_dist"] = True
    # Get Knn resources
    knn = knn_wikifier_res(**tmp_parameters)
    resources_ids = knn["neighbors"]
    matrix = knn["matrix"]
    # Get resources descriptions
    resources_needed_infos = get_resource_description(resources_ids,
                                                      {"concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'],
                                                       "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']},
                                                      max_concepts=parameters['max_concepts'],
                                                      max_keywords=parameters['max_concepts'])
    # Get the concepts
    conceptsmax = Counter()
    concetpsmostcommon = Counter()
    for v in knn["vectors"]:
        conceptsmax += Counter(v)
        concetpsmostcommon += Counter(v.keys())
    sumpr = sum(conceptsmax.values())
    # Get the most common concepts
    most_common_concepts = [(c, v/sumpr) for c, v in conceptsmax.most_common(parameters['max_concepts'])]
    # Fill up the missings needed infos: ContextualResourceDescription
    resources_final_infos = []
    for i, rid in enumerate(resources_ids):
        res_infos = resources_needed_infos.get(int(rid), dict())
        if res_infos:
            res_infos['duration'] = 60 * (res_infos['len_word'] / __wpm()["Slide"])
            res_infos['difficulty'] = wikification2con_per_sec(res_infos['len_char'], res_infos['len_concepts'])
            # res_infos['difficulty'] = tfidf2technicity( res_infos['keywords_full'] )
            res_infos['projection'] = knn["projected_matrix"][i]
            common_wk_in_res = [[dict(label= c_f["title"],
                                                  url=  c_f["url"],
                                                  value= c_f["norm_pageRank"]) for c_f in filter(lambda c_res: c_res['url'] == c,
                                                                                                 res_infos["wikifier_full"]["value"]["concepts"])]
                                            for c, v_c in most_common_concepts]
            res_infos['common_wikifier'] = [c[0] for c in common_wk_in_res if len(c) > 0]
            res_infos['distance'] = knn["distances"][i]
            del res_infos['keywords_full']
            del res_infos['wikifier_full']
        resources_final_infos.append(res_infos)
    a = {
             "reference": resources_final_infos[0],
             "neighbors": resources_final_infos[1:],
             # "matrix": matrix,
             # "top": knn["top"],
             # "distances": knn["distances"],
             # "concepts_max": [(c, v/sumpr) for c, v in conceptsmax.most_common(10)],
             # "distri": list(conceptsmax.items()),
             # "concepts_most_common": concetpsmostcommon.most_common(10),
             # "variance_ratio_": knn["variance_ratio_"].tolist() if knn["variance_ratio_"] is not None else None
           }
    return a
示例#2
0
文件: core.py 项目: X5GON/lamapi
def get_resource_difficulty(resource_ids):
    res_lens = get_experimental_contents(resource_ids)
    res_wks = get_experimental_features(resource_ids,
                                        [__DEFAULT_EXPID_SETTING["SIMPLE"]])
    res_valid = get_valid_resources(res_lens, res_wks)
    return [{
        "resource_id": res[0],
        "value": wikification2con_per_sec(res[1], len(res[2]['concepts']))
    } for res in res_valid]
示例#3
0
文件: core.py 项目: X5GON/lamapi
def get_resource_modelsdshsearch(search_infos):
    # The X5GON API is available at:
    PLATFORM_URL = "https://platform.x5gon.org/api/v1"
    # initialise the endpoint
    search_endpoint = "/search?{}"

    # search_endpoint parameters
    search_e_p = dict(
        text=search_infos['q'] if 'q' in search_infos else '',
        type=",".join(search_infos['type']) if 'type' in search_infos else '',
        provider=",".join(map(str, search_infos.get('provider', []))),
        orig_lang=",".join(map(str, search_infos.get('orig_lang', []))),
        available_langs=",".join(
            map(str, search_infos.get('available_langs', []))),
        max_resources=search_infos.get('max_resources', 20),
        max_concepts=search_infos.get('max_concepts', 20),
        page='1')
    search_e_s = "text={text}&type={type}&page={page};"

    # Execute X5GON search endpoint
    response = requests.get(
        PLATFORM_URL + search_endpoint.format(search_e_s.format(**search_e_p)))
    r_json = response.json()

    # Return modelsdsh needed output format
    resources_ids = [
        d['material_id']
        for d in r_json['rec_materials'][:search_e_p['max_resources']]
    ]
    # disc_rsltndvcts = {x['id']: x['result']['value'] for x in get_experimental_features(resources_ids,
    #                                                                                     [globals()[f"experiment_id_{model_type}"]],
    #                                                                                     order_needed=False)}
    # res_set, vects_set = zip(*((x, disc_rsltndvcts.get(x, {})) for x in resources_ids if x in disc_rsltndvcts))
    # res_set, vects_set = list(res_set), list(vects_set)
    disc_rsltfd = checkrmv_duplicates(resources_ids, model_type)
    resources_ids = disc_rsltfd[:search_e_p['max_resources']]
    resources_needed_infos = get_resource_description(
        resources_ids, {
            "concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'],
            "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']
        },
        max_concepts=search_e_p['max_concepts'],
        max_keywords=search_e_p['max_concepts'])
    resources_final_infos = []
    for i, rid in enumerate(resources_ids):
        res_infos = resources_needed_infos.get(int(rid), dict())
        if res_infos:
            res_infos['duration'] = 60 * (res_infos['len_word'] /
                                          __wpm()["Slide"])
            res_infos['difficulty'] = wikification2con_per_sec(
                res_infos['len_char'], res_infos['len_concepts'])
            # res_infos['difficulty'] = tfidf2technicity( res_infos['keywords_full'] )
            del res_infos['keywords_full']
            del res_infos['wikifier_full']
            resources_final_infos.append(res_infos)
    return {'result': resources_final_infos}
示例#4
0
文件: core.py 项目: X5GON/lamapi
def enrich_pd_resources(pd_resources):
    resources_needed_infos = get_resource_description(pd_resources,
                                                      {"concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'],
                                                       "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']},
                                                       max_concepts=5,
                                                       max_keywords=5)
    res_metadata = get_resource_metadata(pd_resources)
    resources_final_infos = []
    for i, rid in enumerate(pd_resources):
        res_infos = resources_needed_infos.get(int(rid), dict())
        res_metainfos = res_metadata.get(int(rid), dict())
        # The avoid exeception on not found ids taken/computed by rec_algo from models files (res_infos/res_metadata)
        try:
            if res_infos or res_metainfos:
                if res_infos:
                    res_infos['difficulty'] = wikification2con_per_sec(res_infos['len_char'], res_infos['len_concepts']) if ('len_char' in res_infos and res_infos['len_char'] != 0) else 0
                    if 'keywords_full' in res_infos:
                        del res_infos['keywords_full']
                    if 'wikifier_full' in res_infos:
                        del res_infos['wikifier_full']
                    res_infos['concepts'] = res_infos['wikifier'] if 'wikifier' in res_infos else []
                else:
                    # In case there is no info returned from exp_results query
                    res_infos['id'] = res_metainfos['id']
                    res_infos['orig_lang'] = res_metainfos['orig_lang']
                    res_infos['provider'] = res_metainfos['provider']
                    res_infos['difficulty'] = ''
                    res_infos['keywords'] = []
                    res_infos['concepts'] = []
                res_infos['title'] = res_metainfos['title'] if res_metainfos['title'] is not None else ''
                res_infos['description'] = ' '.join(res_metainfos['description'].split()[:150]) if res_metainfos['description'] is not None else ''
                res_infos['duration'] = f"~ {res_metainfos['len_word'] / __wpm()['Slide']} mins"
                res_infos['url'] = res_metainfos['url']
                res_infos['author'] = ", ".join(res_metainfos['authors']) if res_metainfos['authors'] is not None else ''
                res_infos['date'] = res_metainfos['date'] if res_metainfos['date'] != '' else ''
                res_infos['mediatype'] = res_metainfos['type']
                res_infos['mimetype'] = res_metainfos['mimetype']
                res_infos['license'] = res_metainfos['license'] if res_metainfos['license'] is not None else ''
                # This is to make sure that there is no "" chars can brake the xmls
                res_infos['title'] = ''.join(filter(lambda x: x in string.printable, res_infos['title'].replace('"',"")))
                res_infos['description'] = ''.join(filter(lambda x: x in string.printable, res_infos['description'].replace('"',"")))
        except KeyError as e:
            print(e)
            print(traceback.format_exc())
        resources_final_infos.append(res_infos)
    return resources_final_infos
示例#5
0
def format_resources(insertions, max_concepts):
    added_ids = [c for c in insertions if c != None]
    if added_ids == []:
        return insertions
    resources_needed_infos = get_resource_description(added_ids,
                                                      {"concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'],
                                                       "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']},
                                                      max_concepts=max_concepts)

    resources_final_infos = {}
    for rid in added_ids:
        res_infos = resources_needed_infos.get(int(rid), dict())
        if res_infos:
            res_infos['duration'] = 60 * (res_infos['len_word'] / __wpm()["Slide"])
            res_infos['difficulty'] = wikification2con_per_sec(res_infos['len_char'], res_infos['len_concepts'])
            # res_infos['difficulty'] = tfidf2technicity( res_infos['keywords_full'] )
            del res_infos['keywords_full']
            del res_infos['wikifier_full']
        resources_final_infos[rid] = res_infos
    return [None if v == None else {'resource':resources_final_infos[v], 'confidence': 0.5} for v in insertions]
示例#6
0
文件: core.py 项目: X5GON/lamapi
def get_resource_difficulty(resource_texts):
    return [{"resource_text": res,
             "value": wikification2con_per_sec(len(res), len(wikification(res, wikification_type='SIMPLE')['concepts']))} for res in resource_texts]
示例#7
0
def enrich_playlist_items(playlist_infos):
    pst_items_ix = [{
        'x5gon_id': item['x5gon_id'],
        'xlearn_id': item['material_id'],
        'item_ix': i
    } for i, item in enumerate(playlist_infos['playlist_items'])]
    x5gon_items_ix = [
        item for item in pst_items_ix
        if item['x5gon_id'] not in [None, 'null', '']
    ]
    resources_ids = [item['x5gon_id'] for item in x5gon_items_ix]
    resources_needed_infos = get_resource_description(
        resources_ids, {
            "concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'],
            "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']
        },
        max_concepts=5,
        max_keywords=5)
    res_metadata = get_resource_metadata(resources_ids)
    resources_final_infos = []
    for i, pstx5item in enumerate(x5gon_items_ix):
        rid = pstx5item['x5gon_id']
        rix = pstx5item['item_ix']
        res_infos = resources_needed_infos.get(int(rid), dict())
        res_metainfos = res_metadata.get(int(rid), dict())
        # The following metadata will be fetched directly from the plalist infos
        if 'title' not in playlist_infos['playlist_items'][
                rix] or playlist_infos['playlist_items'][rix]['title'] is None:
            playlist_infos['playlist_items'][rix]['title'] = res_metainfos[
                'title'] if res_metainfos['title'] is not None else ''
        if 'description' not in playlist_infos['playlist_items'][
                rix] or playlist_infos['playlist_items'][rix][
                    'description'] is None:
            playlist_infos['playlist_items'][rix]['description'] = ' '.join(
                res_metainfos['description'].split()
                [:150]) if res_metainfos['description'] is not None else ''
        if 'duration' not in playlist_infos['playlist_items'][
                rix] or playlist_infos['playlist_items'][rix]['duration'] in [
                    None, ''
                ]:
            playlist_infos['playlist_items'][rix][
                'duration'] = f"~ {res_metainfos['len_word'] / __wpm()['Slide']} mins" if 'len_word' in res_metainfos else "~ unknown"
        playlist_infos['playlist_items'][rix]['url'] = res_metainfos[
            'url'] if 'url' in res_metainfos else playlist_infos[
                'playlist_items'][rix]['url']
        playlist_infos['playlist_items'][rix]['author'] = ", ".join(
            res_metainfos['authors']) if (
                'authors' in res_metainfos
                and res_metainfos['authors'] is not None) else ''
        playlist_infos['playlist_items'][rix][
            'date'] = res_metainfos['date'] if (
                'date' in res_metainfos
                and res_metainfos['date'] not in ['', None]) else (
                    playlist_infos['playlist_items'][rix]['date']
                    if 'date' in playlist_infos['playlist_items'][rix] else '')
        playlist_infos['playlist_items'][rix][
            'mediatype'] = res_metainfos['type'] if (
                'type' in res_metainfos and res_metainfos['type'] != ''
            ) else playlist_infos['playlist_items'][rix]['mediatype']
        playlist_infos['playlist_items'][rix]['license'] = res_metainfos[
            'license'] if ('license' in res_metainfos
                           and res_metainfos['license'] is not None) else ''
        # This is to make sure that there is no "" chars can brake the xmls
        playlist_infos['playlist_items'][rix]['title'] = ''.join(
            filter(
                lambda x: x in string.printable,
                playlist_infos['playlist_items'][rix]['title'].replace(
                    '"', "")))
        playlist_infos['playlist_items'][rix]['description'] = ''.join(
            filter(
                lambda x: x in string.printable,
                playlist_infos['playlist_items'][rix]['description'].replace(
                    '"', "")))
        if res_infos:
            res_infos['difficulty'] = wikification2con_per_sec(
                res_infos['len_char'], res_infos['len_concepts'])
            del res_infos['keywords_full']
            del res_infos['wikifier_full']
            # update only the metadata of the found oers in db
            playlist_infos['playlist_items'][rix]['difficulty'] = res_infos[
                'difficulty']
            playlist_infos['playlist_items'][rix]['keywords'] = ", ".join([
                keyword['label']
                for i, keyword in enumerate(res_infos['keywords'])
            ])
            playlist_infos['playlist_items'][rix]['concepts'] = res_infos[
                'wikifier']
        else:
            playlist_infos['playlist_items'][rix]['difficulty'] = ''
            playlist_infos['playlist_items'][rix]['keywords'] = ''
            playlist_infos['playlist_items'][rix]['concepts'] = []
        resources_final_infos.append(res_infos)
    return playlist_infos