def get_resource_modelsdshknn(parameters): # max_concepts tmp_parameters = {} tmp_parameters["resource_id"] = parameters['id'] tmp_parameters["n_neighbors"] = parameters['max_resources'] tmp_parameters["remove_duplicates"] = True tmp_parameters["return_reduction"] = True tmp_parameters["return_matrix"] = True tmp_parameters["return_vectors"] = True tmp_parameters["return_dist"] = True # Get Knn resources knn = knn_wikifier_res(**tmp_parameters) resources_ids = knn["neighbors"] matrix = knn["matrix"] # Get resources descriptions resources_needed_infos = get_resource_description(resources_ids, {"concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'], "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']}, max_concepts=parameters['max_concepts'], max_keywords=parameters['max_concepts']) # Get the concepts conceptsmax = Counter() concetpsmostcommon = Counter() for v in knn["vectors"]: conceptsmax += Counter(v) concetpsmostcommon += Counter(v.keys()) sumpr = sum(conceptsmax.values()) # Get the most common concepts most_common_concepts = [(c, v/sumpr) for c, v in conceptsmax.most_common(parameters['max_concepts'])] # Fill up the missings needed infos: ContextualResourceDescription resources_final_infos = [] for i, rid in enumerate(resources_ids): res_infos = resources_needed_infos.get(int(rid), dict()) if res_infos: res_infos['duration'] = 60 * (res_infos['len_word'] / __wpm()["Slide"]) res_infos['difficulty'] = wikification2con_per_sec(res_infos['len_char'], res_infos['len_concepts']) # res_infos['difficulty'] = tfidf2technicity( res_infos['keywords_full'] ) res_infos['projection'] = knn["projected_matrix"][i] common_wk_in_res = [[dict(label= c_f["title"], url= c_f["url"], value= c_f["norm_pageRank"]) for c_f in filter(lambda c_res: c_res['url'] == c, res_infos["wikifier_full"]["value"]["concepts"])] for c, v_c in most_common_concepts] res_infos['common_wikifier'] = [c[0] for c in common_wk_in_res if len(c) > 0] res_infos['distance'] = knn["distances"][i] del res_infos['keywords_full'] del res_infos['wikifier_full'] resources_final_infos.append(res_infos) a = { "reference": resources_final_infos[0], "neighbors": resources_final_infos[1:], # "matrix": matrix, # "top": knn["top"], # "distances": knn["distances"], # "concepts_max": [(c, v/sumpr) for c, v in conceptsmax.most_common(10)], # "distri": list(conceptsmax.items()), # "concepts_most_common": concetpsmostcommon.most_common(10), # "variance_ratio_": knn["variance_ratio_"].tolist() if knn["variance_ratio_"] is not None else None } return a
def get_resource_difficulty(resource_ids): res_lens = get_experimental_contents(resource_ids) res_wks = get_experimental_features(resource_ids, [__DEFAULT_EXPID_SETTING["SIMPLE"]]) res_valid = get_valid_resources(res_lens, res_wks) return [{ "resource_id": res[0], "value": wikification2con_per_sec(res[1], len(res[2]['concepts'])) } for res in res_valid]
def get_resource_modelsdshsearch(search_infos): # The X5GON API is available at: PLATFORM_URL = "https://platform.x5gon.org/api/v1" # initialise the endpoint search_endpoint = "/search?{}" # search_endpoint parameters search_e_p = dict( text=search_infos['q'] if 'q' in search_infos else '', type=",".join(search_infos['type']) if 'type' in search_infos else '', provider=",".join(map(str, search_infos.get('provider', []))), orig_lang=",".join(map(str, search_infos.get('orig_lang', []))), available_langs=",".join( map(str, search_infos.get('available_langs', []))), max_resources=search_infos.get('max_resources', 20), max_concepts=search_infos.get('max_concepts', 20), page='1') search_e_s = "text={text}&type={type}&page={page};" # Execute X5GON search endpoint response = requests.get( PLATFORM_URL + search_endpoint.format(search_e_s.format(**search_e_p))) r_json = response.json() # Return modelsdsh needed output format resources_ids = [ d['material_id'] for d in r_json['rec_materials'][:search_e_p['max_resources']] ] # disc_rsltndvcts = {x['id']: x['result']['value'] for x in get_experimental_features(resources_ids, # [globals()[f"experiment_id_{model_type}"]], # order_needed=False)} # res_set, vects_set = zip(*((x, disc_rsltndvcts.get(x, {})) for x in resources_ids if x in disc_rsltndvcts)) # res_set, vects_set = list(res_set), list(vects_set) disc_rsltfd = checkrmv_duplicates(resources_ids, model_type) resources_ids = disc_rsltfd[:search_e_p['max_resources']] resources_needed_infos = get_resource_description( resources_ids, { "concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'], "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id'] }, max_concepts=search_e_p['max_concepts'], max_keywords=search_e_p['max_concepts']) resources_final_infos = [] for i, rid in enumerate(resources_ids): res_infos = resources_needed_infos.get(int(rid), dict()) if res_infos: res_infos['duration'] = 60 * (res_infos['len_word'] / __wpm()["Slide"]) res_infos['difficulty'] = wikification2con_per_sec( res_infos['len_char'], res_infos['len_concepts']) # res_infos['difficulty'] = tfidf2technicity( res_infos['keywords_full'] ) del res_infos['keywords_full'] del res_infos['wikifier_full'] resources_final_infos.append(res_infos) return {'result': resources_final_infos}
def enrich_pd_resources(pd_resources): resources_needed_infos = get_resource_description(pd_resources, {"concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'], "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']}, max_concepts=5, max_keywords=5) res_metadata = get_resource_metadata(pd_resources) resources_final_infos = [] for i, rid in enumerate(pd_resources): res_infos = resources_needed_infos.get(int(rid), dict()) res_metainfos = res_metadata.get(int(rid), dict()) # The avoid exeception on not found ids taken/computed by rec_algo from models files (res_infos/res_metadata) try: if res_infos or res_metainfos: if res_infos: res_infos['difficulty'] = wikification2con_per_sec(res_infos['len_char'], res_infos['len_concepts']) if ('len_char' in res_infos and res_infos['len_char'] != 0) else 0 if 'keywords_full' in res_infos: del res_infos['keywords_full'] if 'wikifier_full' in res_infos: del res_infos['wikifier_full'] res_infos['concepts'] = res_infos['wikifier'] if 'wikifier' in res_infos else [] else: # In case there is no info returned from exp_results query res_infos['id'] = res_metainfos['id'] res_infos['orig_lang'] = res_metainfos['orig_lang'] res_infos['provider'] = res_metainfos['provider'] res_infos['difficulty'] = '' res_infos['keywords'] = [] res_infos['concepts'] = [] res_infos['title'] = res_metainfos['title'] if res_metainfos['title'] is not None else '' res_infos['description'] = ' '.join(res_metainfos['description'].split()[:150]) if res_metainfos['description'] is not None else '' res_infos['duration'] = f"~ {res_metainfos['len_word'] / __wpm()['Slide']} mins" res_infos['url'] = res_metainfos['url'] res_infos['author'] = ", ".join(res_metainfos['authors']) if res_metainfos['authors'] is not None else '' res_infos['date'] = res_metainfos['date'] if res_metainfos['date'] != '' else '' res_infos['mediatype'] = res_metainfos['type'] res_infos['mimetype'] = res_metainfos['mimetype'] res_infos['license'] = res_metainfos['license'] if res_metainfos['license'] is not None else '' # This is to make sure that there is no "" chars can brake the xmls res_infos['title'] = ''.join(filter(lambda x: x in string.printable, res_infos['title'].replace('"',""))) res_infos['description'] = ''.join(filter(lambda x: x in string.printable, res_infos['description'].replace('"',""))) except KeyError as e: print(e) print(traceback.format_exc()) resources_final_infos.append(res_infos) return resources_final_infos
def format_resources(insertions, max_concepts): added_ids = [c for c in insertions if c != None] if added_ids == []: return insertions resources_needed_infos = get_resource_description(added_ids, {"concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'], "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id']}, max_concepts=max_concepts) resources_final_infos = {} for rid in added_ids: res_infos = resources_needed_infos.get(int(rid), dict()) if res_infos: res_infos['duration'] = 60 * (res_infos['len_word'] / __wpm()["Slide"]) res_infos['difficulty'] = wikification2con_per_sec(res_infos['len_char'], res_infos['len_concepts']) # res_infos['difficulty'] = tfidf2technicity( res_infos['keywords_full'] ) del res_infos['keywords_full'] del res_infos['wikifier_full'] resources_final_infos[rid] = res_infos return [None if v == None else {'resource':resources_final_infos[v], 'confidence': 0.5} for v in insertions]
def get_resource_difficulty(resource_texts): return [{"resource_text": res, "value": wikification2con_per_sec(len(res), len(wikification(res, wikification_type='SIMPLE')['concepts']))} for res in resource_texts]
def enrich_playlist_items(playlist_infos): pst_items_ix = [{ 'x5gon_id': item['x5gon_id'], 'xlearn_id': item['material_id'], 'item_ix': i } for i, item in enumerate(playlist_infos['playlist_items'])] x5gon_items_ix = [ item for item in pst_items_ix if item['x5gon_id'] not in [None, 'null', ''] ] resources_ids = [item['x5gon_id'] for item in x5gon_items_ix] resources_needed_infos = get_resource_description( resources_ids, { "concepts": EXP_IDS['wikifier']['SIMPLE']['experiment_id'], "keywords": EXP_IDS['text2tfidf']['SIMPLE']['experiment_id'] }, max_concepts=5, max_keywords=5) res_metadata = get_resource_metadata(resources_ids) resources_final_infos = [] for i, pstx5item in enumerate(x5gon_items_ix): rid = pstx5item['x5gon_id'] rix = pstx5item['item_ix'] res_infos = resources_needed_infos.get(int(rid), dict()) res_metainfos = res_metadata.get(int(rid), dict()) # The following metadata will be fetched directly from the plalist infos if 'title' not in playlist_infos['playlist_items'][ rix] or playlist_infos['playlist_items'][rix]['title'] is None: playlist_infos['playlist_items'][rix]['title'] = res_metainfos[ 'title'] if res_metainfos['title'] is not None else '' if 'description' not in playlist_infos['playlist_items'][ rix] or playlist_infos['playlist_items'][rix][ 'description'] is None: playlist_infos['playlist_items'][rix]['description'] = ' '.join( res_metainfos['description'].split() [:150]) if res_metainfos['description'] is not None else '' if 'duration' not in playlist_infos['playlist_items'][ rix] or playlist_infos['playlist_items'][rix]['duration'] in [ None, '' ]: playlist_infos['playlist_items'][rix][ 'duration'] = f"~ {res_metainfos['len_word'] / __wpm()['Slide']} mins" if 'len_word' in res_metainfos else "~ unknown" playlist_infos['playlist_items'][rix]['url'] = res_metainfos[ 'url'] if 'url' in res_metainfos else playlist_infos[ 'playlist_items'][rix]['url'] playlist_infos['playlist_items'][rix]['author'] = ", ".join( res_metainfos['authors']) if ( 'authors' in res_metainfos and res_metainfos['authors'] is not None) else '' playlist_infos['playlist_items'][rix][ 'date'] = res_metainfos['date'] if ( 'date' in res_metainfos and res_metainfos['date'] not in ['', None]) else ( playlist_infos['playlist_items'][rix]['date'] if 'date' in playlist_infos['playlist_items'][rix] else '') playlist_infos['playlist_items'][rix][ 'mediatype'] = res_metainfos['type'] if ( 'type' in res_metainfos and res_metainfos['type'] != '' ) else playlist_infos['playlist_items'][rix]['mediatype'] playlist_infos['playlist_items'][rix]['license'] = res_metainfos[ 'license'] if ('license' in res_metainfos and res_metainfos['license'] is not None) else '' # This is to make sure that there is no "" chars can brake the xmls playlist_infos['playlist_items'][rix]['title'] = ''.join( filter( lambda x: x in string.printable, playlist_infos['playlist_items'][rix]['title'].replace( '"', ""))) playlist_infos['playlist_items'][rix]['description'] = ''.join( filter( lambda x: x in string.printable, playlist_infos['playlist_items'][rix]['description'].replace( '"', ""))) if res_infos: res_infos['difficulty'] = wikification2con_per_sec( res_infos['len_char'], res_infos['len_concepts']) del res_infos['keywords_full'] del res_infos['wikifier_full'] # update only the metadata of the found oers in db playlist_infos['playlist_items'][rix]['difficulty'] = res_infos[ 'difficulty'] playlist_infos['playlist_items'][rix]['keywords'] = ", ".join([ keyword['label'] for i, keyword in enumerate(res_infos['keywords']) ]) playlist_infos['playlist_items'][rix]['concepts'] = res_infos[ 'wikifier'] else: playlist_infos['playlist_items'][rix]['difficulty'] = '' playlist_infos['playlist_items'][rix]['keywords'] = '' playlist_infos['playlist_items'][rix]['concepts'] = [] resources_final_infos.append(res_infos) return playlist_infos