def __knn_continuousDoc2vec(part_vector, n_neighbors=20, return_vectors=False, return_dist=False, resource_id=None): model = load_model('ccknlt')[0] ids = load_model('ccknlt')[1] dists, neigh_index = model.kneighbors([resource_vector], return_distance=True, n_neighbors=int(n_neighbors)) dists = dists.tolist()[0] neigh_partids = (ids["index2id"][i] for i in neigh_index[0]) neigh_ids = {} for partid in neigh_partids: rid, partid = partid.split("p") neigh_ids[rid] = neigh_ids.get(rid, []) + partid if return_vectors: vectors = [resource_vector] gen = get_experimental_features(list(neigh_ids.keys()), [experiment_id], order_needed=False) for d in gen: rid = d["id"] for part_id in neigh_ids[d["id"]]: dvect[f"{rid}p{part_id}"] = d["result"]["value"][part_id] for rid in neigh_ids: vectors.append(dvect[rid]) return { "neighbors": neigh_ids, "distances": dists if return_dist else None, "vectors": vectors if return_vectors else None, }
def __preprocess(text, phrase="x5gon", dfxp2text=False, **kwargs): if dfxp2text: text = remove_tags(text) if phrase and phrase == "x5gon": text = text2phrase(text, [load_model("teanlt2r"), load_model("teanlt3r")]) if phrase and phrase == "wikipedia": text = text2phrase(text, [load_model("teaalt2r"), load_model("teaalt3r")]) return preprocess(text, **kwargs)
def wikifier_to_vector(resource_wikifier): """Given a list of concepts with pageRanks, return a vector representing the wikifier""" concept_map = load_model('wrknlt')[1] model = load_model('wrknlt')[0] ids = load_model('wrknlt')[2] top = None resource_vector = dok_matrix((1, len(concept_map["index2concept"])), dtype=np.float32) not_found = 0 for r in resource_wikifier: try: ind = concept_map["concept2index"][r["url"]] resource_vector[0, ind] = r['pageRank'] except KeyError: not_found += 1 return resource_vector
def generate_possible(sequence, resource_wikifiers): dist_to_next = [distance_wikifiers(resource_wikifiers[sequence[i]], resource_wikifiers[sequence[i+1]]) for i in range(len(sequence)-1)] dist_balls = [dist_to_next[i] if i == 0 else (max(dist_to_next[i],dist_to_next[i-1]) if i < len(sequence)-1 else dist_to_next[i-1]) for i in range(len(sequence))] vectors = {c_id:wikifier_to_vector(wk) for c_id,wk in resource_wikifiers.items()} model, concept_map, ids = load_model('wrknlt') balls = [model.radius_neighbors([vectors[sequence[i]]], dist_balls[i]) for i in range(len(sequence))] return [generate_possible_of_two_balls(balls[i],balls[i+1], sequence, ids["index2id"]) for i in range(len(sequence)-1)]
def get_resource_difficulty(resource_texts): tfidfs = interpolates(resource_texts, load_model('tfmllt[1-2]-grams'), return_format="dict") return [{ "resource_text": text, "value": tfidf2technicity(tfidf) } for text, tfidf in zip(resource_texts, tfidfs)]
def reordonize(modeltype, vec1, vec2): if modeltype == "cw2order": # based on cwikifier vectors return rzecw2order(vec1, vec2) elif modeltype == "rnn2order": rnn2ordermodel = load_model('rrmllt') # based on cdoc2vec vectors return rzernn2order(rnn2ordermodel, vec1, vec2)
def __knn_doc2vec(resource_vector, n_neighbors=20, return_reduction=False, return_vectors=False, return_dist=False, return_matrix=False, reduction_type="PCA", remove_duplicates=False, resource_id=None): knnmodel = load_model('dcknlt') model = knnmodel[0] # model.set_params(n_jobs=1) # print(model.get_params()) ids = knnmodel[1] dists, neigh_index = model.kneighbors([resource_vector], return_distance=True, n_neighbors=int(n_neighbors)) dists = dists.tolist()[0] neigh_ids = [ids["index2id"][i] for i in neigh_index[0]] if resource_id is not None and remove_duplicates: dists, neigh_ids = zip(*((d, oerid) for d, oerid in zip(dists, neigh_ids) if d > 1e-4 or oerid == resource_id)) if return_vectors or return_matrix or return_reduction or remove_duplicates: vectors = [resource_vector] gen = get_experimental_features(neigh_ids, [experiment_id], order_needed=False) dvect = {d["id"]: d["result"]["value"] for d in gen} for rid in neigh_ids: # To avoid the missing vectors in db but found in models files if rid in dvect: vectors.append(dvect[rid]) # print("len vector", len(vectors)) if return_matrix or return_reduction or remove_duplicates: matrix = pairwise_distances(vectors, metric="cosine") if remove_duplicates: matrix, vectors, dists, neigh_ids = filter_knn(matrix, vectors, dists, neigh_ids) if return_reduction: reductor = dimension_reduction(reduction_type, len(vectors) - 1) matrix_projected = reductor.fit_transform(vectors) # print(len(neigh_ids)) return { "top": None, "neighbors": neigh_ids, "distances": dists if return_dist else None, "vectors": vectors if return_vectors else None, "matrix": matrix.tolist() if return_matrix else None, "projected_matrix": matrix_projected.tolist() if return_reduction else None, "variance_ratio_": reductor.explained_variance_ratio_ if return_reduction and\ (reduction_type=="PCA" or\ reduction_type=="TruncatedSVD" or\ reduction_type=="SparsePCA" )else None }
def continuous_doc2vec_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING, exp_id: int = __DEFAULT_EXPID_SETTING): model = load_model('ccmllt') lids = list(get_all_resource_ids()) if resume: lids_computed = list(get_all_computed_resource_ids(exp_id)) print(f"We are talking about global nbr of resources: {len(lids)}") print( f"We are talking about nbr of computed resources: {len(lids_computed)}" ) lids = list(set(lids) - set(lids_computed)) print( f"We are talking about nbr of tobe_computed resources: {len(lids)}" ) print("Some ids samples from DB that will be computed:") print(lids[0:100]) # lids = lids[0:1002] chunk = 0 records = {} batch_size = 1000 for text, rid in ((t["content_raw"], t['id']) for t in tqdm.tqdm(get_experimental_contents( lids, order_needed=False, return_content_raw=True), total=len(lids), desc="continuousdoc2vec done")): try: if rid in model[0]: records[rid] = { 'value': recover_vectors(rid, model), 'interpolate': False } else: records[rid] = { 'value': recover_vectors(text, model), 'interpolate': True } except Exception as error: print("ErrorFATAL:", rid) print(error) records[rid] = {"value": {"error": str(error)}} raise error chunk += 1 if chunk == batch_size: print("One part submitted to DB:") print(records.keys()) insert_experiment_result(exp_id, records.items(), update=not resume) chunk = 0 records = {} if chunk > 0 and chunk < batch_size: print("Last part submitted to DB:") print(records.keys()) insert_experiment_result(exp_id, records.items(), update=not resume)
def get_knn_one_resource(resource_wikifier, n_neighbors, forbidden_ids=[]): """Given a resource wikifier (list of concepts), return the ids of the nearest neighbors""" model, concept_map, ids = load_model('wrknlt') resource_vector = dok_matrix((1, len(concept_map["index2concept"])), dtype=np.float32) not_found = 0 for concept in resource_wikifier: try: ind = concept_map["concept2index"][concept["url"]] resource_vector[0, ind] = concept["pageRank"] except KeyError: not_found += 1 dists, _neigh_index = model.kneighbors(resource_vector, n_neighbors=n_neighbors, return_distance=True) dists = dists.tolist()[0] neigh_ids = [ids["index2id"][i] for i in _neigh_index[0]] dists, neigh_ids = zip(*((d, oerid) for d, oerid in zip(dists, neigh_ids) if d > 1e-4 and oerid not in forbidden_ids)) return neigh_ids
def knn_doc2vec_text(resource_text, **kwargs): vec = __interpolate(resource_text, model=load_model('ccmllt')) return __knn_doc2vec(resource_vector=vec, **kwargs)
def __knn_tfidf(resource_tfidf, n_neighbors=20, return_reduction=False, return_vectors=False, return_dist=False, return_matrix=False, reduction_type="TruncatedSVD", remove_duplicates=False, resource_id=None): keyword_map = load_model('tfknlt[1-2]-grams')[2] model = load_model('tfknlt[1-2]-grams')[0] ids = load_model('tfknlt[1-2]-grams')[1] # print(type(keyword_map), type(model), type(ids)) # print(keyword_map.keys()) top = None resource_vector = dok_matrix((1, len(keyword_map["index2keyword"])), dtype=np.float32) keyword_not_found = 0 for k, v in resource_tfidf.items(): try: ind = keyword_map["keyword2index"][k] resource_vector[0, ind] = v except KeyError: keyword_not_found += 1 dists, neigh_index = model.kneighbors(resource_vector, n_neighbors=n_neighbors, return_distance=True) dists = dists.tolist()[0] neigh_ids = [ids["index2id"][i] for i in neigh_index[0]] if resource_id is not None and remove_duplicates: dists, neigh_ids = zip(*((d, oerid) for d, oerid in zip(dists, neigh_ids) if d > 1e-4 or oerid == resource_id)) if return_vectors or return_matrix or return_reduction or remove_duplicates: gen = get_experimental_features(neigh_ids, [experiment_id], order_needed=False) neighbors_tfidf = {d["id"]: d["result"]["value"] for d in gen} neighbors_vector = dok_matrix( (len(neighbors_tfidf), len(keyword_map["index2keyword"])), dtype=np.float32) for i, rid in enumerate(neigh_ids): for k, v in neighbors_tfidf[rid].items(): ind = keyword_map["keyword2index"][k] neighbors_vector[i, ind] = v vectors = vstack((resource_vector, neighbors_vector)) # print(type(vectors)) if return_matrix or return_reduction or remove_duplicates: matrix = pairwise_distances(vectors, metric="cosine") if remove_duplicates: matrix, vectors, dists, neigh_ids = filter_knn(matrix, vectors, dists, neigh_ids) if return_reduction: reductor = dimension_reduction(reduction_type, vectors.shape[0] - 1) matrix_projected = reductor.fit_transform(vectors) print(reductor.components_.shape) top = [] for i in range(reductor.components_.shape[0]): top_ = sorted(enumerate(reductor.components_[i].tolist()), key=lambda x: x[1], reverse=True) top_ = map(lambda x: (keyword_map["index2keyword"][x[0]], x[1]), top_) top_ = list(filter(lambda x: x[1] != 0, top_)) top.append(top_) if return_vectors: json_vectors = [{} for _ in range(vectors.shape[0])] for k, v in vectors.todok().items(): json_vectors[k[0]][keyword_map["index2keyword"][k[1]]] = float(v) return {"neighbors": neigh_ids, "keyword_not_found": keyword_not_found, "distances": dists if return_dist else None, "vectors": json_vectors if return_vectors else None, "matrix": matrix.tolist() if return_matrix else None, "projected_matrix": matrix_projected.tolist() if return_reduction else None, "top": top, "variance_ratio_": reductor.explained_variance_ratio_ if return_reduction and\ (reduction_type=="PCA" or\ reduction_type=="TruncatedSVD" or\ reduction_type=="SparsePCA" ) else None }
def knn_tfidf_text(resource_text, **kwargs): tfidf = interpolates([resource_text], load_model('tfmllt[1-2]-grams'), return_format="dict")[0] return __knn_tfidf(resource_tfidf=tfidf, **kwargs)
def knn_doc2vec_list(res_ids, n_neighbors=20, return_reduction=False, return_vectors=False, return_dist=False, return_matrix=False, reduction_type="PCA", remove_duplicates=False, resource_id=None): # assume not rendering resources with unfound knns: just keep the order nd_vectors = { d["id"]: d["result"]["value"] for d in get_experimental_features(res_ids, [experiment_id], order_needed=False) } res_ids, res_vectors = zip(*((x, nd_vectors.get(x, {})) for x in res_ids if x in nd_vectors)) res_ids, res_vectors = list(res_ids), list(res_vectors) knnmodel = load_model('dcknlt') model = knnmodel[0] ids = knnmodel[1] dists, neigh_index = model.kneighbors(res_vectors, return_distance=True, n_neighbors=int(n_neighbors)) dists = dists.tolist() neigh_ids = [[ids["index2id"][i] for i in neigix] for neigix in neigh_index] if return_vectors or return_matrix or return_reduction or remove_duplicates: vectors = [] matrix = [] matrix_projected = [] for ix, vecrq in enumerate(neigh_ids): gen = get_experimental_features(vecrq, [experiment_id], order_needed=False) dvect = {d["id"]: d["result"]["value"] for d in gen} # To avoid the missing vectors in db but found in models files ngvects = [dvect.get(rid, ()) for rid in vecrq if rid in dvect] vectors.append(ngvects) if return_matrix or return_reduction or remove_duplicates: ngmtx = pairwise_distances(ngvects, metric="cosine") matrix.append(ngmtx) if remove_duplicates: for j in dists[ix]: matrix[ix], vectors[ix], dists[ix], neigh_ids[ ix] = filter_knn(matrix[ix], vectors[ix], dists[ix], neigh_ids[ix]) if return_reduction: reductor = dimension_reduction(reduction_type, len(ngvects) - 1) ngmtx_pjtd = reductor.fit_transform(ngvects) matrix_projected.append(ngmtx_pjtd) return { "top": None, "neighbors": neigh_ids, "distances": dists if return_dist else None, "vectors": vectors if return_vectors else None, "matrix": matrix.tolist() if return_matrix else None, "projected_matrix": matrix_projected.tolist() if return_reduction else None, "variance_ratio_": reductor.explained_variance_ratio_ if return_reduction and\ (reduction_type=="PCA" or\ reduction_type=="TruncatedSVD" or\ reduction_type=="SparsePCA" )else None }