Пример #1
0
def __knn_continuousDoc2vec(part_vector,
                            n_neighbors=20,
                            return_vectors=False,
                            return_dist=False,
                            resource_id=None):
    model = load_model('ccknlt')[0]
    ids = load_model('ccknlt')[1]
    dists, neigh_index = model.kneighbors([resource_vector],
                                          return_distance=True,
                                          n_neighbors=int(n_neighbors))
    dists = dists.tolist()[0]
    neigh_partids = (ids["index2id"][i] for i in neigh_index[0])
    neigh_ids = {}
    for partid in neigh_partids:
        rid, partid = partid.split("p")
        neigh_ids[rid] = neigh_ids.get(rid, []) + partid

    if return_vectors:
        vectors = [resource_vector]
        gen = get_experimental_features(list(neigh_ids.keys()),
                                        [experiment_id],
                                        order_needed=False)
        for d in gen:
            rid = d["id"]
            for part_id in neigh_ids[d["id"]]:
                dvect[f"{rid}p{part_id}"] = d["result"]["value"][part_id]
        for rid in neigh_ids:
            vectors.append(dvect[rid])
    return {
        "neighbors": neigh_ids,
        "distances": dists if return_dist else None,
        "vectors": vectors if return_vectors else None,
    }
Пример #2
0
def __preprocess(text, phrase="x5gon", dfxp2text=False, **kwargs):
    if dfxp2text:
        text = remove_tags(text)
    if phrase and phrase == "x5gon":
        text = text2phrase(text,
                           [load_model("teanlt2r"),
                            load_model("teanlt3r")])
    if phrase and phrase == "wikipedia":
        text = text2phrase(text,
                           [load_model("teaalt2r"),
                            load_model("teaalt3r")])
    return preprocess(text, **kwargs)
Пример #3
0
def wikifier_to_vector(resource_wikifier):
    """Given a list of concepts with pageRanks, return a vector representing the wikifier"""
    concept_map = load_model('wrknlt')[1]
    model = load_model('wrknlt')[0]
    ids = load_model('wrknlt')[2]
    top = None
    resource_vector = dok_matrix((1, len(concept_map["index2concept"])),
                                 dtype=np.float32)
    not_found = 0
    for r in resource_wikifier:
        try:
            ind = concept_map["concept2index"][r["url"]]
            resource_vector[0, ind] = r['pageRank']
        except KeyError:
            not_found += 1
    return resource_vector
Пример #4
0
def generate_possible(sequence, resource_wikifiers):
    dist_to_next = [distance_wikifiers(resource_wikifiers[sequence[i]], resource_wikifiers[sequence[i+1]]) for i in range(len(sequence)-1)]
    dist_balls = [dist_to_next[i] if i == 0 else (max(dist_to_next[i],dist_to_next[i-1]) if i < len(sequence)-1 else dist_to_next[i-1]) for i in range(len(sequence))]
    vectors = {c_id:wikifier_to_vector(wk) for c_id,wk in resource_wikifiers.items()}
    model, concept_map, ids = load_model('wrknlt')
    balls = [model.radius_neighbors([vectors[sequence[i]]], dist_balls[i]) for i in range(len(sequence))]
    return [generate_possible_of_two_balls(balls[i],balls[i+1], sequence, ids["index2id"]) for i in range(len(sequence)-1)]
Пример #5
0
def get_resource_difficulty(resource_texts):
    tfidfs = interpolates(resource_texts,
                          load_model('tfmllt[1-2]-grams'),
                          return_format="dict")
    return [{
        "resource_text": text,
        "value": tfidf2technicity(tfidf)
    } for text, tfidf in zip(resource_texts, tfidfs)]
Пример #6
0
def reordonize(modeltype, vec1, vec2):
    if modeltype == "cw2order":
        # based on cwikifier vectors
        return rzecw2order(vec1, vec2)
    elif modeltype == "rnn2order":
        rnn2ordermodel = load_model('rrmllt')
        # based on cdoc2vec vectors
        return rzernn2order(rnn2ordermodel, vec1, vec2)
Пример #7
0
def __knn_doc2vec(resource_vector,
                  n_neighbors=20,
                  return_reduction=False,
                  return_vectors=False,
                  return_dist=False,
                  return_matrix=False,
                  reduction_type="PCA",
                  remove_duplicates=False,
                  resource_id=None):
    knnmodel = load_model('dcknlt')
    model = knnmodel[0]
    # model.set_params(n_jobs=1)
    # print(model.get_params())
    ids = knnmodel[1]
    dists, neigh_index = model.kneighbors([resource_vector],
                                          return_distance=True,
                                          n_neighbors=int(n_neighbors))
    dists = dists.tolist()[0]
    neigh_ids = [ids["index2id"][i] for i in neigh_index[0]]

    if resource_id is not None and remove_duplicates:
        dists, neigh_ids = zip(*((d, oerid)
                                 for d, oerid in zip(dists, neigh_ids)
                                 if d > 1e-4 or oerid == resource_id))

    if return_vectors or return_matrix or return_reduction or remove_duplicates:
        vectors = [resource_vector]
        gen = get_experimental_features(neigh_ids, [experiment_id],
                                        order_needed=False)
        dvect = {d["id"]: d["result"]["value"] for d in gen}
        for rid in neigh_ids:
            # To avoid the missing vectors in db but found in models files
            if rid in dvect:
                vectors.append(dvect[rid])
        # print("len vector", len(vectors))
    if return_matrix or return_reduction or remove_duplicates:
        matrix = pairwise_distances(vectors, metric="cosine")
    if remove_duplicates:
        matrix, vectors, dists, neigh_ids = filter_knn(matrix, vectors, dists,
                                                       neigh_ids)
    if return_reduction:
        reductor = dimension_reduction(reduction_type, len(vectors) - 1)
        matrix_projected = reductor.fit_transform(vectors)
    # print(len(neigh_ids))
    return {
            "top": None,
            "neighbors": neigh_ids,
            "distances": dists if return_dist else None,
            "vectors": vectors if return_vectors else None,
            "matrix": matrix.tolist() if return_matrix else None,
            "projected_matrix": matrix_projected.tolist() if return_reduction else None,
            "variance_ratio_": reductor.explained_variance_ratio_ if return_reduction and\
                                                                     (reduction_type=="PCA" or\
                                                                      reduction_type=="TruncatedSVD" or\
                                                                      reduction_type=="SparsePCA"
                                                                     )else None
            }
Пример #8
0
def continuous_doc2vec_model_update_DB(resume: bool = __DEFAULT_RESUME_SETTING,
                                       exp_id: int = __DEFAULT_EXPID_SETTING):
    model = load_model('ccmllt')
    lids = list(get_all_resource_ids())
    if resume:
        lids_computed = list(get_all_computed_resource_ids(exp_id))
        print(f"We are talking about global nbr of resources: {len(lids)}")
        print(
            f"We are talking about nbr of computed resources: {len(lids_computed)}"
        )
        lids = list(set(lids) - set(lids_computed))
        print(
            f"We are talking about nbr of tobe_computed resources: {len(lids)}"
        )
    print("Some ids samples from DB that will be computed:")
    print(lids[0:100])
    # lids = lids[0:1002]
    chunk = 0
    records = {}
    batch_size = 1000
    for text, rid in ((t["content_raw"], t['id'])
                      for t in tqdm.tqdm(get_experimental_contents(
                          lids, order_needed=False, return_content_raw=True),
                                         total=len(lids),
                                         desc="continuousdoc2vec done")):
        try:
            if rid in model[0]:
                records[rid] = {
                    'value': recover_vectors(rid, model),
                    'interpolate': False
                }
            else:
                records[rid] = {
                    'value': recover_vectors(text, model),
                    'interpolate': True
                }
        except Exception as error:
            print("ErrorFATAL:", rid)
            print(error)
            records[rid] = {"value": {"error": str(error)}}
            raise error
        chunk += 1
        if chunk == batch_size:
            print("One part submitted to DB:")
            print(records.keys())
            insert_experiment_result(exp_id,
                                     records.items(),
                                     update=not resume)
            chunk = 0
            records = {}
    if chunk > 0 and chunk < batch_size:
        print("Last part submitted to DB:")
        print(records.keys())
        insert_experiment_result(exp_id, records.items(), update=not resume)
Пример #9
0
def get_knn_one_resource(resource_wikifier, n_neighbors, forbidden_ids=[]):
    """Given a resource wikifier (list of concepts), return the ids of the nearest neighbors"""
    model, concept_map, ids = load_model('wrknlt')
    resource_vector = dok_matrix((1, len(concept_map["index2concept"])),
                                 dtype=np.float32)
    not_found = 0
    for concept in resource_wikifier:
        try:
            ind = concept_map["concept2index"][concept["url"]]
            resource_vector[0, ind] = concept["pageRank"]
        except KeyError:
            not_found += 1
    dists, _neigh_index = model.kneighbors(resource_vector,
                                           n_neighbors=n_neighbors,
                                           return_distance=True)
    dists = dists.tolist()[0]
    neigh_ids = [ids["index2id"][i] for i in _neigh_index[0]]
    dists, neigh_ids = zip(*((d, oerid) for d, oerid in zip(dists, neigh_ids) if d > 1e-4 and oerid not in forbidden_ids))
    return neigh_ids
Пример #10
0
def knn_doc2vec_text(resource_text, **kwargs):
    vec = __interpolate(resource_text, model=load_model('ccmllt'))
    return __knn_doc2vec(resource_vector=vec, **kwargs)
Пример #11
0
def __knn_tfidf(resource_tfidf,
                n_neighbors=20,
                return_reduction=False,
                return_vectors=False,
                return_dist=False,
                return_matrix=False,
                reduction_type="TruncatedSVD",
                remove_duplicates=False,
                resource_id=None):
    keyword_map = load_model('tfknlt[1-2]-grams')[2]
    model = load_model('tfknlt[1-2]-grams')[0]
    ids = load_model('tfknlt[1-2]-grams')[1]
    # print(type(keyword_map), type(model), type(ids))
    # print(keyword_map.keys())
    top = None
    resource_vector = dok_matrix((1, len(keyword_map["index2keyword"])),
                                 dtype=np.float32)
    keyword_not_found = 0
    for k, v in resource_tfidf.items():
        try:
            ind = keyword_map["keyword2index"][k]
            resource_vector[0, ind] = v
        except KeyError:
            keyword_not_found += 1
    dists, neigh_index = model.kneighbors(resource_vector,
                                          n_neighbors=n_neighbors,
                                          return_distance=True)
    dists = dists.tolist()[0]
    neigh_ids = [ids["index2id"][i] for i in neigh_index[0]]

    if resource_id is not None and remove_duplicates:
        dists, neigh_ids = zip(*((d, oerid)
                                 for d, oerid in zip(dists, neigh_ids)
                                 if d > 1e-4 or oerid == resource_id))
    if return_vectors or return_matrix or return_reduction or remove_duplicates:
        gen = get_experimental_features(neigh_ids, [experiment_id],
                                        order_needed=False)
        neighbors_tfidf = {d["id"]: d["result"]["value"] for d in gen}
        neighbors_vector = dok_matrix(
            (len(neighbors_tfidf), len(keyword_map["index2keyword"])),
            dtype=np.float32)
        for i, rid in enumerate(neigh_ids):
            for k, v in neighbors_tfidf[rid].items():
                ind = keyword_map["keyword2index"][k]
                neighbors_vector[i, ind] = v
        vectors = vstack((resource_vector, neighbors_vector))
        # print(type(vectors))
    if return_matrix or return_reduction or remove_duplicates:
        matrix = pairwise_distances(vectors, metric="cosine")
    if remove_duplicates:
        matrix, vectors, dists, neigh_ids = filter_knn(matrix, vectors, dists,
                                                       neigh_ids)
    if return_reduction:
        reductor = dimension_reduction(reduction_type, vectors.shape[0] - 1)
        matrix_projected = reductor.fit_transform(vectors)
        print(reductor.components_.shape)
        top = []
        for i in range(reductor.components_.shape[0]):
            top_ = sorted(enumerate(reductor.components_[i].tolist()),
                          key=lambda x: x[1],
                          reverse=True)
            top_ = map(lambda x: (keyword_map["index2keyword"][x[0]], x[1]),
                       top_)
            top_ = list(filter(lambda x: x[1] != 0, top_))
            top.append(top_)
    if return_vectors:
        json_vectors = [{} for _ in range(vectors.shape[0])]
        for k, v in vectors.todok().items():
            json_vectors[k[0]][keyword_map["index2keyword"][k[1]]] = float(v)
    return {"neighbors": neigh_ids,
            "keyword_not_found": keyword_not_found,
            "distances": dists if return_dist else None,
            "vectors": json_vectors if return_vectors else None,
            "matrix": matrix.tolist() if return_matrix else None,
            "projected_matrix": matrix_projected.tolist() if return_reduction else None,
            "top": top,
            "variance_ratio_": reductor.explained_variance_ratio_ if return_reduction and\
                                                                     (reduction_type=="PCA" or\
                                                                      reduction_type=="TruncatedSVD" or\
                                                                      reduction_type=="SparsePCA"
                                                                     ) else None
         }
Пример #12
0
def knn_tfidf_text(resource_text, **kwargs):
    tfidf = interpolates([resource_text],
                         load_model('tfmllt[1-2]-grams'),
                         return_format="dict")[0]
    return __knn_tfidf(resource_tfidf=tfidf, **kwargs)
Пример #13
0
def knn_doc2vec_list(res_ids,
                     n_neighbors=20,
                     return_reduction=False,
                     return_vectors=False,
                     return_dist=False,
                     return_matrix=False,
                     reduction_type="PCA",
                     remove_duplicates=False,
                     resource_id=None):
    # assume not rendering resources with unfound knns: just keep the order
    nd_vectors = {
        d["id"]: d["result"]["value"]
        for d in get_experimental_features(res_ids, [experiment_id],
                                           order_needed=False)
    }
    res_ids, res_vectors = zip(*((x, nd_vectors.get(x, {})) for x in res_ids
                                 if x in nd_vectors))
    res_ids, res_vectors = list(res_ids), list(res_vectors)
    knnmodel = load_model('dcknlt')
    model = knnmodel[0]
    ids = knnmodel[1]
    dists, neigh_index = model.kneighbors(res_vectors,
                                          return_distance=True,
                                          n_neighbors=int(n_neighbors))
    dists = dists.tolist()
    neigh_ids = [[ids["index2id"][i] for i in neigix]
                 for neigix in neigh_index]
    if return_vectors or return_matrix or return_reduction or remove_duplicates:
        vectors = []
        matrix = []
        matrix_projected = []
        for ix, vecrq in enumerate(neigh_ids):
            gen = get_experimental_features(vecrq, [experiment_id],
                                            order_needed=False)
            dvect = {d["id"]: d["result"]["value"] for d in gen}
            # To avoid the missing vectors in db but found in models files
            ngvects = [dvect.get(rid, ()) for rid in vecrq if rid in dvect]
            vectors.append(ngvects)
            if return_matrix or return_reduction or remove_duplicates:
                ngmtx = pairwise_distances(ngvects, metric="cosine")
                matrix.append(ngmtx)
            if remove_duplicates:
                for j in dists[ix]:
                    matrix[ix], vectors[ix], dists[ix], neigh_ids[
                        ix] = filter_knn(matrix[ix], vectors[ix], dists[ix],
                                         neigh_ids[ix])
            if return_reduction:
                reductor = dimension_reduction(reduction_type,
                                               len(ngvects) - 1)
                ngmtx_pjtd = reductor.fit_transform(ngvects)
                matrix_projected.append(ngmtx_pjtd)
    return {
            "top": None,
            "neighbors": neigh_ids,
            "distances": dists if return_dist else None,
            "vectors": vectors if return_vectors else None,
            "matrix": matrix.tolist() if return_matrix else None,
            "projected_matrix": matrix_projected.tolist() if return_reduction else None,
            "variance_ratio_": reductor.explained_variance_ratio_ if return_reduction and\
                                                                     (reduction_type=="PCA" or\
                                                                      reduction_type=="TruncatedSVD" or\
                                                                      reduction_type=="SparsePCA"
                                                                     )else None
            }