Пример #1
0
 def analogy(self, x, y, z):
     """
     y is to ??? what z is to x
     :param x:
     :param y:
     :param z:
     :return:
     """
     x = dpu.encode_cell(x)
     y = dpu.encode_cell(y)
     z = dpu.encode_cell(z)
     indexes, metrics = self.M.analogy(pos=[x, y], neg=[z], n=10)
     res = self.M.generate_response(indexes, metrics).tolist()
     return res
Пример #2
0
 def concept_qa(self, entity, relation, attribute, n=20, simf=SIMF.COSINE):
     entity = dpu.encode_cell(entity)
     indexes = []
     metrics = []
     if simf == SIMF.COSINE:
         indexes, metrics = self.M.cosine(entity, n=n)
     elif simf == SIMF.EUCLIDEAN:
         indexes, metrics = self.M.euclidean(entity, n=n)
     res = self.M.generate_response(indexes, metrics).tolist()
     vec_attribute = self.RE[relation]["columns"][attribute]
     candidate_attribute_sim = []
     for e, score in res:
         vec_e = self.M.get_vector(
             e)  # no need to normalize e --- it's already normalized
         distance = 0
         if simf == SIMF.COSINE:
             distance = cosine(vec_e, vec_attribute)
         elif simf == SIMF.EUCLIDEAN:
             distance = euclidean(vec_e, vec_attribute)
         similarity = 1 - distance
         candidate_attribute_sim.append((e, similarity))
     candidate_attribute_sim = sorted(candidate_attribute_sim,
                                      key=lambda x: x[1],
                                      reverse=True)
     return candidate_attribute_sim
Пример #3
0
 def entity_to_attribute(self, entities, n=2, simf=SIMF.COSINE):
     res = []
     for entity in entities:
         entity = dpu.encode_cell(entity)
         vec_e = self.M.get_vector(entity)
         topk = self.topk_columns(vec_e, k=n, simf=simf)
         res.append((entity, topk))
     return res
Пример #4
0
 def vector_for_entity(self, cell=None, attribute=None, table=None):
     vec = None
     if cell:
         cell = dpu.encode_cell(cell)
         vec = self.M.get_vector(cell)
     elif table:
         table = dpu.encode_cell(table)
         if attribute:
             attribute = dpu.encode_cell(attribute)
             vec = self.RE[table]["columns"][attribute]
         else:
             vec = self.RE[table]["vector"]
     elif attribute:
         attribute = dpu.encode_cell(attribute)
         print("Not supported yet!")
         return
     return vec
Пример #5
0
def _read_columns_from_dataframe(df, columns, format="TXT"):
    for c in columns:
        data_values = df[c]
        for el in data_values:
            el = dpu.encode_cell(el)
            if el == 'nan':  # probably more efficient to avoid nan upstream
                continue
            yield el
Пример #6
0
def _read_rows_from_dataframe(df, columns):
    for index, el in df.iterrows():
        for c in columns:
            cell_value = el[c]
            # clean cell_value
            cell_value = dpu.encode_cell(cell_value)
            if cell_value == 'nan':  # probably more efficient to avoid nan upstream
                continue
            yield cell_value
Пример #7
0
 def topk_similar_vectors(self, input_string, k=10, simf=SIMF.COSINE):
     el = dpu.encode_cell(input_string)
     indexes = []
     metrics = []
     if simf == SIMF.COSINE:
         indexes, metrics = self.M.cosine(el, n=k)
     elif simf == SIMF.EUCLIDEAN:
         indexes, metrics = self.M.euclidean(el, n=k)
     res = self.M.generate_response(indexes, metrics).tolist()
     return res
Пример #8
0
def _read_rows_from_dataframe(df, columns, format="TXT"):
    for index, el in df.iterrows():
        row = []
        for c in columns:
            cell_value = el[c]
            # clean cell_value
            cell_value = dpu.encode_cell(cell_value)
            if cell_value == 'nan':  # probably more efficient to avoid nan upstream
                row.append("")
            else:
                row.append(cell_value)
                if format == "TXT":
                    yield cell_value
        if format == "CSV":
            yield row
Пример #9
0
def row_avg_composition(path, we_model):
    missing_words = 0
    row_we_dict = dict()
    df = pd.read_csv(path, encoding='latin1')
    columns = df.columns
    for i, row in df.iterrows():
        row_wes = []
        for c in columns:
            el = dpu.encode_cell(row[c])
            try:
                we = we_model.get_vector(el)
            except KeyError:
                missing_words += 1
                continue
            row_wes.append(we)
        row_wes = np.asarray(row_wes)
        row_we = np.mean(row_wes, axis=0)
        row_we_dict[i] = row_we
    return row_we_dict, missing_words
Пример #10
0
def column_avg_composition(path, we_model):
    column_we = dict()
    df = pd.read_csv(path, encoding='latin1')
    columns = df.columns
    missing_words = 0
    for c in columns:
        col_wes = []
        value = df[c]
        for el in value:
            el = dpu.encode_cell(el)
            try:
                vector = we_model.get_vector(el)
            except KeyError:
                missing_words += 1
                continue
            col_wes.append(vector)
        col_wes = np.asarray(col_wes)
        col_we = np.mean(col_wes, axis=0)
        column_we[c] = col_we
    return column_we, missing_words
Пример #11
0
def _read_columns_from_dataframe(df, columns):
    for c in columns:
        data_values = df[c]
        for el in data_values:
            el = dpu.encode_cell(el)
            yield el
Пример #12
0
 def similarity_between(self, entity1, entity2, simf=SIMF.COSINE):
     x = dpu.encode_cell(entity1)
     y = dpu.encode_cell(entity2)
     vec_x = self.M.get_vector(x)
     vec_y = self.M.get_vector(y)
     return self.similarity_between_vectors(vec_x, vec_y, simf=simf)