예제 #1
0
 def analogy(self, x, y, z):
     """
     y is to ??? what z is to x
     :param x:
     :param y:
     :param z:
     :return:
     """
     x = dpu.encode_cell(x)
     y = dpu.encode_cell(y)
     z = dpu.encode_cell(z)
     indexes, metrics = self.M.analogy(pos=[x, y], neg=[z], n=10)
     res = self.M.generate_response(indexes, metrics).tolist()
     return res
예제 #2
0
def evaluate_table_attributes(api, args, table_df, entity_attribute, table_name, target_attribute, ranking_size=10, debug=True):
    """
    Given a table dataframe (pandas), an entity attribute and a target attribute, makes questions and records the
    position of the found answers
    :param api: relational_emb api object
    :param args: arguments passed to the program
    :param table_df: dataframe holding the table of interest (pandas dataframe)
    :param entity_attribute: attribute in table_df from where to draw entities
    :param target_attribute: attribute in table_df for which we want to predict the answer
    :param ranking_size: the size of the ranking
    :return:
    """
    should_sample = args.sample

    evaluation_results = defaultdict(int)
    num_questions = 0
    key_error = 0

    qs = 0
    # Iterate rows of table to draw entity and target_attribute
    for index, el in table_df.iterrows():
        if should_sample:
            if random.randint(1, 10) > 1:
                continue
        qs += 1
        if (qs % 100) == 0:
            print("#q: " + str(qs))
        entity = dpu.encode_cell(el[entity_attribute])
        ground_truth = dpu.encode_cell(el[target_attribute])
        try:
            ranking_result = api.concept_qa(entity, table_name, target_attribute, n=ranking_size)
            # Record in which position does the right answer appear, if it does
            for index, entry in enumerate(ranking_result):
                answer, score = entry
                found = (answer == ground_truth)
                if found:
                    evaluation_results[index] += 1
                    break
            num_questions += 1  # One more question
        except KeyError:
            key_error += 1

    # We only recorded the first position where an answer appears, accumulate results to get easy-to-interpret perc
    total_hits = 0
    for index in range(ranking_size):
        evaluation_results[index] += total_hits
        total_hits = evaluation_results[index]

    return evaluation_results, num_questions, key_error
예제 #3
0
def window_column(paths, output_file, debug=False):
    try:
        os.remove(output_file)
    except FileNotFoundError:
        print("Creating new file for writing data")

    total = len(paths)
    current = 0
    for path in paths:
        if debug:
            print(str(current) + "/" + str(total))
            current += 1
        df = pd.read_csv(path, encoding='latin1')
        # Check for valid relations only
        if not dpu.valid_relation(df):
            continue
        columns = df.columns
        f = csv.writer(open(output_file, 'a'),
                       delimiter=',',
                       quotechar='\"',
                       quoting=csv.QUOTE_MINIMAL)
        # Columns
        for c in columns:
            col_data = df[c]
            row = [
                dpu.encode_cell(cell_value) for cell_value in col_data
                if dpu.valid_cell(cell_value)
            ]
            if len(row) > 0:
                f.writerow(row)
        # TODO: why is it necessary to indicate end of relation?
        f.writerow(["~R!RR*~"])
예제 #4
0
def column_avg_unique_composition(df, we_model):
    column_we = dict()
    columns = df.columns
    missing_words = 0
    for c in columns:
        col_wes = []
        value = df[c].unique()
        for el in value:
            # Check validity of cell
            if not dpu.valid_cell(el):
                continue
            el = dpu.encode_cell(el)
            if " " in el:
                els = el.split(" ")
                vector = we_model.get_vector(els[0])
                missing_words_mini = 0
                for ee in range(1, len(els)):
                    try:
                        vector += we_model.get_vector(els[1])
                    except KeyError:
                        missing_words += 1
                        missing_words_mini += 1
                vector /= (len(els) - missing_words_mini)
            else:
                try:
                    vector = we_model.get_vector(el)
                except KeyError:
                    missing_words += 1
                    continue
            col_wes.append(vector)
        col_wes = np.asarray(col_wes)
        col_we = np.mean(col_wes, axis=0)
        column_we[c] = col_we
    return column_we, missing_words
예제 #5
0
def row_avg_composition(df, we_model):
    missing_words = 0
    row_we_dict = dict()
    columns = df.columns
    for i, row in df.iterrows():
        row_wes = []
        for c in columns:
            # Check validity of cell
            if not dpu.valid_cell(row[c]):
                continue
            el = dpu.encode_cell(row[c])
            if " " in el:
                els = el.split(" ")
                vector = we_model.get_vector(els[0])
                missing_words_mini = 0
                for ee in range(1, len(els)):
                    try:
                        vector += we_model.get_vector(els[1])
                    except KeyError:
                        missing_words += 1
                        missing_words_mini += 1
                vector /= (len(els) - missing_words_mini)
            else:
                try:
                    vector = we_model.get_vector(el)
                except KeyError:
                    missing_words += 1
                    continue
            row_wes.append(vector)
        row_wes = np.asarray(row_wes)
        row_we = np.mean(row_wes, axis=0)
        row_we_dict[i] = row_we
    return row_we_dict, missing_words
예제 #6
0
 def vector_for_entity(self, cell=None, attribute=None, table=None):
     vec = None
     if cell:
         cell = dpu.encode_cell(cell)
         vec = self.M.get_vector(cell)
     elif table:
         table = dpu.encode_cell(table)
         if attribute:
             attribute = dpu.encode_cell(attribute)
             vec = self.RE[table]["columns"][attribute]
         else:
             vec = self.RE[table]["vector"]
     elif attribute:
         attribute = dpu.encode_cell(attribute)
         print("Not supported yet!")
         return
     return vec
예제 #7
0
 def entity_to_attribute(self, entities, n=2, simf=SIMF.COSINE):
     res = []
     for entity in entities:
         entity = dpu.encode_cell(entity)
         vec_e = self.M.get_vector(entity)
         topk = self.topk_columns(vec_e, k=n, simf=simf)
         res.append((entity, topk))
     return res
예제 #8
0
def _read_columns_from_dataframe(df, columns):
    for c in columns:
        data_values = df[c]
        for cell_value in data_values:
            # We check the cell value is valid before continuing
            if not dpu.valid_cell(cell_value):
                continue
            cell_value = dpu.encode_cell(cell_value)
            yield cell_value
예제 #9
0
 def topk_similar_vectors(self, input_string, k=10, simf=SIMF.COSINE):
     el = dpu.encode_cell(input_string)
     indexes = []
     metrics = []
     if simf == SIMF.COSINE:
         indexes, metrics = self.M.cosine(el, n=k)
     elif simf == SIMF.EUCLIDEAN:
         indexes, metrics = self.M.euclidean(el, n=k)
     res = self.M.generate_response(indexes, metrics).tolist()
     return res
예제 #10
0
def _read_rows_from_dataframe(df, columns):
    for index, el in df.iterrows():
        for c in columns:
            cell_value = el[c]
            # We check the cell value is valid before continuing
            if not dpu.valid_cell(cell_value):
                continue
            # If valid, we clean and format it and return it
            cell_value = dpu.encode_cell(cell_value)
            yield cell_value
예제 #11
0
 def concept_qa(self, entity, relation, attribute, n=20, simf=SIMF.COSINE):
     entity = dpu.encode_cell(entity)
     if " " in entity:
         # We have spaces/words now!!
         entity_words = entity.split(" ")
     indexes = []
     metrics = []
     if simf == SIMF.COSINE:
         if " " in entity:
             indexes, metrics = self.M.cosine_array(entity_words, n=n)
             # print(indexes)
         else:
             indexes, metrics = self.M.cosine(entity, n=n)
     elif simf == SIMF.EUCLIDEAN:
         indexes, metrics = self.M.euclidean(entity, n=n)
         #SPACES UNIMPLEMENTED TODO:
     res = self.M.generate_response(indexes, metrics).tolist()
     res = [(e, self.re_range_score(score)) for e, score in res]
     vec_attribute = self.RE[relation]["columns"][attribute]
     if type(vec_attribute) is not np.ndarray:
         # print(attribute)
         return []
     # vec_attribute = self.RE[relation+"."+attribute]
     candidate_attribute_sim = []
     for e, score in res:
         vec_e = self.M.get_vector(e)  # no need to normalize e --- it's already normalized
         similarity_to_attr = 0
         if simf == SIMF.COSINE:
             similarity_to_attr = np.dot(vec_e, vec_attribute)
             similarity_to_attr = self.re_range_score(similarity_to_attr)
             # distance_to_attr = cosine(vec_e, vec_attribute)
         elif simf == SIMF.EUCLIDEAN:
             similarity_to_attr = 1 - euclidean(vec_e, vec_attribute)
         # avg distance between original entity to each ranking entity and each ranking entity and target attr
         similarity = (similarity_to_attr + score) / 2
         candidate_attribute_sim.append((e, similarity))
     candidate_attribute_sim = sorted(candidate_attribute_sim, key=lambda x: x[1], reverse=True)
     return candidate_attribute_sim
예제 #12
0
 def _concept_qa_no_avg_rerank(self, entity, relation, attribute, n=20, simf=SIMF.COSINE):
     entity = dpu.encode_cell(entity)
     indexes = []
     metrics = []
     if simf == SIMF.COSINE:
         indexes, metrics = self.M.cosine(entity, n=n)
     elif simf == SIMF.EUCLIDEAN:
         indexes, metrics = self.M.euclidean(entity, n=n)
     res = self.M.generate_response(indexes, metrics).tolist()
     vec_attribute = self.RE[relation]["columns"][attribute]
     # vec_attribute = self.RE[relation+"."+attribute]
     candidate_attribute_sim = []
     for e, score in res:
         vec_e = self.M.get_vector(e)  # no need to normalize e --- it's already normalized
         similarity = 0
         if simf == SIMF.COSINE:
             similarity = np.dot(vec_e, vec_attribute)
             similarity = self.re_range_score(similarity)
         elif simf == SIMF.EUCLIDEAN:
             similarity = 1 - euclidean(vec_e, vec_attribute)
         candidate_attribute_sim.append((e, similarity))
     candidate_attribute_sim = sorted(candidate_attribute_sim, key=lambda x: x[1], reverse=True)
     return candidate_attribute_sim
예제 #13
0
            df = pd.read_csv(csv_filepath, encoding='latin1')
            columns = list(df.columns.values)
            columnsize = len(columns)
            fh.write(f"D:Columns: {columns} \n")
            fh.flush()
            for index, el in df.iterrows():
                if random.randint(1, 10) > 1:
                    continue
                for i in range(3):
                    c = random.randint(0, columnsize - 1)
                    target_column = random.randint(0, columnsize - 1)
                    #SHOULD I CHECK IF
                    if c == target_column:
                        continue
                    # try:
                    value = dpu.encode_cell(el[c])

                    if len(
                            value
                    ) < 4 or "/" in value:  #We're only going 1 direction with the testing data AND NO DATES
                        continue
                    expected = dpu.encode_cell(el[target_column])
                    # print(value,expected)
                    try:
                        res = api.concept_qa(value,
                                             csv_file,
                                             columns[target_column],
                                             n=RELEVANTS[-1])
                        y = 0
                        ind = 0
예제 #14
0
 def similarity_between(self, entity1, entity2, simf=SIMF.COSINE):
     x = dpu.encode_cell(entity1)
     y = dpu.encode_cell(entity2)
     vec_x = self.M.get_vector(x)
     vec_y = self.M.get_vector(y)
     return self.similarity_between_vectors(vec_x, vec_y, simf=simf)