def window_column(paths, output_file, debug=False): try: os.remove(output_file) except FileNotFoundError: print("Creating new file for writing data") total = len(paths) current = 0 for path in paths: if debug: print(str(current) + "/" + str(total)) current += 1 df = pd.read_csv(path, encoding='latin1') # Check for valid relations only if not dpu.valid_relation(df): continue columns = df.columns f = csv.writer(open(output_file, 'a'), delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL) # Columns for c in columns: col_data = df[c] row = [ dpu.encode_cell(cell_value) for cell_value in col_data if dpu.valid_cell(cell_value) ] if len(row) > 0: f.writerow(row) # TODO: why is it necessary to indicate end of relation? f.writerow(["~R!RR*~"])
def compose_dataset_avg_unique(path_to_relations, we_model): relational_embedding = dict() all_relations = [relation for relation in os.listdir(path_to_relations)] for relation in all_relations: path = path_to_relations + "/" + relation df = pd.read_csv(path, encoding='latin1') if not dpu.valid_relation(df): continue col_we, missing_words = column_avg_unique_composition(df, we_model) rel_we = relation_column_avg_composition(col_we) row_we, missing_words = row_avg_composition(df, we_model) relational_embedding[relation] = dict() relational_embedding[relation]["vector"] = rel_we relational_embedding[relation]["columns"] = col_we relational_embedding[relation]["rows"] = row_we return relational_embedding
def serialize_column(paths, output_file, debug=False): try: os.remove(output_file) except FileNotFoundError: print("Creating new file for writing data") total = len(paths) current = 0 for path in paths: if debug: print(str(current) + "/" + str(total)) current += 1 df = pd.read_csv(path, encoding='latin1') # Filtering out non-valid relations if not dpu.valid_relation(df): continue columns = df.columns with open(output_file, 'a') as f: # Columns for cell_value in _read_columns_from_dataframe(df, columns): f.write(" " + cell_value)