Пример #1
0
def window_column(paths, output_file, debug=False):
    try:
        os.remove(output_file)
    except FileNotFoundError:
        print("Creating new file for writing data")

    total = len(paths)
    current = 0
    for path in paths:
        if debug:
            print(str(current) + "/" + str(total))
            current += 1
        df = pd.read_csv(path, encoding='latin1')
        # Check for valid relations only
        if not dpu.valid_relation(df):
            continue
        columns = df.columns
        f = csv.writer(open(output_file, 'a'),
                       delimiter=',',
                       quotechar='\"',
                       quoting=csv.QUOTE_MINIMAL)
        # Columns
        for c in columns:
            col_data = df[c]
            row = [
                dpu.encode_cell(cell_value) for cell_value in col_data
                if dpu.valid_cell(cell_value)
            ]
            if len(row) > 0:
                f.writerow(row)
        # TODO: why is it necessary to indicate end of relation?
        f.writerow(["~R!RR*~"])
Пример #2
0
def compose_dataset_avg_unique(path_to_relations, we_model):
    relational_embedding = dict()
    all_relations = [relation for relation in os.listdir(path_to_relations)]
    for relation in all_relations:
        path = path_to_relations + "/" + relation
        df = pd.read_csv(path, encoding='latin1')
        if not dpu.valid_relation(df):
            continue
        col_we, missing_words = column_avg_unique_composition(df, we_model)
        rel_we = relation_column_avg_composition(col_we)
        row_we, missing_words = row_avg_composition(df, we_model)
        relational_embedding[relation] = dict()
        relational_embedding[relation]["vector"] = rel_we
        relational_embedding[relation]["columns"] = col_we
        relational_embedding[relation]["rows"] = row_we
    return relational_embedding
Пример #3
0
def serialize_column(paths, output_file, debug=False):
    try:
        os.remove(output_file)
    except FileNotFoundError:
        print("Creating new file for writing data")

    total = len(paths)
    current = 0
    for path in paths:
        if debug:
            print(str(current) + "/" + str(total))
            current += 1
        df = pd.read_csv(path, encoding='latin1')
        # Filtering out non-valid relations
        if not dpu.valid_relation(df):
            continue
        columns = df.columns
        with open(output_file, 'a') as f:
            # Columns
            for cell_value in _read_columns_from_dataframe(df, columns):
                f.write(" " + cell_value)