Пример #1
0
def read(dataset_name, read_separator=";"):
    """
    Read the mappings <prediction -> number of correct answers >
    from the corresponding file of the dataset with the given name
    and return them in the format of two dicts:
        - head_prediction_2_peers: maps all "head questions" relation;tail_entity to the number of correct head answers
        - tail_prediction_2_peers: maps all "tail questions" head_entity;relation to the number of correct tail answers

    :param dataset_name: the name of the dataset for which to compute the mappings
    :param read_separator: the separator to use when reading the csv file

    :return: the computed mappings
    """

    print("Reading number of peers for training facts of dataset %s" %
          dataset_name)

    filepath = os.path.join(datasets.home_folder_for(dataset_name), FOLDER,
                            TRAIN_FACTS_WITH_PEERS_FILENAME)
    head_prediction_2_peers = defaultdict(lambda: 0)
    tail_prediction_2_peers = defaultdict(lambda: 0)

    with open(filepath, "r") as input_file:
        lines = input_file.readlines()
        for line in lines:
            line = html.unescape(
                line)  # this may be needed by YAGO, that has some &amp; stuff
            head, relation, tail, head_peers, tail_peers = line.strip().split(
                read_separator)

            head_prediction_2_peers[relation + ";" + tail] = int(head_peers)
            tail_prediction_2_peers[head + ";" + relation] = int(tail_peers)

    return head_prediction_2_peers, tail_prediction_2_peers
def read(dataset_name, read_separator=";", return_fact_2_arity=False):
    filepath = os.path.join(datasets.home_folder_for(dataset_name), FOLDER,
                            TEST_FACTS_WITH_ARITY_FILENAME)
    with open(filepath, "r") as input_file:
        lines = input_file.readlines()

    if return_fact_2_arity:
        triple_2_arity = dict()

        for line in lines:
            line = html.unescape(
                line)  # this may be needed by YAGO, that has some &amp; stuff
            head, relation, tail, arity = line.strip().split(read_separator)

            triple_2_arity[read_separator.join([head, relation,
                                                tail])] = int(arity)

        return triple_2_arity
    else:
        arity_2_triples = defaultdict(lambda: [])

        for line in lines:
            line = html.unescape(
                line)  # this may be needed by YAGO, that has some &amp; stuff
            head, relation, tail, arity = line.strip().split(read_separator)

            arity_2_triples[int(arity)].append(
                read_separator.join([head, relation, tail]))

        return arity_2_triples
Пример #3
0
def read(dataset_name, read_separator=";"):
    """

    Read the file that contains the mappings
            <entity name -> in degree>
            <entity name -> out degree>
            <entity name -> overall degree>
    for the training set of a specific dataset.

    :param dataset_name: the name of the dataset for which to compute the mappings
    :param read_separator: the separator to use when reading the csv file
    :return: the computed mappings, in the order <entity -> in degree>, <entity -> out degree>, <entity -> overall degree>
    """

    print("Reading the mappings <entity name -> degree> (for in, out and overall degree) in %s training set..." % dataset_name)
    dataset_home = datasets.home_folder_for(dataset_name)
    filepath = os.path.join(dataset_home, FOLDER, FILENAME)

    mid_2_in_degree = defaultdict(lambda: 0)
    mid_2_out_degree = defaultdict(lambda: 0)
    mid_2_degree = defaultdict(lambda: 0)

    with open(filepath) as input_data:
        lines = input_data.readlines()
        for line in lines:
            line = html.unescape(line)  # this may be needed by YAGO, that has some &amp; stuff
            (mid, in_degree, out_degree, degree) = line.strip().split(read_separator)
            mid_2_in_degree[mid] = int(in_degree)
            mid_2_out_degree[mid] = int(out_degree)
            mid_2_degree[mid] = int(degree)

    return mid_2_in_degree, mid_2_out_degree, mid_2_degree
Пример #4
0
def read(dataset_name, read_separator=";"):
    """

    Read the file that contains the mappings <relationship name -> list of types> for the training set of a specific dataset.

    :param dataset_name: the name of the dataset for which to compute the mappings
    :param read_separator: the separator to use when reading the csv file
    :return: the computed mappings
    """

    print(
        "Reading the mappings <relationship name -> list of types> in %s training set..."
        % dataset_name)
    dataset_home = datasets.home_folder_for(dataset_name)
    filepath = os.path.join(dataset_home, FOLDER, FILENAME)

    relation_2_types = dict()

    with open(filepath) as input_data:
        lines = input_data.readlines()
        for line in lines:
            line = html.unescape(
                line)  # this may be needed by YAGO, that has some &amp; stuff
            relation, types = line.strip().split(read_separator)
            relation_2_types[relation] = types.split(",")

    return relation_2_types
Пример #5
0
def save(dataset):
    """
    Compute the mappings  < relation fine class -> relations that belong to that class> in a specific dataset
    and save them in a location in the home folder of the dataset

    :param dataset: the dataset to compute the mappings for
    """

    fine_class_2_rels = compute(dataset)
    lines = []
    for fine_class in FINE_CLASSES:
        for rel in fine_class_2_rels[fine_class]:
            lines.append(";".join([rel, fine_class]) + "\n")

    dataset_home = datasets.home_folder_for(dataset.home)

    output_filepath = os.path.join(dataset_home, FILENAME)

    print(
        "Saving fine-grained relation classes for dataset %s into location %s"
        % (dataset.name, output_filepath))

    with open(output_filepath, "w") as output_file:
        output_file.writelines(lines)


# dataset = datasets.Dataset(datasets.FB15K)
# save(dataset)
Пример #6
0
def save(dataset_name, write_separator=";"):
    """
    Compute the mappings < test fact -> degree class > for all the test facts of a dataset,
    and save them in a file.

    :param write_separator: the separator to use when writing the file
    :param dataset_name: the name of the dataset for which to compute the mappings
    """

    degree_class_2_facts = compute(dataset_name)
    lines = []
    for degree_class in CLASSES:
        for fact in degree_class_2_facts[degree_class]:
            head, relationship, tail = fact
            lines.append(
                write_separator.join([head, relationship, tail, degree_class])
                + "\n")

    print(
        "Saving the mappings <degree class -> list of test facts belonging to that degree class> for dataset %s ..."
        % dataset_name)
    dataset_home = datasets.home_folder_for(dataset_name)
    output_filepath = os.path.join(dataset_home, FILENAME)
    with open(output_filepath, "w") as output_file:
        output_file.writelines(lines)
def save(dataset, read_separator=";"):
    test_fact_2_arity = compute(dataset)

    print("Saving the arity for each test fact in " + dataset.name + "...")
    output_lines = []
    for test_fact in dataset.test_triples:
        key = ";".join(test_fact)
        output_lines.append(key + ";" + str(test_fact_2_arity[key]) + "\n")

    filepath = os.path.join(datasets.home_folder_for(dataset.name), FOLDER,
                            TEST_FACTS_WITH_ARITY_FILENAME)

    with open(filepath, "w") as outfile:
        outfile.writelines(output_lines)
Пример #8
0
def read(dataset_name, read_separator=";", return_fact_2_class=False):
    """
    Read the mappings <peer class -> test facts to that class >
    from the corresponding file of the dataset with the given name
    and return them
        - either in the format  <peer class -> test facts belonging to that class >
        - or in the format  <test fact -> peer class that it belongs to >

    :param dataset_name: the name of the dataset for which to compute the mappings
    :param read_separator: the separator to use when reading the csv file
    :param return_fact_2_class: if true, return mappings in the format <test fact -> peer class that it belongs to >
                                otherwise, return mappings in the format in the format <peer class -> test facts belonging to that class >

    :return: the computed mappings
    """

    print("Reading peer classes for test facts of dataset %s..." %
          dataset_name)

    input_filepath = os.path.join(datasets.home_folder_for(dataset_name),
                                  FOLDER, TEST_FACTS_WITH_PEERS_FILENAME)

    with open(input_filepath, "r") as input_file:
        if not return_fact_2_class:
            peer_class_2_facts = dict()
            for peer_class in PEER_CLASSES:
                peer_class_2_facts[peer_class] = []

            for line in input_file.readlines():
                line = html.unescape(
                    line
                )  # this may be needed by YAGO, that has some &amp; stuff
                head, relation, tail, peer_class = line.strip().split(
                    read_separator)
                peer_class_2_facts[peer_class].append([head, relation, tail])
            return peer_class_2_facts

        else:
            fact_2_peer_class = dict()
            for line in input_file.readlines():
                line = html.unescape(
                    line
                )  # this may be needed by YAGO, that has some &amp; stuff
                head, relation, tail, peer_class = line.strip().split(
                    read_separator)
                fact_2_peer_class[";".join([head, relation,
                                            tail])] = peer_class
            return fact_2_peer_class
Пример #9
0
def read(dataset_name, read_separator=";", return_fact_2_class=False):
    """
    Read the mappings <test fact -> degree class that it belongs to >
    from the corresponding file of the dataset with the given name
    and return them
        - either in the format < degree class -> test facts that belong to that degree class >
        - or in the format <test fact -> degree class that it belongs to >


    :param dataset_name: the name of the dataset for which to compute the mappings
    :param read_separator: the separator to use when reading the csv file
    :param return_fact_2_class: if true, return mappings in the format <test fact -> degree class that it belongs to >
                                otherwise, return mappings in the format in the format < degree class -> test facts that belong to that degree class >
    :return: the computed mappings
    """

    print(
        "Reading the mappings <degree class -> list of test facts belonging to that degree class> for dataset %s ..."
        % dataset_name)
    datase_folder = datasets.home_folder_for(dataset_name)

    with open(os.path.join(datase_folder, FILENAME), "r") as input_file:

        if not return_fact_2_class:
            degree_class_2_facts = dict()
            for degree_class in CLASSES:
                degree_class_2_facts[degree_class] = []

            for line in input_file.readlines():
                line = html.unescape(
                    line
                )  # this may be needed by YAGO, that has some &amp; stuff
                head, relation, tail, degree_class = line.strip().split(
                    read_separator)
                degree_class_2_facts[degree_class].append(
                    [head, relation, tail])
            return degree_class_2_facts

        else:
            fact_2_class = dict()
            for line in input_file.readlines():
                line = html.unescape(
                    line
                )  # this may be needed by YAGO, that has some &amp; stuff
                head, relation, tail, degree_class = line.strip().split(
                    read_separator)
                fact_2_class[";".join([head, relation, tail])] = degree_class
            return fact_2_class
Пример #10
0
def read(dataset_name, read_separator=";", return_fact_2_clique_size=False):
    """
    Read from the filesystem a map that associates each test triple
    to the size of the maximal clique that contains that triple.

    :return return a map <clique size -> list of facts with that clique size>

    :param dataset_name: the name of the dataset to read the mappings for
    :param read_separator: the separator to use when reading the mappings from the filesystem
    :param return_fact_2_clique_size: return a map <fact -> clique size>

    """

    print("Reading number of siblings for training facts of dataset %s" %
          dataset_name)

    filepath = os.path.join(datasets.home_folder_for(dataset_name), FOLDER,
                            TEST_FACTS_WITH_MAXIMAL_CLIQUE_SIZE_FILENAME)
    with open(filepath, "r") as input_file:
        lines = input_file.readlines()

    if return_fact_2_clique_size:
        triple_2_clique_size = dict()

        for line in lines:
            line = html.unescape(
                line)  # this may be needed by YAGO, that has some &amp; stuff
            head, relation, tail, max_clique_size = line.strip().split(
                read_separator)

            triple_2_clique_size[read_separator.join(
                [head, relation, tail])] = int(max_clique_size)

        return triple_2_clique_size
    else:
        clique_size_2_triples = defaultdict(lambda: [])

        for line in lines:
            line = html.unescape(
                line)  # this may be needed by YAGO, that has some &amp; stuff
            head, relation, tail, max_clique_size = line.strip().split(
                read_separator)

            clique_size_2_triples[int(max_clique_size)].append(
                read_separator.join([head, relation, tail]))

        return clique_size_2_triples
Пример #11
0
def save(dataset):
    reified_fact_2_cvts = compute(dataset)

    print("Saving the freebase-clean CVTs for each fact in " + dataset.name +
          "...")
    output_lines = []
    for reified_fact in reified_fact_2_cvts:
        output_lines.append(";".join([reified_fact]) + ";[" +
                            ";".join(reified_fact_2_cvts[reified_fact]) +
                            "]\n")

    filepath = os.path.join(datasets.home_folder_for(dataset.name), FOLDER,
                            ALL_FACTS_WITH_CVTS_FILENAME)

    with open(filepath, "w") as outfile:
        outfile.writelines(output_lines)


#save(datasets.Dataset(datasets.FB15K_237))
def read(dataset_name, read_separator=";", return_rel_2_class=False):
    """
    Read the mappings <relation coarse class -> relations belonging to that coarse class >
    from the corresponding file of the dataset with the given name
    and return them
        - either in the format  <relation coarse class -> relations belonging to that coarse class >
        - or in the format  <relation -> relation coarse class that it belongs to >

    :param dataset_name: the name of the dataset for which to compute the mappings
    :param read_separator: the separator to use when reading the csv file
    :param return_rel_2_class: if true, return mappings in the format <relation -> class that it belongs to >
                                otherwise, return mappings in the format in the format < class -> relations belonging to that class >

    :return: the computed mappings
    """

    print("Reading coarse-grained relation classes for dataset %s" %
          dataset_name)

    dataset_home = datasets.home_folder_for(dataset_name)

    with open(os.path.join(dataset_home, FILENAME), "r") as input_file:

        if not return_rel_2_class:
            coarse_class_2_rels = dict()
            for coarse_class in COARSE_CLASSES:
                coarse_class_2_rels[coarse_class] = []

            for line in input_file.readlines():
                relation, coarse_class = line.strip().split(read_separator)
                coarse_class_2_rels[coarse_class].append(relation)
            return coarse_class_2_rels

        else:
            rel_2_class = dict()
            for line in input_file.readlines():
                relation, coarse_class = line.strip().split(read_separator)
                rel_2_class[relation] = coarse_class
            return rel_2_class


# dataset = datasets.Dataset(datasets.FB15K)
# save(dataset)
Пример #13
0
def read(dataset, return_cvt_2_facts=False):

    filepath = os.path.join(datasets.home_folder_for(dataset.name), FOLDER,
                            ALL_FACTS_WITH_CVTS_FILENAME)
    with open(filepath, "r") as infile:
        lines = infile.readlines()

    if return_cvt_2_facts:
        cvt_2_facts = defaultdict(lambda: [])
        for line in lines:
            head, rel, tail, cvts = line.strip().split(";", 3)
            cvts = cvts[1:-1].split(";")
            for cvt in cvts:
                cvt_2_facts[cvt].append((head, rel, tail))
        return cvt_2_facts
    else:
        fact_2_cvts = dict()
        for line in lines:
            head, rel, tail, cvts = line.strip().split(";", 3)
            cvts = cvts[1:-1].split(";")
            fact_2_cvts[";".join([head, rel, tail])] = cvts
        return fact_2_cvts
Пример #14
0
def read(dataset_name, read_separator=";"):
    """
    Read the file that contains the mapping <relation name -> number of mentions>
    for the training set of a specific dataset.
    :param dataset_name: the name of the dataset to read the mappings for, from its specific, pre-computed file
    :param read_separator: the separator to use when reading the file
    :return: the read mappings
    """

    print("Reading the mapping <relation name -> number of mentions> in %s training set..." % dataset_name)
    dataset_home = datasets.home_folder_for(dataset_name)
    filepath = os.path.join(dataset_home, FOLDER, FILENAME)

    name_2_count = defaultdict(lambda: 0)
    with open(filepath) as input_data:
        lines = input_data.readlines()
        for line in lines:
            line = html.unescape(line)  # this may be needed by YAGO, that has some &amp; stuff
            (name, count) = line.strip().split(read_separator)
            name_2_count[name] = int(count)

    return name_2_count