Python Dictionary示例，torchbiggraph.converters.dictionary.Dictionary Python示例

示例#1

0

显示文件

文件： import_from_tsv.py 项目： sckGeek/PyTorch-BigGraph

def collect_entities_by_type(
    relation_types: Dictionary,
    entity_configs: Dict[str, EntitySchema],
    relation_configs: List[RelationSchema],
    edge_paths: List[str],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
    entity_min_count: int,
) -> Dict[str, Dictionary]:

    counters: Dict[str, Counter[str]] = {}
    for entity_name in entity_configs.keys():
        counters[entity_name] = Counter()

    print("Searching for the entities in the edge files...")
    for edgepath in edge_paths:
        with open(edgepath, "rt") as tf:
            for line_num, line in enumerate(tf, start=1):
                words = line.split()
                try:
                    lhs_word = words[lhs_col]
                    rhs_word = words[rhs_col]
                    rel_word = words[rel_col] if rel_col is not None else None
                except IndexError:
                    raise RuntimeError(
                        "Line %d of %s has only %d words" %
                        (line_num, edgepath, len(words))) from None

                if dynamic_relations or rel_col is None:
                    rel_id = 0
                else:
                    try:
                        rel_id = relation_types.get_id(rel_word)
                    except KeyError:
                        raise RuntimeError(
                            "Could not find relation type in config")

                counters[relation_configs[rel_id].lhs][lhs_word] += 1
                counters[relation_configs[rel_id].rhs][rhs_word] += 1

    entities_by_type: Dict[str, Dictionary] = {}
    for entity_name, counter in counters.items():
        print("Entity type %s:" % entity_name)
        print("- Found %d entities" % len(counter))
        if entity_min_count > 0:
            print("- Removing the ones with fewer than %d occurrences..." %
                  entity_min_count)
            counter = Counter(
                {k: c
                 for k, c in counter.items() if c >= entity_min_count})
            print("- Left with %d entities" % len(counter))
        print("- Shuffling them...")
        names = list(counter.keys())
        random.shuffle(names)
        entities_by_type[entity_name] = Dictionary(
            names, num_parts=entity_configs[entity_name].num_partitions)

    return entities_by_type

示例#2

0

显示文件

文件： importers.py 项目： zgsxwsdxg/PyTorch-BigGraph

def generate_entity_path_files(
    entity_storage: AbstractEntityStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_type_storage: AbstractRelationTypeStorage,
    relation_types: Dictionary,
    dynamic_relations: bool,
) -> None:
    print(
        f"Preparing counts and dictionaries for entities and relation types:")
    entity_storage.prepare()
    relation_type_storage.prepare()

    for entity_name, entities in entities_by_type.items():
        for part in range(entities.num_parts):
            print(f"- Writing count of entity type {entity_name} "
                  f"and partition {part}")
            entity_storage.save_count(entity_name, part,
                                      entities.part_size(part))
            entity_storage.save_names(entity_name, part,
                                      entities.get_part_list(part))

    if dynamic_relations:
        print("- Writing count of dynamic relations")
        relation_type_storage.save_count(relation_types.size())
        relation_type_storage.save_names(relation_types.get_list())

示例#3

0

显示文件

def collect_entities_by_type(
    relation_types: Dictionary,
    entity_configs: Dict[str, EntitySchema],
    relation_configs: List[RelationSchema],
    edge_paths: List[Path],
    dynamic_relations: bool,
    edgelist_reader: EdgelistReader,
    entity_min_count: int,
) -> Dict[str, Dictionary]:

    counters: Dict[str, Counter[str]] = {}
    for entity_name in entity_configs.keys():
        counters[entity_name] = Counter()

    log("Searching for the entities in the edge files...")
    for edgepath in edge_paths:
        for lhs_word, rhs_word, rel_word in edgelist_reader.read(edgepath):
            if dynamic_relations or rel_word is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    raise RuntimeError("Could not find relation type in config")

            counters[relation_configs[rel_id].lhs][lhs_word] += 1
            counters[relation_configs[rel_id].rhs][rhs_word] += 1

    entities_by_type: Dict[str, Dictionary] = {}
    for entity_name, counter in counters.items():
        log(f"Entity type {entity_name}:")
        log(f"- Found {len(counter)} entities")
        if entity_min_count > 0:
            log(
                f"- Removing the ones with fewer than {entity_min_count} occurrences..."
            )
            counter = Counter(
                {k: c for k, c in counter.items() if c >= entity_min_count}
            )
            log(f"- Left with {len(counter)} entities")
        log("- Shuffling them...")
        names = list(counter.keys())
        random.shuffle(names)
        entities_by_type[entity_name] = Dictionary(
            names, num_parts=entity_configs[entity_name].num_partitions
        )

    return entities_by_type

示例#4

0

显示文件

def collect_relation_types(
    relation_configs: List[RelationSchema],
    edge_paths: List[str],
    dynamic_relations: bool,
    rel_col: Optional[int],
    relation_type_min_count: int,
) -> Dictionary:

    if dynamic_relations:
        if rel_col is None:
            raise RuntimeError("Need to specify rel_col in dynamic mode.")
        print("Looking up relation types in the edge files...")
        counter: Counter[str] = Counter()
        for edgepath in edge_paths:
            with open(edgepath, "rt") as tf:
                for line in tf:
                    counter[line.split()[rel_col]] += 1
        print("- Found %d relation types" % len(counter))
        if relation_type_min_count > 0:
            print("- Removing the ones with fewer than %d occurrences..." %
                  relation_type_min_count)
            counter = Counter({
                k: c
                for k, c in counter.items() if c >= relation_type_min_count
            })
            print("- Left with %d relation types" % len(counter))
        print("- Shuffling them...")
        names = list(counter.keys())
        random.shuffle(names)

    else:
        names = [rconfig.name for rconfig in relation_configs]
        print("Using the %d relation types given in the config" % len(names))

    return Dictionary(names)

示例#5

0

显示文件

文件： importers.py 项目： pinellolab/simba_pbg

def generate_edge_path_files_fast(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    edgelist_reader: EdgelistReader,
) -> None:
    processed = 0
    skipped = 0

    log("Taking the fast train!")
    data = []
    for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in):
        if rel_word is None:
            rel_id = 0
        else:
            try:
                rel_id = relation_types.get_id(rel_word)
            except KeyError:
                # Ignore edges whose relation type is not known.
                skipped += 1
                continue

        lhs_type = relation_configs[rel_id].lhs
        rhs_type = relation_configs[rel_id].rhs

        try:
            _, lhs_offset = entities_by_type[lhs_type].get_partition(lhs_word)
            _, rhs_offset = entities_by_type[rhs_type].get_partition(rhs_word)
        except KeyError:
            # Ignore edges whose entities are not known.
            skipped += 1
            continue

        data.append((lhs_offset, rhs_offset, rel_id))

        processed = processed + 1
        if processed % 100000 == 0:
            log(f"- Processed {processed} edges so far...")

    lhs_offsets, rhs_offsets, rel_ids = zip(*data)
    edge_list = EdgeList(
        EntityList.from_tensor(torch.tensor(list(lhs_offsets), dtype=torch.long)),
        EntityList.from_tensor(torch.tensor(list(rhs_offsets), dtype=torch.long)),
        torch.tensor(list(rel_ids), dtype=torch.long),
    )
    edge_storage.save_edges(0, 0, edge_list)

    log(f"- Processed {processed} edges in total")
    if skipped > 0:
        log(
            f"- Skipped {skipped} edges because their relation type or "
            f"entities were unknown (either not given in the config or "
            f"filtered out as too rare)."
        )

示例#6

0

显示文件

文件： import_from_tsv.py 项目： wbj0110/PyTorch-BigGraph

def collect_relation_types(
    relation_configs: List[RelationSchema],
    edge_paths: List[Path],
    dynamic_relations: bool,
    rel_col: Optional[int],
    relation_type_min_count: int,
) -> Dictionary:

    if dynamic_relations:
        if rel_col is None:
            raise RuntimeError("Need to specify rel_col in dynamic mode.")
        print("Looking up relation types in the edge files...")
        counter: Counter[str] = Counter()
        for edgepath in edge_paths:
            with edgepath.open("rt") as tf:
                for line_num, line in enumerate(tf, start=1):
                    words = line.split()
                    try:
                        rel_word = words[rel_col]
                    except IndexError:
                        raise RuntimeError(
                            f"Line {line_num} of {edgepath} has only {len(words)} words"
                        ) from None
                    counter[rel_word] += 1
        print(f"- Found {len(counter)} relation types")
        if relation_type_min_count > 0:
            print(
                f"- Removing the ones with fewer than {relation_type_min_count} occurrences..."
            )
            counter = Counter({
                k: c
                for k, c in counter.items() if c >= relation_type_min_count
            })
            print(f"- Left with {len(counter)} relation types")
        print("- Shuffling them...")
        names = list(counter.keys())
        random.shuffle(names)

    else:
        names = [rconfig.name for rconfig in relation_configs]
        print(f"Using the {len(names)} relation types given in the config")

    return Dictionary(names)

示例#7

0

显示文件

文件： importers.py 项目： zgsxwsdxg/PyTorch-BigGraph

def collect_relation_types(
    relation_configs: List[RelationSchema],
    edge_paths: List[Path],
    dynamic_relations: bool,
    edgelist_reader: EdgelistReader,
    relation_type_min_count: int,
) -> Dictionary:

    if dynamic_relations:
        print("Looking up relation types in the edge files...")
        counter: Counter[str] = Counter()
        for edgepath in edge_paths:
            for _lhs_word, _rhs_word, rel_word in edgelist_reader.read(
                    edgepath):
                if rel_word is None:
                    raise RuntimeError(
                        "Need to specify rel_col in dynamic mode.")
                counter[rel_word] += 1

        print(f"- Found {len(counter)} relation types")
        if relation_type_min_count > 0:
            print(
                f"- Removing the ones with fewer than {relation_type_min_count} occurrences..."
            )
            counter = Counter({
                k: c
                for k, c in counter.items() if c >= relation_type_min_count
            })
            print(f"- Left with {len(counter)} relation types")
        print("- Shuffling them...")
        names = list(counter.keys())
        random.shuffle(names)

    else:
        names = [rconfig.name for rconfig in relation_configs]
        print(f"Using the {len(names)} relation types given in the config")

    return Dictionary(names)

示例#8

0

显示文件

def generate_entity_path_files(
    entity_path: str,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    dynamic_relations: bool,
) -> None:

    print("Preparing entity path %s:" % entity_path)
    for entity_name, entities in entities_by_type.items():
        for part in range(entities.num_parts):
            print("- Writing count of entity type %s and partition %d" %
                  (entity_name, part))
            with open(
                    os.path.join(
                        entity_path,
                        "entity_count_%s_%d.txt" % (entity_name, part)),
                    "wt") as tf:
                tf.write("%d" % entities.part_size(part))

    if dynamic_relations:
        print("- Writing count of dynamic relations")
        with open(os.path.join(entity_path, "dynamic_rel_count.txt"),
                  "wt") as tf:
            tf.write("%d" % relation_types.size())

示例#9

0

显示文件

def generate_edge_path_files(
    edge_file_in: str,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
) -> None:

    basename, _ = os.path.splitext(edge_file_in)
    edge_path_out = basename + '_partitioned'

    print("Preparing edge path %s, out of the edges found in %s" %
          (edge_path_out, edge_file_in))
    os.makedirs(edge_path_out, exist_ok=True)

    num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts
                        for rconfig in relation_configs)
    num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts
                        for rconfig in relation_configs)

    print("- Edges will be partitioned in %d x %d buckets." %
          (num_lhs_parts, num_rhs_parts))

    buckets: DefaultDict[Tuple[int, int], List[Tuple[int, int, int]]] = \
        DefaultDict(list)
    processed = 0
    skipped = 0

    with open(edge_file_in, "rt") as tf:
        for line in tf:
            words = line.split()
            if rel_col is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(words[rel_col])
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = \
                    entities_by_type[lhs_type].get_partition(words[lhs_col])
                rhs_part, rhs_offset = \
                    entities_by_type[rhs_type].get_partition(words[rhs_col])
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            buckets[lhs_part, rhs_part].append(
                (lhs_offset, rhs_offset, rel_id))

            processed = processed + 1
            if processed % 100000 == 0:
                print("- Processed %d edges so far..." % processed)

    print("- Processed %d edges in total" % processed)
    if skipped > 0:
        print(
            "- Skipped %d edges because their relation type or entities were "
            "unknown (either not given in the config or filtered out as too "
            "rare)." % skipped)

    for i in range(num_lhs_parts):
        for j in range(num_rhs_parts):
            print("- Writing bucket (%d, %d), containing %d edges..." %
                  (i, j, len(buckets[i, j])))
            edges = np.asarray(buckets[i, j])
            with h5py.File(
                    os.path.join(edge_path_out, "edges_%d_%d.h5" % (i, j)),
                    "w") as hf:
                hf.attrs["format_version"] = 1
                hf.create_dataset("lhs", data=edges[:, 0])
                hf.create_dataset("rhs", data=edges[:, 1])
                hf.create_dataset("rel", data=edges[:, 2])

示例#10

0

显示文件

def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    edgelist_reader: EdgelistReader,
    n_flush_edges: int = 100000,
) -> None:
    log(
        f"Preparing edge path {edge_path_out}, "
        f"out of the edges found in {edge_file_in}"
    )
    edge_storage.prepare()

    num_lhs_parts = max(
        entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs
    )
    num_rhs_parts = max(
        entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs
    )

    log(f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets.")

    processed = 0
    skipped = 0
    # We use an ExitStack in order to close the dynamically-created edge appenders.
    with ExitStack() as appender_stack:
        appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {}
        data: Dict[Tuple[int, int], List[Tuple[int, int, int]]] = {}

        for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in):
            if rel_word is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = entities_by_type[lhs_type].get_partition(
                    lhs_word
                )
                rhs_part, rhs_offset = entities_by_type[rhs_type].get_partition(
                    rhs_word
                )
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            if (lhs_part, rhs_part) not in appenders:
                appenders[lhs_part, rhs_part] = appender_stack.enter_context(
                    edge_storage.save_edges_by_appending(lhs_part, rhs_part)
                )
                data[lhs_part, rhs_part] = []

            part_data = data[lhs_part, rhs_part]
            part_data.append((lhs_offset, rhs_offset, rel_id))
            if len(part_data) > n_flush_edges:
                append_to_file(part_data, appenders[lhs_part, rhs_part])
                part_data.clear()

            processed = processed + 1
            if processed % 100000 == 0:
                log(f"- Processed {processed} edges so far...")

        for (lhs_part, rhs_part), part_data in data.items():
            if len(part_data) > 0:
                append_to_file(part_data, appenders[lhs_part, rhs_part])
                part_data.clear()

    log(f"- Processed {processed} edges in total")
    if skipped > 0:
        log(
            f"- Skipped {skipped} edges because their relation type or "
            f"entities were unknown (either not given in the config or "
            f"filtered out as too rare)."
        )

示例#11

0

显示文件

文件： importers.py 项目： pinellolab/simba_pbg

def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    edgelist_reader: EdgelistReader,
) -> None:
    log(
        f"Preparing edge path {edge_path_out}, "
        f"out of the edges found in {edge_file_in}"
    )
    edge_storage.prepare()

    num_lhs_parts = max(
        entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs
    )
    num_rhs_parts = max(
        entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs
    )

    if not dynamic_relations and num_lhs_parts == 1 and num_rhs_parts == 1:
        print('using fast version')
        return generate_edge_path_files_fast(
            edge_file_in,
            edge_path_out,
            edge_storage,
            entities_by_type,
            relation_types,
            relation_configs,
            edgelist_reader,
        )

    log(f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets.")

    processed = 0
    skipped = 0

    # We use an ExitStack in order to close the dynamically-created edge appenders.
    with ExitStack() as appender_stack:
        appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {}
        for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in):
            if rel_word is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = entities_by_type[lhs_type].get_partition(
                    lhs_word
                )
                rhs_part, rhs_offset = entities_by_type[rhs_type].get_partition(
                    rhs_word
                )
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            if (lhs_part, rhs_part) not in appenders:
                appenders[lhs_part, rhs_part] = appender_stack.enter_context(
                    edge_storage.save_edges_by_appending(lhs_part, rhs_part)
                )
            appenders[lhs_part, rhs_part].append_edges(
                EdgeList(
                    EntityList.from_tensor(
                        torch.tensor([lhs_offset], dtype=torch.long)
                    ),
                    EntityList.from_tensor(
                        torch.tensor([rhs_offset], dtype=torch.long)
                    ),
                    torch.tensor([rel_id], dtype=torch.long),
                )
            )

            processed = processed + 1
            if processed % 100000 == 0:
                log(f"- Processed {processed} edges so far...")

    log(f"- Processed {processed} edges in total")
    if skipped > 0:
        log(
            f"- Skipped {skipped} edges because their relation type or "
            f"entities were unknown (either not given in the config or "
            f"filtered out as too rare)."
        )

示例#12

0

显示文件

文件： import_from_tsv.py 项目： wbj0110/PyTorch-BigGraph

def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
) -> None:
    print(f"Preparing edge path {edge_path_out}, "
          f"out of the edges found in {edge_file_in}")
    edge_storage.prepare()

    num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts
                        for rconfig in relation_configs)
    num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts
                        for rconfig in relation_configs)

    print(
        f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets."
    )

    processed = 0
    skipped = 0

    # We use an ExitStack in order to close the dynamically-created edge appenders.
    with edge_file_in.open("rt") as tf, ExitStack() as appender_stack:
        appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {}
        for line_num, line in enumerate(tf, start=1):
            words = line.split()
            try:
                lhs_word = words[lhs_col]
                rhs_word = words[rhs_col]
                rel_word = words[rel_col] if rel_col is not None else None
            except IndexError:
                raise RuntimeError(
                    f"Line {line_num} of {edge_file_in} has only {len(words)} words"
                ) from None

            if rel_col is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = \
                    entities_by_type[lhs_type].get_partition(lhs_word)
                rhs_part, rhs_offset = \
                    entities_by_type[rhs_type].get_partition(rhs_word)
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            if (lhs_part, rhs_part) not in appenders:
                appenders[lhs_part, rhs_part] = appender_stack.enter_context(
                    edge_storage.save_edges_by_appending(lhs_part, rhs_part))
            appenders[lhs_part, rhs_part].append_edges(
                EdgeList(
                    EntityList.from_tensor(
                        torch.tensor([lhs_offset], dtype=torch.long)),
                    EntityList.from_tensor(
                        torch.tensor([rhs_offset], dtype=torch.long)),
                    torch.tensor([rel_id], dtype=torch.long),
                ))

            processed = processed + 1
            if processed % 100000 == 0:
                print(f"- Processed {processed} edges so far...")

    print(f"- Processed {processed} edges in total")
    if skipped > 0:
        print(f"- Skipped {skipped} edges because their relation type or "
              f"entities were unknown (either not given in the config or "
              f"filtered out as too rare).")

示例#13

0

显示文件

文件： import_from_tsv.py 项目： delldu/BigGraph

def generate_edge_path_files(
    edge_file_in: Path,
    edge_path_out: Path,
    edge_storage: AbstractEdgeStorage,
    entities_by_type: Dict[str, Dictionary],
    relation_types: Dictionary,
    relation_configs: List[RelationSchema],
    dynamic_relations: bool,
    lhs_col: int,
    rhs_col: int,
    rel_col: Optional[int],
) -> None:
    print(f"Preparing edge path {edge_path_out}, "
          f"out of the edges found in {edge_file_in}")
    edge_storage.prepare()

    num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts
                        for rconfig in relation_configs)
    num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts
                        for rconfig in relation_configs)

    print(
        f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets."
    )

    buckets: DefaultDict[Tuple[int, int], List[Tuple[int, int, int]]] = \
        DefaultDict(list)
    processed = 0
    skipped = 0

    with edge_file_in.open("rt") as tf:
        for line_num, line in enumerate(tf, start=1):
            words = line.split()
            try:
                lhs_word = words[lhs_col]
                rhs_word = words[rhs_col]
                rel_word = words[rel_col] if rel_col is not None else None
            except IndexError:
                raise RuntimeError(
                    f"Line {line_num} of {edge_file_in} has only {len(words)} words"
                ) from None

            if rel_col is None:
                rel_id = 0
            else:
                try:
                    rel_id = relation_types.get_id(rel_word)
                except KeyError:
                    # Ignore edges whose relation type is not known.
                    skipped += 1
                    continue

            if dynamic_relations:
                lhs_type = relation_configs[0].lhs
                rhs_type = relation_configs[0].rhs
            else:
                lhs_type = relation_configs[rel_id].lhs
                rhs_type = relation_configs[rel_id].rhs

            try:
                lhs_part, lhs_offset = \
                    entities_by_type[lhs_type].get_partition(lhs_word)
                rhs_part, rhs_offset = \
                    entities_by_type[rhs_type].get_partition(rhs_word)
            except KeyError:
                # Ignore edges whose entities are not known.
                skipped += 1
                continue

            buckets[lhs_part, rhs_part].append(
                (lhs_offset, rhs_offset, rel_id))

            processed = processed + 1
            if processed % 100000 == 0:
                print(f"- Processed {processed} edges so far...")

    print(f"- Processed {processed} edges in total")
    if skipped > 0:
        print(f"- Skipped {skipped} edges because their relation type or "
              f"entities were unknown (either not given in the config or "
              f"filtered out as too rare).")

    for i in range(num_lhs_parts):
        for j in range(num_rhs_parts):
            print(f"- Writing bucket ({i}, {j}), "
                  f"containing {len(buckets[i, j])} edges...")
            edges = torch.tensor(buckets[i, j], dtype=torch.long).view((-1, 3))
            edge_storage.save_edges(
                i, j,
                EdgeList(
                    EntityList.from_tensor(edges[:, 0]),
                    EntityList.from_tensor(edges[:, 1]),
                    edges[:, 2],
                ))