def collect_entities_by_type( relation_types: Dictionary, entity_configs: Dict[str, EntitySchema], relation_configs: List[RelationSchema], edge_paths: List[str], dynamic_relations: bool, lhs_col: int, rhs_col: int, rel_col: Optional[int], entity_min_count: int, ) -> Dict[str, Dictionary]: counters: Dict[str, Counter[str]] = {} for entity_name in entity_configs.keys(): counters[entity_name] = Counter() print("Searching for the entities in the edge files...") for edgepath in edge_paths: with open(edgepath, "rt") as tf: for line_num, line in enumerate(tf, start=1): words = line.split() try: lhs_word = words[lhs_col] rhs_word = words[rhs_col] rel_word = words[rel_col] if rel_col is not None else None except IndexError: raise RuntimeError( "Line %d of %s has only %d words" % (line_num, edgepath, len(words))) from None if dynamic_relations or rel_col is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: raise RuntimeError( "Could not find relation type in config") counters[relation_configs[rel_id].lhs][lhs_word] += 1 counters[relation_configs[rel_id].rhs][rhs_word] += 1 entities_by_type: Dict[str, Dictionary] = {} for entity_name, counter in counters.items(): print("Entity type %s:" % entity_name) print("- Found %d entities" % len(counter)) if entity_min_count > 0: print("- Removing the ones with fewer than %d occurrences..." % entity_min_count) counter = Counter( {k: c for k, c in counter.items() if c >= entity_min_count}) print("- Left with %d entities" % len(counter)) print("- Shuffling them...") names = list(counter.keys()) random.shuffle(names) entities_by_type[entity_name] = Dictionary( names, num_parts=entity_configs[entity_name].num_partitions) return entities_by_type
def generate_edge_path_files_fast( edge_file_in: Path, edge_path_out: Path, edge_storage: AbstractEdgeStorage, entities_by_type: Dict[str, Dictionary], relation_types: Dictionary, relation_configs: List[RelationSchema], edgelist_reader: EdgelistReader, ) -> None: processed = 0 skipped = 0 log("Taking the fast train!") data = [] for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in): if rel_word is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: # Ignore edges whose relation type is not known. skipped += 1 continue lhs_type = relation_configs[rel_id].lhs rhs_type = relation_configs[rel_id].rhs try: _, lhs_offset = entities_by_type[lhs_type].get_partition(lhs_word) _, rhs_offset = entities_by_type[rhs_type].get_partition(rhs_word) except KeyError: # Ignore edges whose entities are not known. skipped += 1 continue data.append((lhs_offset, rhs_offset, rel_id)) processed = processed + 1 if processed % 100000 == 0: log(f"- Processed {processed} edges so far...") lhs_offsets, rhs_offsets, rel_ids = zip(*data) edge_list = EdgeList( EntityList.from_tensor(torch.tensor(list(lhs_offsets), dtype=torch.long)), EntityList.from_tensor(torch.tensor(list(rhs_offsets), dtype=torch.long)), torch.tensor(list(rel_ids), dtype=torch.long), ) edge_storage.save_edges(0, 0, edge_list) log(f"- Processed {processed} edges in total") if skipped > 0: log( f"- Skipped {skipped} edges because their relation type or " f"entities were unknown (either not given in the config or " f"filtered out as too rare)." )
def collect_entities_by_type( relation_types: Dictionary, entity_configs: Dict[str, EntitySchema], relation_configs: List[RelationSchema], edge_paths: List[Path], dynamic_relations: bool, edgelist_reader: EdgelistReader, entity_min_count: int, ) -> Dict[str, Dictionary]: counters: Dict[str, Counter[str]] = {} for entity_name in entity_configs.keys(): counters[entity_name] = Counter() log("Searching for the entities in the edge files...") for edgepath in edge_paths: for lhs_word, rhs_word, rel_word in edgelist_reader.read(edgepath): if dynamic_relations or rel_word is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: raise RuntimeError("Could not find relation type in config") counters[relation_configs[rel_id].lhs][lhs_word] += 1 counters[relation_configs[rel_id].rhs][rhs_word] += 1 entities_by_type: Dict[str, Dictionary] = {} for entity_name, counter in counters.items(): log(f"Entity type {entity_name}:") log(f"- Found {len(counter)} entities") if entity_min_count > 0: log( f"- Removing the ones with fewer than {entity_min_count} occurrences..." ) counter = Counter( {k: c for k, c in counter.items() if c >= entity_min_count} ) log(f"- Left with {len(counter)} entities") log("- Shuffling them...") names = list(counter.keys()) random.shuffle(names) entities_by_type[entity_name] = Dictionary( names, num_parts=entity_configs[entity_name].num_partitions ) return entities_by_type
def generate_edge_path_files( edge_file_in: str, entities_by_type: Dict[str, Dictionary], relation_types: Dictionary, relation_configs: List[RelationSchema], dynamic_relations: bool, lhs_col: int, rhs_col: int, rel_col: Optional[int], ) -> None: basename, _ = os.path.splitext(edge_file_in) edge_path_out = basename + '_partitioned' print("Preparing edge path %s, out of the edges found in %s" % (edge_path_out, edge_file_in)) os.makedirs(edge_path_out, exist_ok=True) num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs) num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs) print("- Edges will be partitioned in %d x %d buckets." % (num_lhs_parts, num_rhs_parts)) buckets: DefaultDict[Tuple[int, int], List[Tuple[int, int, int]]] = \ DefaultDict(list) processed = 0 skipped = 0 with open(edge_file_in, "rt") as tf: for line in tf: words = line.split() if rel_col is None: rel_id = 0 else: try: rel_id = relation_types.get_id(words[rel_col]) except KeyError: # Ignore edges whose relation type is not known. skipped += 1 continue if dynamic_relations: lhs_type = relation_configs[0].lhs rhs_type = relation_configs[0].rhs else: lhs_type = relation_configs[rel_id].lhs rhs_type = relation_configs[rel_id].rhs try: lhs_part, lhs_offset = \ entities_by_type[lhs_type].get_partition(words[lhs_col]) rhs_part, rhs_offset = \ entities_by_type[rhs_type].get_partition(words[rhs_col]) except KeyError: # Ignore edges whose entities are not known. skipped += 1 continue buckets[lhs_part, rhs_part].append( (lhs_offset, rhs_offset, rel_id)) processed = processed + 1 if processed % 100000 == 0: print("- Processed %d edges so far..." % processed) print("- Processed %d edges in total" % processed) if skipped > 0: print( "- Skipped %d edges because their relation type or entities were " "unknown (either not given in the config or filtered out as too " "rare)." % skipped) for i in range(num_lhs_parts): for j in range(num_rhs_parts): print("- Writing bucket (%d, %d), containing %d edges..." % (i, j, len(buckets[i, j]))) edges = np.asarray(buckets[i, j]) with h5py.File( os.path.join(edge_path_out, "edges_%d_%d.h5" % (i, j)), "w") as hf: hf.attrs["format_version"] = 1 hf.create_dataset("lhs", data=edges[:, 0]) hf.create_dataset("rhs", data=edges[:, 1]) hf.create_dataset("rel", data=edges[:, 2])
def generate_edge_path_files( edge_file_in: Path, edge_path_out: Path, edge_storage: AbstractEdgeStorage, entities_by_type: Dict[str, Dictionary], relation_types: Dictionary, relation_configs: List[RelationSchema], dynamic_relations: bool, edgelist_reader: EdgelistReader, n_flush_edges: int = 100000, ) -> None: log( f"Preparing edge path {edge_path_out}, " f"out of the edges found in {edge_file_in}" ) edge_storage.prepare() num_lhs_parts = max( entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs ) num_rhs_parts = max( entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs ) log(f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets.") processed = 0 skipped = 0 # We use an ExitStack in order to close the dynamically-created edge appenders. with ExitStack() as appender_stack: appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {} data: Dict[Tuple[int, int], List[Tuple[int, int, int]]] = {} for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in): if rel_word is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: # Ignore edges whose relation type is not known. skipped += 1 continue if dynamic_relations: lhs_type = relation_configs[0].lhs rhs_type = relation_configs[0].rhs else: lhs_type = relation_configs[rel_id].lhs rhs_type = relation_configs[rel_id].rhs try: lhs_part, lhs_offset = entities_by_type[lhs_type].get_partition( lhs_word ) rhs_part, rhs_offset = entities_by_type[rhs_type].get_partition( rhs_word ) except KeyError: # Ignore edges whose entities are not known. skipped += 1 continue if (lhs_part, rhs_part) not in appenders: appenders[lhs_part, rhs_part] = appender_stack.enter_context( edge_storage.save_edges_by_appending(lhs_part, rhs_part) ) data[lhs_part, rhs_part] = [] part_data = data[lhs_part, rhs_part] part_data.append((lhs_offset, rhs_offset, rel_id)) if len(part_data) > n_flush_edges: append_to_file(part_data, appenders[lhs_part, rhs_part]) part_data.clear() processed = processed + 1 if processed % 100000 == 0: log(f"- Processed {processed} edges so far...") for (lhs_part, rhs_part), part_data in data.items(): if len(part_data) > 0: append_to_file(part_data, appenders[lhs_part, rhs_part]) part_data.clear() log(f"- Processed {processed} edges in total") if skipped > 0: log( f"- Skipped {skipped} edges because their relation type or " f"entities were unknown (either not given in the config or " f"filtered out as too rare)." )
def generate_edge_path_files( edge_file_in: Path, edge_path_out: Path, edge_storage: AbstractEdgeStorage, entities_by_type: Dict[str, Dictionary], relation_types: Dictionary, relation_configs: List[RelationSchema], dynamic_relations: bool, edgelist_reader: EdgelistReader, ) -> None: log( f"Preparing edge path {edge_path_out}, " f"out of the edges found in {edge_file_in}" ) edge_storage.prepare() num_lhs_parts = max( entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs ) num_rhs_parts = max( entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs ) if not dynamic_relations and num_lhs_parts == 1 and num_rhs_parts == 1: print('using fast version') return generate_edge_path_files_fast( edge_file_in, edge_path_out, edge_storage, entities_by_type, relation_types, relation_configs, edgelist_reader, ) log(f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets.") processed = 0 skipped = 0 # We use an ExitStack in order to close the dynamically-created edge appenders. with ExitStack() as appender_stack: appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {} for lhs_word, rhs_word, rel_word in edgelist_reader.read(edge_file_in): if rel_word is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: # Ignore edges whose relation type is not known. skipped += 1 continue if dynamic_relations: lhs_type = relation_configs[0].lhs rhs_type = relation_configs[0].rhs else: lhs_type = relation_configs[rel_id].lhs rhs_type = relation_configs[rel_id].rhs try: lhs_part, lhs_offset = entities_by_type[lhs_type].get_partition( lhs_word ) rhs_part, rhs_offset = entities_by_type[rhs_type].get_partition( rhs_word ) except KeyError: # Ignore edges whose entities are not known. skipped += 1 continue if (lhs_part, rhs_part) not in appenders: appenders[lhs_part, rhs_part] = appender_stack.enter_context( edge_storage.save_edges_by_appending(lhs_part, rhs_part) ) appenders[lhs_part, rhs_part].append_edges( EdgeList( EntityList.from_tensor( torch.tensor([lhs_offset], dtype=torch.long) ), EntityList.from_tensor( torch.tensor([rhs_offset], dtype=torch.long) ), torch.tensor([rel_id], dtype=torch.long), ) ) processed = processed + 1 if processed % 100000 == 0: log(f"- Processed {processed} edges so far...") log(f"- Processed {processed} edges in total") if skipped > 0: log( f"- Skipped {skipped} edges because their relation type or " f"entities were unknown (either not given in the config or " f"filtered out as too rare)." )
def generate_edge_path_files( edge_file_in: Path, edge_path_out: Path, edge_storage: AbstractEdgeStorage, entities_by_type: Dict[str, Dictionary], relation_types: Dictionary, relation_configs: List[RelationSchema], dynamic_relations: bool, lhs_col: int, rhs_col: int, rel_col: Optional[int], ) -> None: print(f"Preparing edge path {edge_path_out}, " f"out of the edges found in {edge_file_in}") edge_storage.prepare() num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs) num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs) print( f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets." ) processed = 0 skipped = 0 # We use an ExitStack in order to close the dynamically-created edge appenders. with edge_file_in.open("rt") as tf, ExitStack() as appender_stack: appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {} for line_num, line in enumerate(tf, start=1): words = line.split() try: lhs_word = words[lhs_col] rhs_word = words[rhs_col] rel_word = words[rel_col] if rel_col is not None else None except IndexError: raise RuntimeError( f"Line {line_num} of {edge_file_in} has only {len(words)} words" ) from None if rel_col is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: # Ignore edges whose relation type is not known. skipped += 1 continue if dynamic_relations: lhs_type = relation_configs[0].lhs rhs_type = relation_configs[0].rhs else: lhs_type = relation_configs[rel_id].lhs rhs_type = relation_configs[rel_id].rhs try: lhs_part, lhs_offset = \ entities_by_type[lhs_type].get_partition(lhs_word) rhs_part, rhs_offset = \ entities_by_type[rhs_type].get_partition(rhs_word) except KeyError: # Ignore edges whose entities are not known. skipped += 1 continue if (lhs_part, rhs_part) not in appenders: appenders[lhs_part, rhs_part] = appender_stack.enter_context( edge_storage.save_edges_by_appending(lhs_part, rhs_part)) appenders[lhs_part, rhs_part].append_edges( EdgeList( EntityList.from_tensor( torch.tensor([lhs_offset], dtype=torch.long)), EntityList.from_tensor( torch.tensor([rhs_offset], dtype=torch.long)), torch.tensor([rel_id], dtype=torch.long), )) processed = processed + 1 if processed % 100000 == 0: print(f"- Processed {processed} edges so far...") print(f"- Processed {processed} edges in total") if skipped > 0: print(f"- Skipped {skipped} edges because their relation type or " f"entities were unknown (either not given in the config or " f"filtered out as too rare).")
def generate_edge_path_files( edge_file_in: Path, edge_path_out: Path, edge_storage: AbstractEdgeStorage, entities_by_type: Dict[str, Dictionary], relation_types: Dictionary, relation_configs: List[RelationSchema], dynamic_relations: bool, lhs_col: int, rhs_col: int, rel_col: Optional[int], ) -> None: print(f"Preparing edge path {edge_path_out}, " f"out of the edges found in {edge_file_in}") edge_storage.prepare() num_lhs_parts = max(entities_by_type[rconfig.lhs].num_parts for rconfig in relation_configs) num_rhs_parts = max(entities_by_type[rconfig.rhs].num_parts for rconfig in relation_configs) print( f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets." ) buckets: DefaultDict[Tuple[int, int], List[Tuple[int, int, int]]] = \ DefaultDict(list) processed = 0 skipped = 0 with edge_file_in.open("rt") as tf: for line_num, line in enumerate(tf, start=1): words = line.split() try: lhs_word = words[lhs_col] rhs_word = words[rhs_col] rel_word = words[rel_col] if rel_col is not None else None except IndexError: raise RuntimeError( f"Line {line_num} of {edge_file_in} has only {len(words)} words" ) from None if rel_col is None: rel_id = 0 else: try: rel_id = relation_types.get_id(rel_word) except KeyError: # Ignore edges whose relation type is not known. skipped += 1 continue if dynamic_relations: lhs_type = relation_configs[0].lhs rhs_type = relation_configs[0].rhs else: lhs_type = relation_configs[rel_id].lhs rhs_type = relation_configs[rel_id].rhs try: lhs_part, lhs_offset = \ entities_by_type[lhs_type].get_partition(lhs_word) rhs_part, rhs_offset = \ entities_by_type[rhs_type].get_partition(rhs_word) except KeyError: # Ignore edges whose entities are not known. skipped += 1 continue buckets[lhs_part, rhs_part].append( (lhs_offset, rhs_offset, rel_id)) processed = processed + 1 if processed % 100000 == 0: print(f"- Processed {processed} edges so far...") print(f"- Processed {processed} edges in total") if skipped > 0: print(f"- Skipped {skipped} edges because their relation type or " f"entities were unknown (either not given in the config or " f"filtered out as too rare).") for i in range(num_lhs_parts): for j in range(num_rhs_parts): print(f"- Writing bucket ({i}, {j}), " f"containing {len(buckets[i, j])} edges...") edges = torch.tensor(buckets[i, j], dtype=torch.long).view((-1, 3)) edge_storage.save_edges( i, j, EdgeList( EntityList.from_tensor(edges[:, 0]), EntityList.from_tensor(edges[:, 1]), edges[:, 2], ))