def Split(self, db: ir_database.Database) -> List[np.array]: """Split the database.""" poj104 = super(TrainValTestSplitter, self).Split(db) # Get the IDs of non-POJ-104 IRs. with db.Session() as session: total_count = ( session.query(sql.func.count(ir_database.IntermediateRepresentation.id)) .filter( ir_database.IntermediateRepresentation.compilation_succeeded == True, ~ir_database.IntermediateRepresentation.source.like("poj-104:%"), ) .scalar() ) # Scale the train/val/test ratio to the total IR count. train_val_test_counts = np.floor(self.ratios * total_count).astype( np.int32 ) # Round up if there were missing values. while train_val_test_counts.sum() < total_count: train_val_test_counts[random.randint(0, 2)] += 1 assert total_count == train_val_test_counts.sum() app.Log( 1, "Splitting %s IRs into splits: %s train, %s val, %s test", humanize.Commas(total_count + sum(len(s) for s in poj104)), humanize.Commas(train_val_test_counts[0] + len(poj104[0])), humanize.Commas(train_val_test_counts[1] + len(poj104[1])), humanize.Commas(train_val_test_counts[2] + len(poj104[2])), ) ir_ids = [ row.id for row in session.query(ir_database.IntermediateRepresentation.id) .filter( ir_database.IntermediateRepresentation.compilation_succeeded == True, ~ir_database.IntermediateRepresentation.source.like("poj-104:%"), ) .order_by(db.Random()) ] if not ir_ids: raise ValueError("No results") return [ np.concatenate((poj104[0], ir_ids[: train_val_test_counts[0]])), np.concatenate( ( poj104[1], ir_ids[train_val_test_counts[0] : sum(train_val_test_counts[:2])], ) ), np.concatenate((poj104[2], ir_ids[sum(train_val_test_counts[:2]) :])), ]
def proto_db( request, ir_db: ir_database.Database ) -> unlabelled_graph_database.Database: """A test fixture which yields a graph database with random graph tuples.""" with ir_db.Session() as session: ir_ids = [ row.id for row in session.query(ir_database.IntermediateRepresentation.id) ] with testing_databases.DatabaseContext( unlabelled_graph_database.Database, request.param ) as db: with db.Session(commit=True) as session: session.add_all( [ unlabelled_graph_database.ProgramGraph.Create( proto=random_programl_generator.CreateRandomProto( graph_y_dimensionality=2 ), ir_id=ir_id, ) for ir_id in ir_ids ] ) yield db
def AnnotateGraphMetas( ir_db: ir_database.Database, proto_db: unlabelled_graph_database.Database, df: pd.DataFrame, ctx: progress.ProgressContext = progress.NullContext, ) -> Iterable[graph_tuple_database.GraphTuple]: """Add features and labels to graph metas in database.""" with ir_db.Session() as ir_session, proto_db.Session() as proto_session: for _, row in df.iterrows(): relpath = row["relpath"] with ctx.Profile( 2, f"Processed graph {row['relpath']}:{row['data:dataset_name']}" ): # Select the corresponding IR. ir_id = (ir_session.query( ir_database.IntermediateRepresentation.id).filter( ir_database.IntermediateRepresentation.source == "pact17_opencl_devmap", ir_database.IntermediateRepresentation.relpath == relpath, ).scalar()) # Check that we have an exact 1:1 mapping from the opencl devmap dataset # to IR. if ir_id is None: raise ValueError(f"Expected one IR with relpath {relpath}") # Load the program graph. proto_row = (proto_session.query( unlabelled_graph_database.ProgramGraph).filter( unlabelled_graph_database.ProgramGraph.ir_id == ir_id).options( sql.orm.joinedload(unlabelled_graph_database. ProgramGraph.data)).scalar()) if proto_row is None: raise ValueError( f"Expected one proto for relpath {relpath} with ID {ir_id}" ) proto: programl_pb2.ProgramGraph = proto_row.proto # Add the null "selector vector" value. for node in proto.node: node.x.append(0) # Add the graph-level features. proto.x[:] = [row["wgsize"], row["transfer"]] # Add 'y' graph feature as target. proto.y[:] = row["y"].tolist() # Create the graph tuple. Note the jumping through hoops with converting # proto -> nx -> graph_tuple, because there is currently no direct # proto -> graph_tuple conversion. graph_tuple = graph_tuple_database.GraphTuple.CreateFromGraphTuple( graph_tuple=graph_tuples.GraphTuple.CreateFromNetworkX( programl.ProgramGraphToNetworkX(proto)), ir_id=ir_id, ) yield graph_tuple
def db_with_empty_ir(request, db: ir_database.Database) -> ir_database.Database: empty_ir_count = request.param with db.Session(commit=True) as session: session.add_all([ ir_database.IntermediateRepresentation.CreateEmpty( source="foo", relpath=str(i), source_language=ir_database.SourceLanguage.C, type=ir_database.IrType.LLVM_6_0, cflags="", ) for i in range(empty_ir_count) ]) return db
def Split(self, db: ir_database.Database) -> List[np.array]: """Get the bytecode IDs for the POJ-104 app classification experiment.""" def GetBytecodeIds(session, filter_cb) -> np.array: """Return the IDs for the given filtered query.""" ids = np.array( [ row.id for row in ( session.query(ir_database.IntermediateRepresentation.id).filter( ir_database.IntermediateRepresentation.compilation_succeeded == True, filter_cb(), ) ) ], dtype=np.int32, ) if not ids.size: raise ValueError("No results") return ids with db.Session() as session: return [ GetBytecodeIds( session, lambda: ( ir_database.IntermediateRepresentation.source == "poj-104:train" ), ), GetBytecodeIds( session, lambda: ( ir_database.IntermediateRepresentation.source == "poj-104:val" ), ), GetBytecodeIds( session, lambda: ( ir_database.IntermediateRepresentation.source == "poj-104:test" ), ), ]
def PopulateBytecodeTable(self, db: ir_database.Database, commit_every: int = 1000): programs_df = self.programs_df.reset_index() bar = progressbar.ProgressBar() bar.max_value = len(programs_df) # Process each row of the table in parallel. pool = multiprocessing.Pool() with db.Session(commit=True) as s: for i, proto in enumerate( pool.imap_unordered(ProcessOpenClProgramDfBytecode, [d for _, d in programs_df.iterrows()])): bar.update(i) s.add( bytecode_database.LlvmBytecode( **bytecode_database.LlvmBytecode.FromProto(proto))) if not (i % commit_every): s.commit()
def populated_ir_db(ir_db: ir_database.Database, opencl_relpaths: Set[str]) -> ir_database.Database: """A test fixture which yields an IR database with 256 OpenCL entries.""" rows = [] # Create random rows using OpenCL relpaths. for i, relpath in enumerate(opencl_relpaths): ir = ir_database.IntermediateRepresentation.CreateFromText( source="pact17_opencl_devmap", relpath=relpath, source_language=ir_database.SourceLanguage.OPENCL, type=ir_database.IrType.LLVM_6_0, cflags="", text=CreateRandomString(), ) ir.id = i + 1 rows.append(ir) with ir_db.Session(commit=True) as session: session.add_all(rows) return ir_db
def Split(self, db: ir_database.Database) -> List[np.array]: """Split the database.""" with db.Session() as session: all_ids = np.array( [ row.id for row in session.query( ir_database.IntermediateRepresentation.id ).filter( ir_database.IntermediateRepresentation.compilation_succeeded == True, ir_database.IntermediateRepresentation.source == "pact17_opencl_devmap", ) ], dtype=np.int32, ) if not all_ids.size: raise ValueError("No results") kfold = sklearn.model_selection.KFold(self.k).split(all_ids) return [all_ids[test] for (train, test) in kfold]