def GetBuildInfo() -> config_pb2.BuildInfo: """Return the build state.""" if not _BUILD_INFO.is_file(): raise OSError("No build_info.pbtxt. Are there runfiles?") return pbutil.FromFile(_BUILD_INFO, config_pb2.BuildInfo(), uninitialized_okay=False)
def main(argv) -> None: """Main entry point.""" if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) meta_files = [] for language in clone_list.language: directory = pathlib.Path(language.destination_directory) if directory.is_dir(): meta_files += [pathlib.Path(directory / f) for f in directory.iterdir() if IsRepoMetaFile(f)] random.shuffle(meta_files) worker = AsyncWorker(meta_files) logging.info('Cloning %s repos from GitHub ...', humanize.intcomma(worker.max)) bar = progressbar.ProgressBar(max_value=worker.max, redirect_stderr=True) worker.start() while worker.is_alive(): bar.update(worker.i) worker.join(.5) bar.update(worker.i)
def PostprocessSampleCorpus(instance: clgen.Instance): """Create a corpus from the model samples and pre-process.""" sample_dir = instance.model.SamplerCache(instance.sampler) # Read the sample protos and write them to a directory of content files. contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles') contentfiles_dir.mkdir(exist_ok=True) logging.info('Writing output contentfiles to %s', contentfiles_dir) if len(list(contentfiles_dir.iterdir())) != len(list( sample_dir.iterdir())): for proto_path in sample_dir.iterdir(): sample = pbutil.FromFile(proto_path, model_pb2.Sample()) with open(contentfiles_dir / proto_path.name, 'w') as f: f.write(sample.text) logging.info('Creating output corpus') output_corpus_config = corpus_pb2.Corpus() output_corpus_config.CopyFrom(instance.model.corpus.config) output_corpus_config.local_directory = str(contentfiles_dir) # We derive the programming language name from the input corpus directory. # This depends on corpuses being in directories named after their language, # e.g. ~/corpuses/opencl, or ~/corpuses/java.A preprocessed_dir = instance.model.corpus.preprocessed.url[len('sqlite:///' ):].parent language = (preprocessed_dir / 'contentfiles').resolve().name output_corpus_config.preprocessor[:] = POSTPROCESSORS[language] output_corpus = corpuses.Corpus(output_corpus_config) try: output_corpus.Create() except errors.EmptyCorpusException: pass return output_corpus
def ConfigFromFlags() -> clgen_pb2.Instance: config_path = pathlib.Path(FLAGS.config) if not config_path.is_file(): raise app.UsageError(f"CLgen --config file not found: '{config_path}'") config = pbutil.FromFile(config_path, clgen_pb2.Instance()) os.environ['PWD'] = str(config_path.parent) return config
def _ReadTestDataStoreFiles() -> datastore_pb2.DataStoreTestSet: """Read the config protos for testing. The datastore names are derived from the file names. Returns: A DataStoreTestSet instance. Raises: AssertionError: In case of error reading datastore configs. """ paths = list( pathlib.Path('deeplearning/deepsmith/tests/data/datastores').iterdir()) assert paths names = [p.stem for p in paths] protos = [ pbutil.FromFile(path, datastore_pb2.DataStore()) for path in paths ] datastore_set = datastore_pb2.DataStoreTestSet() for name, proto in zip(names, protos): # There's no graceful error handling here, but it's important that we don't # run tests on a datastore unless it's specifically marked as testonly. assert proto.testonly dst_proto = datastore_set.values[name] dst_proto.MergeFrom(proto) assert len(datastore_set.values) == len(protos) == len(names) == len(paths) return datastore_set
def CloneFromMetafile(metafile: pathlib.Path) -> None: meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) if not meta.owner and meta.name: logging.error('Metafile missing owner and name fields %s', metafile) return clone_dir = metafile.parent / f'{meta.owner}_{meta.name}' logging.debug('%s', meta) if (clone_dir / '.git').is_dir(): return # Remove anything left over from a previous attempt. subprocess.check_call(['rm', '-rf', str(clone_dir)]) cmd = ['timeout', f'{FLAGS.repository_clone_timeout_minutes}m', '/usr/bin/git', 'clone', meta.clone_from_url, str(clone_dir)] logging.debug('$ %s', ' '.join(cmd)) # Try to checkout the repository and submodules. p = subprocess.Popen(cmd + ['--recursive'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) _, stderr = p.communicate() if p.returncode and 'submodule' in stderr: # Remove anything left over from a previous attempt. subprocess.check_call(['rm', '-rf', str(clone_dir)]) # Try again, but this time without cloning submodules. p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) _, stderr = p.communicate() if p.returncode: # Give up. logging.warning('\nClone failed %s:\n%s', meta.clone_from_url, stderr) # Remove anything left over. subprocess.check_call(['rm', '-rf', str(clone_dir)])
def main(argv) -> None: """Main entry point.""" if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tiers = pbutil.FromFile(pathlib.Path(FLAGS.data_tiers), data_tiers_pb2.DataTiers()) for tier in tiers.directory: logging.info('Processing %s', tier.path) _SetDirectorySize(tier) if FLAGS.summary: # Print the size per directory. df = pd.DataFrame([ { 'Path': d.path, 'Tier': d.tier, 'Size': humanize.naturalsize(d.size_bytes), 'Size (bytes)': d.size_bytes } for d in tiers.directory if d.size_bytes ]) df = df.sort_values(['Tier', 'Size (bytes)'], ascending=[True, False]) print(df[['Path', 'Tier', 'Size']].to_string(index=False)) # Print the total size per tier. df2 = df.groupby('Tier').sum() df2['Size'] = [humanize.naturalsize(d['Size (bytes)']) for _, d in df2.iterrows()] df2 = df2.reset_index() df2 = df2.sort_values('Tier') print() print("Totals:") print(df2[['Tier', 'Size']].to_string(index=False)) else: print(tiers)
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments '{}'".format(', '.join(argv[1:]))) clone_list_path = pathlib.Path(FLAGS.clone_list or '') if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) if not FLAGS.export_path: raise app.UsageError('--export_path not set.') export_path = pathlib.Path(FLAGS.export_path) export_path.mkdir(parents=True, exist_ok=True) # To export from contentfiles database. # for language in clone_list.language: # d = pathlib.Path(language.destination_directory) # d = d.parent / (str(d.name) + '.db') # db = contentfiles.ContentFiles(d) # with db.Session() as session: # (export_path / language.language).mkdir(exist_ok=True) # ExportDatabase(session, export_path / language.language) # To export from index directory. for language in clone_list.language: index_path = pathlib.Path(language.destination_directory + '.index') if index_path.is_dir(): (export_path / language.language).mkdir(exist_ok=True) ExportIndex(index_path, export_path / language.language)
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError('Unrecognized arguments') # Parse flags and instantiate testing objects. if not FLAGS.interesting_results_dir: raise app.UsageError('--interesting_results_dir must be set') interesting_results_dir = pathlib.Path(FLAGS.interesting_results_dir) if interesting_results_dir.exists( ) and not interesting_results_dir.is_dir(): raise app.UsageError('--interesting_results_dir must be a directory') logging.info('Recording interesting results in %s.', interesting_results_dir) for path in interesting_results_dir.iterdir(): result = pbutil.FromFile(path, deepsmith_pb2.Result()) print(f'=== BEGIN INTERESTING RESULT {path.stem} ===') print('Outcome:', deepsmith_pb2.Result.Outcome.Name(result.outcome)) print() print('OpenCL kernel') print('-------------') print(fmt.Indent(2, result.testcase.inputs['src'])) print() print('Stdout') print('------') print(fmt.Indent(2, result.outputs['stderr'])) print()
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(' '.join( argv[1:]))) start_time = time.time() instances = [ clgen.Instance(p) for p in pbutil.FromFile( pathlib.Path(FLAGS.instances), clgen_pb2.Instances()).instance ] random.shuffle(instances) candidate_instances = collections.deque(instances) logging.info('Loaded %d instances in %s ms', len(candidate_instances), humanize.intcomma(int((time.time() - start_time) * 1000))) while candidate_instances: instance = candidate_instances.popleft() with instance.Session(): if IsEligible(instance): logging.info('Found an eligible candidate to work on') SampleModel(instance) PostprocessSampleCorpus(instance) else: logging.info('Candidate is ineligible') candidate_instances.append(instance) time.sleep(1) logging.info('Done.')
def ContentFiles(self) -> typing.Iterable[scrape_repos_pb2.ContentFile]: """Return an iterator over all contentfiles in the repo.""" if self.IsIndexed(): return (pbutil.FromFile(f, scrape_repos_pb2.ContentFile()) for f in self.index_dir.iterdir() if f.name != 'DONE.txt') else: return []
def LoadPositiveNegativeProtos(path: pathlib.Path) -> PositiveNegativeDataset: """Load positive and negative training protos from a directory.""" positive_protos = [ pbutil.FromFile(p, fish_pb2.CompilerCrashDiscriminatorTrainingExample()) for p in path.iterdir() if p.name.startswith('positive-') ] logging.info('Loaded %s positive protos', humanize.intcomma(len(positive_protos))) negative_protos = [ pbutil.FromFile(p, fish_pb2.CompilerCrashDiscriminatorTrainingExample()) for p in path.iterdir() if p.name.startswith('negative-') ] logging.info('Loaded %s negative protos', humanize.intcomma(len(negative_protos))) return PositiveNegativeDataset(positive_protos, negative_protos)
def EpochTelemetry(self) -> typing.List[telemetry_pb2.ModelEpochTelemetry]: """Return the epoch telemetry files.""" return [ pbutil.FromFile(self.logdir / p, telemetry_pb2.ModelEpochTelemetry()) for p in sorted(self.logdir.iterdir()) if re.match(r'epoch_\d\d+_telemetry\.pbtxt', str(p.name)) ]
def test_main_stop_after_train(abc_instance_file): """Test that --stop_after train trains the model.""" app.FLAGS.unparse_flags() app.FLAGS( ['argv[0]', '--config', abc_instance_file, '--stop_after', 'train']) clgen.main([]) instance = clgen.Instance( pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance())) assert instance.model.is_trained
def test_main_stop_after_corpus(abc_instance_file): """Test that --stop_after corpus prevents model training.""" app.FLAGS.unparse_flags() app.FLAGS( ['argv[0]', '--config', abc_instance_file, '--stop_after', 'corpus']) clgen.main([]) instance = clgen.Instance( pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance())) assert not instance.model.is_trained
def test_FromFile_required_fields_not_set_uninitialized_okay(suffix): """Test that DecodeError not raised if required fields not set.""" with tempfile.NamedTemporaryFile(prefix='labm8_proto_', suffix=suffix) as f: proto_in = test_protos_pb2.AnotherTestMessage(number=1) pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1), pathlib.Path(f.name)) pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage(), uninitialized_okay=True)
def test_FromFile_required_fields_not_set(suffix): """Test that DecodeError raised if required fields not set.""" with tempfile.NamedTemporaryFile(prefix='labm8_proto_', suffix=suffix) as f: pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1), pathlib.Path(f.name)) with pytest.raises(pbutil.DecodeError) as e_info: pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage()) assert f"Required fields not set: '{f.name}'" == str(e_info.value)
def GeneratorFromFlag(config_class, generator_class) -> base_generator.GeneratorServiceBase: """Instantiate a generator from the --generator_config flag.""" if not pbutil.ProtoIsReadable(FLAGS.generator_config, config_class()): raise app.UsageError( f'--generator_config is not a {config_class.__name__} proto') config = pbutil.FromFile(pathlib.Path(FLAGS.generator_config), config_class()) return generator_class(config)
def ProtoFromFile(cls, path: pathlib.Path) -> deepsmith_pb2.Testcase: """Instantiate a protocol buffer testcase from file. Args: path: Path to the testcase proto file. Returns: Testcase message instance. """ return pbutil.FromFile(path, deepsmith_pb2.Testcase())
def CreateInstanceProtoFromFlags() -> clgen_pb2.Instance: if FLAGS.clgen_instance: return pbutil.FromFile( pathlib.Path(FLAGS.clgen_instance), clgen_pb2.Instance()) else: return clgen_pb2.Instance( working_dir=FLAGS.clgen_working_dir, model=CreateModelProtoFromFlags(), sampler=CreateSamplerProtoFromFlags(), )
def ProtoFromFile(cls, path: pathlib.Path) -> deepsmith_pb2.Result: """Instantiate a protocol buffer result from file. Args: path: Path to the result proto file. Returns: Result message instance. """ return pbutil.FromFile(path, deepsmith_pb2.Result())
def ServiceConfigFromFlag( flag_name: str, service_config: pbutil.ProtocolBuffer) -> pbutil.ProtocolBuffer: if not getattr(FLAGS, flag_name): raise app.UsageError(f'--{flag_name} not set.') config_path = pathlib.Path(getattr(FLAGS, flag_name)) if not config_path.is_file(): cls_name = type(service_config).__name__ raise app.UsageError(f"{cls_name} file not found: '{config_path}'.") return pbutil.FromFile(config_path, service_config)
def DoFlagsAction(): """Do the action requested by the command line flags.""" if not FLAGS.config: raise app.UsageError("Missing required argument: '--config'") config_path = pathlib.Path(FLAGS.config) if not config_path.is_file(): raise app.UsageError(f"File not found: '{config_path}'") config = pbutil.FromFile(config_path, clgen_pb2.Instance()) os.environ['PWD'] = str(config_path.parent) if FLAGS.clgen_profiling: prof.enable() instance = Instance(config) with instance.Session(): if FLAGS.print_cache_path == 'corpus': print(instance.model.corpus.cache.path) return elif FLAGS.print_cache_path == 'model': print(instance.model.cache.path) return elif FLAGS.print_cache_path == 'sampler': print(instance.model.SamplerCache(instance.sampler)) return elif FLAGS.print_cache_path: raise app.UsageError( f"Invalid --print_cache_path argument: '{FLAGS.print_cache_path}'" ) if FLAGS.print_preprocessed: print(instance.model.corpus.GetTextCorpus(shuffle=False)) return # The default action is to sample the model. if FLAGS.stop_after == 'corpus': instance.model.corpus.Create() elif FLAGS.stop_after == 'train': instance.model.Train() logging.info('Model: %s', instance.model.cache.path) elif FLAGS.stop_after: raise app.UsageError( f"Invalid --stop_after argument: '{FLAGS.stop_after}'") elif FLAGS.export_model: instance.model.Train() export_dir = pathlib.Path(FLAGS.export_model) for path in instance.model.InferenceManifest(): relpath = pathlib.Path( os.path.relpath(path, instance.model.cache.path)) (export_dir / relpath.parent).mkdir(parents=True, exist_ok=True) shutil.copyfile(path, export_dir / relpath) print(export_dir / relpath) else: instance.model.Sample(instance.sampler, FLAGS.min_samples)
def FromFile(cls, path: pathlib.Path) -> 'DataStore': """Instantiate a DataStore from a config file. Args: path: Path to the datastore config proto file. Returns: A DataStore instance. """ config = pbutil.FromFile(path, datastore_pb2.DataStore()) return DataStore(config)
def PackDataPackage(package_dir: pathlib.Path) -> None: """Create an archive and sidecar of a package.""" manifest = pbutil.FromFile(package_dir / 'MANIFEST.pbtxt', dpack_pb2.DataPackage()) PackageManifestIsValid(package_dir, manifest) archive_path = (package_dir / f'../{package_dir.name}.dpack.tar.bz2').resolve() sidecar_path = (package_dir / f'../{package_dir.name}.dpack.pbtxt').resolve() CreatePackageArchive(package_dir, manifest, archive_path) CreatePackageArchiveSidecar(archive_path, manifest, sidecar_path)
def main(argv: typing.List[str]): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(' '.join( argv[1:]))) path = pathlib.Path(FLAGS.delayed_reward_experiment_path) data = pbutil.FromFile(path, random_opt_pb2.DelayedRewardExperiment()) # graph = DelayedRewardExperimentToGraph(data) # print(graph.ToDot()) dot = DelayedRewardExperimentToDot(data) print(dot.source)
def test_ToFile_FromFile_equivalence(suffix): """Test that ToFile() and FromFile() are symmetrical.""" with tempfile.TemporaryDirectory(prefix='labm8_proto_') as d: path = pathlib.Path(d) / f'proto{suffix}' proto_in = test_protos_pb2.TestMessage(string='abc', number=1) pbutil.ToFile(proto_in, path) assert path.is_file() proto_out = test_protos_pb2.TestMessage() pbutil.FromFile(path, proto_out) assert proto_out.string == 'abc' assert proto_out.number == 1 assert proto_in == proto_out
def GetProtos(export_path: pathlib.Path, outcomes: typing.List[str], max_src_len: int) -> typing.List[TrainingProto]: paths = sorted( labtypes.flatten( [list((export_path / outcome).iterdir()) for outcome in outcomes])) protos = [] for path in paths: proto = pbutil.FromFile(path, TrainingProto()) if len(proto.src) > max_src_len: continue protos.append(proto) return protos
def test_config_is_valid(): """Test that config proto is valid.""" with tempfile.TemporaryDirectory() as d: config = pbutil.FromFile( bazelutil.DataPath( 'phd/deeplearning/clgen/tests/data/c99/config.pbtxt'), clgen_pb2.Instance()) # Change the working directory and corpus path to our bazel run dir. config.working_dir = d config.model.corpus.local_directory = str( bazelutil.DataPath('phd/deeplearning/clgen/tests/data/c99/src/')) clgen.Instance(config)
def VerifyManifest(package_dir: pathlib.Path) -> bool: """Verify that the MANIFEST.pbtext file matches the contents.""" if not (package_dir / 'MANIFEST.pbtxt').is_file(): logging.info('%s/MANIFEST.pbtxt missing, nothing to do.', package_dir) return False manifest = pbutil.FromFile(package_dir / 'MANIFEST.pbtxt', dpack_pb2.DataPackage()) if not PackageManifestIsValid(package_dir, manifest): logging.error('Package %s contains errors.', package_dir) return False logging.info('%s verified. No changes to files in the manifest.', package_dir) return True