예제 #1
0
파일: cldrive.py 프로젝트: fivosts/clgen
 def get_execution_times_ms(
     self, src: str, dataset: str, global_size: int, local_size: int
 ) -> typing.Tuple[typing.List[int], typing.List[int], typing.List[int],
                   typing.List[int]]:
     """
 Search code by hash and return lists with all different execution times.
 """
     sha = crypto.sha256_str(src + dataset + str(global_size) +
                             str(local_size))
     ctt, ckt, gtt, gkt = [], [], [], []
     with self.Session() as session:
         entry = session.query(CLDriveSample).filter_by(sha256=sha).first()
         if entry is None:
             return None
         else:
             ctt = [
                 int(x) // 1000
                 for x in entry.cpu_transfer_time_ns.split('\n')
             ]
             ckt = [
                 int(x) // 1000
                 for x in entry.cpu_kernel_time_ns.split('\n')
             ]
             gtt = [
                 int(x) // 1000
                 for x in entry.gpu_transfer_time_ns.split('\n')
             ]
             gkt = [
                 int(x) // 1000
                 for x in entry.gpu_kernel_time_ns.split('\n')
             ]
     return ctt, ckt, gtt, gkt
예제 #2
0
파일: dashboard.py 프로젝트: fivosts/clgen
def dataset(workspace: str, model_sha: str):
    global data
    global cached_models
    if data == {}:
        data = parseData()

    target_sha = crypto.sha256_str(str(workspace) + model_sha)
    current_model = cached_models[target_sha]

    datasets = []
    for d in glob.glob(str(current_model['path'] / "dataset" / "*.png")):
        png_path = pathlib.Path(d)
        dest_file = MEDIA_PATH / workspace / model_sha / "dataset" / png_path.name
        dest_file.parent.mkdir(exist_ok=True, parents=True)
        shutil.copyfile(png_path, str(dest_file))
        datasets.append({
            'name':
            png_path.stem,
            'plot':
            "/" + str(
                dest_file.relative_to(
                    pathlib.Path(flask_app.static_folder).parent))
        })
    spec_data = {
        'summary': current_model['summary'],
        'workspace': workspace,
        'model_sha': model_sha,
        'datasets': datasets,
    }
    return flask.render_template("dataset.html",
                                 data=spec_data,
                                 **GetBaseTemplateArgs())
예제 #3
0
파일: dashboard.py 프로젝트: fivosts/clgen
def parseCorpus(workspace_path):

    corpuses = []
    if (workspace_path / "corpus" / "encoded").exists():
        corpus_path = workspace_path / "corpus" / "encoded"
        for corpus_sha in corpus_path.iterdir():
            encoded_db = encoded.EncodedContentFiles("sqlite:///{}".format(
                corpus_sha / "encoded.db"),
                                                     must_exist=True)
            corpus = {
                'path':
                corpus_path / corpus_sha,
                'sha':
                str(corpus_sha.stem),
                'datapoint_count':
                encoded_db.size,
                'summary':
                "{} datapoint corpus, {}".format(encoded_db.size,
                                                 str(corpus_sha.stem)),
                'models':
                parseModels(workspace_path, str(corpus_sha.stem))
            }
            global cached_corpuses
            cached_corpuses[crypto.sha256_str(
                str(workspace_path.name) + str(corpus_sha.name))] = corpus
            corpuses.append(corpus)
    return corpuses
예제 #4
0
파일: dashboard.py 프로젝트: fivosts/clgen
def training(workspace: str, model_sha: str):
    global data
    global cached_models
    if data == {}:
        data = parseData()

    data['plots'] = []

    target_sha = crypto.sha256_str(str(workspace) + model_sha)
    for d in glob.glob(
            str(cached_models[target_sha]['path'] / "logs" / "*.png")):
        png_file = pathlib.Path(d)
        dest_file = MEDIA_PATH / workspace / model_sha / "logs" / png_file.name
        dest_file.parent.mkdir(exist_ok=True, parents=True)
        shutil.copyfile(png_file, dest_file)
        data['plots'].append("/" + str(
            dest_file.relative_to(
                pathlib.Path(flask_app.static_folder).parent)))

    data['summary'] = cached_models[target_sha]['summary']
    data['workspace'] = workspace
    data['model_sha'] = model_sha
    return flask.render_template("training.html",
                                 data=data,
                                 **GetBaseTemplateArgs())
예제 #5
0
    def remove_identical_files(self) -> None:

        l.logger().info("Removing duplicate files from mined corpus...")
        if os.path.isfile(str(self.cache_path / "record.json")):
            with open(self.cache_path / "record.json", 'r') as f:
                data = json.load(f)
                repos = data[0]
                length = data[1]['total_files']

        cache_map = {}
        for i in range(length):
            with open(self.cache_path / "{}.cl".format(i), 'r') as f:
                cf = f.read()
                cf_hash = crypto.sha256_str(cf)
                if cf_hash not in cache_map:
                    cache_map[cf_hash] = cf

        new_path = self.cache_path / "distinct_corpus"
        new_path.mkdir(exist_ok=True, parents=True)
        for k, v in cache_map.items():
            with open(new_path / "{}.cl".format(k), 'w') as f:
                f.write(v)

        with open(new_path / "record.json", 'w') as f:
            data[1]['total_files'] = len(cache_map)
            json.dump(data, f, indent=2)
        return
예제 #6
0
 def FromProto(cls, id: int,
               proto: model_pb2.Sample) -> typing.Dict[str, typing.Any]:
     return {
         "id":
         id,
         "sha256":
         crypto.sha256_str(proto.text),
         "train_step":
         proto.train_step,
         "encoded_text":
         proto.encoded_text,
         "sample_feed":
         proto.sample_feed,
         "text":
         proto.text,
         "sample_indices":
         proto.sample_indices,
         "encoded_sample_indices":
         proto.encoded_sample_indices,
         "feature_vector":
         proto.feature_vector,
         "num_tokens":
         proto.num_tokens,
         "compile_status":
         proto.compile_status,
         "categorical_sampling":
         proto.categorical_sampling,
         "sample_time_ms":
         proto.sample_time_ms,
         "date_added":
         datetime.datetime.strptime(proto.date_added, "%m/%d/%Y, %H:%M:%S"),
     }
예제 #7
0
    def FromArgs(
        cls,
        tokenizer,
        id: int,
        input_feed: np.array,
        input_features: typing.Dict[str, float],
    ) -> typing.TypeVar("ActiveInput"):
        """Construt ActiveFeed table entry from argumentns."""
        str_input_feed = tokenizer.tokensToString(
            input_feed, ignore_token=tokenizer.padToken)
        if tokenizer.padToken in input_feed:
            num_tokens = np.where(input_feed == tokenizer.padToken)[0][0]
        else:
            num_tokens = len(input_feed)

        return ActiveInput(
            id=id,
            sha256=crypto.sha256_str(str_input_feed),
            input_feed=str_input_feed,
            encoded_feed=','.join([str(x) for x in input_feed]),
            input_features='\n'.join(
                ["{}:{}".format(k, v) for k, v in input_features.items()]),
            num_tokens=int(num_tokens),
            date_added=datetime.datetime.utcnow(),
        )
예제 #8
0
def samples_distribution(data) -> None:
    freqd = {}
    for dp in data:
        gen, sam = dp.generation_id, dp.sample
        hsm = crypto.sha256_str(sam)
        if gen in freqd:
            if hsm in freqd[gen]:
                freqd[gen][hsm] += 1
            else:
                freqd[gen][hsm] = 1
        else:
            freqd[gen] = {}
            freqd[gen][hsm] = 1
    for k, v in freqd.items():
        gdict = {}
        for samp, freq in v.items():
            if freq in gdict:
                gdict[freq] += 1
            else:
                gdict[freq] = 1
        freqd[k] = (list(gdict.keys()), list(gdict.values()))
    plt.GrouppedBars(
        groups=freqd,  # Dict[Dict[int, int]]
        plot_name="freq_samples_per_gen",
        path=pathlib.Path(FLAGS.eval_cand_db).absolute().parent,
        title="Repetition of samples per generation",
        x_name="# of repetitions",
    )

    return
예제 #9
0
파일: dashboard.py 프로젝트: fivosts/clgen
def corpus(workspace: str, corpus_sha: str):
    global data
    global cached_corpuses
    if data == {}:
        data = parseData()

    target_sha = crypto.sha256_str(str(workspace) + corpus_sha)
    corpus = cached_corpuses[target_sha]
    corpus_stats = []

    for d in glob.glob(str(corpus['path'] / "*.png")):
        png_path = pathlib.Path(d)
        dest_file = MEDIA_PATH / workspace / corpus_sha / png_path.name
        dest_file.parent.mkdir(exist_ok=True, parents=True)
        shutil.copyfile(png_path, str(dest_file))
        corpus_stats.append({
            'name':
            png_path.stem,
            'plot':
            "/" + str(
                dest_file.relative_to(
                    pathlib.Path(flask_app.static_folder).parent))
        })
    corpus['stats'] = corpus_stats
    print(corpus['summary'])
    return flask.render_template("corpus.html",
                                 data=corpus,
                                 **GetBaseTemplateArgs())
예제 #10
0
 def FromArgs(cls, act_l_pf: int, act_s_dep: int, act_s_wid: int,
              feat_space: str) -> typing.TypeVar("ActiveSamplingSpecs"):
     return ActiveSamplingSpecs(
         sha256=crypto.sha256_str(
             str(act_l_pf) + str(act_s_dep) + str(act_s_wid) + feat_space),
         active_limit_per_feed=act_l_pf,
         active_search_depth=act_s_dep,
         active_search_width=act_s_wid,
         feature_space=feat_space,
     )
예제 #11
0
파일: dashboard.py 프로젝트: fivosts/clgen
def model_specs(workspace: str, model_sha: str):
    global data
    global cached_models
    if data == {}:
        data = parseData()

    target_sha = crypto.sha256_str(str(workspace) + model_sha)
    current_model = cached_models[target_sha]
    spec_data = {'config': current_model['config']}
    return flask.render_template("model_specs.html",
                                 data=spec_data,
                                 **GetBaseTemplateArgs())
예제 #12
0
def ContentHash(src: str) -> str:
    """
  Re-write code with deterministic, sequential rewriter, remove whitespaces and new lines
  and calculate the hash of the string.

  Args:
    src: The source code to compute.

  Returns:
    256-bit hash of pure source code string.
  """
    rw = SequentialNormalizeIdentifiers(src)
    return crypto.sha256_str(rw.replace(" ", "").replace("\n", ""))
예제 #13
0
파일: config.py 프로젝트: fivosts/clgen
    def __init__(
        self,
        config: typing.Union[active_learning_pb2.MLP],
        downstream_task: downstream_tasks.DownstreamTask,
    ) -> "ModelConfig":
        if isinstance(config, active_learning_pb2.MLP):
            self.name = "MLP"
        self.config = config
        self.downstream_task = downstream_task
        self.sha256 = crypto.sha256_str(str(config))

        self.num_train_steps = config.num_train_steps
        self.num_warmup_steps = config.num_warmup_steps
        self.num_epochs = 1
        self.steps_per_epoch = config.num_train_steps
        self.batch_size = config.batch_size

        self.learning_rate = config.initial_learning_rate_micros / 1e6
        self.max_grad_norm = 1.0

        if len(self.config.layer) == 0:
            raise ValueError("Layer list is empty for committee model")
        if self.config.layer[0].HasField("linear"):
            if self.config.layer[
                    0].linear.in_features != self.downstream_task.input_size:
                raise ValueError(
                    "Mismatch between committee member's input size {} and downstream task's input size {}"
                    .format(self.config.layer[0].linear.in_features,
                            self.downstream_task.input_size))
        self.layer_config = []
        for l in self.config.layer:
            if l.HasField("linear"):
                self.layer_config.append(('Linear', {
                    'in_features':
                    l.linear.in_features,
                    'out_features':
                    l.linear.out_features,
                }))
            elif l.HasField("dropout"):
                self.layer_config.append(('Dropout', {
                    'dropout_prob':
                    l.dropout.dropout_prob
                }))
            elif l.HasField("layer_norm"):
                self.layer_config.append(('LayerNorm', {
                    'layer_norm_eps':
                    l.dropout.layer_norm_eps
                }))
            elif l.HasField("act_fn"):
                self.layer_config.append((l.act_fn, {}))
        return
예제 #14
0
파일: cldrive.py 프로젝트: fivosts/clgen
 def FromArgs(
     cls,
     id: int,
     global_size: int,
     local_size: int,
     source: str,
     dataset: str,
     cpu_transfer_time_ns: typing.List[int],
     cpu_kernel_time_ns: typing.List[int],
     gpu_transfer_time_ns: typing.List[int],
     gpu_kernel_time_ns: typing.List[int],
     transferred_bytes: int,
     status: str,
 ) -> typing.Dict[str, typing.Any]:
     return CLDriveSample(
         **{
             "id":
             id,
             "sha256":
             crypto.sha256_str(source + dataset + str(global_size) +
                               str(local_size)),
             "global_size":
             global_size,
             "local_size":
             local_size,
             "source":
             source,
             "dataset":
             dataset,
             "cpu_transfer_time_ns":
             '\n'.join(
                 [str(int(x)) for x in cpu_transfer_time_ns if x != 'nan']),
             "cpu_kernel_time_ns":
             '\n'.join(
                 [str(int(x)) for x in cpu_kernel_time_ns if x != 'nan']),
             "gpu_transfer_time_ns":
             '\n'.join(
                 [str(int(x)) for x in gpu_transfer_time_ns if x != 'nan']),
             "gpu_kernel_time_ns":
             '\n'.join(
                 [str(int(x)) for x in gpu_kernel_time_ns if x != 'nan']),
             "transferred_bytes":
             transferred_bytes,
             "status":
             status,
             "date_added":
             datetime.datetime.utcnow(),
         })
예제 #15
0
def IRContentHash(src: str,
                  header_file=None,
                  use_aux_headers: bool = True) -> str:
    """
  Collect optimized LLVM-IR of source code and compute its hash.

  Args:
    src: The source code to compute.

  Returns:
    256-bit hash of pure source code string.
  """
    bc = CompileLlvmBytecode(src,
                             header_file=header_file,
                             use_aux_headers=use_aux_headers)
    return crypto.sha256_str(''.join(bc.split('\n')[2:]))
예제 #16
0
    def FromArgs(
        cls,
        tokenizer,
        id: int,
        input_feed: np.array,
        input_features: typing.Dict[str, float],
        sample: np.array,
        output_features: typing.Dict[str, float],
        sample_quality: float,
        target_benchmark: typing.Tuple[str, str],
        target_features: typing.Dict[str, float],
        compile_status: bool,
        generation_id: int,
    ) -> typing.TypeVar("ActiveFeed"):
        """Construt ActiveFeed table entry from argumentns."""
        str_input_feed = tokenizer.tokensToString(
            input_feed, ignore_token=tokenizer.padToken, with_formatting=True)
        str_sample = tokenizer.ArrayToCode(sample, with_formatting=True)

        num_tokens = len(sample)
        if tokenizer.padToken in sample:
            num_tokens = np.where(sample == tokenizer.padToken)[0][0]

        return ActiveFeed(
            id=id,
            sha256=crypto.sha256_str(str_input_feed + str_sample),
            input_feed=str_input_feed,
            encoded_feed=','.join([str(x) for x in input_feed]),
            input_features='\n'.join(
                ["{}:{}".format(k, v) for k, v in input_features.items()]),
            sample=str_sample,
            num_tokens=int(num_tokens),
            output_features='\n'.join([
                "{}:{}".format(k, v) for k, v in output_features.items()
            ]) if output_features else "None",
            target_benchmark="// {}\n{}".format(target_benchmark[0],
                                                target_benchmark[1]),
            target_features='\n'.join([
                "{}:{}".format(k, v) for k, v in target_features.items()
            ]) if target_features else "None",
            sample_quality=sample_quality,
            compile_status=compile_status,
            generation_id=generation_id,
            date_added=datetime.datetime.utcnow(),
        )
예제 #17
0
파일: cldrive.py 프로젝트: fivosts/clgen
 def get_entry(self, src: str, dataset: str, global_size: int,
               local_size: int) -> "CLDriveSample":
     """
 Fetch row from DB, if exists.
 """
     sha = crypto.sha256_str(src + dataset + str(global_size) +
                             str(local_size))
     try:
         with self.Session() as session:
             entry = session.query(CLDriveSample).filter_by(
                 sha256=sha).first()
             if entry is not None:
                 return entry
             else:
                 return None
     except Exception as e:
         l.logger().error(e)
     return
예제 #18
0
파일: dashboard.py 프로젝트: fivosts/clgen
def sampling(workspace: str, model_sha: str):
    global data
    global cached_models
    if data == {}:
        data = parseData()

    target_sha = crypto.sha256_str(str(workspace) + model_sha)
    current_model = cached_models[target_sha]
    samplers = current_model['samplers']

    data = {
        'summary': current_model['summary'],
        'workspace': workspace,
        'model_sha': model_sha,
        'samplers': samplers,
    }
    return flask.render_template("sampling.html",
                                 data=data,
                                 **GetBaseTemplateArgs())
예제 #19
0
파일: dashboard.py 프로젝트: fivosts/clgen
def parseModels(workspace_path, corpus_sha: str):

    models = []
    if (workspace_path / "model").exists():
        for model_sha in (workspace_path / "model").iterdir():
            model_path = workspace_path / "model" / model_sha
            if (model_path / "tokenizer").exists() and pathlib.Path(
                    os.readlink(
                        model_path / "tokenizer")).parent.name == corpus_sha:
                if (model_path / "META.pbtxt").exists():
                    meta = parseMeta(model_path / "META.pbtxt")
                    model = {
                        'path':
                        model_path,
                        'sha':
                        str(model_sha.name),
                        'config':
                        meta,
                        'tokenizer':
                        tokenizers.TokenizerBase.FromFile(
                            model_path / pathlib.Path(
                                os.readlink(model_path / "tokenizer"))),
                        'training_log':
                        parseTrainLogs(model_path / "logs"),  # TODO
                        'validation':
                        parseValidationDB(model_path / "logs" /
                                          "validation_samples.db"),
                        'samplers':
                        parseSamplers(workspace_path, model_path / "samples",
                                      str(model_sha.name)),  # TODO sample_db ?
                        'summary':
                        parseModelSummary(meta)
                    }
                    global cached_models
                    cached_models[crypto.sha256_str(
                        str(workspace_path.name) +
                        str(model_sha.name))] = model
                    models.append(model)

    return models
예제 #20
0
파일: clsmith.py 프로젝트: fivosts/clgen
 def FromArgs(cls,
              id      : int,
              sample  : str,
              include : str,
              encoded_sample : str,
              compile_status : bool,
              feature_vector : str,
              num_tokens     : int,
              ) -> "CLSmithSample":
   """
   Do you want to use CLSmithDatabase as a means to store only code
   without much fuss ? This function is for you!
   """
   return CLSmithSample(**{
     "id"             : id,
     "sha256"         : crypto.sha256_str(sample),
     "sample"         : sample,
     "include"        : include,
     "encoded_sample" : encoded_sample,
     "compile_status" : compile_status,
     "feature_vector" : feature_vector,
     "num_tokens"     : num_tokens,
     "date_added"     : datetime.datetime.utcnow(),
   })
예제 #21
0
 def FromArgsLite(cls, id: int, text: str, feature_vector: str,
                  compiles: bool) -> "Sample":
     """
 Do you want to use SamplesDatabase as a means to store only code
 without much fuss ? This function is for you!
 """
     return Sample(
         **{
             "id": id,
             "sha256": crypto.sha256_str(text),
             "train_step": -1,
             "encoded_text": "",
             "original_input": "",
             "sample_feed": "",
             "text": text,
             "sample_indices": "",
             "encoded_sample_indices": "",
             "compile_status": compiles,
             "feature_vector": feature_vector,
             "num_tokens": 0,
             "categorical_sampling": "False",
             "sample_time_ms": 0,
             "date_added": datetime.datetime.utcnow(),
         })
예제 #22
0
 def OnSample(self, sample: model_pb2.Sample) -> bool:
   """Sample receive callback. Returns True if sampling should continue."""
   sample_id = crypto.sha256_str(sample.text)
   sample_path = self.cache_path / f"{sample_id}.pbtxt"
   pbutil.ToFile(sample, sample_path)
   return True
예제 #23
0
    def FromArgs(
        cls,
        tokenizer,
        id: int,
        train_step: int,
        seen_in_training,
        original_input: typing.List[int],
        input_ids: typing.List[int],
        input_mask: typing.List[int],
        masked_lm_ids: typing.List[int],
        masked_lm_positions: typing.List[int],
        masked_lm_weights: typing.List[float],
        masked_lm_lengths: typing.List[int],
        next_sentence_labels: typing.List[int],
        masked_lm_predictions: typing.List[int],
        next_sentence_predictions: typing.List[int],
    ) -> typing.Dict[str, typing.Any]:

        str_original_input = tokenizer.tokensToString(
            original_input,
            ignore_token=tokenizer.padToken,
            with_formatting=True)
        str_input_ids = tokenizer.tokensToString(
            input_ids, ignore_token=tokenizer.padToken, with_formatting=True)
        str_masked_lm_ids = '\n'.join([
            tokenizer.decoder[x] if
            ('\n' not in tokenizer.vocab or
             ('\n' in tokenizer.vocab and x != tokenizer.vocab['\n'])) else
            '\\n' for x in masked_lm_ids
        ])
        str_masked_lm_predictions = '\n'.join([
            tokenizer.decoder[x] if
            ('\n' not in tokenizer.vocab or
             ('\n' in tokenizer.vocab and x != tokenizer.vocab['\n'])) else
            '\\n' for x in masked_lm_predictions
        ])

        return {
            "id":
            id,
            "sha256":
            crypto.sha256_str(
                str(int(train_step)) + str_original_input + str_input_ids +
                str_masked_lm_ids + str_masked_lm_predictions),
            "train_step":
            int(train_step),
            "original_input":
            str_original_input,
            "encoded_original_input":
            ','.join([str(x) for x in original_input]),
            "input_ids":
            str_input_ids,
            "encoded_input_ids":
            ','.join([str(x) for x in input_ids]),
            "input_mask":
            ','.join([str(x) for x in input_mask]),
            "masked_lm_positions":
            ','.join([str(x) for x in masked_lm_positions]),
            "masked_lm_ids":
            str_masked_lm_ids,
            "encoded_mask_lm_ids":
            ','.join([str(x) for x in masked_lm_ids]),
            "masked_lm_weights":
            ','.join([str(int(x)) for x in masked_lm_weights]),
            "masked_lm_lengths":
            ','.join([str(int(x)) for x in masked_lm_lengths if x >= 0]),
            "next_sentence_labels":
            int(next_sentence_labels),
            "masked_lm_predictions":
            str_masked_lm_predictions,
            "encoded_masked_lm_predictions":
            ','.join([str(x) for x in masked_lm_predictions]),
            "next_sentence_predictions":
            int(next_sentence_predictions),
            "num_targets":
            list(masked_lm_ids).index(tokenizer.padToken) if tokenizer.padToken
            in list(masked_lm_ids) else len(list(masked_lm_ids)),
            "seen_in_training":
            int(seen_in_training),
            "date_added":
            datetime.datetime.utcnow(),
        }
예제 #24
0
파일: cldrive.py 프로젝트: fivosts/clgen
 def add_entry(self, src: str, dataset: str, status: str, global_size: int,
               local_size: int, df: pd.DataFrame) -> None:
     """
 Adds execution entries from pandas dataframe.
 """
     sha = crypto.sha256_str(src + dataset + str(global_size) +
                             str(local_size))
     try:
         with self.Session(commit=True) as session:
             entry = session.query(CLDriveSample).filter_by(
                 sha256=sha).first()
             if entry is None:
                 if status in {"CPU", "GPU"}:
                     idx = 0
                     transferred_bytes = float('NaN')
                     while idx < len(df.transferred_bytes) and math.isnan(
                             transferred_bytes):
                         try:
                             transferred_bytes = int(
                                 df.transferred_bytes[idx])
                         except ValueError:
                             idx += 1
                     session.add(
                         CLDriveSample.FromArgs(
                             id=self.count,
                             global_size=global_size,
                             local_size=local_size,
                             source=src,
                             dataset=dataset,
                             cpu_transfer_time_ns=list(
                                 df[df['device'].str.contains(
                                     "CPU")].transfer_time_ns),
                             cpu_kernel_time_ns=list(
                                 df[df['device'].str.contains(
                                     "CPU")].kernel_time_ns),
                             gpu_transfer_time_ns=list(
                                 df[df['device'].str.contains(
                                     "GPU")].transfer_time_ns),
                             gpu_kernel_time_ns=list(
                                 df[df['device'].str.contains(
                                     "GPU")].kernel_time_ns),
                             transferred_bytes=transferred_bytes,
                             status=status,
                         ))
                 else:
                     session.add(
                         CLDriveSample.FromArgs(
                             id=self.count,
                             global_size=global_size,
                             local_size=local_size,
                             source=src,
                             dataset=dataset,
                             cpu_transfer_time_ns=[],
                             cpu_kernel_time_ns=[],
                             gpu_transfer_time_ns=[],
                             gpu_kernel_time_ns=[],
                             transferred_bytes=-1,
                             status=status,
                         ))
                 if self._status_cache is not None:
                     assert sha not in self._status_cache, "{} should not be in DB".format(
                         sha)
                     self._status_cache[sha] = status
             elif status in {"CPU", "GPU"}:
                 assert False, "This shouldnt happen"
                 entry.cpu_transfer_time_ns = entry.cpu_transfer_time_ns + "\n" + '\n'.join(
                     [
                         str(x) for x in df[df['device'].str.contains(
                             "CPU")].transfer_time_ns
                     ])
                 entry.cpu_kernel_time_ns = entry.cpu_kernel_time_ns + "\n" + '\n'.join(
                     [
                         str(x) for x in df[df['device'].str.contains(
                             "CPU")].kernel_time_ns
                     ])
                 entry.gpu_transfer_time_ns = entry.gpu_transfer_time_ns + "\n" + '\n'.join(
                     [
                         str(x) for x in df[df['device'].str.contains(
                             "GPU")].transfer_time_ns
                     ])
                 entry.gpu_kernel_time_ns = entry.gpu_kernel_time_ns + "\n" + '\n'.join(
                     [
                         str(x) for x in df[df['device'].str.contains(
                             "GPU")].kernel_time_ns
                     ])
             session.commit()
     except Exception as e:
         raise e
     return
예제 #25
0
파일: dashboard.py 프로젝트: fivosts/clgen
def validation_samples(workspace: str, model_sha: str):
    global data
    global cached_models
    if data == {}:
        data = parseData()

    target_sha = crypto.sha256_str(str(workspace) + model_sha)
    current_model = cached_models[target_sha]
    validation = current_model['validation']

    if validation['path']:

        val_db = validation_database.ValidationDatabase(str(
            validation['path']),
                                                        must_exist=True)
        with val_db.Session() as session:
            validation['val_samples'] = session.query(
                validation_database.BERTValFile).all()
            validation['val_metrics'] = session.query(
                validation_database.ValResults).all()
            # random.shuffle(validation['val_samples'])

        for sample in validation['val_samples']:
            processed_input_ids = []
            if '[HOLE]' in sample.input_ids:
                mask_type = '[HOLE]'
            elif '[MASK]' in sample.input_ids:
                mask_type = '[MASK]'
            else:
                mask_type = ''

            if mask_type == '[HOLE]':
                input_ids = sample.input_ids.split(mask_type)
                mask_num = sample.num_targets
                for i in range(mask_num):
                    processed_input_ids += [
                        {
                            'text': input_ids[i],
                            'color': 'plain',
                            'length': len(input_ids[i]),
                        },
                        {
                            'text': mask_type,
                            'color': 'hole',
                            'length':
                            int(sample.masked_lm_lengths.split(',')[i]),
                        },
                        {
                            'text':
                            sample.masked_lm_predictions.split('\n')
                            [i].replace(' ', '[ ]').replace('\n', '\\n'),
                            'color':
                            'prediction',
                            'length':
                            1,
                        },
                        {
                            'text':
                            sample.masked_lm_ids.split('\n')[i].replace(
                                ' ', '[ ]').replace('\n', '\\n'),
                            'color':
                            'target',
                            'length':
                            1,
                        },
                    ]
                while i < len(input_ids) - 1:
                    i += 1
                    processed_input_ids.append(
                        {
                            'text': input_ids[i],
                            'color': 'plain',
                            'length': len(input_ids[i]),
                        }, )

            elif mask_type == '[MASK]':
                processed_input_ids = [{
                    'text': sample.input_ids,
                    'color': 'plain',
                }]

            sample.input_ids = processed_input_ids
    validation['summary'] = current_model['summary']
    validation['workspace'] = workspace
    validation['model_sha'] = model_sha
    return flask.render_template("validation_samples.html",
                                 data=validation,
                                 **GetBaseTemplateArgs())
예제 #26
0
def DriveSource(src        : str,
                group_name : str,
                feats      : typing.Dict[str, float],
                cldrive_db : cldrive.CLDriveExecutions,
                ) -> typing.Generator:
  """
  For a given source code, drive to CLDrive and return a ready row.
  Args:
    src        : source code to process
    feats      : Grewe Feature vector of source code.
    cldrive_db : Caches cldrive executions of source code.
  """
  # for gsize in tqdm.tqdm([2**6, 2**7, 2**8, 2**10, 2**12, 2**14, 2**16, 2**18, 2**20], desc = "gsize", leave = False):
  for gsize in tqdm.tqdm([2**10, 2**12, 2**14, 2**16, 2**18, 2**20], desc = "gsize", leave = False):
    for lsize in tqdm.tqdm([2**2, 2**3, 2**4, 2**5, 2**6, 2**7, 2**8], desc = "lsize", leave = False):
      if lsize > gsize:
        continue

      sha = crypto.sha256_str(src + group_name + str(gsize) + str(lsize))
      if sha in cldrive_db.status_cache:
        cached = cldrive_db.get_entry(src, group_name, gsize, lsize)
        if cached.status in {"CPU", "GPU"}:
          yield ToDataFrameRow(
            name                 = "{}.cl".format(sha),
            grewe_feats          = feats,
            transferred_bytes    = cached.transferred_bytes,
            global_size          = gsize,
            local_size           = lsize,
            label                = cached.status,
            cpu_transfer_time_ns = sum([int(float(x)) for x in cached.cpu_transfer_time_ns.split('\n') if x != 'nan']) // len([x for x in cached.cpu_transfer_time_ns.split('\n') if x != 'nan']),
            cpu_kernel_time_ns   = sum([int(float(x)) for x in cached.cpu_kernel_time_ns.split('\n') if x != 'nan'])   // len([x for x in cached.cpu_kernel_time_ns.split('\n') if x != 'nan']),
            gpu_transfer_time_ns = sum([int(float(x)) for x in cached.gpu_transfer_time_ns.split('\n') if x != 'nan']) // len([x for x in cached.gpu_transfer_time_ns.split('\n') if x != 'nan']),
            gpu_kernel_time_ns   = sum([int(float(x)) for x in cached.gpu_kernel_time_ns.split('\n') if x != 'nan'])   // len([x for x in cached.gpu_kernel_time_ns.split('\n') if x != 'nan']),
          )
        else:
          yield None
      else:
        df, label = opencl.CLDriveDataFrame(src, num_runs = 100, gsize = gsize, lsize = lsize, timeout = 60)
        cldrive_db.add_entry(src, group_name, label, gsize, lsize, df)
        if label not in {"CPU", "GPU"}:
          yield None
        else:
          idx = 0
          transferred_bytes = float('NaN')
          while idx < len(df.transferred_bytes) and math.isnan(transferred_bytes):
            try:
              transferred_bytes = int(df.transferred_bytes[idx])
            except ValueError:
              idx += 1
          yield ToDataFrameRow(
            name                 = "{}.cl".format(sha),
            grewe_feats          = feats,
            transferred_bytes    = transferred_bytes,
            global_size          = gsize,
            local_size           = lsize,
            label                = label,
            cpu_transfer_time_ns = df[df['device'].str.contains("CPU")].transfer_time_ns.mean(),
            cpu_kernel_time_ns   = df[df['device'].str.contains("CPU")].kernel_time_ns.mean(),
            gpu_transfer_time_ns = df[df['device'].str.contains("GPU")].transfer_time_ns.mean(),
            gpu_kernel_time_ns   = df[df['device'].str.contains("GPU")].kernel_time_ns.mean(),
          )
예제 #27
0
 def OnSample(self, sample: model_pb2.Sample) -> bool:
   """Sample receive callback. Returns True if sampling should continue."""
   sample_id = crypto.sha256_str(sample.text)
   path = self.path / f"{sample_id}.txt"
   fs.Write(path, sample.text.encode("utf-8"))
   return True
예제 #28
0
파일: dashboard.py 프로젝트: fivosts/clgen
def sample_files(workspace: str, model_sha: str, sampler_sha: str,
                 sample_db: str):

    global data
    global cached_models
    if data == {}:
        data = parseData()

    current_sampler = {}
    target_sha = crypto.sha256_str(str(workspace) + model_sha)

    for sampler in cached_models[target_sha]['samplers']:
        if sampler['sha'] == sampler_sha:
            current_sampler = sampler
            break

    db_file = current_sampler['path'] / "{}.db".format(sample_db)
    samples_db = samples_database.SamplesDatabase(
        "sqlite:///{}".format(db_file), must_exist=True)

    with samples_db.Session() as session:
        sample_files = session.query(samples_database.Sample).all()

    for sample in sample_files:
        processed_feed = []
        processed_indices = []
        if '[HOLE]' in sample.sample_feed:
            mask_type = '[HOLE]'
        elif '[MASK]' in sample.sample_feed:
            mask_type = '[MASK]'
        else:
            mask_type = ''
        sample_feed = sample.sample_feed.split(mask_type)
        sample_indices = sample.sample_indices.split('\n')
        assert len(sample_feed) - 1 == len(sample_indices), (
            "sample hole length/generation mismatch: {}, {}".format(
                len(sample_feed),
                len(sample_indices),
            ))

        prediction = sample.text

        for i in range(len(sample_feed) - 1):
            processed_feed += [
                {
                    'text': sample_feed[i],
                    'color': 'plain',
                },
                {
                    'text': mask_type,
                    'color': 'mask',
                },
            ]
            processed_indices += [
                {
                    'text': sample_feed[i],
                    'color': 'plain',
                },
                {
                    'text': mask_type,
                    'color': 'mask',
                },
                {
                    'text': sample_indices[i].replace("\\n", "\n"),
                    'color': 'prediction',
                },
            ]
        while i < len(sample_feed) - 1:
            i += 1
            processed_indices.append(
                {
                    'text': sample_feed[i],
                    'color': 'plain',
                }, )
            processed_feed.append({'text': sample_feed[i], 'color': 'plain'})
        sample.sample_indices = processed_indices
        sample.sample_feed = processed_feed

    sample_specs = {
        'summary': cached_models[target_sha]['summary'],
        'workspace': workspace,
        'model_sha': model_sha,
        'samples': sample_files,
    }
    return flask.render_template("sample_files.html",
                                 data=sample_specs,
                                 **GetBaseTemplateArgs())
예제 #29
0
파일: corpuses.py 프로젝트: fivosts/clgen
def ResolveContentId(config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]) -> str:
  """Compute the hash of the input contentfiles.

  This function resolves the unique sha1 checksum of a set of content files.

  Args:
    config: The corpus config proto.

  Returns:
    A hex encoded sha1 string.
  """
  # We can take a massive shortcut if the content ID is already set in the
  # config proto.
  if config.HasField("content_id"):
    # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after splitting
    # out Corpus class.
    return config.content_id
  elif config.HasField("pre_encoded_corpus_url"):
    # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after splitting
    # out Corpus class.
    return crypto.sha1_str(config.pre_encoded_corpus_url)

  start_time = time.time()
  if config.HasField("local_directory"):
    local_directory = ExpandConfigPath(
      config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix
    )

    # After the first time we compute the hash of a directory, we write it into
    # a file. This is a shortcut to work around the fact that computing the
    # directory checksum is O(n) with respect to the number of files in the
    # directory (even if the directory is already cached by the hash cache).
    # This means that it is the responsibility of the user to delete this cached
    # file if the directory is changed.
    hash_file_path = pathlib.Path(str(local_directory) + ".sha1.txt")
    if hash_file_path.is_file():
      l.logger().info("Reading directory hash: '{}'.".format(hash_file_path))
      with open(hash_file_path) as f:
        content_id = f.read().rstrip()
    else:
      # No hash file, so compute the directory hash and create it.
      try:
        # content_id = hc.GetHash(local_directory)
        content_id = crypto.sha256_str(str(local_directory))
      except FileNotFoundError as e:
        raise ValueError(e)
      # Create the hash file in the directory so that next time we don't need
      # to reference the hash cache.
      with open(hash_file_path, "w") as f:
        print(content_id, file=f)
      l.logger().info("Wrote directory hash: '{}'.".format(hash_file_path))
  elif config.HasField("local_tar_archive"):
    # This if not an efficient means of getting the hash, as it requires always
    # unpacking the archive and reading the entire contents. It would be nicer
    # to maintain a cache which maps the mtime of tarballs to their content ID,
    # similart to how local_directory is implemented.
    content_id = GetHashOfArchiveContents(
      ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)
    )
  elif config.HasField("bq_database"):
    content_id = crypto.sha256_str(str(config.bq_database))
  # elif config.HasField("fetch_github"):

  #   gitfile_path = ExpandConfigPath(
  #     config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix
  #   )
  #   gitfile_path.mkdir(exist_ok=True, parents=True)
  #   github_fetcher = github.GithubFetcher(gitfile_path)

  #   github_fetcher.fetch()
  #   hash_file_path = pathlib.Path(str(gitfile_path) + ".sha1.txt")
  #   if hash_file_path.is_file():
  #     l.logger().info("Reading directory hash: '{}'.".format(hash_file_path))
  #     with open(hash_file_path) as f:
  #       content_id = f.read().rstrip()
  #   else:
  #     # No hash file, so compute the directory hash and create it.
  #     try:
  #       content_id = hc.GetHash(gitfile_path)
  #     except FileNotFoundError as e:
  #       raise ValueError(e)
  #     # Create the hash file in the directory so that next time we don't need
  #     # to reference the hash cache.
  #     with open(hash_file_path, "w") as f:
  #       print(content_id, file=f)
  #     l.logger().info("Wrote directory hash: '{}'.".format(hash_file_path))
  else:
    raise NotImplementedError("Unsupported Corpus.contentfiles field value")
  return content_id
예제 #30
0
    def FromArgs(
        cls,
        tokenizer,
        id: int,
        input_feed: np.array,
        input_ids: np.array,
        input_features: typing.Dict[str, float],
        input_score: float,
        hole_lengths: typing.List[int],
        sample: np.array,
        sample_indices: np.array,
        output_features: typing.Dict[str, float],
        sample_score: float,
        target_benchmark: typing.Tuple[str, str],
        target_features: typing.Dict[str, float],
        compile_status: bool,
        generation_id: int,
        # timestep         : int,
    ) -> typing.TypeVar("SearchCandidate"):
        """Construt SearchCandidate table entry from argumentns."""
        str_input_feed = tokenizer.tokensToString(
            input_ids, ignore_token=tokenizer.padToken, with_formatting=True)
        str_sample = tokenizer.ArrayToCode(sample, with_formatting=True)
        len_indices = len(sample_indices)
        sample_indices = tokenizer.tokensToString(
            sample_indices, ignore_token=tokenizer.padToken)

        num_tokens = len(sample)
        if tokenizer.padToken in sample:
            num_tokens = np.where(sample == tokenizer.padToken)[0][0]

        actual_length = len(input_ids) - 3
        if tokenizer.padToken in input_ids:
            actual_length = np.where(input_ids == tokenizer.padToken)[0][0] - 3

        return SearchCandidate(
            id=id,
            sha256=crypto.sha256_str(str_input_feed + str_sample +
                                     str(hole_lengths)),
            sample_sha256=crypto.sha256_str(str_sample),
            generation_id=generation_id,
            frequency=1,
            abs_hole_lengths=','.join(
                [str(hl) for hl in hole_lengths if hl >= 0]),
            rel_hole_lengths=','.join([
                str(hl / (hl + actual_length)) for hl in hole_lengths
                if hl >= 0
            ]),
            hole_ind_length=len_indices,
            input_feed=tokenizer.ArrayToCode(input_feed, with_formatting=True),
            input_ids=str_input_feed,
            encoded_input_ids=','.join([str(x) for x in input_ids]),
            input_features='\n'.join([
                "{}:{}".format(k, v) for k, v in input_features.items()
            ]) if input_features else "None",
            input_score=input_score,
            sample=str_sample,
            sample_indices=sample_indices,
            num_tokens=int(num_tokens),
            output_features='\n'.join([
                "{}:{}".format(k, v) for k, v in output_features.items()
            ]) if output_features else "None",
            sample_score=sample_score,
            target_benchmark="// {}\n{}".format(target_benchmark[0],
                                                target_benchmark[1]),
            target_features='\n'.join([
                "{}:{}".format(k, v) for k, v in target_features.items()
            ]) if target_features else "None",
            compile_status=compile_status,
            score_delta=(sample_score - input_score) /
            input_score if not math.isinf(input_score) else math.inf,
            features_delta='\n'.join([
                "{}:{}".format(k, output_features[k] - input_features[k])
                for k in input_features.keys()
                if (output_features[k] - input_features[k] != 0)
            ]) if input_features and output_features else math.inf,
            date_added=datetime.datetime.utcnow(),
        )