Пример #1
0
def get_whatrequires(pkg, yum_conf):
    """
    Write list of packages.

    Write packages that require the current package to a file
    using dnf repoquery what-requires and --recursive commands.
    """
    # clean up dnf cache to avoid 'no more mirrors repo' error
    try:
        subprocess.check_output([
            'dnf', '--config', yum_conf, '--releasever', 'clear', 'clean',
            'all'
        ])
    except subprocess.CalledProcessError as err:
        util.print_warning("Unable to clean dnf repo: {}, {}".format(pkg, err))
        return

    try:
        out = subprocess.check_output([
            'dnf', 'repoquery', '--config', yum_conf, '--releasever', 'clear',
            '--archlist=src', '--recursive', '--queryformat=%{NAME}',
            '--whatrequires', pkg
        ]).decode('utf-8')

    except subprocess.CalledProcessError as err:
        util.print_warning(
            "dnf repoquery whatrequires for {} failed with: {}".format(
                pkg, err))
        return

    util.write_out(
        'whatrequires', '# This file contains recursive sources that '
        'require this package\n' + out)
Пример #2
0
def attempt_key_import(keyid):
    print(SEPT)
    ig = InputGetter(
        '\nDo you want to attempt to import keyid {}: (y/N) '.format(keyid))
    import_key_answer = ig.get_answer()
    if import_key_answer in [None, False]:
        return False
    with cli_gpg_ctx() as ctx:
        err, _ = ctx.import_key(keyid)
        if err is not None:
            print_error(err.strerror)
            return False
        err, key_content = ctx.export_key(keyid)
        if err is not None:
            print_error(err.strerror)
        key_fullpath = PUBKEY_PATH.format(keyid)
        util.write_out(key_fullpath, key_content)
        print('\n')
        print_success('Public key id: {} was imported'.format(keyid))
        err, content = ctx.display_keyinfo(key_fullpath)
        if err is not None:
            print_error(
                'Unable to parse {}, will be removed'.format(key_fullpath))
            os.unlink(key_fullpath)
            return False
        print('\n', '\n'.join(content.split('\n')[:10]))
        ig = InputGetter(message='\nDo you want to keep this key: (Y/n) ',
                         default='y')
        if ig.get_answer() is True:
            return True
        else:
            os.unlink(key_fullpath)
    return False
Пример #3
0
    def compute_loss(self, minibatch, processed_minibatches, minimum_updates):
        (original_aa_string, actual_coords_list, _) = minibatch

        emissions, _backbone_atoms_padded, _batch_sizes = \
            self._get_network_emissions(original_aa_string)
        actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence(
            actual_coords_list)
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()
        start = time.time()
        if isinstance(_batch_sizes[0], int):
            _batch_sizes = torch.tensor(_batch_sizes)
        emissions_actual, _ = \
            calculate_dihedral_angles_over_minibatch(actual_coords_list_padded,
                                                    _batch_sizes,
                                                    self.use_gpu)
        drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded,
                                                  actual_coords_list_padded,
                                                  _batch_sizes)

        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        multiplier = 0.4

        if (processed_minibatches < minimum_updates * (40 / 100)):
            multiplier = processed_minibatches / minimum_updates

        normalized_angular_loss = angular_loss / 5
        normalized_drmsd_avg = drmsd_avg / 100
        return (normalized_drmsd_avg * multiplier) + (normalized_angular_loss *
                                                      (1 - multiplier))
Пример #4
0
def main():
    parser = argparse.ArgumentParser(description="OpenProtein version 0.1")
    parser.add_argument('--silent',
                        dest='silent',
                        action='store_true',
                        help='Dont print verbose debug statements.')
    parser.add_argument('--hide-ui',
                        dest='hide_ui',
                        action='store_true',
                        default=False,
                        help='Hide loss graph and '
                        'visualization UI while training goes on.')
    parser.add_argument('--evaluate-on-test',
                        dest='evaluate_on_test',
                        action='store_true',
                        default=False,
                        help='Run model of test data.')
    parser.add_argument('--use-gpu',
                        dest='use_gpu',
                        action='store_true',
                        default=False,
                        help='Use GPU.')
    parser.add_argument(
        '--eval-interval',
        dest='eval_interval',
        type=int,
        default=10,
        help='Evaluate model on validation set every n minibatches.')
    parser.add_argument('--min-updates',
                        dest='minimum_updates',
                        type=int,
                        default=100,
                        help='Minimum number of minibatch iterations.')
    parser.add_argument('--minibatch-size',
                        dest='minibatch_size',
                        type=int,
                        default=8,
                        help='Size of each minibatch.')
    parser.add_argument('--experiment-id',
                        dest='experiment_id',
                        type=str,
                        default="example",
                        help='Which experiment to run.')
    args, _ = parser.parse_known_args()

    if args.hide_ui:
        write_out("Live plot deactivated, see output folder for plot.")

    use_gpu = args.use_gpu

    if use_gpu and not torch.cuda.is_available():
        write_out("Error: --use-gpu was set, but no GPU is available.")
        sys.exit(1)

    if not args.hide_ui:
        # start web server
        start_dashboard_server()

    experiment = importlib.import_module("experiments." + args.experiment_id)
    experiment.run_experiment(parser, use_gpu)
Пример #5
0
def check_regression(pkg_dir, skip_tests, test_round):
    """Check the build log for test regressions using the count module."""
    if skip_tests:
        return

    log_path = os.path.join(pkg_dir, 'results', 'build.log')
    result = count.parse_log(log_path)
    if len(result) == 0 or result[0:2] == ',0':
        log_path = os.path.join(pkg_dir, 'results',
                                f"round{test_round}-build.log")
        result = count.parse_log(log_path)

    titles = [('Package', 'package name', 1), ('Total', 'total tests', 1),
              ('Pass', 'total passing', 1), ('Fail', 'total failing', 0),
              ('Skip', 'tests skipped', 0), ('XFail', 'expected fail', 0)]
    res_str = ""
    for line in result.strip('\n').split('\n'):
        s_line = line.split(',')
        for idx, title in enumerate(titles):
            if s_line[idx]:
                if (s_line[idx] != '0') or (title[2] > 0):
                    print("{}: {}".format(title[1], s_line[idx]))
                res_str += "{} : {}\n".format(title[0], s_line[idx])

    util.write_out(os.path.join(pkg_dir, "testresults"), res_str)
Пример #6
0
    def compute_loss(self, minibatch):
        (original_aa_string, actual_coords_list, _) = minibatch

        emissions, _backbone_atoms_padded, _batch_sizes = \
            self._get_network_emissions(original_aa_string)
        actual_coords_list_padded = torch.nn.utils.rnn.pad_sequence(actual_coords_list)
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()
        start = time.time()
        if isinstance(_batch_sizes[0], int):
            _batch_sizes = torch.tensor(_batch_sizes)
        emissions_actual, _ = \
            calculate_dihedral_angles_over_minibatch(actual_coords_list_padded,
                                                     _batch_sizes,
                                                     self.use_gpu)
        # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded,
        #                                           actual_coords_list_padded,
        #                                           batch_sizes)
        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            # drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        return angular_loss  # + drmsd_avg
Пример #7
0
def write_upstream(sha, tarfile, mode="w"):
    """
    Write the upstream hash to the upstream file
    """
    write_out(os.path.join(build.download_path, "upstream"),
              os.path.join(sha, tarfile) + "\n",
              mode=mode)
Пример #8
0
def attempt_key_import(keyid, key_fullpath):
    """Ask user to import key."""
    global IMPORTED
    print(SEPT)
    ig = InputGetter('\nDo you want to attempt to import keyid {}: (y/N) '.format(keyid))
    import_key_answer = ig.get_answer()
    if import_key_answer in [None, False]:
        return False
    with cli_gpg_ctx() as ctx:
        err, _ = ctx.import_key(keyid)
        if err is not None:
            util.print_error(err.strerror)
            return False
        err, key_content = ctx.export_key(keyid)
        if err is not None:
            util.print_error(err.strerror)
            return False
        util.write_out(key_fullpath, key_content)
        print('\n')
        util.print_success('Public key id: {} was imported'.format(keyid))
        err, content = ctx.display_keyinfo(key_fullpath)
        if err is not None:
            util.print_error('Unable to parse {}, will be removed'.format(key_fullpath))
            os.unlink(key_fullpath)
            return False
        print("\n", content)
        ig = InputGetter(message='\nDo you want to keep this key: (Y/n) ', default='y')
        if ig.get_answer() is True:
            IMPORTED = content
            return True
        else:
            os.unlink(key_fullpath)
    return False
Пример #9
0
    def compute_loss(self, minibatch):
        (original_aa_string, actual_coords_list, _) = minibatch

        if any(np.isnan(x.cpu().detach().numpy()).any() for x in original_aa_string) or \
        any(np.isnan(x.cpu().detach().numpy()).any() for x in actual_coords_list):
            return None

        emissions, _backbone_atoms_padded, _batch_sizes = \
            self._get_network_emissions(original_aa_string)
        assert not np.isnan(emissions.cpu().detach().numpy()).any()
        actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn\
            .pad_packed_sequence(
                torch.nn.utils.rnn.pack_sequence(actual_coords_list))
        assert not np.isnan(
            actual_coords_list_padded.cpu().detach().numpy()).any()
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()

        start = time.time()
        emissions_actual, _ = \
            calculate_dihedral_angles_over_minibatch(actual_coords_list_padded,
                                                     batch_sizes_coords,
                                                     self.use_gpu)
        # drmsd_avg = calc_avg_drmsd_over_minibatch(backbone_atoms_padded,
        #                                           actual_coords_list_padded,
        #                                           batch_sizes)
        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            # drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        return angular_loss  # + drmsd_avg
Пример #10
0
    def __iter__(self):
        data_class_map = {}
        data_class_map[0] = []
        data_class_map[1] = []
        data_class_map[2] = []
        data_class_map[3] = []

        for idx in self.sampler:
            data_class_map[self.dataset[idx][4]].append(idx)

        num_each_class = int(self.batch_size / 4)

        max_class_size = max(
            [len(data_class_map[0]), len(data_class_map[1]),
             len(data_class_map[2]), len(data_class_map[3])])

        batch_num = int(max_class_size / num_each_class)
        if max_class_size % num_each_class != 0:
            batch_num += 1

        batch_relative_offset = (1.0 / float(batch_num)) / 2.0
        batches = []
        for _ in range(batch_num):
            batch = []
            for _class_id, data_rows in data_class_map.items():
                int_offset = int(batch_relative_offset * len(data_rows))
                batch.extend(sample_at_index(data_rows, int_offset, num_each_class))
            batch_relative_offset += 1.0 / float(batch_num)
            batches.append(batch)

        random.shuffle(batches)

        for batch in batches:
            write_out("Using minibatch from RandomBatchClassBalancedSequentialSampler")
            yield batch
Пример #11
0
    def write_default_conf_file(name, description):
        """Write default configuration file with description to file name."""
        config_files.add(name)
        filename = os.path.join(path, name)
        if os.path.isfile(filename):
            return

        write_out(filename, wrapper.fill(description) + "\n")
Пример #12
0
def run_on_everything():
    """Stupidly long exhaustive search of each timestamp

    Note: Since it takes more than a second to exhaust a timestamp (currently)
    This loop will not finish before needing more than a 4 byte timestamp"""
    for unix_time in range(0x00000000, 0xffffffff + 1):
        success = distribute(unix_time)
        if not success:
            write_out("ErrorLog.txt", "Failed on {}".format(unix_time))
Пример #13
0
 def merge_samples_to_minibatch(samples):
     samples_list = []
     for s in samples:
         samples_list.append(s)
     # sort according to length of aa sequence
     samples_list.sort(key=lambda x: len(x[7]), reverse=True)
     aa_list, labels_list, remapped_labels_list_crf_hmm, remapped_labels_list_crf_marg, prot_type_list, prot_topology_list, prot_name, original_aa_string, original_label_string = zip(
         *samples_list)
     write_out(prot_type_list)
     return aa_list, labels_list, remapped_labels_list_crf_hmm, remapped_labels_list_crf_marg, prot_type_list, prot_topology_list, prot_name, original_aa_string, original_label_string
Пример #14
0
def construct_model(model_parameters, embedding_size, use_gpu, minibatch_size):
    model_type = model_parameters["architecture"]
    mixture_size = model_parameters["output_size"]
    dropout = model_parameters["dropout"]
    model = None
    soft_max_to_angle = models.soft_to_angle(mixture_size)
    if model_type == "rnn":
        model = ExampleModel(embedding_size, minibatch_size, use_gpu, dropout=dropout, mixture_size=mixture_size, hidden_size=model_parameters["hidden_size"])
    
    elif model_type == "cnn" or model_type == "cnn_angles":
        num_layers = model_parameters["layers"]
        channels = [embedding_size] + model_parameters["channels"][:num_layers-1] + [mixture_size] 
        kernels = model_parameters["kernel"] * num_layers
        paddings = model_parameters["padding"] * num_layers
        stride = model_parameters["stride"] * num_layers
        dilation = model_parameters["dilation"] * num_layers
        spatial_dropout = model_parameters["spatial_dropout"]
        layers = []
        for i in range(num_layers):
            params = (channels[i], channels[i+1], kernels[i], paddings[i], stride[i], dilation[i])
            layers.append(params)
        if model_type == "cnn_angles":
            soft_max_to_angle = None
            model = CNNBaseModelAngles(embedding_size, layers, minibatch_size, use_gpu, mixture_size=mixture_size)
        else:
            model = CNNBaseModel(embedding_size, layers, minibatch_size, use_gpu, dropout=dropout, mixture_size=mixture_size, spatial_dropout=spatial_dropout)
    
    elif model_type == "resnet":
        resnet_type = model_parameters["resnet_type"]
        kernel = model_parameters["kernel"]
        padding = model_parameters["padding"]
        stride = model_parameters["stride"]
        droprate = model_parameters["dropout"] * 5

        parameters = {
            "input_channels":embedding_size,
            "out_channels":mixture_size, 
            "kernel": kernel,
            "padding":padding,
            "stride":stride,
            "use_gpu":use_gpu,
            "droprate":droprate
        }

        model_func = PResNet.name_dict.get(resnet_type, None)
        if model_func is None:
            write_out('RESNET TYPE NOT SUPPORTED PLEASE USE SUPPORTED TYPE [resnet18,resnet34,resnet50,restnet101,resnet152] BY SPECIFYING "resnet_type" IN CONFIG FILE')
            exit()
        
        model = model_func(**parameters)
    else:
        write_out("MODEL TYPE NOT RECOGNICED PLEASE USE A SUPPORTED ARCHITECTURE IN CONFIG FILE [cnn,cnn_angles,resnet,rnn]")
        exit()
    return openprotein.BaseModel(use_gpu, mixture_size, model, soft_max_to_angle)
Пример #15
0
def track_best_hash(oven):
    """Watches the output queue and reports the best hash found for each process
    The global best is stored"""
    best = ("1", "1", "1")
    for attempt in iter(oven.get, STOP):
        print(attempt)
        if attempt[0] < best[0]:
            print("NEW BEST")
            best = attempt

    write_out("best_found.txt", best)
Пример #16
0
def embed(data, batch_sizes, device):

    # one-hot encoding
    start_compute_embed = time.time()
    prot_aa_list = data.unsqueeze(1)
    embed_tensor = torch.zeros(prot_aa_list.size(0), 21,
                               prot_aa_list.size(2)).to(device)  # 21 classes
    #prot_aa_list.to(device) #should already be embedded.
    input_sequences = embed_tensor.scatter_(1, prot_aa_list.data,
                                            1).transpose(1, 2)
    end = time.time()
    write_out("Embed time:", end - start_compute_embed)
    packed_input_sequences = rnn_utils.pack_padded_sequence(
        input_sequences, batch_sizes)
    return packed_input_sequences
Пример #17
0
 def __init__(self, pubkey=None, home=None):
     _gpghome = home
     if _gpghome is None:
         _gpghome = tempfile.mkdtemp(prefix='tmp.gpghome')
     os.environ['GNUPGHOME'] = _gpghome
     self.args = ['gpg', '--homedir', _gpghome]
     util.write_out(os.path.join(_gpghome, 'gpg.conf'), GNUPGCONF)
     if pubkey is not None:
         args = self.args + ['--import', pubkey]
         output, err, code = self.exec_cmd(args)
         if code == -9:
             raise Exception('Command {} timeout after {} seconds'.format(' '.join(args), CMD_TIMEOUT))
         elif code != 0:
             raise Exception(err.decode('utf-8'))
     self._home = _gpghome
Пример #18
0
def main():
    parser = argparse.ArgumentParser(description="OpenProtein version 0.1")
    parser.add_argument('--no_force_pre_processing_overwrite',
                        dest='no_force_pre_processing_overwrite',
                        action='store_false',
                        help='Force overwrite existing preprocessed files',
                        default=True)
    args, _unknown = parser.parse_known_args()

    uge_gpu = False
    if torch.cuda.is_available():
        write_out("CUDA is available, using GPU")
        uge_gpu = True

    process_raw_data(
        uge_gpu,
        force_pre_processing_overwrite=args.force_pre_processing_overwrite)
Пример #19
0
    def embed(self, original_aa_string):
        data, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(
            torch.nn.utils.rnn.pack_sequence(original_aa_string))

        # one-hot encoding
        start_compute_embed = time.time()
        prot_aa_list = data.unsqueeze(1)
        embed_tensor = torch.zeros(prot_aa_list.size(0), 21,
                                   prot_aa_list.size(2))  # 21 classes
        if self.use_gpu:
            prot_aa_list = prot_aa_list.cuda()
            embed_tensor = embed_tensor.cuda()
        input_sequences = embed_tensor.scatter_(1, prot_aa_list.data,
                                                1).transpose(1, 2)
        end = time.time()
        write_out("Embed time:", end - start_compute_embed)
        packed_input_sequences = rnn_utils.pack_padded_sequence(
            input_sequences, batch_sizes)
        return packed_input_sequences
Пример #20
0
def check_regression(pkg_dir):
    """Check the build log for test regressions using the count module."""
    if config.config_opts['skip_tests']:
        return

    result = count.parse_log(os.path.join(pkg_dir, "results/build.log"))
    titles = [('Package', 'package name', 1), ('Total', 'total tests', 1),
              ('Pass', 'total passing', 1), ('Fail', 'total failing', 0),
              ('Skip', 'tests skipped', 0), ('XFail', 'expected fail', 0)]
    res_str = ""
    for line in result.strip('\n').split('\n'):
        s_line = line.split(',')
        for idx, title in enumerate(titles):
            if s_line[idx]:
                if (s_line[idx] != '0') or (title[2] > 0):
                    print("{}: {}".format(title[1], s_line[idx]))
                res_str += "{} : {}\n".format(title[0], s_line[idx])

    util.write_out(os.path.join(pkg_dir, "testresults"), res_str)
Пример #21
0
    def compute_loss(self, training_minibatch):
        _, labels_list, remapped_labels_list_crf_hmm, remapped_labels_list_crf_marg, \
        _prot_type_list, _prot_topology_list, _prot_name_list, original_aa_string, \
        _original_label_string = training_minibatch
        minibatch_size = len(labels_list)
        if self.model_mode == TMHMM3Mode.LSTM_CRF_MARG:
            labels_to_use = remapped_labels_list_crf_marg
        elif self.model_mode == TMHMM3Mode.LSTM_CRF_HMM:
            labels_to_use = remapped_labels_list_crf_hmm
        else:
            labels_to_use = labels_list
        input_sequences = [
            autograd.Variable(x) for x in self.embed(original_aa_string)
        ]

        actual_labels = torch.nn.utils.rnn.pad_sequence(
            [autograd.Variable(l) for l in labels_to_use])
        emissions, batch_sizes = self._get_network_emissions(input_sequences)
        if self.model_mode == TMHMM3Mode.LSTM:
            prediction = emissions.transpose(0, 1).contiguous().view(
                -1, emissions.size(-1))
            target = actual_labels.transpose(0, 1).contiguous().view(-1, 1)
            losses = -torch.gather(
                nn.functional.log_softmax(prediction), dim=1,
                index=target).view(*actual_labels.transpose(0, 1).size())
            mask_expand = torch.range(0, batch_sizes.data.max() - 1).long() \
                .unsqueeze(0).expand(batch_sizes.size(0), batch_sizes.data.max())
            if self.use_gpu:
                mask_expand = mask_expand.cuda()
                batch_sizes = batch_sizes.cuda()
            mask = mask_expand < batch_sizes.unsqueeze(1).expand_as(
                mask_expand)
            loss = (losses * mask.float()).sum() / batch_sizes.float().sum()
        else:
            mask = (self.batch_sizes_to_mask(batch_sizes))
            loss = -1 * self.crf_model(emissions, actual_labels,
                                       mask=mask) / minibatch_size
            if float(
                    loss
            ) > 100000:  # if loss is this large, an invalid tx must have been found
                for idx, batch_size in enumerate(batch_sizes):
                    last_label = None
                    for i in range(batch_size):
                        label = int(actual_labels[i][idx])
                        write_out(str(label) + ",", end='')
                        if last_label is not None and (last_label, label) \
                                not in self.allowed_transitions:
                            write_out("Error: invalid transition found")
                            write_out((last_label, label))
                            sys.exit(1)
                        last_label = label
                    write_out(" ")
        return loss
Пример #22
0
    def __iter__(self):
        data = []
        for idx in self.sampler:
            data.append(idx)

        batch_num = int(len(data) / self.batch_size)
        if len(data) % self.batch_size != 0:
            batch_num += 1

        batch_order = list(range(batch_num))
        random.shuffle(batch_order)

        batch = []
        for batch_id in batch_order:
            write_out("Accessing minibatch #" + str(batch_id))
            for i in range(self.batch_size):
                if i + (batch_id * self.batch_size) < len(data):
                    batch.append(data[i + (batch_id * self.batch_size)])
            yield batch
            batch = []
Пример #23
0
def calculate_partitions(partitions_count, cluster_partitions, types):
    partition_distribution = torch.ones(
        (partitions_count, len(torch.unique(types))), dtype=torch.long)
    partition_assignments = torch.zeros(cluster_partitions.shape[0],
                                        dtype=torch.long)

    for i in torch.unique(cluster_partitions):
        cluster_positions = (cluster_partitions == i).nonzero()
        cluster_types = types[cluster_positions]
        unique_types_in_cluster, type_count = torch.unique(cluster_types,
                                                           return_counts=True)
        tmp_distribution = partition_distribution.clone()
        tmp_distribution[:, unique_types_in_cluster] += type_count
        relative_distribution = partition_distribution.double(
        ) / tmp_distribution.double()
        min_relative_distribution_group = torch.argmin(
            torch.sum(relative_distribution, dim=1))
        partition_distribution[min_relative_distribution_group,
                               unique_types_in_cluster] += type_count
        partition_assignments[
            cluster_positions] = min_relative_distribution_group

    write_out("Loaded data into the following partitions")
    write_out("[[  TM  SP+TM  SP Glob]")
    write_out(partition_distribution -
              torch.ones(partition_distribution.shape, dtype=torch.long))
    return partition_assignments
Пример #24
0
    def compute_loss(self, minibatch):
        (original_aa_string, actual_coords_list, _, pssms, token) = minibatch

        emissions, _backbone_atoms_padded, _batch_sizes = self._get_network_emissions(
            original_aa_string, pssms, token)
        actual_coords_list_padded, batch_sizes_coords = torch.nn.utils.rnn.pad_packed_sequence(
            torch.nn.utils.rnn.pack_sequence(actual_coords_list))
        if self.use_gpu:
            actual_coords_list_padded = actual_coords_list_padded.cuda()
        start = time.time()
        emissions_actual, _ = calculate_dihedral_angles_over_minibatch(
            actual_coords_list_padded, batch_sizes_coords, self.use_gpu)
        drmsd_avg = calc_avg_drmsd_over_minibatch(_backbone_atoms_padded,
                                                  actual_coords_list_padded,
                                                  _batch_sizes)
        write_out("Angle calculation time:", time.time() - start)
        if self.use_gpu:
            emissions_actual = emissions_actual.cuda()
            drmsd_avg = drmsd_avg.cuda()
        angular_loss = calc_angular_difference(emissions, emissions_actual)

        return angular_loss, drmsd_avg
Пример #25
0
    def embed(self, original_aa_string):
        max_len = max([s.size(0) for s in original_aa_string])
        seqs = []
        for tensor in original_aa_string:
            padding_to_add = torch.zeros(max_len-tensor.size(0)).int()
            seqs.append(torch.cat((tensor, padding_to_add)))

        data = torch.stack(seqs).transpose(0, 1)

        # one-hot encoding
        start_compute_embed = time.time()
        arange_tensor = torch.arange(21).int().repeat(
            len(original_aa_string), 1
        ).unsqueeze(0).repeat(max_len, 1, 1)
        data_tensor = data.unsqueeze(2).repeat(1, 1, 21)
        embed_tensor = (arange_tensor == data_tensor).float()

        if self.use_gpu:
            embed_tensor = embed_tensor.cuda()

        end = time.time()
        write_out("Embed time:", end - start_compute_embed)

        return embed_tensor
Пример #26
0
def write_prep(conf, workingdir, content):
    """Write metadata to the local workingdir when --prep-only is used."""
    if conf.urlban:
        used_url = re.sub(conf.urlban, "localhost", content.url)
    else:
        used_url = content.url

    print()
    print("Exiting after prep due to --prep-only flag")
    print()
    print("Results under ./workingdir")
    print("Source  (./workingdir/{})".format(content.tarball_prefix))
    print("Name    (./workingdir/name)    :", content.name)
    print("Version (./workingdir/version) :", content.version)
    print("URL     (./workingdir/source0) :", used_url)
    write_out(os.path.join(workingdir, "name"), content.name)
    write_out(os.path.join(workingdir, "version"), content.version)
    write_out(os.path.join(workingdir, "source0"), used_url)
Пример #27
0
def train_model(data_set_identifier, model, train_loader, validation_loader,
                learning_rate, minibatch_size=64, eval_interval=50, hide_ui=False,
                use_gpu=False, minimum_updates=1000,
                optimizer_type='adam', restart=False):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size)

    validation_dataset_size = validation_loader.dataset.__len__()

    if use_gpu:
        model = model.cuda()

    if optimizer_type == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer_type == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    elif optimizer_type == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    if restart:
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=32)

    sample_num = list()
    train_loss_values = list()
    train_drmsd_values = list()
    validation_loss_values = list()
    validation_angles_loss_values = list()
    best_model_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    best_json_data = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    while not stopping_condition_met:
    # for i in range(2):
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        drmsd_tracker = np.zeros(0)
        for _minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            start_compute_loss = time.time()
            loss, drmsd_avg = model.compute_loss(training_minibatch)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            drmsd_tracker = np.append(drmsd_tracker, float(drmsd_avg))
            end = time.time()
            write_out("Loss time:", start_compute_grad - start_compute_loss, "Grad time:",
                      end - start_compute_grad)
            optimizer.step()
            if restart:
                scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % eval_interval == 0:

                write_out("Testing model on validation set...")

                train_loss = float(loss_tracker.mean())
                train_drmsd = float(drmsd_tracker.mean())
                loss_tracker = np.zeros(0)
                drmsd_tracker = np.zeros(0)
                validation_loss, json_data, _, validation_angles_loss = model.evaluate_model(validation_loader)

                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)
                    best_json_data = json_data

                write_out("Validation loss:", validation_loss, "Train loss:", train_loss, "Train drmsd:", train_drmsd)
                write_out("Best model so far (validation loss): ", best_model_loss, "at time",
                          best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Minibatches processed:", minibatches_proccesed)
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                train_drmsd_values.append(train_drmsd)
                validation_loss_values.append(validation_loss)
                validation_angles_loss_values.append(validation_angles_loss)
                json_data["validation_dataset_size"] = validation_dataset_size
                json_data["sample_num"] = sample_num
                json_data["train_loss_values"] = train_loss_values
                json_data["train_drmsd_values"] = train_drmsd_values
                json_data["validation_loss_values"] = validation_loss_values
                json_data['validation_angles_loss_values'] = validation_angles_loss_values

                write_out(json_data)

                if not hide_ui:
                    res = requests.post('http://localhost:5000/graph', json=json_data)
                    if res.ok:
                        print(res.json())

                if minibatches_proccesed > minimum_updates and minibatches_proccesed \
                        >= best_model_minibatch_time + minimum_updates:
                    stopping_condition_met = True
                    break
    write_result_summary(best_model_loss)
    write_result_summary(json.dumps(best_json_data))
    return best_model_path
Пример #28
0
def commit_to_git(config, name, success):
    """Update package's git tree for autospec managed changes."""
    path = config.download_path
    call("git init", stdout=subprocess.DEVNULL, cwd=path)

    # This config is used for setting the remote URI, so it is optional.
    if config.git_uri:
        try:
            call("git config --get remote.origin.url", cwd=path)
        except subprocess.CalledProcessError:
            upstream_uri = config.git_uri % {'NAME': name}
            call("git remote add origin %s" % upstream_uri, cwd=path)

    for config_file in config.config_files:
        call("git add %s" % config_file, cwd=path, check=False)
    for unit in config.sources["unit"]:
        call("git add %s" % unit, cwd=path)
    call("git add Makefile", cwd=path)
    call("git add upstream", cwd=path)
    call("bash -c 'shopt -s failglob; git add *.spec'", cwd=path)
    call("git add %s.tmpfiles" % name, check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add %s.sysusers" % name, check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add prep_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add pypi.json", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add build_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add make_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add install_prepend", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add install_append", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add series", check=False, stderr=subprocess.DEVNULL, cwd=path)
    # Add/remove version specific patch lists
    for filename in glob.glob('series.*'):
        base, version = filename.split('.', 1)
        if version in config.versions:
            call("git add {}".format(filename), check=False, stderr=subprocess.DEVNULL, cwd=path)
        else:
            call("git rm {}".format(filename), check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("bash -c 'shopt -s failglob; git add -f *.asc'", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("bash -c 'shopt -s failglob; git add -f *.sig'", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("bash -c 'shopt -s failglob; git add -f *.sha256'", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("bash -c 'shopt -s failglob; git add -f *.sign'", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("bash -c 'shopt -s failglob; git add -f *.pkey'", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add configure", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add configure32", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add configure64", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add configure_avx2", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add configure_avx512", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add make_check_command", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("bash -c 'shopt -s failglob; git add *.patch'", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("bash -c 'shopt -s failglob; git add *.nopatch'", check=False, stderr=subprocess.DEVNULL, cwd=path)
    for item in config.transforms.values():
        call("git add {}".format(item), check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add release", cwd=path)
    call("git add symbols", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add symbols32", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add used_libs", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add used_libs32", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add testresults", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add profile_payload", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add options.conf", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add configure_misses", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add whatrequires", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add description", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git add attrs", check=False, stderr=subprocess.DEVNULL, cwd=path)

    # remove deprecated config files
    call("git rm make_install_append", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm prep_append", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm use_clang", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm use_lto", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm use_avx2", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm fast-math", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm broken_c++", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm skip_test_suite", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm optimize_size", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm asneeded", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm broken_parallel_build", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm pgo", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm unit_tests_must_pass", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm funroll-loops", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm keepstatic", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm allow_test_failures", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm no_autostart", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm insecure_build", check=False, stderr=subprocess.DEVNULL, cwd=path)
    call("git rm conservative_flags", check=False, stderr=subprocess.DEVNULL, cwd=path)

    # add a gitignore
    ignorelist = [
        ".*~",
        "*~",
        "*.info",
        "*.mod",
        "*.swp",
        ".repo-index",
        "*.log",
        "build.log.round*",
        "*.tar.*",
        "*.tgz",
        "!*.tar.*.*",
        "*.zip",
        "*.jar",
        "*.pom",
        "*.xml",
        "commitmsg",
        "results/",
        "rpms/",
        "for-review.txt",
        ""
    ]
    write_out(os.path.join(path, '.gitignore'), '\n'.join(ignorelist))
    call("git add .gitignore", check=False, stderr=subprocess.DEVNULL, cwd=path)

    if success == 0:
        return

    call("git commit -a -F commitmsg ", cwd=path)
    call("rm commitmsg", cwd=path)
Пример #29
0
parser.add_argument('--hide-ui', dest = 'hide_ui', action = 'store_true',
                    default=False, help='Hide loss graph and visualization UI while training goes on.')
parser.add_argument('--evaluate-on-test', dest = 'evaluate_on_test', action = 'store_true',
                    default=False, help='Run model of test data.')
parser.add_argument('--eval-interval', dest = 'eval_interval', type=int,
                    default=5, help='Evaluate model on validation set every n minibatches.')
parser.add_argument('--min-updates', dest = 'minimum_updates', type=int,
                    default=5000, help='Minimum number of minibatch iterations.')
parser.add_argument('--minibatch-size', dest = 'minibatch_size', type=int,
                    default=1, help='Size of each minibatch.')
parser.add_argument('--learning-rate', dest = 'learning_rate', type=float,
                    default=0.01, help='Learning rate to use during training.')
args, unknown = parser.parse_known_args()

if args.hide_ui:
    write_out("Live plot deactivated, see output folder for plot.")

use_gpu = False
if torch.cuda.is_available():
    write_out("CUDA is available, using GPU")
    use_gpu = True

# start web server
start_dashboard_server()

process_raw_data(use_gpu, force_pre_processing_overwrite=False)

training_file = "data/preprocessed/sample.txt.hdf5"
validation_file = "data/preprocessed/sample.txt.hdf5"
testing_file = "data/preprocessed/testing.hdf5"
Пример #30
0
def train_model(data_set_identifier, train_file, val_file, learning_rate, minibatch_size, name):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size, name)

    train_loader = contruct_dataloader_from_disk(train_file, minibatch_size, use_evolutionary=True)
    validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size, use_evolutionary=True)
    validation_dataset_size = validation_loader.dataset.__len__()
    train_dataset_size = train_loader.dataset.__len__()



    embedding_size = 21
    if configs.run_params["use_evolutionary"]:
        embedding_size = 42


    #Load in existing model if given as argument
    if args.model is not None:
        model_path = "output/models/" + args.model + ".model"
        model = load_model_from_disk(model_path, use_gpu)
    else:
    #else construct new model from config file
        model = construct_model(configs.model_params, embedding_size, use_gpu,minibatch_size)
    
    #optimizer parameters
    betas = tuple(configs.run_params["betas"])
    weight_decay = configs.run_params["weight_decay"]
    angle_lr = configs.run_params["angles_lr"]

    if configs.model_params['architecture'] == 'cnn_angles':
        optimizer = optim.Adam(model.parameters(), betas=betas, lr=learning_rate, weight_decay=weight_decay)
    else:
        optimizer = optim.Adam([
            {'params' : model.model.parameters(), 'lr':learning_rate},
            {'params' : model.soft_to_angle.parameters(), 'lr':angle_lr}], betas=betas, weight_decay=weight_decay)
    
    #print number of trainable parameters
    print_number_of_parameters(model)
    #For creating a summary table of the model (does not work on ExampleModel!)
    if configs.run_params["print_model_summary"]:
        if configs.model_params["architecture"] != 'rnn':
            summary(model, configs.run_params["max_sequence_length"], 2)
        else:
            write_out("DETAILED MODEL SUMMARY IS NOT SUPPORTED FOR RNN MODELS")
    
    if use_gpu:
        model = model.cuda()

    # TODO: is soft_to_angle.parameters() included here?

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()
    rmsd_avg_values = list()
    drmsd_avg_values = list()
    break_point_values = list()

    breakpoints = configs.run_params['breakpoints']
    best_model_loss = 1e20
    best_model_train_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    loss_atoms = configs.run_params["loss_atoms"]
    start_time = time.time()
    max_time = configs.run_params["max_time"]
    C_epochs = configs.run_params["c_epochs"] # TODO: Change to parameter
    C_batch_updates = C_epochs

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        start_time_n_minibatches = time.time()
        for minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            training_minibatch = list(training_minibatch)
            primary_sequence, tertiary_positions, mask, p_id = training_minibatch[:-1]
            # Update C
            C = 1.0 if minibatches_proccesed >= C_batch_updates else float(minibatches_proccesed) / C_batch_updates

            #One Hot encode amino string and concate PSSM values.
            amino_acids, batch_sizes = one_hot_encode(primary_sequence, 21, use_gpu)

            if configs.run_params["use_evolutionary"]:
                evolutionary = training_minibatch[-1]

                evolutionary, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(torch.nn.utils.rnn.pack_sequence(evolutionary))
                
                if use_gpu:
                    evolutionary = evolutionary.cuda()

                amino_acids = torch.cat((amino_acids, evolutionary.view(-1, len(batch_sizes) , 21)), 2)

            start_compute_loss = time.time()

            if configs.run_params["only_angular_loss"]:
                #raise NotImplementedError("Only_angular_loss function has not been implemented correctly yet.")
                loss = model.compute_angular_loss((amino_acids, batch_sizes), tertiary_positions, mask)
            else:
                loss = model.compute_loss((amino_acids, batch_sizes), tertiary_positions, mask, C=C, loss_atoms=loss_atoms)
            
            if C != 1:
                write_out("C:", C)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad-start_compute_loss, "Grad time:", end-start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % configs.run_params["eval_interval"] == 0:
                model.eval()
                write_out("Testing model on validation set...")
                train_loss = loss_tracker.mean()
                loss_tracker = np.zeros(0)
                validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(validation_loader,
                     model, use_gpu, loss_atoms, configs.run_params["use_evolutionary"])
                prim = data_total[0][0]
                pos = data_total[0][1]
                pos_pred = data_total[0][3]
                mask = data_total[0][4]
                pos = apply_mask(pos, mask)
                angles_pred = data_total[0][2]

                angles_pred = apply_mask(angles_pred, mask, size=3)

                pos_pred = apply_mask(pos_pred, mask)
                prim = torch.masked_select(prim, mask)

                if use_gpu:
                    pos = pos.cuda()
                    pos_pred = pos_pred.cuda()

                angles = calculate_dihedral_angels(pos, use_gpu)
                #angles_pred = calculate_dihedral_angels(pos_pred, use_gpu)
                #angles_pred = data_total[0][2] # Use angles output from model - calculate_dihedral_angels(pos_pred, use_gpu)

                write_to_pdb(get_structure_from_angles(prim, angles), "test")
                write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred")
                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)

                if train_loss < best_model_train_loss:
                    best_model_train_loss = train_loss
                    best_model_train_path = write_model_to_disk(model, model_type="train")

                write_out("Validation loss:", validation_loss, "Train loss:", train_loss)
                write_out("Best model so far (validation loss): ", best_model_loss, "at time", best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Best model train stored at " + best_model_train_path)
                write_out("Minibatches processed:",minibatches_proccesed)

                end_time_n_minibatches = time.time()
                n_minibatches_time_used = end_time_n_minibatches - start_time_n_minibatches
                minibatches_left = configs.run_params["max_updates"] - minibatches_proccesed
                seconds_left = int(n_minibatches_time_used * (minibatches_left/configs.run_params["eval_interval"]))
                
                m, s = divmod(seconds_left, 60)
                h, m = divmod(m, 60)
                write_out("Estimated time until maximum number of updates:", '{:d}:{:02d}:{:02d}'.format(h, m, s) )
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)
                rmsd_avg_values.append(rmsd_avg)
                drmsd_avg_values.append(drmsd_avg)
                
                if breakpoints and minibatches_proccesed > breakpoints[0]:
                    break_point_values.append(drmsd_avg)
                    breakpoints = breakpoints[1:]

                data = {}
                data["pdb_data_pred"] = open("output/protein_test_pred.pdb","r").read()
                data["pdb_data_true"] = open("output/protein_test.pdb","r").read()
                data["validation_dataset_size"] = validation_dataset_size
                data["sample_num"] = sample_num
                data["train_loss_values"] = train_loss_values
                data["break_point_values"] = break_point_values
                data["validation_loss_values"] = validation_loss_values
                data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:,1]])
                data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1,2]])
                data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:,1]])
                data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1,2]])
                data["drmsd_avg"] = drmsd_avg_values
                data["rmsd_avg"] = rmsd_avg_values
                if not configs.run_params["hide_ui"]:
                    res = requests.post('http://localhost:5000/graph', json=data)
                    if res.ok:
                        print(res.json())
                
                # Save run data
                write_run_to_disk(data)

                #Check if maximum time is reached.
                start_time_n_minibatches = time.time()
                time_used = time.time() - start_time

                time_condition = (max_time is not None and time_used > max_time)
                max_update_condition = minibatches_proccesed >= configs.run_params["max_updates"]
                min_update_condition = (minibatches_proccesed > configs.run_params["min_updates"] and minibatches_proccesed > best_model_minibatch_time * 2)

                model.train()
                #Checking for stop conditions
                if time_condition or max_update_condition or min_update_condition:
                    stopping_condition_met = True
                    break
    write_out("Best validation model found after" , best_model_minibatch_time , "minibatches.")
    write_result_summary(best_model_loss)
    return best_model_path