Пример #1
0
    def __init__(self, downsampling_step, sequence_length):
        loading_dataset_since = time()
        extension = 'xlsx'
        self.downsampling_step = downsampling_step
        self.sequence_length = sequence_length
        all_filenames = [i for i in glob.glob('*.{}'.format(extension))
                         ]  #find all files
        data_pd = pd.concat(
            [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames],
            ignore_index=True)  #concat all the data
        data_numpy = data_pd.to_numpy().astype(float)
        zeros_removed = remove_zeros(data_numpy)
        downsampled_data = downsample(zeros_removed, downsampling_step)
        time_series_data = split_time_series(downsampled_data, sequence_length)
        sc = StandardScaler()
        scaled_data = sc.fit_transform(time_series_data)
        scaled_data_tensor = torch.from_numpy(scaled_data)
        scaled_data_tensor_reshaped = scaled_data_tensor.unsqueeze(
            0).transpose(1, 0)
        self.len = scaled_data_tensor_reshaped.shape[0]
        self.training_data_tensor = scaled_data_tensor_reshaped
        loading_dataset_end = time()
        hours, minutes, seconds = timer(loading_dataset_since,
                                        loading_dataset_end)

        print('The length of the dataset is {}'.format(
            len(self.training_data_tensor)))
        print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds))
Пример #2
0
def fetch_comment_op_thread(comment_op, comment_keys, username, only_authored):
   # Skip comments we've already fetched
   op_key = get_key(comment_op)
   if op_key in comment_keys:
      return []
   
   post_comments = []

   # We can't fetch deleted comments from the api, so try to use the parent
   # to fill in the information missing from the operation
   comment = fetch_comment(op_key)
   if comment_is_not_found(comment):
      parent_comment = fetch_comment(get_parent_key(comment_op))
      if comment_is_not_found(parent_comment):
         print_with_timestamp('Could not find \'{}\', skipping...'.format(get_link_string(*op_key)))
         return []
      post_comments.append(make_comment_from_parent(comment_op, parent_comment))
      print_with_timestamp('Comment \'{}\' was deleted but was able to fill in information from parent'.format(get_link_string(*op_key)))
      comment = parent_comment # Now that the deleted comment was added, find the root comment from the parent

   if only_authored and comment['root_author'] != username:
      return []

   # If this isn't the root comment, fetch it
   comment_key = get_key(comment)
   root_key = get_root_key(comment)
   root_comment = comment if root_key == comment_key else fetch_comment(root_key)

   with timer('Fetching post \'{}\''.format(get_link_string(*root_key))):
      post_comments.extend(fetch_thread(root_comment, comment_keys))
   return post_comments
Пример #3
0
def archive_user_history(username, start_date, end_date, only_authored):
   message = 'Archive history for user \'{}\' from {} to {}'.format(username, start_date, end_date)
   if only_authored:
      message += ' (only self-authored posts)'
   with timer(message):
      comment_keys = db.get_comment_keys() # Keep track of the comments we've already fetched
      for post_comments in fetch.fetch_user_history_rows(username, start_date, end_date, comment_keys, only_authored):
         if post_comments:
            insert_comments(post_comments)
Пример #4
0
def archive_thread(author, permlink):
    with timer(f'Archive thread "{get_link_string(author, permlink)}"'):
        thread_comments = fetch.fetch_thread_rows(author, permlink)
        if thread_comments:
            insert_comments(thread_comments)
Пример #5
0
    def __init__(self,
                 downsampling_step,
                 sequence_length,
                 train=True,
                 normalize=False):
        loading_dataset_since = time()
        extension = 'xlsx'

        self.downsampling_step = downsampling_step
        self.sequence_length = sequence_length

        #find all files and concatenate
        all_filenames = [i for i in glob.glob('*{}'.format(extension))]

        data = pd.concat(
            [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames],
            ignore_index=True)

        #extract torque and label
        torque = data.iloc[:, 0].to_numpy().astype(float)
        label = data.iloc[:, 1].to_numpy().astype(float)

        #remove zeros from torque and label
        label = np.delete(label, np.where(torque == 0))
        torque = remove_zeros(torque)

        #expand dimension and store the zero removed data
        torque = np.expand_dims(torque, axis=1)
        label = np.expand_dims(label, axis=1)
        data = np.append(torque, label, axis=1)

        #find the normal and anomalous labeled sequences and divide the data into segments'
        segmented_list = consecutive(
            (np.where(data[:, 1] == 0))[0]) + consecutive(
                (np.where(data[:, 1] == 1))[0])
        segmented_list.sort(key=lambda segmented_list: segmented_list[1])
        segmented_data = []
        for i in range(len(segmented_list)):
            segments = segmented_list[i]
            start_index = segments[0]
            end_index = segments[len(segments) - 1]
            seg_data = data[start_index:end_index + 1, :]
            segmented_data.append(seg_data)

        #downsample the data and make sequences'
        sequenced_data = []
        for i in range(len(segmented_data)):
            label = segmented_data[i][0, 1]
            data = downsample(segmented_data[i][:, 0], self.downsampling_step)
            data = split_time_series(data, self.sequence_length)
            if label == 0.:
                label_column = [0] * len(data)
            else:
                label_column = [1] * len(data)

            sequenced_data.append(np.column_stack((data, label_column)))

        data = np.empty((0, self.sequence_length + 1))

        for i in range(len(sequenced_data)):
            if sequenced_data[i].shape[1] == self.sequence_length + 1:
                data = np.append(data, sequenced_data[i], axis=0)

        if normalize:
            #scale the data and return the tensor output'
            sc = StandardScaler()

            training_data = data[0:int(0.7 * (len(data))),
                                 0:self.sequence_length]
            testing_data = data[int(0.7 * (len(data))):,
                                0:self.sequence_length]

            training_label = data[0:int(0.7 * (len(data))), -1]
            testing_label = data[int(0.7 * (len(data))):, -1]

            sc_fit = sc.fit(training_data)

            if train:
                unlabeled_data = sc_fit.transform(training_data)
                data = np.column_stack((unlabeled_data, training_label))
            else:
                unlabeled_data = sc_fit.transform(testing_data)
                data = np.column_stack((unlabeled_data, testing_label))
        else:
            if train:
                data = data[0:int(0.7 * (len(data))), :]
            else:
                data = data[int(0.7 * (len(data))):, :]
        data = torch.from_numpy(data).unsqueeze(0).transpose(1, 0)

        self.len = data.shape[0]
        self.data = data

        loading_dataset_end = time()
        hours, minutes, seconds = timer(loading_dataset_since,
                                        loading_dataset_end)

        print('The length of the dataset is {}'.format(self.len))
        print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds))
Пример #6
0
import numpy as np
from matplotlib import pyplot as plt

from helpers import parse_args, timer

from generate_data import generate_x, find_min_max
from visualisation import visualise_x

if __name__ == '__main__':
    filename = 'task_2.log'
    args = parse_args()
    x_msg = f"X generation with N = {args.N} and M = {args.M}"
    X = timer(generate_x, filename, x_msg)(args.M, args.N)
    y_msg = "Finding optimums for X"
    YMin, YMax = timer(find_min_max, filename, y_msg)(X, args.T, args.k)

    for _ in range(args.amount_graphs):
        start = np.random.randint(0, args.N * (args.M - 1))
        visualise_x(X, start, args.N, YMin, YMax)
        plt.legend()
        plt.show()
Пример #7
0
import numpy as np
from dataset import make_data_loader
from helpers import parse_args, timer
from generate_data import generate_x, find_min_max


def sample(loader):
    for x in loader:
        pass


if __name__ == '__main__':
    args = parse_args()
    filename = 'task_3.log'
    x_msg = f"X generation with N = {args.N} and M = {args.M}"
    X = timer(generate_x, filename, x_msg)(args.M, args.N)
    y_msg = "Finding optimums for X"
    YMin, YMax = timer(find_min_max, filename, y_msg)(X, args.T, args.k)

    loader = make_data_loader(X,
                              YMin,
                              YMax,
                              N=args.N,
                              batch_size=args.batch_size,
                              num_batches=args.num_batches)
    timer(
        sample, filename,
        f"{args.num_batches} batches sampling with batch size = {args.batch_size}"
    )(loader)
Пример #8
0
                             alpha=1.0,
                             color_one='red',
                             color_two='green')
run.plot_nmi_ari(list_of_nmi, list_of_ari, top_acc_maps_idx)
#plot testing results
run.plot_acc_test(list_of_acc_test,
                  top_acc_maps_idx_test,
                  alpha=0.8,
                  color='red')
run.plot_nmi_ari_test(list_of_nmi_test,
                      list_of_ari_test,
                      top_acc_maps_idx_test,
                      alpha=0.8)

end = time.time()

hours, minutes, seconds = helpers.timer(since, end)
print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                                 seconds))
tsne_since = time.time()
run.apply_TSNE(embeddings,
               labels_pred,
               list_of_centers,
               top_acc_maps_idx,
               n_components=2,
               perplexity=30.0)
tsne_end = time.time()
hours, minutes, seconds = helpers.timer(tsne_since, tsne_end)
print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                                 seconds))
Пример #9
0
def archive_thread(author, permlink):
   with timer('Archive thread \'{}\''.format(get_link_string(author, permlink))):
      thread_comments = fetch.fetch_thread_rows(author, permlink)
      if thread_comments:
         insert_comments(thread_comments)
Пример #10
0
# COFHAE config
parser.add_argument('--skip_cofhae', type=int, default=0)
parser.add_argument('--softmax_temperature', type=float, default=1.0)
parser.add_argument('--adversarial_penalty', type=float, default=1.0)
parser.add_argument('--assignment_penalty', type=float, default=1000.0)

FLAGS = parser.parse_args()

# Set up path to save model artifacts, possibly suffixed with an experiment ID
path = FLAGS.output_dir or f"/tmp/{int(time.time())}"
os.system('mkdir -p ' + path)

with open(os.path.join(path, 'flags.json'), 'w') as f:
    f.write(json.dumps(FLAGS.__dict__))

with timer("loading data"):
    if 'chopsticks' in FLAGS.dataset:
        from chopsticks import Chopsticks
        m = re.search(r'depth(\d)_([a-z]+)', FLAGS.dataset)
        depth = int(m.group(1))
        variant = m.group(2)
        noise = 0
        if 'noise' in FLAGS.dataset:
            noise = float(re.search(r'noise([0-9\.]+)', FLAGS.dataset).group(1))
        dataset = Chopsticks(depth, variant, noise)
    elif FLAGS.dataset == 'spaceshapes':
        from spaceshapes import Spaceshapes
        dataset = Spaceshapes()
    else:
        raise ValueError(f"Unrecognized dataset {FLAGS.dataset} -- should either be 'spaceshapes' or a Chopsticks variant string with a depth and slope/inter/either/both, e.g. 'chopsticks_depth3_both' or 'chopsticks_depth2_slope'.")
Пример #11
0
if __name__ == '__main__':
    logger = logger_call()
    queue, queue_out = Queue(), Queue()
    create_bd()

    links = get_links()
    start = time()
    thread_count = 5

    # Populate queue
    for link in links:
        queue.put(link)

    # Create threads for links parsing
    for _ in range(thread_count):
        t = GetData(queue)
        t.daemon = True
        t.start()

    # Create thread for data storing to db
    db_thread = StoreData(queue_out)
    db_thread.daemon = True
    db_thread.name = 'Thread-DB'
    db_thread.start()

    queue.join()
    queue_out.join()

    # Measure time spent
    logger.info("Time spent: " + timer(start, time()))
Пример #12
0
def MIMOSA(Z,
           num_nearest_neighbors=40,
           eig_cumsum_thresh=0.95,
           eig_decay_thresh=4,
           cos_simil_thresh=0.99,
           ransac_frac=0.6667,
           contagion_num=5,
           min_size_init=20,
           min_size_merged=2000,
           neighbor_lengthscale_mult=10):

    with timer("BallTree"):
        ball_tree = BallTree(Z)
        neighbors = ball_tree.query(Z, k=num_nearest_neighbors)[1]

    with timer("LocalSVD"):
        svd_kwargs = dict(eig_cumsum_thresh=eig_cumsum_thresh,
                          eig_decay_thresh=eig_decay_thresh,
                          cos_simil_thresh=cos_simil_thresh,
                          ransac_frac=ransac_frac)
        svds = [LocalSVD(Z[n], **svd_kwargs) for n in neighbors]

    covered = set()

    def BuildComponent(start):
        similar_neighbors = [
            n for n in neighbors[start]
            if n not in covered and svds[start].is_similar(svds[n])
        ]
        component = set([start] + similar_neighbors)
        frontier = similar_neighbors
        visits = defaultdict(int)

        while len(frontier):
            i = frontier.pop()
            for j in neighbors[i]:
                if j in covered: continue
                if j in component: continue
                if svds[i].is_similar(svds[j]):
                    visits[j] += 1
                    if visits[j] >= contagion_num:
                        component.add(j)
                        frontier.append(j)

        idx = list(component)

        return ManifoldComponent(Z[idx], [svds[i] for i in idx], idx)

    with timer("BuildComponent"):
        components = []
        for i in range(len(Z)):
            if i not in covered:
                component = BuildComponent(i)
                components.append(component)
                for j in component.index_list:
                    covered.add(j)

    with timer("MergeComponents"):
        components2 = MergeComponents(components,
                                      min_size_init=min_size_init,
                                      min_size_merged=min_size_merged)

    with timer("ConstructHierarchy"):
        hierarchy, assignments = ConstructHierarchy(
            len(Z),
            components2,
            neighbor_lengthscale_mult=neighbor_lengthscale_mult)

    return components, components2, hierarchy, assignments
Пример #13
0
	def begin(self):
		helpers.timer(self.duration, self.endStage).start()
Пример #14
0
	def startBidTimer(self):
		# start and announce to players
		self.bidTimer = helpers.timer(self.bidDuration, self.bidEnded)
		self.bidTimer.start()
		self.game.sendEventToAllPlayers('TimerBegin', {'duration':self.bidDuration})
Пример #15
0
    def train_bultmann(self,
                       tr_loader,
                       tr_dataset_length,
                       Adam=True,
                       scheduler=True):

        since = time.time()
        print('Training the network {}'.format(
            self.network.__class__.__name__))
        print('Network Architecture \n{}'.format(self.network))
        print('Network Criterion {}'.format(self.network_criterion))
        list_of_network_loss = []
        list_of_clustering_loss = []
        list_of_total_loss = []
        list_of_losses = []
        learning_rates = []
        list_of_centers = []
        list_of_ranks_of_center_distances = []
        list_of_center_distances = []

        if Adam:
            optimizer = torch.optim.Adam(self.network.parameters(),
                                         lr=self.lr,
                                         weight_decay=0.0)

        else:
            optimizer = torch.optim.SGD(self.network.parameters(),
                                        lr=self.lr,
                                        momentum=0.0,
                                        weight_decay=0.0,
                                        nesterov=False)

        if scheduler:
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                        step_size=1,
                                                        gamma=0.1)

        for epoch in range(self.n_epochs):

            embedded_representation = []
            batched_center_index = 0
            total_combined_loss = 0.0
            total_network_loss = 0.0
            total_clustering_loss = 0.0
            labels = np.empty((0, 1), float)

            for batch in tr_loader:

                #extract the sequence and label from the batch and make predictions and return bottleneck

                sequences = batch[:, :, 0:self.sequence_length].float()
                batch_labels = batch[:, :, self.sequence_length]
                labels = np.append(labels, batch_labels.numpy(), axis=0)
                target_sequences = sequences.clone()
                predictions, bottleneck = self.network(sequences)

                embedded_representation.append(bottleneck.clone().detach())
                batch_embeddings = torch.cat(embedded_representation)

                #compute the network loss

                network_loss = self.network_criterion(predictions,
                                                      target_sequences)

                #set condition for pretrain mode

                if epoch <= self.no_of_pretrain_epochs:

                    #pretrain mode

                    clustering_loss = torch.zeros([1, 1], dtype=torch.float64)
                    combined_loss = network_loss  # + self.alpha*clustering_loss   # defining the combined loss
                    optimizer.zero_grad()

                    #calculating the gradients and taking step with only network loss as the clustering loss is zero'

                    combined_loss.backward(
                        retain_graph=True
                    )  # retaining the pytorch computation graph so that backward can be done twice
                    optimizer.step()

                else:

                    #joint training mode

                    clustering_loss = self.clustering_criterion(
                        bottleneck,
                        batched_center_designation[batched_center_index])
                    batched_center_index += 1  # incrementing the batched center index
                    combined_loss = (
                        1 - self.alpha
                    ) * network_loss + self.alpha * clustering_loss
                    optimizer.zero_grad()

                    #calculating the gradients but not taking step

                    combined_loss.backward(retain_graph=True)

                    #updating the weights of the clustering friendly channels wrt combined loss

                    bottleneck_layer = helpers.get_bottleneck_name(
                        self.network)

                    #train_reporter.print_grads(network)

                    with torch.no_grad():

                        for name, parameters in self.network.named_parameters(
                        ):

                            if name == bottleneck_layer:

                                ranked_channels = torch.from_numpy(
                                    ranks_of_center_distances)
                                parameters.grad[torch.where(
                                    ranked_channels <=
                                    self.no_of_clustering_channels)] = 0.0

                    optimizer.step()

                    #updating the weights of rest of the channels wrt network loss'

                    optimizer.zero_grad()
                    network_loss.backward()

                    with torch.no_grad():

                        for name, parameters in self.network.named_parameters(
                        ):

                            if name == bottleneck_layer:

                                ranked_channels = torch.from_numpy(
                                    ranks_of_center_distances)
                                parameters.grad[torch.where(
                                    ranked_channels >
                                    self.no_of_clustering_channels)] = 0.0

                    optimizer.step()

                total_network_loss += network_loss.item()
                total_clustering_loss += clustering_loss.item()
                total_combined_loss += combined_loss.item()
            #extract embeddings
            embeddings = batch_embeddings

            #make list of losses

            list_of_network_loss.append(total_network_loss /
                                        (tr_dataset_length) / self.batch_size)
            list_of_clustering_loss.append(
                total_clustering_loss / (tr_dataset_length) / self.batch_size)
            list_of_total_loss.append(total_combined_loss /
                                      (tr_dataset_length) / self.batch_size)

            #make cluster update interval array

            cluster_update = np.arange(self.no_of_pretrain_epochs,
                                       self.n_epochs,
                                       self.cluster_update_interval)

            #clustering
            for update in cluster_update:

                if update == epoch:
                    print('Updating Cluster Centers')
                    center_designation_pre = []
                    cluster_label_pre = []
                    centers_pre = []
                    no_of_channels = embeddings.shape[1]

                    for i in range(no_of_channels):
                        channel = embeddings[:, i, :].numpy()
                        choice_cluster, initial_centers, cluster_ass = helpers.kmeansalter(
                            channel, self.n_clusters)
                        cluster_label_pre.append(
                            torch.from_numpy(choice_cluster).unsqueeze(
                                0).transpose(1, 0))
                        cluster_label = torch.cat(cluster_label_pre, dim=1)
                        centers_pre.append(
                            torch.from_numpy(initial_centers).unsqueeze(
                                0).transpose(1, 0))
                        centers = torch.cat(centers_pre, dim=1)
                        center_designation_pre.append(
                            cluster_ass.unsqueeze(0).transpose(1, 0))
                        center_designation = torch.cat(center_designation_pre,
                                                       dim=1)

                    batched_center_designation = list(
                        helpers.divide_batches(center_designation,
                                               self.batch_size))
                    center_distances, ranks_of_center_distances = helpers.rank_channels(
                        centers)

            print(
                'Epoch : {}/{} Network Loss : {} Clustering Loss : {} Total Loss : {}'
                .format(epoch + 1, self.n_epochs,
                        (total_network_loss /
                         (tr_dataset_length / self.batch_size)),
                        (total_clustering_loss /
                         (tr_dataset_length / self.batch_size)),
                        (total_combined_loss /
                         (tr_dataset_length / self.batch_size))))

        list_of_centers.append(centers.numpy())
        list_of_ranks_of_center_distances.append(ranks_of_center_distances)
        list_of_center_distances.append(center_distances)
        list_of_losses.append(list_of_network_loss)
        list_of_losses.append(list_of_clustering_loss)
        list_of_losses.append(list_of_total_loss)
        end = time.time()
        hours, minutes, seconds = helpers.timer(since, end)
        print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds))
        return self.network, optimizer, list_of_network_loss, list_of_clustering_loss, list_of_total_loss, list_of_losses, embeddings, labels, list_of_centers, list_of_ranks_of_center_distances, list_of_center_distances
Пример #16
0
import torch

from model import Model, train
from generate_data import generate_x, find_min_max
from helpers import parse_args, timer


if __name__ == '__main__':
    args = parse_args()
    config = {'hidden_size': args.hidden_size, 'num_layers': args.num_layers}
    model = Model(**config)
    print(model)
    filename = "task_4.log"
    x_msg = f"X generation with N = {args.N} and M = {args.M}"
    X = timer(generate_x, filename, x_msg)(args.M, args.N)
    y_msg = "Finding optimums for X"
    YMin, YMax = timer(find_min_max, filename, y_msg)(X, args.T, args.k)

    X_val = generate_x(1, args.val_size)
    YMin_val, YMax_val = find_min_max(X_val, args.T, args.k)[:args.N]
    X, X_val, YMin, YMin_val, YMax, YMax_val = [torch.from_numpy(x) for x in [X, X_val, YMin, YMin_val, YMax, YMax_val]]
    training_message = "Training model"
    timer(train, filename, training_message)(model, X, X_val, YMin, YMin_val, YMax, YMax_val, args.N, args.M,
                                             args.epochs, args.lr, args.num_batches, args.batch_size, args.device)
    if not os.path.exists(args.model_path):
        os.mkdir(args.model_path)
    torch.save(model.state_dict(), os.path.join(args.model_path, 'model.pkl'))
    with open(os.path.join(args.model_path, 'config.json'), 'w') as f:
        json.dump(config, f, ensure_ascii=False, indent=4)