예제 #1
0
def _solve(g, name, params):
    from grb_lazy import solve_problem as solve
    print()
    print('***  STARTING A NEW PROBLEM  ***')
    print()
    print('name:', name)
    print('params:', params)
    #
    nontriv_sccs = sum(1 for sc in strongly_connected_components(g)
                       if len(sc) > 1)
    assert nontriv_sccs == 1, nontriv_sccs
    assert g.number_of_selfloops() == 0
    #
    stats = Stats(name=name,
                  params=params,
                  is_optimal=True,
                  cost=None,
                  ILP=0,
                  node=0,
                  iter=0,
                  time=None)
    start = time()
    elims, cost, cycle_matrix = solve(g, stats)
    end = time()
    #
    stats.time = end - start
    #
    stats.cost = cost
    print_stats(g, stats)
    fname = (name + '_' + params).replace(' ', '_')
    serialize(cycle_matrix, TMP_DIR + 'cycle_matrix_' + fname + '.pkl.gz')
    serialize(elims, TMP_DIR + 'solution_' + fname + '.pkl.gz')
    #
    return stats
    def __init__(self, dataset, min_tracks):
        """
        """
        # Inizialize classes
        self.pp = PreProcessing()
        self.st = Stats()

        # Define input files
        IN_DIR = os.path.join("../data", dataset)
        playlists_file = os.path.join(IN_DIR, "playlists.tsv")
        tracklist_file = os.path.join(IN_DIR, "tracklist.tsv")
        glove_embs = os.path.join("../data", "tag_embeds", dataset + ".txt")
        lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv")

        # Import data
        DictEmbeds, self.DictKeyEmbeds = self.pp.import_embeddings(glove_embs)
        self.a_idx = self.pp.create_annoy_idx(DictEmbeds)
        self.DictTrackTag = self.pp.import_track_tags(lastfm_tags)
        self.playlists = self.pp.filter_playlists(
            self.pp.import_playlists(playlists_file, min_tracks))
        self.DictTrack = self.pp.import_tracklist(tracklist_file)

        # Define variables
        self.low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv')
        self.high_pDI_playlists = os.path.join(IN_DIR,
                                               'high_pDI_playlists.tsv')
        self.rand_tracks_playlist = []
예제 #3
0
def main():
    print('\n loading the dataset ... \n')
    print('\n done \n')
    # args.distributed = args.world_size > 1
    model = AttenVgg(input_size=args.img_size, num_class=args.num_classes)
    model = loadpartweight(model)
    LR = Learning_rate_generater('step', [40, 50], 120)
    opt = optim.SGD(model.parameters(),
                    lr=args.lr,
                    momentum=0.9,
                    weight_decay=1e-4)

    # plot network
    # vizNet(model, args.modeldir)

    # if args.distributed:
    # 	dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size,rank=0)
    # 	model.cuda()
    # 	model = torch.nn.parallel.DistributedDataParallel(model)
    model = torch.nn.DataParallel(model).cuda()
    trainloader, valloader = get_data(args)
    critertion = torch.nn.CrossEntropyLoss().cuda()
    if args.evaluate:
        evaluate(valloader, model, critertion)
        return
    if not os.path.exists(args.modeldir):
        os.mkdir(args.modeldir)
    stats = Stats(args.modeldir, start_epoch=0)
    for epoch in range(args.epochs):

        # if args.distributed:
        # 	train_sampler.set_epoch(epoch)
        adjust_learning_rate(opt, LR.lr_factor, epoch)
        trainObj, top1, top2 = train(trainloader, model, critertion, opt,
                                     epoch)
        valObj, prec1, prec2 = evaluate(valloader, model, critertion)
        stats._update(trainObj, top1, top2, valObj, prec1, prec2)
        filename = []
        if args.store_per_epoch:
            filename.append(
                os.path.join(args.modeldir,
                             'net-epoch-%s.pth.tar' % (epoch + 1)))
        else:
            filename.append(os.path.join(args.modeldir, 'checkpoint.pth.tar'))
        filename.append(os.path.join(args.modeldir, 'model_best.pth.tar'))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer': opt.state_dict()
            }, (prec1 > best_prec1), filename)

        plot_curve(stats, args.modeldir, True)
        sio.savemat(os.path.join(args.modeldir, 'stats.mat'), {'data': stats})
def track_popularity_analysis(playlists):
    """
    Analyze how popularity is distributed within the tracks in the dataset.
    It takes as input a list of playlists and return:
    - Dict with as keys the track IDs and as values the tracks popularity
    - Int representing the minimum track popularity
    - Int representing the maximum track popularity
    """
    DictTrackPop = get_track_popularity(playlists)

    # Get most popular track
    top_tPI = max(DictTrackPop.items(), key=operator.itemgetter(1))[0]

    logging.info("Most popular track <{}>, with tPI: {} ".format(
        top_tPI, DictTrackPop[top_tPI]))

    logging.info("Top tPI divided by number of playlists: {} ".format(
        DictTrackPop[top_tPI] / len(playlists)))

    # Min-Max normalization
    min_pop, max_pop = [min(DictTrackPop.values()), max(DictTrackPop.values())]
    DictTrackPop = {
        k: (v - min_pop) / (max_pop - min_pop)
        for k, v in DictTrackPop.items()
    }

    # Groupe in 10 groups according to tPI
    DictTrackPopGroup = {}
    for track in DictTrackPop:
        group = int(np.floor(DictTrackPop[track] * 10))
        if group not in DictTrackPopGroup:
            DictTrackPopGroup[group] = 0
        DictTrackPopGroup[group] += 1

    # Compute Shannon and Simpson Indexes
    idx = Stats()
    logging.info("Tracks with tPI in [0.0, 0.1) (%): {}".format(
        DictTrackPopGroup[0] * 100 / len(DictTrackPop)))
    logging.info("Shannon Diversity Index: {}".format(
        idx.shannon_di(DictTrackPopGroup)))
    logging.info("Simpson Diversity Index: {}".format(
        idx.simpson_di(DictTrackPopGroup)))

    return DictTrackPop, min_pop, max_pop
예제 #5
0
    def __init__(self, mi=None, args=None):
        if args is None:
            self.args = ArgsProvider(call_from=self,
                                     define_params=self._params(),
                                     on_get_params=self._init)
        else:
            self.args = args
            self._init(args)

        # Accumulated errors.
        self.stats = defaultdict(lambda: Stats())

        self._cb = {}
        if mi is not None:
            self.model_interface = mi
예제 #6
0
def train(model, dataloader, optimizer, scheduler, params):
    print("Starting training...")
    best_val_loss = 100
    #print(params.save_dir, params.tag)
    stats = Stats(params.save_dir, params.tag)
    for epoch in range(params.epoch_num):
        loss_avg = RunningAverage()
        train_data = tqdm(dataloader.data_iterator(data_type='train',
                                                   batch_size=params.batch_size),
                                                   total=(dataloader.size()[0] // params.batch_size))
        optimizer.zero_grad()
        model.zero_grad()
        for data, labels in train_data:
            model.train()
            data = torch.tensor(data, dtype=torch.long).to(params.device)
            labels = torch.tensor(labels, dtype=torch.long).to(params.device)

            batch_masks = (data != 0)
            output = model(data, attention_mask=batch_masks, labels=labels)

            loss = torch.mean(output[0])
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)

            optimizer.step()
            scheduler.step()
            model.zero_grad()
            optimizer.zero_grad()
            # update the average loss
            loss_avg.update(loss.item())
            train_data.set_postfix(type='TRAIN',epoch=epoch,loss='{:05.3f}'.format(loss_avg()))

        metrics = validate(model, dataloader, params)
        print('After {} epochs: F1={}, Loss={}'.format(epoch , metrics.f1(), metrics.loss))
        stats.update(metrics, epoch, loss_avg())
        stats.save()
        if epoch % params.save_freq == 0 and params.save_checkpoints:
            save_checkpoint({'epoch': epoch,
                                    'state_dict': model.state_dict(),
                                    'optim_dict': optimizer.state_dict()},
                                    is_best=False,
                                    tag=params.tag,
                                    epoch=epoch,
                                    score=metrics.f1(),
                                    checkpoint=params.save_dir)
        if metrics.loss < best_val_loss:
            best_val_loss = metrics.loss
            save_checkpoint({'epoch': epoch,
                                    'state_dict': model.state_dict(),
                                    'optim_dict': optimizer.state_dict()},
                                    is_best=True,
                                    tag=params.tag,
                                    epoch='generic',
                                    score='epic',
                                    checkpoint=params.save_dir)
예제 #7
0
        env_eval, _ = make_env(args.env,
                               args.max_episode_steps,
                               add_downsampling=False,
                               downsampling_tiles_w=None,
                               downsampling_tiles_h=None,
                               downsampling_pix_values=None,
                               atari_frameskip=args.atari_frameskip)
        eval_fn = get_evaluate_fn(env_eval=env_eval,
                                  preproc_obs_fn=preproc_obs_fn,
                                  policy_NN=call_model,
                                  args=args)

    process = psutil.Process()
    memory_usage_fn = lambda: process.memory_info().rss

    stats = Stats(use_tensorboard=args.use_tensorboard, log_path=log_path)
    experience_keys = ["observations", "target_policy"]
    if args.compute_value:
        experience_keys.append("returns")

    experience_replay = ExperienceReplay(keys=experience_keys,
                                         capacity=args.replay_capacity)

    run_episode_fn = get_episode_fn(
        actor=high_level_actor if args.hierarchical else low_level_actor,
        planner=high_level_planner if args.hierarchical else low_level_planner,
        train_fn=train_fn,
        dataset=experience_replay,
        add_returns=args.compute_value,
        stats=stats,
        memory_usage_fn=memory_usage_fn,
예제 #8
0
    hypers: Dict[str, Tuple[float, float]] = {
        "bostonHousing": (0.0001, 0.1),
        "concrete": (0.001, 0.1),
        "energy": (0.0001, 0.1),
        "kin8nm": (0.1, 0.1),
        "power-plant": (0.1, 1.0),
        "wine-quality-red": (0.001, 0.01),
        "yacht": (0.0001, 1.0),
        "naval-propulsion-plant": (0.001, 1.0),
        "protein-tertiary-structure": (0.0001, 0.1),
    }

    for name in hypers:
        print(name)
        (lengthscale, tau) = hypers[name]
        stats = Stats()
        for run in range(args.runs):
            train, val, test = get_pbp_sets(name,
                                            args.batch_size,
                                            get_val=args.val)

            # l = prior lengthscale, tau = model precision, N = dataset instances
            # weight regularizer = l^2 / (tau N)
            wr = lengthscale**2.0 / (tau * len(train.dataset))
            # dropout regularizer = 2 / tau N
            dr = 2 / (tau * len(train.dataset))

            for (x, y) in train:
                break

            h_dim = 50 if "protein" not in name else 100
예제 #9
0
            parse_chunk_async(file_name, temp_dir, start, end)
        
        total = dict( (count_name, LogCounter(count_name)) for count_name in count_names )
        
        stats.waiting()
        for (temp_file_name, job_pid, job_time) in queue:
            stats.received_job_result()
            start_reduce_time = time.time()
            
            mapper = pickle.load(open(temp_file_name, 'rb'))
            for (count_name, counter) in mapper.get_counters().iteritems():
                total[count_name].add_counter(counter)
            os.remove(temp_file_name)
            
            stats.job_report(job_pid, job_time, time.time() - start_reduce_time)
            stats.waiting()
    
    finally:
        shutil.rmtree(temp_dir)
    
    for name in count_names:
        print total[name].report()

if __name__ == '__main__':
    args = parse_argv()
    
    stats = Stats(args.log_file, log_each_job=args.log_each_job)
    parse_file(args.filename, args.cores, args.jobs_per_core, stats)
    
    stats.report_master_stats()
    # low_level_planner = CountbasedRolloutIW(generate_successor_fn=low_level_tree_actor.generate_successor, width=low_level_width, features_name="low_level_features")

    low_level_planner.add_stop_fn(lambda tree: not interactions.within_budget())

    abstract_tree_actor = AbstractTreeActor(low_level_planner=low_level_planner,
                                            low_level_tree_actor=low_level_tree_actor)

    # high_level_planner = BFS(generate_successor_fn=abstract_tree_actor.generate_successor, features_name="high_level_features")
    # high_level_planner = IW(generate_successor_fn=abstract_tree_actor.generate_successor, width=high_level_width, features_name="high_level_features")
    high_level_planner = CountbasedRolloutIW(generate_successor_fn=abstract_tree_actor.generate_successor, width=high_level_width, features_name="high_level_features")

    high_level_planner.add_stop_fn(lambda tree: not interactions.within_budget())

    abstract_tree = abstract_tree_actor.reset()
    episode_done = False
    stats = Stats()
    abstract_tree_actor.render_tree(abstract_tree, size=None)
    while not episode_done:
        interactions.reset_budget()
        high_level_planner.initialize(tree=abstract_tree)
        high_level_planner.plan(tree=abstract_tree)

        abstract_tree_actor.compute_returns(abstract_tree, discount_factor=discount_factor, add_value=False)
        Q = compute_node_Q(node=abstract_tree.root.low_level_tree.root,
                           n_actions=env.action_space.n,
                           discount_factor=discount_factor,
                           add_value=False)
        low_level_policy = softmax(Q, temp=0)
        a = sample_pmf(low_level_policy)
        abstract_tree_nodes = len(abstract_tree)
class PlaylistDI(object):
    """
    Perform playlist diversity analysis. 
    """
    def __init__(self, dataset, min_tracks):
        """
        """
        # Inizialize classes
        self.pp = PreProcessing()
        self.st = Stats()

        # Define input files
        IN_DIR = os.path.join("../data", dataset)
        playlists_file = os.path.join(IN_DIR, "playlists.tsv")
        tracklist_file = os.path.join(IN_DIR, "tracklist.tsv")
        glove_embs = os.path.join("../data", "tag_embeds", dataset + ".txt")
        lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv")

        # Import data
        DictEmbeds, self.DictKeyEmbeds = self.pp.import_embeddings(glove_embs)
        self.a_idx = self.pp.create_annoy_idx(DictEmbeds)
        self.DictTrackTag = self.pp.import_track_tags(lastfm_tags)
        self.playlists = self.pp.filter_playlists(
            self.pp.import_playlists(playlists_file, min_tracks))
        self.DictTrack = self.pp.import_tracklist(tracklist_file)

        # Define variables
        self.low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv')
        self.high_pDI_playlists = os.path.join(IN_DIR,
                                               'high_pDI_playlists.tsv')
        self.rand_tracks_playlist = []

    def tag_distance(self, word1, word2):
        """
        Compute pairwise cosine distance between two tags, from the Annoy index.
        If tags are not in the index, return -1
        """
        word1 = self.pp.norm_str(word1)
        word2 = self.pp.norm_str(word2)
        try:
            dist = self.a_idx.get_distance(self.DictKeyEmbeds[word1],
                                           self.DictKeyEmbeds[word2])
        except KeyError:
            dist = -1

        return dist

    def TT_distance(self, v1, v2):
        """
        Compute Track-Tag distance between two tracks. If tracks have not the 
        same number of tags, return -1. 
        """
        if not v1 or not v2:
            return -1

        max_len = max(len(v1), len(v2))

        # TODO ### improve propagation when incomplete information
        if len(v1) < max_len:
            return -1
        elif len(v2) < max_len:
            return -1

        s = 0
        for i in range(max_len):
            max_weight = max(v1[i][1], v2[i][1])

            if max_weight == 0:
                s += 0
                max_len += -1
            else:
                dist = self.tag_distance(v1[i][0], v2[i][0])
                if dist == -1:
                    s += 0
                    max_len += -1
                else:
                    s += ((v1[i][1] + v2[i][1]) / float(2 * max_weight) * dist)

        if max_len == 0:
            return -1

        return (s / float(max_len))

    def log_results(self, results):
        """
        Print results of the diversity analysis.
        """
        logging.info("Mean pDI: {}".format(np.mean(results)))
        logging.info("Std pDI: {}".format(np.std(results)))
        logging.info("Max pDI: {}".format(max(results)))
        logging.info("Min pDI: {}".format(min(results)))
        logging.info("Gini pDI: {}".format(gini(np.array(results))))
        logging.info("QCD pDI: {}".format(self.st.qcd(results)))

    def analyze_playlist(self):
        """
        Analyze diversity of playlists from the dataset.
        """
        logging.info("Analyzing Playlists...")
        pDI = []
        pDI_idx = []
        playlist_analyzed = 0

        for c, playlist in enumerate(self.playlists):
            playlist_track_tags = []
            playlist_tracks_tags_count = 0
            for track in playlist:
                track = str(track).strip()
                try:
                    # Continue if track has at least 1 tag associated
                    if self.DictTrackTag[self.DictTrack[track]]:
                        playlist_track_tags.append(
                            self.DictTrackTag[self.DictTrack[track]])
                        playlist_tracks_tags_count += 1

                        # Get random tracks for evaluation
                        if random.randint(0, 9) > 5:
                            self.rand_tracks_playlist.append(
                                self.DictTrackTag[self.DictTrack[track]])
                # Skip if tracks has not tags associated
                except KeyError:
                    pass

            # Skip playlist without complete information
            if playlist_tracks_tags_count >= int(1 * len(playlist)):
                playlist_analyzed += 1
                pDI_sum = 0

                tracks_comb = list(
                    itertools.combinations(playlist_track_tags, 2))

                for track_tags in tracks_comb:
                    dist = self.TT_distance(track_tags[0], track_tags[1])
                    if dist == -1:
                        tracks_comb.remove(track_tags)
                    else:
                        pDI_sum += dist
                if pDI_sum == 0:
                    pass
                else:
                    pDI.append(pDI_sum / float(len(tracks_comb)))
                    pDI_idx.append(c)

        self.log_results(pDI)

        logging.info("Playlists analyzed: {}/{}".format(
            playlist_analyzed, len(self.playlists)))

        return pDI, pDI_idx

    def analyze_random_playlist(self):
        """
        Analyze diversity of random playlists created
        with tracks from the dataset.
        """
        logging.info("Analyzing Random Playlists...")
        playlist_len_mean = int(np.mean([len(x) for x in self.playlists]))

        k = 0
        while k < 1:
            # Shuffle tracks at each iteration
            rand_tracks_playlist = random.sample(
                self.rand_tracks_playlist, len(self.rand_tracks_playlist))

            rand_pDI = []
            random_playlists = [
                rand_tracks_playlist[x:x + playlist_len_mean]
                for x in range(0, len(rand_tracks_playlist), playlist_len_mean)
            ]

            for el in random_playlists:
                rand_pDI_sum = 0
                tracks_comb = list(itertools.combinations(el, 2))
                for track_tags in tracks_comb:
                    dist = self.TT_distance(track_tags[0], track_tags[1])

                    if dist == -1:
                        tracks_comb.remove(track_tags)
                    else:
                        rand_pDI_sum += dist

                if tracks_comb:
                    if rand_pDI_sum == 0:
                        pass
                    else:
                        rand_pDI.append(rand_pDI_sum / float(len(tracks_comb)))

            self.log_results(rand_pDI)
            k += 1

    def write_playlist_qualia(self, pDI, pDI_idx):
        """
        Write out the files with the playlists for performing 
        the qualitative analysis
        """
        dist_10pct = int(0.1 * len(pDI))
        # Write most similar playlists
        with open(self.low_pDI_playlists, 'w+') as outf:
            _writer = csv.writer(outf, delimiter='\t')
            for idx in sorted(range(len(pDI)),
                              key=lambda i: pDI[i],
                              reverse=False)[:dist_10pct]:
                row = [pDI[idx], self.playlists[pDI_idx[idx]]]
                _writer.writerow(row)

        # Write less similar playlists
        with open(self.high_pDI_playlists, 'w+') as outf:
            _writer = csv.writer(outf, delimiter='\t')
            for idx in sorted(range(len(pDI)),
                              key=lambda i: pDI[i],
                              reverse=True)[:dist_10pct]:
                row = [pDI[idx], self.playlists[pDI_idx[idx]]]
                _writer.writerow(row)

    def run(self):
        """
        Main function. It performs the diversity on the playlists from the 
        dataset, then on random playlists. Finally, it writes the files with
        the playlists for performing a qualitative analysis
        """
        pDI, pDI_idx = self.analyze_playlist()
        self.analyze_random_playlist()
        self.write_playlist_qualia(pDI, pDI_idx)
예제 #12
0

def policy(state, q_values, stats):
    epsilon = stats.epsilon(state)
    actions = q_values[state]
    if np.random.uniform() > epsilon:
        #greedy - exploit
        return np.argmax(actions)
    #random action - explore
    return np.random.randint(4)


print('Episodes: %d' % EPISODES)
print('Biased environment: ', use_bias_env)

stats = Stats()
q_values = QValue()

win = [0]
lose = [0]
draw = [0]

wi, lo, dr, has_ace = 0, 0, 0, 0

for ep in tqdm(range(EPISODES)):

    blackjack = Blackjack_Biased(1000) if use_bias_env else Blackjack(1000)

    #get start state
    state = blackjack.reset()
    action = policy(state, q_values, stats)
예제 #13
0
        "wine-quality-red": (2.5, 3.0, 3.5),
        "yacht": (0.25, 0.5, 0.75),
        "naval-propulsion-plant": (30000, 40000, 50000),
        "protein-tertiary-structure": (0.025, 0.05, 0.075),
    }

    for name in taus:
        print(name)
        best_rmse = float("inf")
        best_stats = None
        best_model = None
        best_hypers = (0.0, 0.0)

        for lengthscale in lengthscales:
            for tau in taus[name]:
                stats = Stats()
                for run in range(args.runs):
                    train, _, test = get_pbp_sets(name,
                                                  args.batch_size,
                                                  get_val=False)

                    # l = prior lengthscale, tau = model precision, N = dataset instances
                    # weight regularizer = l^2 / (tau N)
                    wr = lengthscale**2.0 / (tau * len(train.dataset))
                    # dropout regularizer = 2 / tau N
                    dr = 2 / (tau * len(train.dataset))

                    for (x, y) in train:
                        break

                    h_dim = 50 if "protein" not in name else 100
예제 #14
0
 def register_stats(self, env):
     """ Creates charts/stats that we want to show. """
     env.stats['rewards'] = Stats(PlotType.REGULAR)
     env.stats['fails'] = Stats(PlotType.REGULAR)
     env.stats['heatmap'] = MatrixStats(env.height, env.width, 0)
예제 #15
0
def run(dataset, print_ex):
    """
    Analyze the 10% of playlists with the highest, and
    the 10% with the lowest diversity index.
    """
    pp = PreProcessing()
    st = Stats()

    # Define input files
    IN_DIR = os.path.join("../data", dataset)
    tracklist_file = os.path.join(IN_DIR, "tracklist.tsv")
    lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv")

    # Import data
    DictTrackTag = pp.import_track_tags(lastfm_tags)
    DictTrack = pp.import_tracklist(tracklist_file)
    low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv')
    high_pDI_playlists = os.path.join(IN_DIR, 'high_pDI_playlists.tsv')

    results_pd = []

    for input_file in [low_pDI_playlists, high_pDI_playlists]:

        # Initialize variables
        tag_no = []
        tag_common = []
        ratio_tag_track = []
        artist_no = []
        tracks_no = []
        ratio_track_art = []
        distances = []
        print_c = 0
        playlist_c = 0

        with open(input_file, 'r') as inf:
            _reader = csv.reader(inf, delimiter='\t')

            # Iterate over playlists
            for row in _reader:
                playlist_c += 1
                dist, playlist = row
                playlist = eval(playlist)
                distances.append(float(dist))

                artistnames = set()
                total_tags = set()
                tags_list = []

                # Print playlist info
                if print_c < print_ex:
                    logging.info("Printing info new playlist...")
                    logging.info("Playlist pDI:{}".format(dist))
                    logging.info("Playlist Tracks:")

                # Iterate over playlist tracks
                for track in playlist:
                    track = str(track)
                    try:
                        artistname, trackname = DictTrack[track].split("|")
                    except ValueError:
                        continue
                    artistnames.add(artistname)
                    tags_tracks = set()

                    if DictTrack[track] in DictTrackTag:
                        for tag in DictTrackTag[DictTrack[track]]:
                            total_tags.add(pp.norm_str(tag[0]))
                            tags_tracks.add(pp.norm_str(tag[0]))

                        tags_list.append(tags_tracks)
                        if print_c < print_ex:
                            logging.info("{} {}".format(
                                DictTrack[track],
                                DictTrackTag[DictTrack[track]]))
                    else:
                        tags_list.append(set())
                        continue

                # Print playlist stats
                if print_c < print_ex:
                    logging.info("No. unique tags: {}".format(len(total_tags)))
                    logging.info("No. unique tags for tracks: {}".format(
                        len(total_tags) / float(len(playlist))))
                    logging.info("No. unique artists: {}".format(
                        len(artistnames)))
                    logging.info("No. unique tracks: {}".format(len(playlist)))
                    logging.info("No. unique tracks for artists: {}".format(
                        len(playlist) / float(len(artistnames))))

                print_c += 1

                tag_no.append(len(total_tags))
                ratio_tag_track.append(len(total_tags) / float(len(playlist)))
                artist_no.append(len(artistnames))
                tracks_no.append(len(playlist))
                ratio_track_art.append(len(playlist) / float(len(artistnames)))
                tag_common.append(set.intersection(*tags_list))

            common_tags = round(
                len([x for x in tag_common if len(x) > 1]) * 100 /
                float(playlist_c))
            single_artists = round(
                len([x
                     for x in artist_no if x == 1]) * 100 / float(playlist_c))

            # Print playlist dataset qualitative analysis results
            logging.info("")
            logging.info(
                "## Qualitative analysis of playlists from {} file ## ".format(
                    input_file))
            logging.info("Average pDI: {}".format(np.mean(distances)))
            logging.info("Average tag count: {}".format(round(
                np.mean(tag_no))))
            logging.info("Common tags(%): {}".format(common_tags))
            logging.info("Average tag over tracks: {}".format(
                round(np.mean(ratio_tag_track))))
            logging.info("Average artist count: {}".format(
                round(np.mean(artist_no))))
            logging.info("Single-artist(%): {}".format(single_artists))
            logging.info("Average tracks count: {}".format(
                round(np.mean(tracks_no))))
            logging.info("Average tracks over artists: {}".format(
                round(np.mean(ratio_track_art))))

            # Store results for computing Percentual Difference
            results = [
                np.mean(distances),
                round(np.mean(tag_no)), common_tags,
                round(np.mean(ratio_tag_track)),
                round(np.mean(artist_no)), single_artists,
                round(np.mean(tracks_no)),
                round(np.mean(ratio_track_art))
            ]

            results_pd.append(results)

    logging.info("")
    logging.info("## Percentage Difference (PD) ## ".format(input_file))
    for c in range(0, 8):
        if c not in [2, 5]:
            logging.info(st.pdiff(results_pd[0][c], results_pd[1][c]))
        else:
            logging.info(abs(results_pd[0][c] - results_pd[1][c]))