Пример #1
0
    def read_data(self, query, index, max_size, query_type):
        if self.es.indices.exists(index=index):
            res = self.es.search(index=index, size = max_size,
                scroll = '5m', # Keep the scroll window open for 5 minutes
                body=query, doc_type=query_type)

            logger.info({"hits":res['hits']['total']})
            return res['hits']['hits']
        else:
            return([])
Пример #2
0
def read_data(q):
    logger.debug(q)

    es_query = {"query": q["query"]}
    logger.info({"es_query": es_query})

    hits = esd.read_all_data(es_query, cfg['elk']['index'], cfg['elk']['type'],
                             5000)

    logger.info({"first hit": hits[0]})
    return (hits)
Пример #3
0
    def read_all_data(self, query, index, query_type, scroll_size=500, reduce = [], to=30000):
        page = self.es.search(index=index, size = scroll_size,
                scroll = '5m', # Keep the scroll window open for 5 minutes
                body=query, doc_type=query_type, request_timeout=to)

        sid = page['_scroll_id']
        scroll_size = page['hits']['total']
        data = self.reduce_data(page['hits']['hits'], reduce)

        # Start scrolling
        while (scroll_size > 0):
            logger.info("Scrolling...")
            page = self.es.scroll(scroll_id = sid, scroll = '2m')
            # Update the scroll ID
            sid = page['_scroll_id']
            # Get the number of results that we returned in the last scroll
            data += self.reduce_data(page['hits']['hits'], reduce)
            scroll_size = len(page['hits']['hits'])
            logger.info({"scroll size:": str(scroll_size)})

        return(data)
Пример #4
0
def evaluate(args, test_dataset, model, tokenizer, mode='test'):
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    eval_sampler = SequentialSampler(test_dataset)
    eval_dataloader = DataLoader(test_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    logger.info('******* Running Evaluation ********')
    logger.info(' Num examples = %d', len(test_dataset))
    logger.info(' Batch size = %d', args.eval_batch_size)

    y_true = []
    y_pred = []
    for batch in tqdm(eval_dataloader, desc='Iter'):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'segment_ids': batch[2]
        }
        yt = batch[3].view(-1, 1).to('cpu').numpy()
        with torch.no_grad():
            logits = model(**inputs)

        logits.detach().cpu().numpy()
        for i, label in enumerate(yt):
            y_true.append(label[0])
            y_pred.append(1 if logits[i] >= 0.5 else 0)

    print(y_true[:15])
    print(y_pred[:15])
    logger.info(' Accuracy score: %f', metrics.accuracy_score(y_true, y_pred))
    f1 = metrics.f1_score(y_true, y_pred, average='binary')

    return f1
Пример #5
0
def load_and_cache_examples(args,
                            processor,
                            tokenizer,
                            evaluate=False,
                            dev=False,
                            output_examples=False):
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}'.format(
            'eval' if evaluate else 'dev' if dev else 'train',
            list(filter(None, args.bert_model.split('/'))).pop(),
            str(args.max_seq_length)))

    # TODO: refactor the Example structure, save both Example and labels to cache
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        # label_list = processor.get_labels()

        examples, labels = processor.get_test_examples(args.data_dir) if evaluate else processor.get_dev_examples(args.data_dir) \
                                            if dev else processor.get_train_examples(args.data_dir)
        features = convert_examples_to_features(args, examples, tokenizer)

        logger.info('saving features into cache file %s', cached_features_file)
        torch.save(features, cached_features_file)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_masks = torch.tensor([f.attention_masks for f in features],
                                       dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    # TODO: check shape
    dataset = (all_input_ids, all_attention_masks, all_segment_ids, all_labels)

    return dataset
Пример #6
0
    def information_leakage(self, clusters, sample_size=5000, joint_leakage=True):
        """
        Evaluate the information leakage for feature(s).

        Computes marginal KDEs for features given a sites using AKDEs.
        Conditional entropy is then estimated from the distributions via monte-carlo integration.
        The conditional entropy is then used to compute the leakage for the feature(s)

        Parameters
        ----------
        clusters : list
            A list of lists. Features is a list of clusters.
            Each cluster is a list containing the features in the cluster.
            A singular feature or cluster may be given as the parameter.
            In those instances, the data will be wrapped in additional lists to match the expected form.
        sample_size : int
            Count of total random feature samples to use for monte-carlo estimation.
        joint_leakage : bool
            Determines if the leakage of clusters should be measured jointly or individually.
            If True, the probability of samples for each cluster will be multiplied together before estimating entropy.
            Otherwise, the leakage for each cluster is measured.

        Returns
        -------
        list
            Estimated information leakage for the features/clusters.
            If ``joint_leakage`` is True, the list contains the leakage for the combined analysis.
            Otherwise, the list contains the leakages for each cluster,
            appearing in the same order as seen in ``clusters``.

        """
        # convert one feature to singular list for comparability
        if not isinstance(clusters, Iterable):
            clusters = [clusters]
        if not isinstance(clusters[0], Iterable):
            clusters = [clusters]

        self.sample_size = sample_size
        logger.debug("Measuring leakage for {}".format(clusters))

        # Shannon Entropy func: -p(x)*log2(p(x))
        h = lambda x: -x * math.log(x, 2)

        # H(C) -- compute website entropy, this represents the maximum number of bits which can be leaked
        H_C = sum([h(prior) for prior in self.website_priors if prior > 0])

        # map clusters to probability predictions for random samples
        # allows for KDE construction, sampling, and prediction to be done in parallel (if enabled)
        if self._pool is None:
            results = map(self._do_predictions, clusters)
        else:
            results = self._pool.imap(self._do_predictions, clusters)
            self._pool.close()

        # load the results as they are produced and log progress
        cluster_probs = []
        for probs in results:
            cluster_probs.append(probs)
            # print progress updates
            count = len(cluster_probs)
            if count-1 % (len(clusters)*0.05) == 0:
                logger.info("Progress: {}/{}".format(count, len(clusters)))

        # restart pool if multiprocessing
        if self._pool is not None:
            self._pool.join()
            self._pool.restart()

        if joint_leakage:
            # multiply cluster probs to get joint probs for each sample
            # clusters are assumed to be independent from one another
            # in this way, the joint probability of all the variables is their products
            cluster_probs = np.array(cluster_probs)
            prob_sets = [np.prod(cluster_probs, axis=0)]  # shape (1, n_sites, n_samples)
        else:
            # measure leakages for each cluster independently
            prob_sets = cluster_probs  # shape (n_clusters, n_sites, n_samples)

        # compute information leakage for each cluster (or combined cluster if joint)
        leakages = []
        for i, prob_set in enumerate(prob_sets):

            # weight the probability predictions by the website priors
            # in the closed-world scenario, all are equally weighted
            probs_weighted = []
            for site, probs in enumerate(prob_set):
                probs_weighted.append(probs * self.website_priors[site])
            probs_weighted = np.array(probs_weighted)

            # transpose array so that first index represents samples, second index represent site
            probs_weighted = np.transpose(probs_weighted)

            # normalize probabilities such that the per-site probs for each sample sums to one
            # (as should be expected for conditional probabilities)
            probs_norm = []
            for probs in probs_weighted:
                norm = probs / sum(probs) if sum(probs) > 0 else probs
                probs_norm.append(norm)

            # compute entropy for each sample
            entropies = []
            for probs in probs_norm:
                entropies.append(sum([h(prob) for prob in probs if prob > 0]))

            # H(C|f) -- estimate real entropy as average of all samples
            H_CF = sum(entropies)/len(entropies)

            # I(C;f) = H(C) - H(C|f) -- compute information leakage
            leakage = H_C - H_CF
            leakages.append(leakage)

            # debug output
            logger.debug("{cluster} {l} = {c} - {cf}"
                         .format(cluster=clusters[i], l=leakage, c=H_C, cf=H_CF))

        return leakages
Пример #7
0
 def update(self, doc, index, doc_id, doc_type='list'):
     logger.debug({"about to update":str(doc)})
     res = self.es.update(index, id=doc_id, doc_type=doc_type, body=doc )
     logger.info({"updated. results":str(res)})
     return(res)
Пример #8
0
 def write(self, doc, index, doc_id, doc_type='list'):
     logger.debug({"About to write":str(doc)})
     res = self.es.index(index, id=doc_id, doc_type=doc_type, body=doc )
     logger.info({"wrote, results":str(res)})
     return(res)
Пример #9
0
def train(args, train_dataset, dev_dataset, model, tokenizer):
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs
    if args.warmup_ratio:
        ws = args.warmup_ratio * t_total
    else:
        ws = args.warmup_steps

    # prepare the optimizer and scheduler (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=ws,
                                     t_total=t_total)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_f1 = 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc='Epoch',
                            disable=args.local_rank not in [-1, 0])

    for _ in train_iterator:
        epoch_loss = 0.0
        epoch_iteration = tqdm(train_dataloader, desc='Iter')
        for step, batch in enumerate(epoch_iteration):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'segment_ids': batch[2]
            }
            y_true = batch[3].view(-1, 1).float()

            logits = model(**inputs)
            # print(logits.type())
            # print(y_true.type())
            loss_func = BCEWithLogitsLoss()
            loss = loss_func(logits, y_true)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()
            epoch_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if global_step % args.dev_step == 0:
                    curr_f1 = evaluate(args,
                                       dev_dataset,
                                       model,
                                       tokenizer,
                                       mode='dev')
                    logger.info(' current f1: %f', curr_f1)
                    if curr_f1 > best_f1:
                        best_f1 = curr_f1
                        logger.info(' best f1: %f', best_f1)
                        output_file = os.path.join(
                            args.output_dir,
                            str(args.max_seq_length) + '_' +
                            str(args.train_batch_size) + '_' + args.task +
                            'best_model.bin')
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        torch.save(model_to_save, output_file)

        logger.info('Training loss current epoch: %f', epoch_loss)

    return global_step, tr_loss / global_step
Пример #10
0
def main():
    parser = argparse.ArgumentParser()

    # required parameters
    parser.add_argument(
        '--data_dir',
        default=None,
        type=str,
        required=True,
        help="Dir of input data. DON'T include exact file name")
    parser.add_argument(
        '--bert_model',
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        '--task',
        default=None,
        type=str,
        required=True,
        help='Will use as name of saved models and result file')
    parser.add_argument('--output_dir', default=None, type=str, required=True)

    # Other optional parameters
    parser.add_argument("--model_type", default='bert')
    parser.add_argument("--split_ratio", default=0.25, type=float)
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--warmup_ratio",
                        default=0.0,
                        type=float,
                        help="Linear warmup over warmup_ratio.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--dev_step', type=float, default=500)
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    args.device = device

    # Set seed
    set_seed(args)
    # args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES['bert']
    processor = ClfProcessor()
    # label_list = processor.get_labels()
    # num_labels = len(processor.get_labels())

    # initialize tokenizer and model from the downloaded tf checkpoint
    if args.bert_model == 'bert-base-cased':
        vocab_file = weightpath.BASE_VOCAB_FILE
        config_file = weightpath.BASE_CONFIG_FILE
        weight_file = weightpath.BASE_WEIGHTS
    elif args.bert_model == 'wwm':
        vocab_file = weightpath.WWM_VOCAB_FILE
        config_file = weightpath.WWM_CONFIG_FILE
        weight_file = weightpath.WWM_WEIGHTS
    else:
        raise ValueError(
            'Currently only support Bert Base Cased(bert-base-cased) and Whole Word Masking Cased(wwm)'
        )

    # prepare the pretrained model and tokenizer
    tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False)
    config = BertConfig.from_pretrained(config_file)
    model = model_class.from_pretrained('bert-base-cased')

    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)

    if args.do_train:
        train_dataset = TensorDataset(
            *load_and_cache_examples(args, processor, tokenizer))
        dev_dataset = TensorDataset(
            *load_and_cache_examples(args, processor, tokenizer, dev=True))
        global_step, tr_loss = train(args, train_dataset, dev_dataset, model,
                                     tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    if args.do_eval:
        model = torch.load(
            os.path.join(
                args.output_dir,
                str(args.max_seq_length) + '_' + str(args.train_batch_size) +
                '_' + args.task + 'best_model.bin'))
        model.to(args.device)
        test_dataset = TensorDataset(*load_and_cache_examples(
            args, processor, tokenizer, evaluate=True))
        f1 = evaluate(args, test_dataset, model, tokenizer)
        logger.info(" f1: %f", f1)
Пример #11
0
    def cluster(self,
                features,
                checkpoint=None,
                min_samples=1,
                min_cluster_size=3):
        """
        Find clusters in provided features.

        Use DBSCAN algorithm to cluster topN features based upon their pairwise mutual information.
        First fill an NxN matrix with NMI feature pair values.
        NMI values may be retrieved from the MIAnalyzer's internal cache or by doing computations anew.
        The DBSCAN model is then fit to this distances grid, and the identified clusters are returned.

        Parameters
        ----------
        features : list
            A list of features to cluster
        checkpoint : str
            Path to plaintext file to store feature redundancy checkpoint information.
            Do not perform checkpointing if None is used.
        min_samples : int
            The min_samples parameter to use for the HDBSCAN algorithm.
            The number of samples in a neighbourhood for a point to be considered a core point.

        min_cluster_size : int
            The min_cluster_size parameter to use for the HDBSCAN algorithm.
            The minimum size of clusters; single linkage splits that contain fewer points than this will be considered points “falling out” of a cluster rather than a cluster splitting into two new clusters.

        Returns
        -------
        list
            Nested lists where each list contains the cluster's features.
            Features that do not fall into a cluster are given their own cluster (ie. singular list).
        """
        # compute pairwise MI for all topN features
        X = np.empty(shape=(len(features), len(features)),
                     dtype=float)  # distance matrix
        pairs = list(combinations_with_replacement(
            features, 2))  # all possible combinations

        # if checkpointing, read NMI calculations and save to cache
        if checkpoint is not None:
            if os.path.exists(checkpoint):
                chk_fi = open(checkpoint, 'r+')
                for line in chk_fi:
                    try:
                        if line[0] == '=':
                            a, b, c = line[1:].split(',')
                            self._nmi_cache.append(
                                ((int(a), int(b)), float(c)))
                    except:
                        pass
                chk_fi.close()
            # re-open checkpoint for appending
            chk_fi = open(checkpoint, 'a+')

        if self._nmi_cache:
            # ignore unselected features in cache
            cache = [(pair, nmi) for pair, nmi in self._nmi_cache
                     if pair[0] in features and pair[1] in features]
            # add each cached nmi to the distance matrix
            for cached_pair, nmi in cache:
                # remove cached_pair from pairs
                pairs = list(
                    filter(
                        lambda pair: (pair[0] != cached_pair[0] and pair[
                            1] != cached_pair[1]) and (pair[0] != cached_pair[
                                1] and pair[1] != cached_pair[0]), pairs))
                # add cached nmi to matrix
                i, j = features.index(cached_pair[0]), features.index(
                    cached_pair[1])
                X[i][j] = 1 - nmi
                X[j][i] = 1 - nmi

        if len(pairs) > 0:
            # map pairs to nmi
            if self._pool is None:
                results = map(self._estimate_nmi, pairs)
            else:
                results = self._pool.imap(self._estimate_nmi, pairs)
                self._pool.close()

            # fill matrix with pair nmi values
            count = 0
            for pair, nmi in zip(pairs, results):

                # print progress updates
                count += 1
                if count - 1 % (len(pairs) * 0.05) == 0:
                    logger.info("Progress: {}/{}".format(count, len(pairs)))

                fidx1, fidx2 = pair
                i, j = features.index(fidx1), features.index(fidx2)
                X[i][j] = 1 - nmi
                X[j][i] = 1 - nmi

                if checkpoint is not None:
                    chk_fi.write('={},{},{}\n'.format(fidx1, fidx2, nmi))
                    chk_fi.flush()

            # restart pool if multiprocessing
            if self._pool is not None:
                self._pool.join()
                self._pool.restart()

        # verify that all values are filled
        assert not np.any(X[X == np.nan])

        # use DBSCAN to cluster our data
        labels = HDBSCAN(metric='precomputed',
                         min_samples=min_samples,
                         min_cluster_size=min_cluster_size).fit_predict(X)
        logger.debug("Found {} clusters.".format(set(labels)))

        # organize the topN features into sub-lists where
        # each sub-list contains all features in a cluster
        clusters = []
        for label in range(min(labels), max(labels) + 1):
            if label >= 0:
                cluster = [
                    features[i] for i, la in enumerate(labels) if la == label
                ]
                clusters.append(cluster)
            else:
                # treat features that do not cluster (ie. noise) each as their own independent cluster
                noise = [[features[i]] for i, la in enumerate(labels)
                         if la == label]
                clusters.extend(noise)

        logger.debug("Clusters: {}".format(labels))
        return clusters, X
Пример #12
0
    def prune(self, features, checkpoint=None, nmi_threshold=0.9, topn=100):
        """
        Reduce the feature-set to a list of top features which are non-redundant.

        Redundancy is identified by estimating the pair-wise mutual information of features.
        The algorithm will find up to a maximum of ``topn`` non-redundant features before ending.
        If the MIAnalyzer was instantiated with a ``pool``, NMI calculations will be performed in parallel.

        Parameters
        ----------
        features : list
            Array of features from which to prune redundant features.
            Features should be pre-sorted by importance with the most important feature being at index 0.
        checkpoint : str
            Path to plaintext file to store feature redundancy checkpoint information.
            Do not perform checkpointing if None is used.
        nmi_threshold : float
            Threshold value used to identify redundant features.
            Features with NMI values greater than the threshold value are pruned.
        topn : int
            Number of features to save when pruning is performed.

        Returns
        -------
        list
            Features list having variable length up to ``topn``.
        """
        # results of NMI calculations are saved in list internal to the analyzer
        # reduces the amount of computation required in any subsequent cluster calls
        self._nmi_cache, self._mi_cache = [], dict()

        self.nmi_threshold = nmi_threshold

        # feature lists
        cleaned_features = set()  # non-redundant
        pruned_features = set()  # redundant

        # if checkpointing, open file and read any previously processed features
        if checkpoint is not None:
            if os.path.exists(checkpoint):
                checkpoint_fi = open(checkpoint, 'r+')
                for line in checkpoint_fi:
                    try:
                        if line[0] == '+':
                            feature = int(line[1:].strip())
                            cleaned_features.add(feature)
                        elif line[0] == '-':
                            feature = int(line[1:].strip())
                            pruned_features.add(feature)
                        if line[0] == '=':
                            a, b, c = line[1:].split(',')
                            self._nmi_cache.append(
                                ((int(a), int(b)), float(c)))
                    except:
                        pass
                features = list(
                    filter(
                        lambda f: f not in cleaned_features and f not in
                        pruned_features, features))
                checkpoint_fi.close()

            # re-open checkpoint for appending
            checkpoint = open(checkpoint, 'a+')

        # continue to process features until either there are no features left to process
        # or the topN features have been selected
        while features and len(cleaned_features) < topn:

            # the next most important feature
            current_feature = features.pop(0)
            logger.debug("MI analysis on feature #{}".format(current_feature))

            # for all top features, measure pair-wise mutual information to check for redundancy
            feature_pairs = zip(repeat(current_feature), cleaned_features)
            if self._pool is None or len(cleaned_features) < 2:
                results = map(self._check_redundancy, feature_pairs)
            else:  # parallel, unordered
                results = self._pool.uimap(self._check_redundancy,
                                           feature_pairs)

            # break upon first occurrence of redundancy
            is_redundant = False
            for res in results:

                # unzip results
                is_redundant, feature_pair, nmi = res

                # save feature pair with nmi in cache
                self._nmi_cache.append((feature_pair, nmi))
                if checkpoint is not None:
                    checkpoint.write('={},{},{}\n'.format(
                        feature_pair[0], feature_pair[1], nmi))
                    checkpoint.flush()

                # break loop
                if is_redundant:
                    # if the analyzer is using a process pool
                    # terminate processes and restart the pool
                    if self._pool is not None:
                        self._pool.terminate()
                        self._pool.join()
                        self._pool.restart()
                    break

            # if the current feature does not appear to be redundant with any
            # other top features, add current feature to top features list
            if not is_redundant:
                cleaned_features.add(current_feature)
                logger.info("Progress: {}/{}".format(len(cleaned_features),
                                                     min(topn, len(features))))
                if checkpoint is not None:
                    checkpoint.write('+{}\n'.format(current_feature))
                    checkpoint.flush()
            else:
                pruned_features.add(current_feature)
                if checkpoint is not None:
                    checkpoint.write('-{}\n'.format(current_feature))
                    checkpoint.flush()

        if checkpoint is not None:
            checkpoint.close()

        # return both non-redundant and redundant features
        # which feature was redundant with which is however not saved
        return list(cleaned_features), list(pruned_features)
Пример #13
0
def _individual_measure(modeler, pool, checkpoint):
    """
    Perform information leakage analysis for each feature one-by-one.

    The resulting leakages are saved in a plain-text ascii checkpoint file,
    which can be loaded in subsequent runs to avoid re-processing features.

    Parameters
    ----------
    modeler : WebsiteFingerprintModeler
        initialized fingerprinting engine
    pool : ProcessPool
        Pool to use for multiprocessing.
    checkpoint : str
        Path to ascii file to save individual leakage checkpoint information.

    Returns
    -------
    list
        list of leakages where the index of each leakage maps to the feature number

    """
    leakage_indiv = []

    # open a checkpoint file
    if checkpoint:
        lines = None
        if os.path.exists(checkpoint):
            with open(checkpoint, 'r') as tmp_file:
                past_leaks = [float(line) for line in tmp_file]
                lines = len(past_leaks)
                leakage_indiv = past_leaks
        tmp_file = open(checkpoint, 'a+')

    # if a pool has been provided, perform computation in parallel
    # otherwise do serial computation
    if checkpoint and lines:
        features = modeler.data.features[lines:]
    else:
        features = modeler.data.features
    if pool is None:
        proc_results = map(modeler, features)
    else:
        proc_results = pool.imap(modeler, features)
        pool.close()
    size = len(modeler.data.features)  # number of features

    logger.info("Begin individual leakage measurements.")
    # measure information leakage
    # log current progress at twenty intervals
    for leakage in proc_results:
        leakage_indiv.append(leakage[0])
        if len(leakage_indiv)-1 % int(size*0.05) == 0:
            logger.info("Progress: {}/{}".format(len(leakage_indiv), size))
        if checkpoint:
            tmp_file.write('{}\n'.format(str(leakage[0])))
            tmp_file.flush()
    logger.info("Progress: Done.")
    if pool is not None:
        pool.join()
        pool.restart()
    if checkpoint:
        tmp_file.close()
    return leakage_indiv
Пример #14
0
def main(features_path, output_path, n_procs=0, n_samples=5000, topn=100, nmi_threshold=0.9, discrete_threshold=100000):
    """
    Run the full information leakage analysis on a processed dataset.

    Parameters
    ----------
    features_path : str
        Operating system file path to the directory containing processed feature files.
    output_path : str
        Operating system file path to the directory where analysis results should be saved.
    n_procs : int
        Number of processes to use for parallelism.
        If 0 is used, auto-detect based on number of system CPUs.
    n_samples : int
        Number of samples to use when performing monte-carlo estimation when running the fingerprint modeler.
    topn : int
        Top number of features to analyze during joint analysis.
    nmi_threshold : float
        Cut-off value for determining redundant features. Should be a percentage value.

    Returns
    -------
    float
        Combined feature leakage (in bits)
    """
    # prepare feature dataset
    logger.info("Loading dataset.")
    feature_data = WebsiteData(features_path)
    logger.info("Loaded {} sites.".format(len(feature_data.sites)))
    logger.info("Loaded {} instances.".format(len(feature_data)))

    # create process pool
    if n_procs > 1:
        pool = Pool(n_procs)
    elif n_procs == 0:
        pool = Pool(cpu_count())
    else:
        pool = None

    # directory to save results
    outdir = output_path
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # initialize fingerprint modeler
    modeler = WebsiteFingerprintModeler(feature_data, discrete_threshold=discrete_threshold)

    # load previous leakage measurements if possible
    indiv_path = os.path.join(outdir, 'indiv.pkl')
    if os.path.exists(indiv_path):
        with open(indiv_path, "rb") as fi:
            logger.info("Loading individual leakage measures from file.")
            leakage_indiv = dill.load(fi)

    # otherwise do individual measure
    else:
        logger.info("Begin individual feature analysis.")

        # perform individual measure with checkpointing
        chk_path = os.path.join(outdir, 'indiv_checkpoint.txt')
        leakage_indiv = _individual_measure(modeler, pool, chk_path)

        # save individual leakage to file
        logger.info("Saving individual leakage to {}.".format(indiv_path))
        with open(indiv_path, "wb") as fi:
            dill.dump(leakage_indiv, fi)

    # perform combined information leakage measurements
    # initialize MI analyzer
    analyzer = MutualInformationAnalyzer(feature_data, pool=pool)

    # sort the list of features by their individual leakage
    # we will process these features in the order of their importance during MI analysis
    logger.info("Sorting features by individual leakage.")
    tuples = list(zip(feature_data.features, leakage_indiv))
    tuples = sorted(tuples, key=lambda x: (-x[1], x[0]))
    logger.debug("Top 20:\t {}".format(tuples[:20]))
    sorted_features = list(list(zip(*tuples))[0])

    # process into list of non-redundant features
    cln_path = os.path.join(outdir, 'cleaned.pkl')
    rdn_path = os.path.join(outdir, 'redundant.pkl')
    chk_path = os.path.join(outdir, 'prune_checkpoint.txt')
    if os.path.exists(cln_path):
        logger.info("Loading top non-redundant features from file.")
        with open(cln_path, 'rb') as fi:
            cleaned = dill.load(fi)
    else:
        logger.info("Begin feature pruning.")
        cleaned, pruned = analyzer.prune(features=sorted_features,
                                         nmi_threshold=nmi_threshold,
                                         topn=topn,
                                         checkpoint=chk_path)
        with open(cln_path, 'wb') as fi:
            dill.dump(cleaned, fi)
        with open(rdn_path, 'wb') as fi:
            dill.dump(pruned, fi)

    # cluster non-redundant features
    dst_path = os.path.join(outdir, 'distance_matrix.pkl')
    cst_path = os.path.join(outdir, 'clusters.pkl')
    if os.path.exists(cst_path):
        logger.info("Loading clusters from file.")
        with open(cst_path, 'rb') as fi:
            clusters = dill.load(fi)
    else:
        logger.info("Begin feature clustering.")
        clusters, distance_matrix = analyzer.cluster(cleaned, checkpoint=chk_path)
        with open(dst_path, 'wb') as fi:
            dill.dump(distance_matrix, fi)
        with open(cst_path, 'wb') as fi:
            dill.dump(clusters, fi)

    # perform joint information leakage measurement
    logger.info('Identified {} clusters.'.format(len(clusters)))
    logger.info("Begin cluster leakage measurements.")
    modeler._pool = pool    # configure modeler to use the proc pool
    leakage_joint = modeler.information_leakage(clusters=clusters,
                                                sample_size=n_samples,
                                                joint_leakage=True)[0]

    logger.info("Final leakage results: {} bits".format(leakage_joint))
    logger.info("Finished execution.")
    return leakage_joint
Пример #15
0
from ElasticSearchDrive import ElasticSearchDriver
from data_utils import cfg, logger, email_notify
import time, boto, sys, json, logging, csv, io, subprocess, glob, re, os
from logging.handlers import RotatingFileHandler

handler = RotatingFileHandler(filename=str(cfg['log']['get_csv_fn']),
                              maxBytes=int(cfg['log']['maxBytes']),
                              backupCount=int(cfg['log']['backupCount']))
logger.addHandler(handler)

formatter = logging.Formatter(
    '{"location":"%(module)s:%(lineno)d:%(funcName)s","server_time":"%(asctime)s","level":"%(levelname)s","msg":%(message)s}'
)
handler.setFormatter(formatter)
# TODO: add process number
logger.info("CSV Starting...")
logger.info(cfg)


#http://docs.ceph.com/docs/master/radosgw/s3/python/
def upload(bucket_name, key_name, dreams_str):
    conn = boto.connect_s3()
    bucket = conn.get_bucket(bucket_name)
    key = bucket.new_key(key_name)
    key.set_contents_from_string(dreams_str)


def get_names(hits):
    names = []
    for h in hits:
        names.extend(list(h["_source"].keys()))