def get_candidate_documents(model,
                            batch_size,
                            dev_set=None,
                            k_best=5,
                            max_doc_len=-1,
                            num_padding_tokens=0):
    """Get list of candidate documents for each pattern, from which best match will be selected"""
    num_patterns = model.total_num_patterns
    selected_documents = [[] for i in range(num_patterns)]

    for batch in chunked(dev_set, batch_size):
        batch_obj = Batch([x[0][0] for x in batch], model.embeddings,
                          model.to_cuda, 0, max_doc_len)
        _, scores = model.forward(batch_obj, 1, 0)
        scores = scores.data

        # Adding epsilon to scores in order to not have two documents with the same score
        epsilon = (torch.rand(scores.size()) - 0.5) / 10000

        scores += model.to_cuda(epsilon)

        for i in range(num_patterns):
            for j in range(batch_obj.size()):
                tup = (scores[j,
                              i], (batch[j][0][1], batch[j][0], batch[j][1]))
                # print(i,j,"tup=",scores[j,i], epsilon[j,i], scores2[j,i])
                heapq.heappush(selected_documents[i], tup)
                if len(selected_documents[i]) == (k_best + 1):
                    heapq.heappop(selected_documents[i])

    return selected_documents
    def test_can_use_iterator_node(self):
        iterable = chunked(StringIO(data_source['mary']), chunksize=3)

        class D(BaseModel, self.Settings):
            stream = Feature(IteratorNode, store=True)
            words = Feature(Tokenizer, needs=stream, store=False)
            count = JSONFeature(WordCount, needs=words, store=True)

        _id = D.process(stream=iterable)
        doc = D(_id)
        self.assertEqual(data_source['mary'], doc.stream.read())
    def _process(self, data):
        try:
            flo = StringIO(data_source[data])
        except KeyError as e:
            if isinstance(data, str):
                flo = StringIO(data)
            else:
                raise e

        for chunk in chunked(flo, chunksize=self._chunksize):
            yield chunk
示例#4
0
    def __iter__(self, flo):
        metadata, _ = NumpyMetaData.unpack(flo)
        example_size = metadata.totalsize
        chunk_size = int(example_size * self.n_examples)
        count = 0

        for chunk in chunked(flo, chunk_size):
            n_examples = len(chunk) // example_size
            yield _np_from_buffer(chunk, (n_examples, ) + metadata.shape,
                                  metadata.dtype)
            count += 1

        if count == 0:
            yield _np_from_buffer(buffer(''), (0, ) + metadata.shape,
                                  metadata.dtype)
示例#5
0
def create_placement_groups(job_id, node_list, partition_name):
    PLACEMENT_MAX_CNT = 150
    groups = {
        f"{cfg.slurm_cluster_name}-{partition_name}-{job_id}-{i}": nodes
        for i, nodes in enumerate(chunked(node_list, n=PLACEMENT_MAX_CNT))
    }
    reverse_groups = {node: group for group, nodes in groups.items() for node in nodes}

    model = next(iter(node_list))
    region = lkp.node_region(model)

    requests = {
        group: create_placement_request(group, region)
        for group, incl_nodes in groups.items()
    }
    done, failed = batch_execute(requests)
    if failed:
        reqs = [f"{e}" for _, e in failed.values()]
        log.fatal("failed to create placement policies: {}".format("\n".join(reqs)))
    log.info(f"created {len(done)} placement groups ({','.join(done.keys())})")
    return reverse_groups
def interpret_documents(model, batch_size, dev_data, dev_text, ofile,
                        max_doc_len):
    j = 0
    with open(ofile, "w") as ofh:
        for batch_idx, chunk in enumerate(chunked(dev_data, batch_size)):
            batch = Batch([x for x, y in chunk], model.embeddings,
                          model.to_cuda)
            res, scores = model.forward(batch, 1)
            print("ss", scores.size())

            output = softmax(res).data

            predictions = [int(x) for x in argmax(output)]

            num_patts = scores.size()[1]

            diffs = np.zeros((num_patts, batch.size()))

            # Traversing all patterns.
            for i in range(num_patts):
                # Copying scores data to numpy array.
                scores_data = np.array(scores.data.numpy(), copy=True)

                # Zeroing out pattern number i across batch
                scores_data[:, i] = 0

                # Running mlp.forward() with zeroed out scores.
                forwarded = softmax(
                    model.mlp.forward(Variable(
                        torch.FloatTensor(scores_data)))).data.numpy()

                # Computing difference between forwarded scores and original scores.
                for k in range(batch.size()):
                    # diffs[i,k] = output[k, predictions[k]] - \
                    #              output[k, 1 - predictions[k]] - \
                    #              forwarded[k, predictions[k]] + \
                    #              forwarded[k, 1 - predictions[k]]

                    diffs[i, k] = forwarded[k, 1 - predictions[k]] - output[
                        k, 1 - predictions[k]]

            # Now, traversing documents in batch
            for i in range(batch.size()):
                # Document string
                text_str = str(" ".join(dev_text[j]).encode('utf-8'))[2:-1]

                # Top ten patterns with largest differences between leave-one-out score and original score.
                top_ten_deltas = sorted(enumerate(diffs[:, i]),
                                        key=lambda x: x[1],
                                        reverse=True)[:10]
                top_ten_neg_deltas = sorted(enumerate(diffs[:, i]),
                                            key=lambda x: x[1])[:10]
                # Top ten patterns with largest overall score (regardless of classification)
                top_ten_scores = sorted(enumerate(scores.data.numpy()[i, :]),
                                        key=lambda x: x[1],
                                        reverse=True)[:10]

                top_scoring_spans = get_top_scoring_spans_for_doc(
                    model, dev_data[j], max_doc_len)

                # Printing out everything.
                ofh.write(
                    "{}   {}   {} All in, predicted: {:>2,.3f}   All in, not-predicted: {:>2,.3f}    Leave one out: +res: {} -res: {} Patt scores: {}\n"
                    .format(
                        dev_data[j][1], predictions[i], text_str,
                        output[i, predictions[i]],
                        output[i, 1 - predictions[i]], " ".join([
                            "{}:{:>2,.3f}".format(i, x)
                            for (i, x) in top_ten_deltas
                        ]), " ".join([
                            "{}:{:>2,.3f}".format(i, x)
                            for (i, x) in top_ten_neg_deltas
                        ]), " ".join([
                            "{}:{:>2,.3f}".format(i, x)
                            for (i, x) in top_ten_scores
                        ])))
                ofh.write("Top ten deltas:\n")
                for l in top_ten_deltas:
                    s = top_scoring_spans[l[0]].display(dev_text[j])
                    ofh.write(
                        str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] +
                        "\n")

                ofh.write("Top ten negative deltas:\n")
                for l in top_ten_neg_deltas:
                    s = top_scoring_spans[l[0]].display(dev_text[j])
                    ofh.write(
                        str(int(l[0])) + " " + str(s.encode('utf-8'))[2:-1] +
                        "\n")
                j += 1
示例#7
0
 def _generator(self, stream, content_length):
     if not content_length:
         raise ValueError('content_length should be greater than zero')
     for chunk in chunked(stream, chunksize=self._chunksize):
         yield StringWithTotalLength(chunk, content_length)
示例#8
0
 def __iter__(self, flo):
     self._total_length = struct.unpack('I', flo.read(4))[0]
     for chunk in chunked(flo, self._chunksize):
         yield StringWithTotalLength(chunk, self._total_length)
示例#9
0
 def __iter__(self, flo):
     decompressor = bz2.BZ2Decompressor()
     for chunk in chunked(flo):
         yield decompressor.decompress(chunk)
示例#10
0
 def __iter__(self, flo):
     for chunk in chunked(flo):
         yield chunk
示例#11
0
 def _process(self, data):
     flo = StringIO(data_source['mary'])
     for chunk in chunked(flo, chunksize=3):
         yield chunk
示例#12
0
 def _process(self, data):
     flo = StringIO(data)
     for chunk in chunked(flo, chunksize=self._chunksize):
         yield chunk
示例#13
0
def resume_nodes(nodelist, placement_groups=None, exclusive=False):
    """resume nodes in nodelist"""

    def ident_key(n):
        # ident here will refer to the combination of partition and group
        return "-".join(
            (
                lkp.node_partition_name(n),
                lkp.node_group_name(n),
            )
        )

    # support already expanded list
    nodes = nodelist
    if isinstance(nodes, str):
        nodelist = expand_nodelist(nodelist)

    nodes = sorted(nodelist, key=ident_key)
    if len(nodes) == 0:
        return
    grouped_nodes = {
        ident: chunk
        for ident, nodes in groupby(nodes, ident_key)
        for chunk in chunked(nodes, n=BULK_INSERT_LIMIT)
    }
    log.debug(f"grouped_nodes: {grouped_nodes}")

    # make all bulkInsert requests and execute with batch
    inserts = {
        ident: create_instances_request(nodes, placement_groups, exclusive)
        for ident, nodes in grouped_nodes.items()
    }
    started, failed = batch_execute(inserts)
    if failed:
        failed_reqs = [f"{e}" for _, (_, e) in failed.items()]
        log.error("bulkInsert API failures: {}".format("\n".join(failed_reqs)))
        for ident, (_, exc) in failed.items():
            down_nodes(grouped_nodes[ident], exc._get_reason())

    # wait for all bulkInserts to complete and log any errors
    bulk_operations = [wait_for_operation(op) for op in started.values()]
    for bulk_op in bulk_operations:
        if "error" in bulk_op:
            error = bulk_op["error"]["errors"][0]
            log.error(
                f"bulkInsert operation error: {error['code']} operationName:'{bulk_op['name']}'"
            )

    # Fetch all insert operations from all bulkInserts. Group by error code and log
    successful_inserts, failed_inserts = separate(
        lambda op: "error" in op, get_insert_operations(bulk_operations)
    )
    # Apparently multiple errors are possible... so join with +.
    # grouped_inserts could be made into a dict, but it's not really necessary. Save some memory.
    grouped_inserts = util.groupby_unsorted(
        failed_inserts,
        lambda op: "+".join(err["code"] for err in op["error"]["errors"]),
    )
    for code, failed_ops in grouped_inserts:
        # at least one insert failure
        failed_nodes = [parse_self_link(op["targetLink"]).instance for op in failed_ops]
        hostlist = util.to_hostlist(failed_nodes)
        count = len(failed_nodes)
        log.error(
            f"{count} instances failed to start due to insert operation error: {code} ({hostlist})"
        )
        down_nodes(hostlist, code)
        if log.isEnabledFor(logging.DEBUG):
            msg = "\n".join(
                err["message"] for err in next(failed_ops)["error"]["errors"]
            )
            log.debug(f"{code} message from first node: {msg}")

    # If reconfigure enabled, create subscriptions for successfully started instances
    if lkp.cfg.enable_reconfigure and len(successful_inserts):
        started_nodes = [
            parse_self_link(op["targetLink"]).instance for op in successful_inserts
        ]
        count = len(started_nodes)
        hostlist = util.to_hostlist(started_nodes)
        log.info("create {} subscriptions ({})".format(count, hostlist))
        execute_with_futures(subscription_create, nodes)