def start_rules_training(self):
        gen_tags = gt(self.df)
        tags_ranges = gen_tags.set_tags()

        parts_gen = Part(self.df)
        partition_set = parts_gen.gen_partition_set()

        best_accuraccy = 0
        best_rulesset = pd.DataFrame()

        for i in range(0, len(partition_set) - 1):
            test_set = partition_set[i]
            training_set = partition_set.copy()
            training_set.pop(i)

            training_df = pd.concat(training_set)

            fuzzifier = FuzGen(test_set)
            test_df = fuzzifier.fuzzify_data(tags_ranges)

            rules_df = self.learn_rules(training_df, tags_ranges)

            classifier = Classifier(test_df, rules_df)
            classifier.classify_dataset()

            TP_value = classifier.verify_classification()
            accuraccy = (TP_value / len(test_df))

            print(accuraccy)

            if accuraccy > best_accuraccy:
                best_accuraccy = accuraccy
                best_rulesset = rules_df

        return best_rulesset
示例#2
0
    def get_content(self):
        # 1.提取基本文本块
        self.parser = Parser(self.url)
        ns_list = self.parser.ns()
        self.title = self.parser.get_title()
        # 2.文本串分块
        self.partitioner = Partitioner()
        blocks = self.partitioner.partition(ns_list)

        # 3.抽取正文块,副产品为分析信息
        self.judge = Judge(self.title.string, ns_list)
        res = self.judge.select(blocks, ns_list)

        flag = res['flag']
        cblock = res['block']
        confidence = res['confidence']
        detail = res['detail']
        #if flag:
        content = cblock.to_str()
        (srcs, images) = self.get_images(cblock)
        cblock = self.insert_images(cblock, images)
        content_with_format = cblock.to_str_with_format()
        #else:
        #    content = ""
        #    content_with_format = ""
        #    srcs = None
        return (flag, self.title.string.strip(), content, content_with_format,
                srcs, confidence, detail)
示例#3
0
 def __init__(self, context, partition_set):
     self.__aggregation_sets = partition_set
     self.__aggregation_sets[c.KEY_TABLES] = {}
     self.__partitioner = Partitioner(context[c.KEY_PARTITIONS],
                                      context[c.KEY_SEPERATOR_PARTITION])
     self.__context = context
     self.__info = {}
     self.__info[c.INFO_TOTAL_BYTES] = 0
     self.__info[c.INFO_TOTAL_ROWS] = 0
     self.__info[c.INFO_TOTAL_MESSAGES] = 0
     self.__logger = logging.getLogger()
     self.__logger.setLevel(logging.ERROR)
示例#4
0
文件: disk.py 项目: k0da/kiwi-1
    def __init__(self, table_type, storage_provider):
        # bind the underlaying block device providing class instance
        # to this object (e.g loop) if present. This is done to guarantee
        # the correct destructor order when the device should be released.
        self.storage_provider = storage_provider

        self.partition_map = {}
        self.partition_id_map = {}
        self.partition_id = {}
        self.is_mapped = False

        self.partitioner = Partitioner(
            table_type, storage_provider
        )

        self.table_type = table_type
示例#5
0
    def get_content(self):
         # 1.提取基本文本块
        self.parser = Parser(self.url)
        ns_list = self.parser.ns()
        self.title = self.parser.get_title()
        # 2.文本串分块
        self.partitioner = Partitioner()
        blocks = self.partitioner.partition(ns_list) 

        # 3.抽取正文块,副产品为分析信息
        self.judge = Judge( self.title.string,ns_list )
        res = self.judge.select( blocks,ns_list )   

        flag = res['flag']
        cblock = res['block']
        confidence = res['confidence']
        detail = res['detail']
        #if flag:
        content = cblock.to_str()
        (srcs,images) = self.get_images( cblock )
        cblock = self.insert_images(cblock,images)
        content_with_format = cblock.to_str_with_format()
        #else:
        #    content = ""
        #    content_with_format = ""
        #    srcs = None
        return (flag,self.title.string.strip(),content,content_with_format,srcs,confidence,detail)
示例#6
0
文件: maxinet.py 项目: wette/MaxiNet
 def setup(self):
     """
     start cluster if not yet started, assign topology parts to workers and
     start workers
     """
     if(not self.cluster.is_running()):
         self.cluster.start()
     if(not self.cluster.is_running()):
         raise RuntimeError("Cluster won't start")
     self.logger.info("Clustering topology...")
     if(not self.topology):
         parti = Partitioner()
         parti.loadtopo(self.origtopology)
         self.topology = parti.partition(self.cluster.num_workers(),self.cluster.get_worker_shares()) # assigning shares to workers requires that the workers are already startet. elsewise we don't have a way to determine the workerid of the worker. topologies are assigned to workers in ascending workerid order
         self.logger.debug("Tunnels: "+str(self.topology.getTunnels()))
     subtopos = self.topology.getTopos()
     if(len(subtopos) > self.cluster.num_workers()):
         raise RuntimeError("Cluster does not have enough workers for given topology")
     for subtopo in subtopos:
         for node in subtopo.nodes():
             self.node_to_workerid[node]=subtopos.index(subtopo)
             self.nodes.append(NodeWrapper(node, self.get_worker(node)))
             self.node_to_wrapper[node]=self.nodes[-1]
             if (not subtopo.isSwitch(node)):
                 self.hosts.append(self.nodes[-1])
             else:
                 self.switches.append(self.nodes[-1])
     self.logger.debug("Nodemapping: %s",self.node_to_workerid)
     tunnels = [[] for x in range(len(subtopos))]
     for tunnel in self.topology.getTunnels():
         w1 = self.get_worker(tunnel[0])
         w2 = self.get_worker(tunnel[1])
         intf = self.cluster.create_tunnel(w1,w2)
         self.tunnellookup[(tunnel[0],tunnel[1])]=intf
         self.tunnellookup[(tunnel[1],tunnel[0])]=intf
         for i in range(0,2):
             tunnels[self.node_to_workerid[tunnel[i]]].append([intf, tunnel[i], tunnel[2]]) # Assumes that workerid = subtopoid
     for topo in subtopos:
         self.cluster.workers()[subtopos.index(topo)].set_switch(self.switch)
         if(self.controller):
             self.cluster.workers()[subtopos.index(topo)].start(topo=topo, tunnels=tunnels[subtopos.index(topo)], controller=self.controller)
         else:
             self.cluster.workers()[subtopos.index(topo)].start(topo=topo, tunnels=tunnels[subtopos.index(topo)])
     if (config.runWith1500MTU):
         for topo in subtopos:
             for host in topo.nodes():
                 self.setMTU(host,1450)
示例#7
0
def exp1(name, step):
    exp1_info = {}
    gzip_fname = utils.get_gzip_fname(name)
    s = network.Network.from_combined(name, gzip_fname)
    graph = s.graph
    p = Partitioner(s.graph)
    cd_s = time.time()
    cd_partition = p.community_detection()
    exp1_info['community_detection'] = duration(cd_s)
    parts = len(cd_partition)
    exp1_info['parts'] = parts
    metis_s = time.time()
    metis_partition = p.metis_partition(parts)
    exp1_info['metis'] = duration(metis_s)
    random_s = time.time()
    rnd_partition = p.random_partition(parts)
    exp1_info['rnd'] = duration(random_s)
    graph_info = GraphInfo(graph)
    partitions = [cd_partition, metis_partition, rnd_partition]
    exp1_impl(name, graph_info, p, partitions, step)
    return exp1_info
示例#8
0
class Aggregator(object):
    """
        Aggregate events of multiple different SQS messages into S3 key lists.
    """
    def __init__(self, context, partition_set):
        self.__aggregation_sets = partition_set
        self.__aggregation_sets[c.KEY_TABLES] = {}
        self.__partitioner = Partitioner(context[c.KEY_PARTITIONS],
                                         context[c.KEY_SEPERATOR_PARTITION])
        self.__context = context
        self.__info = {}
        self.__info[c.INFO_TOTAL_BYTES] = 0
        self.__info[c.INFO_TOTAL_ROWS] = 0
        self.__info[c.INFO_TOTAL_MESSAGES] = 0
        self.__info[c.INFO_EVENTS] = {}
        self.__logger = logging.getLogger()
        self.__logger.setLevel(logging.ERROR)

    @property
    def bytes_uncompressed(self):
        return self.__info[c.INFO_TOTAL_BYTES]

    @property
    def rows(self):
        return self.__info[c.INFO_TOTAL_ROWS]

    @property
    def messages(self):
        return self.__info[c.INFO_TOTAL_MESSAGES]

    @property
    def events(self):
        return self.__info[c.INFO_EVENTS]

    @property
    def info(self):
        return self.__info

    def append_default_metrics_and_partition(self, messages):
        length = len(messages)
        util.debug_print(("Processing {} messages.").format(length))
        self.increment(self.__info, c.INFO_TOTAL_MESSAGES, length)
        for x in range(0, length):
            message = messages[x]
            self.process_message(message)

    def process_message(self, message):
        compression_mode = CompressionClassFactory.instance(
            message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][
                c.SQS_PARAM_COMPRESSION_TYPE]['StringValue'])
        body = compression_mode.extract_message_body(message)
        attempts = int(message['Attributes']['ApproximateReceiveCount'])
        sensitivity_type = SensitivityClassFactory.instance(
            message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][
                c.SQS_PARAM_SENSITIVITY_TYPE]['StringValue'])
        payload_type = PayloadClassFactory.instance(
            self.__context, message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][
                c.SQS_PARAM_PAYLOAD_TYPE]['StringValue'], compression_mode,
            sensitivity_type)

        msg_token = "{}{}{}".format(message['MessageId'],
                                    self.__context[c.KEY_SEPERATOR_CSV],
                                    message['ReceiptHandle'])
        if attempts > self.__context[c.KEY_MAX_MESSAGE_RETRY]:
            self.__logger.error(
                "The message with message Id {} has been processed {} times.".
                format(msg_token, attempts))
        self.increment(self.__info, c.INFO_TOTAL_BYTES, len(body))

        payload_type.to_partitions(msg_token, body, self.partition,
                                   sensitivity_type,
                                   self.__partitioner.partitions)

    def partition(self, token, row, sensitivity_type):
        #schema hash
        columns = row.keys()
        columns = [i if isinstance(i, string_types) else i for i in columns]
        columns.sort()
        rows_as_string = str(columns)
        schema_hash = hash(rows_as_string)
        event_name = row[metric_schema.EVENT.id]
        uuid_key = "{}{}{}".format(row[metric_schema.UUID.id],
                                   row[metric_schema.EVENT.id],
                                   row[metric_schema.SERVER_TIMESTAMP.id])
        #create the key here as the partition my remove attributes if the attribute is created as a partition
        tablename, partition = self.__partitioner.extract(
            schema_hash, row, sensitivity_type)
        columns, row = self.order_and_map_to_long_name(row)

        self.increment_detailed_cloudwatch_event_information(event_name)

        if partition is None:
            self.__logger.error("Dropping metric\n{}".format(row))
            return

        if partition not in self.__aggregation_sets:
            #need to use a immutable object as required by fastparquet for hashing
            self.__aggregation_sets[partition] = dict({})

        if tablename not in self.__aggregation_sets[c.KEY_TABLES]:
            self.__aggregation_sets[c.KEY_TABLES][tablename] = tablename

        partition_dict = self.__aggregation_sets[partition]
        if schema_hash not in partition_dict:
            partition_dict[schema_hash] = {}
            partition_dict[schema_hash][c.KEY_SET] = {}
            partition_dict[schema_hash][c.KEY_SET_COLUMNS] = columns
        partition_dict[schema_hash][c.KEY_SET][uuid_key] = row

        self.register_processed_message(partition_dict[schema_hash], token)

    def increment_detailed_cloudwatch_event_information(self, event_name):
        if self.__context.get(c.KEY_WRITE_DETAILED_CLOUDWATCH_EVENTS, False):
            self.increment(self.events, event_name, 1)

    def register_processed_message(self, schema_dict, msg_token):
        #track which messages have been processed
        if c.KEY_MSG_IDS not in schema_dict:
            schema_dict[c.KEY_MSG_IDS], schema_dict[
                c.KEY_APPENDER] = self.get_new_list_append_handler()

        if msg_token not in schema_dict[c.KEY_MSG_IDS]:
            schema_dict[c.KEY_APPENDER](msg_token)

    def get_new_list_append_handler(self):
        list = []
        append = list.append
        return list, append

    def increment(self, dict, key, value):
        if key not in dict:
            dict[key] = value
        dict[key] += value

    def order_and_map_to_long_name(self, row):
        orderer = Order()
        ordered_columns = orderer.order_columns(row)
        ordered_dict = OrderedDict()
        ordered_columns_long_name = []
        for field in ordered_columns:
            if field not in row:
                continue
            value = row[field]
            if field in metric_schema.DICTIONARY and field in row:
                name = metric_schema.DICTIONARY[field].long_name
                ordered_dict[name] = value
                ordered_columns_long_name.append(name)
            else:
                ordered_dict[field] = value
                ordered_columns_long_name.append(field)

        return ordered_columns_long_name, ordered_dict
    def start_rules_training(self):

        # Generate the tags for the values range
        gen_tags = gt(self.df)
        tags_ranges = gen_tags.set_tags()

        # Split the dataset in 5 random partitions
        parts_gen = Part(self.df)
        partition_set = parts_gen.gen_partition_set()

        # Initialize the best rulesset using full dataset
        best_rulesset = pd.DataFrame()
        best_rulesset = self.get_initial_rules(self.df, tags_ranges)

        # Train the rules with all posible combinations of training and test partitions
        for i in range(0, len(partition_set) - 1):

            # Select the partition for the test set, using the index of the loop
            test_set = partition_set[i]

            # Select the partitions for training set, removing test partition from a copy of the partitions list
            training_set = partition_set.copy()
            training_set.pop(i)  # Remove test partition from training_set

            # Fuzzify the data from the test set
            fuzzifier = FuzGen(test_set)
            test_df = fuzzifier.fuzzify_data(tags_ranges)
            '''
            Deal each training set with the rules set, to get the best rules set.
            In each iteration, accumulate the matched rules to the previous rules set.

            This will allows to distinct the best rules, which have been matched more times
            '''
            for training_df in training_set:
                # Fuzzify training partition
                fuzzifier = FuzGen(training_df)
                fuzzy_df = fuzzifier.fuzzify_data(tags_ranges)

                # Deal the new rules set to the training partition
                classifier = Classifier(fuzzy_df, best_rulesset)
                classifier.classify_dataset()

                # Check results of classification: matched rules and positives rate
                TP_value, matched_rules = classifier.verify_classification()

                # Concatenate the matched rules to the current best rules set
                best_rulesset = pd.concat([best_rulesset, matched_rules])
            '''
            Once get the matched rules over the initial set, test the rules set over the test partition
            Before this, apply a filter to select only a rule for each antecesors set, based in the matches
            got from the training
            '''

            # Filter the best rules, removing repeated antecesors
            best_rulesset = self.reduce_rules(best_rulesset, tags_ranges)

            # Try to classify the test set with the rules set get from training
            classifier = Classifier(test_df, best_rulesset)
            classifier.classify_dataset()

            # Check classification results
            TP_value, matched_rules = classifier.verify_classification()

            # Calculate accuraccy, as the division between the positives rate (matches) and the length of test set
            accuraccy = (TP_value / len(test_df))

            print(f"Test {i} accuraccy: {accuraccy}")

        print(f"Lenght of minimal rules set: {len(best_rulesset)}")

        return best_rulesset
示例#10
0
文件: disk.py 项目: k0da/kiwi-1
class Disk(DeviceProvider):
    """
        implement storage disk and partition table setup
    """
    def __init__(self, table_type, storage_provider):
        # bind the underlaying block device providing class instance
        # to this object (e.g loop) if present. This is done to guarantee
        # the correct destructor order when the device should be released.
        self.storage_provider = storage_provider

        self.partition_map = {}
        self.partition_id_map = {}
        self.partition_id = {}
        self.is_mapped = False

        self.partitioner = Partitioner(
            table_type, storage_provider
        )

        self.table_type = table_type

    def get_device(self):
        """
            return names of partition devices, note that the mapping
            requires an explicit map() call
        """
        device_map = {}
        for partition_name, device_node in self.partition_map.iteritems():
            device_map[partition_name] = MappedDevice(
                device=device_node, device_provider=self
            )
        return device_map

    def is_loop(self):
        """
            returns if this disk is based on a loop device. The
            information is taken from the storage provider. If the
            storage provider is loop based the disk is it too
        """
        return self.storage_provider.is_loop()

    def create_root_partition(self, mbsize):
        self.partitioner.create('p.lxroot', mbsize, 't.linux')
        self.__add_to_map('root')
        self.__add_to_id_map('kiwi_RootPart')
        if 'kiwi_BootPart' not in self.partition_id_map:
            self.__add_to_id_map('kiwi_BootPart')

    def create_root_lvm_partition(self, mbsize):
        self.partitioner.create('p.lxlvm', mbsize, 't.lvm')
        self.__add_to_map('root')
        self.__add_to_id_map('kiwi_RootPart')
        self.__add_to_id_map('kiwi_RootPartVol', 'LVRoot')

    def create_root_raid_partition(self, mbsize):
        self.partitioner.create('p.lxraid', mbsize, 't.raid')
        self.__add_to_map('root')
        self.__add_to_id_map('kiwi_RootPart')
        self.__add_to_id_map('kiwi_RaidPart')
        self.__add_to_id_map('kiwi_RaidDev', '/dev/md0')

    def create_boot_partition(self, mbsize):
        self.partitioner.create('p.lxboot', mbsize, 't.linux')
        self.__add_to_map('boot')
        self.__add_to_id_map('kiwi_BootPart')

    def create_efi_csm_partition(self, mbsize):
        self.partitioner.create('p.legacy', mbsize, 't.csm')
        self.__add_to_map('efi_csm')
        self.__add_to_id_map('kiwi_BiosGrub')

    def create_efi_partition(self, mbsize):
        self.partitioner.create('p.UEFI', mbsize, 't.efi')
        self.__add_to_map('efi')
        self.__add_to_id_map('kiwi_JumpPart')

    def activate_boot_partition(self):
        partition_id = None
        if 'boot' in self.partition_id:
            partition_id = self.partition_id['boot']
        elif 'root' in self.partition_id:
            partition_id = self.partition_id['root']
        if partition_id:
            self.partitioner.set_flag(partition_id, 'f.active')

    def wipe(self):
        """
            Zap (destroy) any GPT and MBR data structures if present
            For DASD disks create a new VTOC table
        """
        if 'dasd' in self.table_type:
            log.debug('Initialize DASD disk with new VTOC table')
            fdasd_input = NamedTemporaryFile()
            with open(fdasd_input.name, 'w') as vtoc:
                vtoc.write('y\n\nw\nq\n')
            bash_command = ' '.join(
                [
                    'cat', fdasd_input.name, '|',
                    'fdasd', '-f', self.storage_provider.get_device()
                ]
            )
            Command.run(
                ['bash', '-c', bash_command]
            )
        else:
            log.debug('Initialize %s disk', self.table_type)
            Command.run(
                [
                    'sgdisk', '--zap-all', self.storage_provider.get_device()
                ]
            )

    def map_partitions(self):
        if self.storage_provider.is_loop():
            Command.run(
                ['kpartx', '-s', '-a', self.storage_provider.get_device()]
            )
            self.is_mapped = True
        else:
            Command.run(
                ['partprobe', self.storage_provider.get_device()]
            )

    def get_partition_id_map(self):
        return OrderedDict(
            sorted(self.partition_id_map.items())
        )

    def __add_to_id_map(self, name, value=None):
        if not value:
            value = self.partitioner.get_id()
        self.partition_id_map[name] = value

    def __add_to_map(self, name):
        device_node = None
        partition_number = format(self.partitioner.get_id())
        if self.storage_provider.is_loop():
            device_base = os.path.basename(self.storage_provider.get_device())
            device_node = ''.join(
                ['/dev/mapper/', device_base, 'p', partition_number]
            )
        else:
            device = self.storage_provider.get_device()
            if device[-1].isdigit():
                device_node = ''.join(
                    [device, 'p', partition_number]
                )
            else:
                device_node = ''.join(
                    [device, partition_number]
                )
        if device_node:
            self.partition_map[name] = device_node
            self.partition_id[name] = partition_number

    def __del__(self):
        if self.storage_provider.is_loop() and self.is_mapped:
            log.info('Cleaning up %s instance', type(self).__name__)
            try:
                Command.run(
                    ['kpartx', '-s', '-d', self.storage_provider.get_device()]
                )
            except Exception:
                log.warning(
                    'cleanup of partition device maps failed, %s still busy',
                    self.storage_provider.get_device()
                )
示例#11
0
    data_orig = xr.open_dataarray(filepath)
    # let's first try only one var
    data = data_orig[0, :, :, :].copy()
    shape = np.shape(data)
    nx = shape[0]
    ny = shape[1]
    nz = shape[2]
# making shape parameters available everywhere
nx = comm.bcast(nx, root=0)
ny = comm.bcast(ny, root=0)
nz = comm.bcast(nz, root=0)

print(nx, ny, nz)
# setting up the partitioner
# the field dimensions need to be the real ones - the halo points.
p = Partitioner(comm, [nx, ny - 2 * 2, nz - 2 * 2], num_halo=2)

# distribute the work onto the ranks
data_work = p.scatter(data)
"""
# subset more for speedup of first tests
print(f'subset even more because very large dataset')
data = data[:,::10,:,:]
"""

# create a mask of nans
mask = ~np.isnan(data_work)  # nan values have zero weight (i.e. are False)

# gapfilling the missing values with spatiotemporal mean
print('gapfilling missing values with spatiotemporal mean')
tic = datetime.now()
示例#12
0
def main(nx, ny, nz, num_iter, num_halo=2, plot_result=False):
    """Driver for apply_diffusion that sets up fields and does timings"""

    assert 0 < nx <= 1024 * 1024, 'You have to specify a reasonable value for nx'
    assert 0 < ny <= 1024 * 1024, 'You have to specify a reasonable value for ny'
    assert 0 < nz <= 1024, 'You have to specify a reasonable value for nz'
    assert 0 < num_iter <= 1024 * 1024, 'You have to specify a reasonable value for num_iter'
    assert 0 < num_halo <= 256, 'Your have to specify a reasonable number of halo points'
    alpha = 1. / 32.

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    p = Partitioner(comm, [nz, ny, nx], num_halo)

    if rank == 0:
        f = np.zeros((nz, ny + 2 * num_halo, nx + 2 * num_halo))
        # Option 1: Original stencil2d-mpi during HPC4WC course:
        # f[nz // 4:3 * nz // 4, num_halo + ny // 4:num_halo + 3 * ny // 4, num_halo + nx // 4:num_halo + 3 * nx // 4] = 1.0

        # Option 2: Similar to option 1, but positive region extended towards tile edges:
        # f[nz // 10:9 * nz // 10, num_halo + ny // 10:num_halo + 9 * ny // 10, num_halo + nx // 10:num_halo + 9 * nx // 10] = 1.0

        # Option 3: One positive region in bottom-left (0-0) corner, one positive region in top-right (ny-nx) corner
        # f[nz // 4:3 * nz // 4, num_halo:num_halo + ny // 4, num_halo:num_halo + nx // 4] = 1.0
        # f[nz // 4:3 * nz // 4, num_halo + 3 * ny // 4:-num_halo, num_halo + 3 * nx // 4:-num_halo] = 1.0

        # Option 4: Positive region line prime number fraction off-center across tile:
        f[nz // 4:3 * nz // 4, num_halo + ny // 7:num_halo + 2 * ny // 7,
          num_halo:-num_halo] = 1.0

    else:
        f = np.empty(1)
    in_field = p.scatter(f)

    out_field = np.copy(in_field)

    f = p.gather(in_field)
    if rank == 0:
        np.save('in_field', f)
        if plot_result:
            plt.ioff()
            plt.imshow(f[in_field.shape[0] // 2, :, :], origin='lower')
            plt.colorbar()
            plt.savefig('in_field.png')
            plt.close()

    # warmup caches
    apply_diffusion(in_field, out_field, alpha, num_halo, p=p)

    comm.Barrier()

    # time the actual work
    tic = time.time()
    apply_diffusion(in_field,
                    out_field,
                    alpha,
                    num_halo,
                    num_iter=num_iter,
                    p=p)
    toc = time.time()

    comm.Barrier()

    if rank == 0:
        print("Elapsed time for work = {} s".format(toc - tic))

    update_halo(out_field, num_halo, p)

    f = p.gather(out_field)
    if rank == 0:
        np.save('out_field', f)
        if plot_result:
            plt.imshow(f[out_field.shape[0] // 2, :, :], origin='lower')
            plt.colorbar()
            plt.savefig('out_field.png')
            plt.close()
示例#13
0
def main(nx, ny, nz, num_iter, num_halo=2, plot_result=False):
    """Driver for apply_diffusion that sets up fields and does timings"""

    assert 0 < nx <= 1024 * 1024, 'You have to specify a reasonable value for nx'
    assert 0 < ny <= 1024 * 1024, 'You have to specify a reasonable value for ny'
    assert 0 < nz <= 1024, 'You have to specify a reasonable value for nz'
    assert 0 < num_iter <= 1024 * 1024, 'You have to specify a reasonable value for num_iter'
    assert 0 < num_halo <= 256, 'Your have to specify a reasonable number of halo points'
    alpha = 1. / 32.

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    p = Partitioner(comm, [nz, ny, nx], num_halo)

    if rank == 0:
        f = np.zeros((nz, ny + 2 * num_halo, nx + 2 * num_halo))
        f[nz // 4:3 * nz // 4, num_halo + ny // 4:num_halo + 3 * ny // 4,
          num_halo + nx // 4:num_halo + 3 * nx // 4] = 1.0
    else:
        f = np.empty(1)
    in_field = p.scatter(f)

    out_field = np.copy(in_field)

    f = p.gather(in_field)
    if rank == 0:
        np.save('in_field', f)
        if plot_result:
            plt.ioff()
            plt.imshow(f[in_field.shape[0] // 2, :, :], origin='lower')
            plt.colorbar()
            plt.savefig('in_field.png')
            plt.close()

    # warmup caches
    apply_diffusion(in_field, out_field, alpha, num_halo, p=p)

    comm.Barrier()

    # time the actual work
    tic = time.time()
    apply_diffusion(in_field,
                    out_field,
                    alpha,
                    num_halo,
                    num_iter=num_iter,
                    p=p)
    toc = time.time()

    comm.Barrier()

    if rank == 0:
        print("Elapsed time for work = {} s".format(toc - tic))

    update_halo(out_field, num_halo, p)

    f = p.gather(out_field)
    if rank == 0:
        np.save('out_field', f)
        if plot_result:
            plt.imshow(f[out_field.shape[0] // 2, :, :], origin='lower')
            plt.colorbar()
            plt.savefig('out_field.png')
            plt.close()
示例#14
0
class Scraper:
    def __init__(self,url):
        self.url = url          # 要分析的url
        self.block_li = []      # 网页所包含的文本块列表
        self.title = ''
        #重置记录
        self.recorder = Recorder()
        self.recorder.reset()
        
    # 从正文前后和其中中提取图片,只取第一个
    # 只取图片大小足够大的
    def get_images(self,block):
        imgs = []
        
        # 设定image搜索起点
        if self.title != self.parser.soup.title:
            start = self.title
        else:
            # title不在正文中,向上扩展image搜索范围
            # 向下扩展image搜索范围
            start = block.text_list()[0]        
            while start.previous:
                start = start.previous
                if not isinstance(start,NavigableString) and start.name in BLOCK_TAGS:
                    break
                
        # 设定image搜索终点
        end = block.text_list()[-1]
        while end.next:
            end = end.next
            if not isinstance(end,NavigableString) and end.name in BLOCK_TAGS:
                break

        while start!=end:
            if not isinstance(start,NavigableString) and start.name=='img':
                imgs.append( start ) 
            start = start.next
        return self.filter_images( imgs )

    def filter_images( self,imgs ):
        srcs = []
        images = []
        for img in imgs:
            if img.has_key('src'):
                src = img['src']
                if not src.lower().startswith('http://'):
                    src = relative2absolute( self.url,src )
                    # 判断图片大小,太小不要
                try:
                    im = urlopen( src ).read()
                    if len(im)>MIN_IMG_SIZE:
                        srcs.append( src )
                        #img['src'] = src
                        images.append( img )
                except IOError:
                    pass
        return (srcs,images)
    # 如果图像出现在block中,则添加图像和图像p内的ns
    def insert_images(self,block,images):
        start = self.title
        end = block.text_list()[-1]
        behind_img = False
        #block.print_ns()

        i = 0                   # 记录block中文本编号
        while start!=end:
            if not isinstance(start,NavigableString) :
                if start.name=='img' and start in images:
                    src = start['src']
                    if not src.lower().startswith('http://'):
                        start['src'] = relative2absolute( self.url,src )
                    #print i,":",str(start),"[]"
                    block.insert( i,start ) 
                    #block.print_ns()
                    i += 1
                    behind_img = True
                elif start.name=='br':
                    #print i,":",str(start),"[]"
                    # 加入换行符
                    block.insert( i,start ) 
                    #block.print_ns()
                    i += 1
                elif start.name in BLOCK_TAGS:
                    behind_img = False
            # NavigableString
            elif start.string.strip():
                # 已经在正文块中
                if start in block.text_list():
                    #print i,":",start.string
                    i += 1 
                    behind_img = False
                # 不在正文块中,在图片后的兄弟文本
                elif behind_img:
                    #print i,":",start.string,"[]"
                    block.insert( i,start )
                    #block.print_ns()
                    i += 1
            start = start.next
    
        return block
    # 执行流程,返回提取到的正文
    def get_content(self):
         # 1.提取基本文本块
        self.parser = Parser(self.url)
        ns_list = self.parser.ns()
        self.title = self.parser.get_title()
        # 2.文本串分块
        self.partitioner = Partitioner()
        blocks = self.partitioner.partition(ns_list) 

        # 3.抽取正文块,副产品为分析信息
        self.judge = Judge( self.title.string,ns_list )
        res = self.judge.select( blocks,ns_list )   

        flag = res['flag']
        cblock = res['block']
        confidence = res['confidence']
        detail = res['detail']
        #if flag:
        content = cblock.to_str()
        (srcs,images) = self.get_images( cblock )
        cblock = self.insert_images(cblock,images)
        content_with_format = cblock.to_str_with_format()
        #else:
        #    content = ""
        #    content_with_format = ""
        #    srcs = None
        return (flag,self.title.string.strip(),content,content_with_format,srcs,confidence,detail)
示例#15
0
class Scraper:
    def __init__(self, url):
        self.url = url  # 要分析的url
        self.block_li = []  # 网页所包含的文本块列表
        self.title = ''
        #重置记录
        self.recorder = Recorder()
        self.recorder.reset()

    # 从正文前后和其中中提取图片,只取第一个
    # 只取图片大小足够大的
    def get_images(self, block):
        imgs = []

        # 设定image搜索起点
        if self.title != self.parser.soup.title:
            start = self.title
        else:
            # title不在正文中,向上扩展image搜索范围
            # 向下扩展image搜索范围
            start = block.text_list()[0]
            while start.previous:
                start = start.previous
                if not isinstance(
                        start, NavigableString) and start.name in BLOCK_TAGS:
                    break

        # 设定image搜索终点
        end = block.text_list()[-1]
        while end.next:
            end = end.next
            if not isinstance(end, NavigableString) and end.name in BLOCK_TAGS:
                break

        while start != end:
            if not isinstance(start, NavigableString) and start.name == 'img':
                imgs.append(start)
            start = start.next
        return self.filter_images(imgs)

    def filter_images(self, imgs):
        srcs = []
        images = []
        for img in imgs:
            if img.has_key('src'):
                src = img['src']
                if not src.lower().startswith('http://'):
                    src = relative2absolute(self.url, src)
                    # 判断图片大小,太小不要
                try:
                    im = urlopen(src).read()
                    if len(im) > MIN_IMG_SIZE:
                        srcs.append(src)
                        #img['src'] = src
                        images.append(img)
                except IOError:
                    pass
        return (srcs, images)

    # 如果图像出现在block中,则添加图像和图像p内的ns
    def insert_images(self, block, images):
        start = self.title
        end = block.text_list()[-1]
        behind_img = False
        #block.print_ns()

        i = 0  # 记录block中文本编号
        while start != end:
            if not isinstance(start, NavigableString):
                if start.name == 'img' and start in images:
                    src = start['src']
                    if not src.lower().startswith('http://'):
                        start['src'] = relative2absolute(self.url, src)
                    #print i,":",str(start),"[]"
                    block.insert(i, start)
                    #block.print_ns()
                    i += 1
                    behind_img = True
                elif start.name == 'br':
                    #print i,":",str(start),"[]"
                    # 加入换行符
                    block.insert(i, start)
                    #block.print_ns()
                    i += 1
                elif start.name in BLOCK_TAGS:
                    behind_img = False
            # NavigableString
            elif start.string.strip():
                # 已经在正文块中
                if start in block.text_list():
                    #print i,":",start.string
                    i += 1
                    behind_img = False
                # 不在正文块中,在图片后的兄弟文本
                elif behind_img:
                    #print i,":",start.string,"[]"
                    block.insert(i, start)
                    #block.print_ns()
                    i += 1
            start = start.next

        return block

    # 执行流程,返回提取到的正文
    def get_content(self):
        # 1.提取基本文本块
        self.parser = Parser(self.url)
        ns_list = self.parser.ns()
        self.title = self.parser.get_title()
        # 2.文本串分块
        self.partitioner = Partitioner()
        blocks = self.partitioner.partition(ns_list)

        # 3.抽取正文块,副产品为分析信息
        self.judge = Judge(self.title.string, ns_list)
        res = self.judge.select(blocks, ns_list)

        flag = res['flag']
        cblock = res['block']
        confidence = res['confidence']
        detail = res['detail']
        #if flag:
        content = cblock.to_str()
        (srcs, images) = self.get_images(cblock)
        cblock = self.insert_images(cblock, images)
        content_with_format = cblock.to_str_with_format()
        #else:
        #    content = ""
        #    content_with_format = ""
        #    srcs = None
        return (flag, self.title.string.strip(), content, content_with_format,
                srcs, confidence, detail)