def start_rules_training(self): gen_tags = gt(self.df) tags_ranges = gen_tags.set_tags() parts_gen = Part(self.df) partition_set = parts_gen.gen_partition_set() best_accuraccy = 0 best_rulesset = pd.DataFrame() for i in range(0, len(partition_set) - 1): test_set = partition_set[i] training_set = partition_set.copy() training_set.pop(i) training_df = pd.concat(training_set) fuzzifier = FuzGen(test_set) test_df = fuzzifier.fuzzify_data(tags_ranges) rules_df = self.learn_rules(training_df, tags_ranges) classifier = Classifier(test_df, rules_df) classifier.classify_dataset() TP_value = classifier.verify_classification() accuraccy = (TP_value / len(test_df)) print(accuraccy) if accuraccy > best_accuraccy: best_accuraccy = accuraccy best_rulesset = rules_df return best_rulesset
def get_content(self): # 1.提取基本文本块 self.parser = Parser(self.url) ns_list = self.parser.ns() self.title = self.parser.get_title() # 2.文本串分块 self.partitioner = Partitioner() blocks = self.partitioner.partition(ns_list) # 3.抽取正文块,副产品为分析信息 self.judge = Judge(self.title.string, ns_list) res = self.judge.select(blocks, ns_list) flag = res['flag'] cblock = res['block'] confidence = res['confidence'] detail = res['detail'] #if flag: content = cblock.to_str() (srcs, images) = self.get_images(cblock) cblock = self.insert_images(cblock, images) content_with_format = cblock.to_str_with_format() #else: # content = "" # content_with_format = "" # srcs = None return (flag, self.title.string.strip(), content, content_with_format, srcs, confidence, detail)
def __init__(self, context, partition_set): self.__aggregation_sets = partition_set self.__aggregation_sets[c.KEY_TABLES] = {} self.__partitioner = Partitioner(context[c.KEY_PARTITIONS], context[c.KEY_SEPERATOR_PARTITION]) self.__context = context self.__info = {} self.__info[c.INFO_TOTAL_BYTES] = 0 self.__info[c.INFO_TOTAL_ROWS] = 0 self.__info[c.INFO_TOTAL_MESSAGES] = 0 self.__logger = logging.getLogger() self.__logger.setLevel(logging.ERROR)
def __init__(self, table_type, storage_provider): # bind the underlaying block device providing class instance # to this object (e.g loop) if present. This is done to guarantee # the correct destructor order when the device should be released. self.storage_provider = storage_provider self.partition_map = {} self.partition_id_map = {} self.partition_id = {} self.is_mapped = False self.partitioner = Partitioner( table_type, storage_provider ) self.table_type = table_type
def get_content(self): # 1.提取基本文本块 self.parser = Parser(self.url) ns_list = self.parser.ns() self.title = self.parser.get_title() # 2.文本串分块 self.partitioner = Partitioner() blocks = self.partitioner.partition(ns_list) # 3.抽取正文块,副产品为分析信息 self.judge = Judge( self.title.string,ns_list ) res = self.judge.select( blocks,ns_list ) flag = res['flag'] cblock = res['block'] confidence = res['confidence'] detail = res['detail'] #if flag: content = cblock.to_str() (srcs,images) = self.get_images( cblock ) cblock = self.insert_images(cblock,images) content_with_format = cblock.to_str_with_format() #else: # content = "" # content_with_format = "" # srcs = None return (flag,self.title.string.strip(),content,content_with_format,srcs,confidence,detail)
def setup(self): """ start cluster if not yet started, assign topology parts to workers and start workers """ if(not self.cluster.is_running()): self.cluster.start() if(not self.cluster.is_running()): raise RuntimeError("Cluster won't start") self.logger.info("Clustering topology...") if(not self.topology): parti = Partitioner() parti.loadtopo(self.origtopology) self.topology = parti.partition(self.cluster.num_workers(),self.cluster.get_worker_shares()) # assigning shares to workers requires that the workers are already startet. elsewise we don't have a way to determine the workerid of the worker. topologies are assigned to workers in ascending workerid order self.logger.debug("Tunnels: "+str(self.topology.getTunnels())) subtopos = self.topology.getTopos() if(len(subtopos) > self.cluster.num_workers()): raise RuntimeError("Cluster does not have enough workers for given topology") for subtopo in subtopos: for node in subtopo.nodes(): self.node_to_workerid[node]=subtopos.index(subtopo) self.nodes.append(NodeWrapper(node, self.get_worker(node))) self.node_to_wrapper[node]=self.nodes[-1] if (not subtopo.isSwitch(node)): self.hosts.append(self.nodes[-1]) else: self.switches.append(self.nodes[-1]) self.logger.debug("Nodemapping: %s",self.node_to_workerid) tunnels = [[] for x in range(len(subtopos))] for tunnel in self.topology.getTunnels(): w1 = self.get_worker(tunnel[0]) w2 = self.get_worker(tunnel[1]) intf = self.cluster.create_tunnel(w1,w2) self.tunnellookup[(tunnel[0],tunnel[1])]=intf self.tunnellookup[(tunnel[1],tunnel[0])]=intf for i in range(0,2): tunnels[self.node_to_workerid[tunnel[i]]].append([intf, tunnel[i], tunnel[2]]) # Assumes that workerid = subtopoid for topo in subtopos: self.cluster.workers()[subtopos.index(topo)].set_switch(self.switch) if(self.controller): self.cluster.workers()[subtopos.index(topo)].start(topo=topo, tunnels=tunnels[subtopos.index(topo)], controller=self.controller) else: self.cluster.workers()[subtopos.index(topo)].start(topo=topo, tunnels=tunnels[subtopos.index(topo)]) if (config.runWith1500MTU): for topo in subtopos: for host in topo.nodes(): self.setMTU(host,1450)
def exp1(name, step): exp1_info = {} gzip_fname = utils.get_gzip_fname(name) s = network.Network.from_combined(name, gzip_fname) graph = s.graph p = Partitioner(s.graph) cd_s = time.time() cd_partition = p.community_detection() exp1_info['community_detection'] = duration(cd_s) parts = len(cd_partition) exp1_info['parts'] = parts metis_s = time.time() metis_partition = p.metis_partition(parts) exp1_info['metis'] = duration(metis_s) random_s = time.time() rnd_partition = p.random_partition(parts) exp1_info['rnd'] = duration(random_s) graph_info = GraphInfo(graph) partitions = [cd_partition, metis_partition, rnd_partition] exp1_impl(name, graph_info, p, partitions, step) return exp1_info
class Aggregator(object): """ Aggregate events of multiple different SQS messages into S3 key lists. """ def __init__(self, context, partition_set): self.__aggregation_sets = partition_set self.__aggregation_sets[c.KEY_TABLES] = {} self.__partitioner = Partitioner(context[c.KEY_PARTITIONS], context[c.KEY_SEPERATOR_PARTITION]) self.__context = context self.__info = {} self.__info[c.INFO_TOTAL_BYTES] = 0 self.__info[c.INFO_TOTAL_ROWS] = 0 self.__info[c.INFO_TOTAL_MESSAGES] = 0 self.__info[c.INFO_EVENTS] = {} self.__logger = logging.getLogger() self.__logger.setLevel(logging.ERROR) @property def bytes_uncompressed(self): return self.__info[c.INFO_TOTAL_BYTES] @property def rows(self): return self.__info[c.INFO_TOTAL_ROWS] @property def messages(self): return self.__info[c.INFO_TOTAL_MESSAGES] @property def events(self): return self.__info[c.INFO_EVENTS] @property def info(self): return self.__info def append_default_metrics_and_partition(self, messages): length = len(messages) util.debug_print(("Processing {} messages.").format(length)) self.increment(self.__info, c.INFO_TOTAL_MESSAGES, length) for x in range(0, length): message = messages[x] self.process_message(message) def process_message(self, message): compression_mode = CompressionClassFactory.instance( message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][ c.SQS_PARAM_COMPRESSION_TYPE]['StringValue']) body = compression_mode.extract_message_body(message) attempts = int(message['Attributes']['ApproximateReceiveCount']) sensitivity_type = SensitivityClassFactory.instance( message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][ c.SQS_PARAM_SENSITIVITY_TYPE]['StringValue']) payload_type = PayloadClassFactory.instance( self.__context, message[c.SQS_PARAM_MESSAGE_ATTRIBUTES][ c.SQS_PARAM_PAYLOAD_TYPE]['StringValue'], compression_mode, sensitivity_type) msg_token = "{}{}{}".format(message['MessageId'], self.__context[c.KEY_SEPERATOR_CSV], message['ReceiptHandle']) if attempts > self.__context[c.KEY_MAX_MESSAGE_RETRY]: self.__logger.error( "The message with message Id {} has been processed {} times.". format(msg_token, attempts)) self.increment(self.__info, c.INFO_TOTAL_BYTES, len(body)) payload_type.to_partitions(msg_token, body, self.partition, sensitivity_type, self.__partitioner.partitions) def partition(self, token, row, sensitivity_type): #schema hash columns = row.keys() columns = [i if isinstance(i, string_types) else i for i in columns] columns.sort() rows_as_string = str(columns) schema_hash = hash(rows_as_string) event_name = row[metric_schema.EVENT.id] uuid_key = "{}{}{}".format(row[metric_schema.UUID.id], row[metric_schema.EVENT.id], row[metric_schema.SERVER_TIMESTAMP.id]) #create the key here as the partition my remove attributes if the attribute is created as a partition tablename, partition = self.__partitioner.extract( schema_hash, row, sensitivity_type) columns, row = self.order_and_map_to_long_name(row) self.increment_detailed_cloudwatch_event_information(event_name) if partition is None: self.__logger.error("Dropping metric\n{}".format(row)) return if partition not in self.__aggregation_sets: #need to use a immutable object as required by fastparquet for hashing self.__aggregation_sets[partition] = dict({}) if tablename not in self.__aggregation_sets[c.KEY_TABLES]: self.__aggregation_sets[c.KEY_TABLES][tablename] = tablename partition_dict = self.__aggregation_sets[partition] if schema_hash not in partition_dict: partition_dict[schema_hash] = {} partition_dict[schema_hash][c.KEY_SET] = {} partition_dict[schema_hash][c.KEY_SET_COLUMNS] = columns partition_dict[schema_hash][c.KEY_SET][uuid_key] = row self.register_processed_message(partition_dict[schema_hash], token) def increment_detailed_cloudwatch_event_information(self, event_name): if self.__context.get(c.KEY_WRITE_DETAILED_CLOUDWATCH_EVENTS, False): self.increment(self.events, event_name, 1) def register_processed_message(self, schema_dict, msg_token): #track which messages have been processed if c.KEY_MSG_IDS not in schema_dict: schema_dict[c.KEY_MSG_IDS], schema_dict[ c.KEY_APPENDER] = self.get_new_list_append_handler() if msg_token not in schema_dict[c.KEY_MSG_IDS]: schema_dict[c.KEY_APPENDER](msg_token) def get_new_list_append_handler(self): list = [] append = list.append return list, append def increment(self, dict, key, value): if key not in dict: dict[key] = value dict[key] += value def order_and_map_to_long_name(self, row): orderer = Order() ordered_columns = orderer.order_columns(row) ordered_dict = OrderedDict() ordered_columns_long_name = [] for field in ordered_columns: if field not in row: continue value = row[field] if field in metric_schema.DICTIONARY and field in row: name = metric_schema.DICTIONARY[field].long_name ordered_dict[name] = value ordered_columns_long_name.append(name) else: ordered_dict[field] = value ordered_columns_long_name.append(field) return ordered_columns_long_name, ordered_dict
def start_rules_training(self): # Generate the tags for the values range gen_tags = gt(self.df) tags_ranges = gen_tags.set_tags() # Split the dataset in 5 random partitions parts_gen = Part(self.df) partition_set = parts_gen.gen_partition_set() # Initialize the best rulesset using full dataset best_rulesset = pd.DataFrame() best_rulesset = self.get_initial_rules(self.df, tags_ranges) # Train the rules with all posible combinations of training and test partitions for i in range(0, len(partition_set) - 1): # Select the partition for the test set, using the index of the loop test_set = partition_set[i] # Select the partitions for training set, removing test partition from a copy of the partitions list training_set = partition_set.copy() training_set.pop(i) # Remove test partition from training_set # Fuzzify the data from the test set fuzzifier = FuzGen(test_set) test_df = fuzzifier.fuzzify_data(tags_ranges) ''' Deal each training set with the rules set, to get the best rules set. In each iteration, accumulate the matched rules to the previous rules set. This will allows to distinct the best rules, which have been matched more times ''' for training_df in training_set: # Fuzzify training partition fuzzifier = FuzGen(training_df) fuzzy_df = fuzzifier.fuzzify_data(tags_ranges) # Deal the new rules set to the training partition classifier = Classifier(fuzzy_df, best_rulesset) classifier.classify_dataset() # Check results of classification: matched rules and positives rate TP_value, matched_rules = classifier.verify_classification() # Concatenate the matched rules to the current best rules set best_rulesset = pd.concat([best_rulesset, matched_rules]) ''' Once get the matched rules over the initial set, test the rules set over the test partition Before this, apply a filter to select only a rule for each antecesors set, based in the matches got from the training ''' # Filter the best rules, removing repeated antecesors best_rulesset = self.reduce_rules(best_rulesset, tags_ranges) # Try to classify the test set with the rules set get from training classifier = Classifier(test_df, best_rulesset) classifier.classify_dataset() # Check classification results TP_value, matched_rules = classifier.verify_classification() # Calculate accuraccy, as the division between the positives rate (matches) and the length of test set accuraccy = (TP_value / len(test_df)) print(f"Test {i} accuraccy: {accuraccy}") print(f"Lenght of minimal rules set: {len(best_rulesset)}") return best_rulesset
class Disk(DeviceProvider): """ implement storage disk and partition table setup """ def __init__(self, table_type, storage_provider): # bind the underlaying block device providing class instance # to this object (e.g loop) if present. This is done to guarantee # the correct destructor order when the device should be released. self.storage_provider = storage_provider self.partition_map = {} self.partition_id_map = {} self.partition_id = {} self.is_mapped = False self.partitioner = Partitioner( table_type, storage_provider ) self.table_type = table_type def get_device(self): """ return names of partition devices, note that the mapping requires an explicit map() call """ device_map = {} for partition_name, device_node in self.partition_map.iteritems(): device_map[partition_name] = MappedDevice( device=device_node, device_provider=self ) return device_map def is_loop(self): """ returns if this disk is based on a loop device. The information is taken from the storage provider. If the storage provider is loop based the disk is it too """ return self.storage_provider.is_loop() def create_root_partition(self, mbsize): self.partitioner.create('p.lxroot', mbsize, 't.linux') self.__add_to_map('root') self.__add_to_id_map('kiwi_RootPart') if 'kiwi_BootPart' not in self.partition_id_map: self.__add_to_id_map('kiwi_BootPart') def create_root_lvm_partition(self, mbsize): self.partitioner.create('p.lxlvm', mbsize, 't.lvm') self.__add_to_map('root') self.__add_to_id_map('kiwi_RootPart') self.__add_to_id_map('kiwi_RootPartVol', 'LVRoot') def create_root_raid_partition(self, mbsize): self.partitioner.create('p.lxraid', mbsize, 't.raid') self.__add_to_map('root') self.__add_to_id_map('kiwi_RootPart') self.__add_to_id_map('kiwi_RaidPart') self.__add_to_id_map('kiwi_RaidDev', '/dev/md0') def create_boot_partition(self, mbsize): self.partitioner.create('p.lxboot', mbsize, 't.linux') self.__add_to_map('boot') self.__add_to_id_map('kiwi_BootPart') def create_efi_csm_partition(self, mbsize): self.partitioner.create('p.legacy', mbsize, 't.csm') self.__add_to_map('efi_csm') self.__add_to_id_map('kiwi_BiosGrub') def create_efi_partition(self, mbsize): self.partitioner.create('p.UEFI', mbsize, 't.efi') self.__add_to_map('efi') self.__add_to_id_map('kiwi_JumpPart') def activate_boot_partition(self): partition_id = None if 'boot' in self.partition_id: partition_id = self.partition_id['boot'] elif 'root' in self.partition_id: partition_id = self.partition_id['root'] if partition_id: self.partitioner.set_flag(partition_id, 'f.active') def wipe(self): """ Zap (destroy) any GPT and MBR data structures if present For DASD disks create a new VTOC table """ if 'dasd' in self.table_type: log.debug('Initialize DASD disk with new VTOC table') fdasd_input = NamedTemporaryFile() with open(fdasd_input.name, 'w') as vtoc: vtoc.write('y\n\nw\nq\n') bash_command = ' '.join( [ 'cat', fdasd_input.name, '|', 'fdasd', '-f', self.storage_provider.get_device() ] ) Command.run( ['bash', '-c', bash_command] ) else: log.debug('Initialize %s disk', self.table_type) Command.run( [ 'sgdisk', '--zap-all', self.storage_provider.get_device() ] ) def map_partitions(self): if self.storage_provider.is_loop(): Command.run( ['kpartx', '-s', '-a', self.storage_provider.get_device()] ) self.is_mapped = True else: Command.run( ['partprobe', self.storage_provider.get_device()] ) def get_partition_id_map(self): return OrderedDict( sorted(self.partition_id_map.items()) ) def __add_to_id_map(self, name, value=None): if not value: value = self.partitioner.get_id() self.partition_id_map[name] = value def __add_to_map(self, name): device_node = None partition_number = format(self.partitioner.get_id()) if self.storage_provider.is_loop(): device_base = os.path.basename(self.storage_provider.get_device()) device_node = ''.join( ['/dev/mapper/', device_base, 'p', partition_number] ) else: device = self.storage_provider.get_device() if device[-1].isdigit(): device_node = ''.join( [device, 'p', partition_number] ) else: device_node = ''.join( [device, partition_number] ) if device_node: self.partition_map[name] = device_node self.partition_id[name] = partition_number def __del__(self): if self.storage_provider.is_loop() and self.is_mapped: log.info('Cleaning up %s instance', type(self).__name__) try: Command.run( ['kpartx', '-s', '-d', self.storage_provider.get_device()] ) except Exception: log.warning( 'cleanup of partition device maps failed, %s still busy', self.storage_provider.get_device() )
data_orig = xr.open_dataarray(filepath) # let's first try only one var data = data_orig[0, :, :, :].copy() shape = np.shape(data) nx = shape[0] ny = shape[1] nz = shape[2] # making shape parameters available everywhere nx = comm.bcast(nx, root=0) ny = comm.bcast(ny, root=0) nz = comm.bcast(nz, root=0) print(nx, ny, nz) # setting up the partitioner # the field dimensions need to be the real ones - the halo points. p = Partitioner(comm, [nx, ny - 2 * 2, nz - 2 * 2], num_halo=2) # distribute the work onto the ranks data_work = p.scatter(data) """ # subset more for speedup of first tests print(f'subset even more because very large dataset') data = data[:,::10,:,:] """ # create a mask of nans mask = ~np.isnan(data_work) # nan values have zero weight (i.e. are False) # gapfilling the missing values with spatiotemporal mean print('gapfilling missing values with spatiotemporal mean') tic = datetime.now()
def main(nx, ny, nz, num_iter, num_halo=2, plot_result=False): """Driver for apply_diffusion that sets up fields and does timings""" assert 0 < nx <= 1024 * 1024, 'You have to specify a reasonable value for nx' assert 0 < ny <= 1024 * 1024, 'You have to specify a reasonable value for ny' assert 0 < nz <= 1024, 'You have to specify a reasonable value for nz' assert 0 < num_iter <= 1024 * 1024, 'You have to specify a reasonable value for num_iter' assert 0 < num_halo <= 256, 'Your have to specify a reasonable number of halo points' alpha = 1. / 32. comm = MPI.COMM_WORLD rank = comm.Get_rank() p = Partitioner(comm, [nz, ny, nx], num_halo) if rank == 0: f = np.zeros((nz, ny + 2 * num_halo, nx + 2 * num_halo)) # Option 1: Original stencil2d-mpi during HPC4WC course: # f[nz // 4:3 * nz // 4, num_halo + ny // 4:num_halo + 3 * ny // 4, num_halo + nx // 4:num_halo + 3 * nx // 4] = 1.0 # Option 2: Similar to option 1, but positive region extended towards tile edges: # f[nz // 10:9 * nz // 10, num_halo + ny // 10:num_halo + 9 * ny // 10, num_halo + nx // 10:num_halo + 9 * nx // 10] = 1.0 # Option 3: One positive region in bottom-left (0-0) corner, one positive region in top-right (ny-nx) corner # f[nz // 4:3 * nz // 4, num_halo:num_halo + ny // 4, num_halo:num_halo + nx // 4] = 1.0 # f[nz // 4:3 * nz // 4, num_halo + 3 * ny // 4:-num_halo, num_halo + 3 * nx // 4:-num_halo] = 1.0 # Option 4: Positive region line prime number fraction off-center across tile: f[nz // 4:3 * nz // 4, num_halo + ny // 7:num_halo + 2 * ny // 7, num_halo:-num_halo] = 1.0 else: f = np.empty(1) in_field = p.scatter(f) out_field = np.copy(in_field) f = p.gather(in_field) if rank == 0: np.save('in_field', f) if plot_result: plt.ioff() plt.imshow(f[in_field.shape[0] // 2, :, :], origin='lower') plt.colorbar() plt.savefig('in_field.png') plt.close() # warmup caches apply_diffusion(in_field, out_field, alpha, num_halo, p=p) comm.Barrier() # time the actual work tic = time.time() apply_diffusion(in_field, out_field, alpha, num_halo, num_iter=num_iter, p=p) toc = time.time() comm.Barrier() if rank == 0: print("Elapsed time for work = {} s".format(toc - tic)) update_halo(out_field, num_halo, p) f = p.gather(out_field) if rank == 0: np.save('out_field', f) if plot_result: plt.imshow(f[out_field.shape[0] // 2, :, :], origin='lower') plt.colorbar() plt.savefig('out_field.png') plt.close()
def main(nx, ny, nz, num_iter, num_halo=2, plot_result=False): """Driver for apply_diffusion that sets up fields and does timings""" assert 0 < nx <= 1024 * 1024, 'You have to specify a reasonable value for nx' assert 0 < ny <= 1024 * 1024, 'You have to specify a reasonable value for ny' assert 0 < nz <= 1024, 'You have to specify a reasonable value for nz' assert 0 < num_iter <= 1024 * 1024, 'You have to specify a reasonable value for num_iter' assert 0 < num_halo <= 256, 'Your have to specify a reasonable number of halo points' alpha = 1. / 32. comm = MPI.COMM_WORLD rank = comm.Get_rank() p = Partitioner(comm, [nz, ny, nx], num_halo) if rank == 0: f = np.zeros((nz, ny + 2 * num_halo, nx + 2 * num_halo)) f[nz // 4:3 * nz // 4, num_halo + ny // 4:num_halo + 3 * ny // 4, num_halo + nx // 4:num_halo + 3 * nx // 4] = 1.0 else: f = np.empty(1) in_field = p.scatter(f) out_field = np.copy(in_field) f = p.gather(in_field) if rank == 0: np.save('in_field', f) if plot_result: plt.ioff() plt.imshow(f[in_field.shape[0] // 2, :, :], origin='lower') plt.colorbar() plt.savefig('in_field.png') plt.close() # warmup caches apply_diffusion(in_field, out_field, alpha, num_halo, p=p) comm.Barrier() # time the actual work tic = time.time() apply_diffusion(in_field, out_field, alpha, num_halo, num_iter=num_iter, p=p) toc = time.time() comm.Barrier() if rank == 0: print("Elapsed time for work = {} s".format(toc - tic)) update_halo(out_field, num_halo, p) f = p.gather(out_field) if rank == 0: np.save('out_field', f) if plot_result: plt.imshow(f[out_field.shape[0] // 2, :, :], origin='lower') plt.colorbar() plt.savefig('out_field.png') plt.close()
class Scraper: def __init__(self,url): self.url = url # 要分析的url self.block_li = [] # 网页所包含的文本块列表 self.title = '' #重置记录 self.recorder = Recorder() self.recorder.reset() # 从正文前后和其中中提取图片,只取第一个 # 只取图片大小足够大的 def get_images(self,block): imgs = [] # 设定image搜索起点 if self.title != self.parser.soup.title: start = self.title else: # title不在正文中,向上扩展image搜索范围 # 向下扩展image搜索范围 start = block.text_list()[0] while start.previous: start = start.previous if not isinstance(start,NavigableString) and start.name in BLOCK_TAGS: break # 设定image搜索终点 end = block.text_list()[-1] while end.next: end = end.next if not isinstance(end,NavigableString) and end.name in BLOCK_TAGS: break while start!=end: if not isinstance(start,NavigableString) and start.name=='img': imgs.append( start ) start = start.next return self.filter_images( imgs ) def filter_images( self,imgs ): srcs = [] images = [] for img in imgs: if img.has_key('src'): src = img['src'] if not src.lower().startswith('http://'): src = relative2absolute( self.url,src ) # 判断图片大小,太小不要 try: im = urlopen( src ).read() if len(im)>MIN_IMG_SIZE: srcs.append( src ) #img['src'] = src images.append( img ) except IOError: pass return (srcs,images) # 如果图像出现在block中,则添加图像和图像p内的ns def insert_images(self,block,images): start = self.title end = block.text_list()[-1] behind_img = False #block.print_ns() i = 0 # 记录block中文本编号 while start!=end: if not isinstance(start,NavigableString) : if start.name=='img' and start in images: src = start['src'] if not src.lower().startswith('http://'): start['src'] = relative2absolute( self.url,src ) #print i,":",str(start),"[]" block.insert( i,start ) #block.print_ns() i += 1 behind_img = True elif start.name=='br': #print i,":",str(start),"[]" # 加入换行符 block.insert( i,start ) #block.print_ns() i += 1 elif start.name in BLOCK_TAGS: behind_img = False # NavigableString elif start.string.strip(): # 已经在正文块中 if start in block.text_list(): #print i,":",start.string i += 1 behind_img = False # 不在正文块中,在图片后的兄弟文本 elif behind_img: #print i,":",start.string,"[]" block.insert( i,start ) #block.print_ns() i += 1 start = start.next return block # 执行流程,返回提取到的正文 def get_content(self): # 1.提取基本文本块 self.parser = Parser(self.url) ns_list = self.parser.ns() self.title = self.parser.get_title() # 2.文本串分块 self.partitioner = Partitioner() blocks = self.partitioner.partition(ns_list) # 3.抽取正文块,副产品为分析信息 self.judge = Judge( self.title.string,ns_list ) res = self.judge.select( blocks,ns_list ) flag = res['flag'] cblock = res['block'] confidence = res['confidence'] detail = res['detail'] #if flag: content = cblock.to_str() (srcs,images) = self.get_images( cblock ) cblock = self.insert_images(cblock,images) content_with_format = cblock.to_str_with_format() #else: # content = "" # content_with_format = "" # srcs = None return (flag,self.title.string.strip(),content,content_with_format,srcs,confidence,detail)
class Scraper: def __init__(self, url): self.url = url # 要分析的url self.block_li = [] # 网页所包含的文本块列表 self.title = '' #重置记录 self.recorder = Recorder() self.recorder.reset() # 从正文前后和其中中提取图片,只取第一个 # 只取图片大小足够大的 def get_images(self, block): imgs = [] # 设定image搜索起点 if self.title != self.parser.soup.title: start = self.title else: # title不在正文中,向上扩展image搜索范围 # 向下扩展image搜索范围 start = block.text_list()[0] while start.previous: start = start.previous if not isinstance( start, NavigableString) and start.name in BLOCK_TAGS: break # 设定image搜索终点 end = block.text_list()[-1] while end.next: end = end.next if not isinstance(end, NavigableString) and end.name in BLOCK_TAGS: break while start != end: if not isinstance(start, NavigableString) and start.name == 'img': imgs.append(start) start = start.next return self.filter_images(imgs) def filter_images(self, imgs): srcs = [] images = [] for img in imgs: if img.has_key('src'): src = img['src'] if not src.lower().startswith('http://'): src = relative2absolute(self.url, src) # 判断图片大小,太小不要 try: im = urlopen(src).read() if len(im) > MIN_IMG_SIZE: srcs.append(src) #img['src'] = src images.append(img) except IOError: pass return (srcs, images) # 如果图像出现在block中,则添加图像和图像p内的ns def insert_images(self, block, images): start = self.title end = block.text_list()[-1] behind_img = False #block.print_ns() i = 0 # 记录block中文本编号 while start != end: if not isinstance(start, NavigableString): if start.name == 'img' and start in images: src = start['src'] if not src.lower().startswith('http://'): start['src'] = relative2absolute(self.url, src) #print i,":",str(start),"[]" block.insert(i, start) #block.print_ns() i += 1 behind_img = True elif start.name == 'br': #print i,":",str(start),"[]" # 加入换行符 block.insert(i, start) #block.print_ns() i += 1 elif start.name in BLOCK_TAGS: behind_img = False # NavigableString elif start.string.strip(): # 已经在正文块中 if start in block.text_list(): #print i,":",start.string i += 1 behind_img = False # 不在正文块中,在图片后的兄弟文本 elif behind_img: #print i,":",start.string,"[]" block.insert(i, start) #block.print_ns() i += 1 start = start.next return block # 执行流程,返回提取到的正文 def get_content(self): # 1.提取基本文本块 self.parser = Parser(self.url) ns_list = self.parser.ns() self.title = self.parser.get_title() # 2.文本串分块 self.partitioner = Partitioner() blocks = self.partitioner.partition(ns_list) # 3.抽取正文块,副产品为分析信息 self.judge = Judge(self.title.string, ns_list) res = self.judge.select(blocks, ns_list) flag = res['flag'] cblock = res['block'] confidence = res['confidence'] detail = res['detail'] #if flag: content = cblock.to_str() (srcs, images) = self.get_images(cblock) cblock = self.insert_images(cblock, images) content_with_format = cblock.to_str_with_format() #else: # content = "" # content_with_format = "" # srcs = None return (flag, self.title.string.strip(), content, content_with_format, srcs, confidence, detail)