def process (self, filename): curSize = 0 progress = Progress(filename, 0, os.stat(filename)[6]) for ln in open(filename): if ln.startswith('RF'): self.processRF(ln) curSize += len(ln) progress.update(curSize)
def __init__(self, tag: str, from_top: bool = True): self.base_url = (f'https://archiveofourown.org/tags/' f'{quote(tag).replace(".", "*d*")}/works?page=') tag_path = paths.tag_path(tag) self.progress = Progress(tag_path) self.last = self.progress.read()[0] self.path = paths.meta_path(tag) log_path = paths.meta_log_path(tag) super().__init__(tag + '_meta', log_path) self.from_top = self._start_from_top(from_top)
def main(argv=None): _init_output_directories() # step 1: create the negatives and positives directory if FLAGS.do_full_prepare: print('Loading labels from %s' % FLAGS.label_file) lr = LabelRecord() label_records = lr.load(FLAGS.label_file) all_bounding_boxes = Box.get_all_bounding_boxes(label_records) counter = 0 # fill examples, originals, negatives, and positives directories print('Processing images...') for (_, v) in label_records.items(): Progress.show_progress(counter) image = CXRImage.get_image_data(v.filename, FLAGS.image_path) basefilename = os.path.splitext(v.filename)[0] if v.hasBoundingBox: for i in range(0, v.boundingBoxes.shape[0]): box = v.boundingBoxes[i, :] #CXRImage.extract_center_and_write(image,box,1024,1024,FLAGS.positives_dir) CXRImage.extract_anisotropic_scale_and_write( image, box, FLAGS.image_size, FLAGS.image_size, FLAGS.positives_dir) CXRImage.write_image(image, FLAGS.examples_dir, "%s.jpg" % basefilename) else: i = np.int32( np.random.randint(0, all_bounding_boxes.shape[0] - 1)) box = all_bounding_boxes[i, :] CXRImage.extract_anisotropic_scale_and_write( image, box, FLAGS.image_size, FLAGS.image_size, FLAGS.negatives_dir) #CXRImage.extract_center_and_write(image,box,1024,1024,FLAGS.negatives_dir) if (v.hasBoundingBox): img = (CXRImage.xlate_image(image) * 255).astype(np.uint8) CXRImage.write_image_with_bounding_boxes( img, FLAGS.originals_dir, "%s.jpg" % basefilename, v.boundingBoxes) counter = counter + 1 # step 2: create the pre-training features by combining negatives and positives into pre_train.tfrecord print('\nCreating pre-train file...') rec = Record(1024, 1024, 1 if FLAGS.grayscale else 3) total = rec.create_pre_train_file(FLAGS.positives_dir, FLAGS.negatives_dir, FLAGS.pre_train_file) print('\n%d files combined in %s' % (total, FLAGS.pre_train_file))
def process_tweets(self): from utils.twitter.tokenizer import Tweet_Tokenizer # for more documentation, visit: # https://dev.twitter.com/overview/api/tweets def extract_text(tweet_json): try: tweet_json["text"] except KeyError: return orig_text = tweet_json["text"].lower().strip() tweet_text = Tweet_Tokenizer.apply_regex_to_text( orig_text, self.replace_hashtags, self.replace_links, self.replace_user_refs) tweet_obj = Tweet(tweet_text, orig_text) #get, process, and package hashtags (this also stores and counts them) hashtags = [ Hashtag(h["text"].lower()) for h in tweet_json["entities"]["hashtags"] ] #connect hashtags to tweet and vice versa tweet_obj.register_hashtags(hashtags) [h.register_tweet(tweet_obj) for h in hashtags] if self.show_progress: with Progress("Extracting tweets", len(self.tweet_JSON_objs)) as up: [ up(extract_text(tweet_obj)) for tweet_obj in self.tweet_JSON_objs[0] ] else: print("Extracting tweets.") [extract_text(tweet_obj) for tweet_obj in self.tweet_JSON_objs]
def process (self, filename, minlevel, maxlevel): self.__minlevel = minlevel self.__maxlevel = maxlevel cursize = 0 progress = Progress(filename, 0, os.stat(filename)[6]) for ln in open(filename): if ln.startswith('AF'): self.processAF(ln) elif ln.startswith('LF'): self.processLF(ln) elif ln.startswith('LM'): self.processLM(ln) elif ln.startswith('PF'): self.processPF(ln) cursize += len(ln) progress.update(cursize)
def extract_batch(filelist, savedir, descobj, verbose=False): """ Extract features/descriptors from a batch of images. Single-threaded. This function calls an image descripor object on a batch of images in order to extract the images descripor. If a feature/descriptor file already exists for the image, it is skipped. This is a single-threaded pipeline. Arguments: filelist: A list of files of image names including their paths of images to read and extract descriptors from savedir: A directory in which to save all of the image features. They are pickled objects (protocol 2) with the same name as the image file. The object that is pickled is the return from descobj.extract(). decobj: An image descriptor object which does the actual extraction work. the method called is descobj.extract(image). See descriptors.Descriptor for an abstract base class. verbose: bool, display progress? Returns: True if there we any errors extracting image features. False otherwise. If there is a problem extracting any image descriptors, a file "errors.log" is created in the savedir directory with a list of file names, error number and messages. """ # Try to make the save path if not os.path.exists(savedir): os.mkdir(savedir) errflag = False # Set up progess updates nfiles = len(filelist) progbar = Progress(nfiles, title="Extracting descriptors", verbose=verbose) # Iterate through all of the images in filelist and extract features for i, impath in enumerate(filelist): errflag |= extract(impath, savedir, descobj) progbar.update(i) progbar.finished() if errflag == True: print('Done with errors. See the "errors.log" file in ' + savedir)
def build (self, model, grid, level): simplify = True self.__meshset = set() total = 0 for mesh in model.keys(): total += 1 self.__model = model counter = 0 progress = Progress(("construct<%d>...." % level), 0, total) for mesh, parcel in self.__model.items(): mainmap = MainMapBuilder() rect = grid.getMeshRect(mesh) bin = mainmap.build(parcel, rect, simplify) offset = self.__stream.tell() self.__stream.write(bin) self.__index_builder.registerParcel(mesh, offset, len(bin)) counter += 1 progress.update(counter)
def process(self, level): # enum meshList = set() if level == 0: for meshId in self.__modelSrc.keys(): meshList.add(meshId) else: for meshId in self.__modelSrc.keys(): meshList.add(self.__grid.getUpperMesh(meshId)) # construct counter = 0 progress = Progress(("building <%d>...." % level), 0, len(meshList)) for meshId in meshList: progress.update(counter); counter += 1 parcel = self.__constructParcel(meshId) if parcel: self.__modelOut.put(meshId, parcel) pass return None
def AddSequences(): """Creates one training, validation.""" errors = [] # Generate datasets file lists. sequences = FindPatternFiles(FLAGS.input_dir, FLAGS.view_pattern, errors) num_frames = PrintSequencesInfo(sequences, 'Found the following datasets and files:') # Sharding and randomizing sets. if FLAGS.max_per_shard > 0: sequences = ShardSequences(sequences, FLAGS.max_per_shard) num_frames = PrintSequencesInfo(sequences, 'After sharding:') tf.logging.info('') # Process sets. progress = Progress(num_frames) output_list = [] for sequence in sequences: record_name = os.path.join(FLAGS.output_dir, '%s.tfrecord' % sequence['name']) if tf.gfile.Exists(record_name) and not FLAGS.overwrite: ok, num_frames = CheckRecord(record_name, sequence) if ok: progress.Add(num_frames) tf.logging.info('Skipping existing output file: %s' % record_name) continue else: tf.logging.info( 'File does not match sequence, reprocessing...') output_dir = os.path.dirname(record_name) if not tf.gfile.Exists(output_dir): tf.logging.info('Creating output directory: %s' % output_dir) tf.gfile.MakeDirs(output_dir) output_list.append(record_name) tf.logging.info('Writing to ' + record_name) writer = tf.python_io.TFRecordWriter(record_name) AddSequence(sequence, writer, progress, errors) writer.close() tf.logging.info('Wrote dataset files: ' + str(output_list)) tf.logging.info('All errors (%d): %s' % (len(errors), str(errors)))
def main(): # load data mnist = input_data.read_data_sets(args.dataset_path + 'MNIST_data', one_hot=True) # we can access to images like this: # images = mnist.train.images; images.shape = [] # labels = mnist.train.labels; each label is a probability distribution. print(mnist.train.num_examples) max_epoch = args.max_epoch max_loop_z = args.max_loop_z with tf.Graph().as_default(): config = Config() # model = MLP(config) model = DCGN(config) tf.get_default_graph().finalize() progress = Progress() n_batch_loop = int(mnist.train.num_examples / config.batch_size) for epoch in range(max_epoch): sum_cost = 0 progress.start_epoch(epoch, max_epoch) for t in range(n_batch_loop): # batch_X: batch_size x n_input # batch_y: batch_size batch_X, batch_y = mnist.train.next_batch(config.batch_size, shuffle=False) batch_indices = np.arange( t * config.batch_size, t * config.batch_size + config.batch_size) for z_t in range(max_loop_z): cost_z = model.forward_backprop_z(batch_X, batch_indices) model.project_z_L2() #print(cost_z) cost_per_sample = model.forward_backprop_theta( batch_X, batch_indices) sum_cost += cost_per_sample # model.increaseBatchID() if t % 10 == 0: progress.show(t, n_batch_loop, {}) print("cost: {}".format(sum_cost / n_batch_loop)) model.save(epoch, args.model_dir)
def extract_batch(filelist, savedir, descobj, verbose=False): """ Extract features/descriptors from a batch of images. Single-threaded. This function calls an image descripor object on a batch of images in order to extract the images descripor. If a feature/descriptor file already exists for the image, it is skipped. This is a single-threaded pipeline. Arguments: filelist: A list of files of image names including their paths of images to read and extract descriptors from savedir: A directory in which to save all of the image features. They are pickled objects (protocol 2) with the same name as the image file. The object that is pickled is the return from descobj.extract(). decobj: An image descriptor object which does the actual extraction work. the method called is descobj.extract(image). See descriptors.Descriptor for an abstract base class. verbose: bool, display progress? Returns: True if there we any errors extracting image features. False otherwise. If there is a problem extracting any image descriptors, a file "errors.log" is created in the savedir directory with a list of file names, error number and messages. """ # Try to make the save path if not os.path.exists(savedir): os.mkdir(savedir) errflag = False # Set up progess updates nfiles = len(filelist) progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose) # Iterate through all of the images in filelist and extract features for i, impath in enumerate(filelist): errflag |= extract(impath, savedir, descobj) progbar.update(i) progbar.finished() if errflag == True: print('Done with errors. See the "errors.log" file in ' + savedir)
def __init__(self, json_txt, show_progress=False, replace_hashtags=True, replace_user_refs=False, replace_links=True): import json self.show_progress = show_progress self.replace_links = replace_links self.replace_hashtags = replace_hashtags self.replace_user_refs = replace_user_refs #parse text into json objects if self.show_progress: from utils.progress import Progress with Progress("Parsing text into JSON Object", len(json_txt)) as up: self.tweet_JSON_objs = [ u(json.loads(line)) for line in json_txt ] else: print("Parsing text into JSON Object.") self.tweet_JSON_objs = [json.loads(line) for line in json_txt] #extract text from tweets self.process_tweets()
def extract_smp(filelist, savedir, descobj, njobs=None, verbose=False): """ Extract features/descriptors from a batch of images. Multi-threaded. This function calls an image descripor object on a batch of images in order to extract the images descripor. If a feature/descriptor file already exists for the image, it is skipped. This is a multi-threaded (SMP) pipeline suitable for running on a single computer. Arguments: filelist: A list of files of image names including their paths of images to read and extract descriptors from savedir: A directory in which to save all of the image features. They are pickled objects (protocol 2) with the same name as the image file. The object that is pickled is the return from descobj.extract(). decobj: An image descriptor object which does the actual extraction work. the method called is descobj.extract(image). See descriptors.Descriptor for an abstract base class. njobs: int, Number of threads to use. If None, then the number of threads is chosen to be the same as the number of cores. verbose: bool, display progress? Returns: True if there we any errors extracting image features. False otherwise. If there is a problem extracting any image descriptors, a file "errors.log" is created in the savedir directory with a list of file names, error number and messages. """ # Try to make the save path if not os.path.exists(savedir): os.mkdir(savedir) # Set up parallel job pool = mp.Pool(processes=njobs) # Iterate through all of the images in filelist and extract features result = pool.map_async( __extract_star, itertools.izip(filelist, itertools.repeat(savedir), itertools.repeat(descobj))) # Set up progess updates nfiles = len(filelist) progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose) # Get the status while ((result.ready() is not True) and (verbose == True)): approx_rem = nfiles - result._number_left * result._chunksize progbar.update(max(0, approx_rem)) time.sleep(5) progbar.finished() # Get notification of errors errflag = any(result.get()) pool.close() pool.join() if errflag == True: print('Done, with errors. See the "errors.log" file in ' + savedir)
def main(args): ############################### # TRAIN PREP ############################### print("Loading data") train_loader, valid_loader, data_var, input_size = \ data.get_data(args.data_folder,args.batch_size) args.input_size = input_size args.downsample = args.input_size[-1] // args.enc_height args.data_variance = data_var print(f"Training set size {len(train_loader.dataset)}") print(f"Validation set size {len(valid_loader.dataset)}") print("Loading model") if args.model == 'diffvqvae': model = DiffVQVAE(args).to(device) elif args.model == 'vqvae': model = VQVAE(args).to(device) print( f'The model has {utils.count_parameters(model):,} trainable parameters' ) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=False) print(f"Start training for {args.num_epochs} epochs") num_batches = math.ceil( len(train_loader.dataset) / train_loader.batch_size) pbar = Progress(num_batches, bar_length=10, custom_increment=True) # Needed for bpd args.KL = args.enc_height * args.enc_height * args.num_codebooks * \ np.log(args.num_embeddings) args.num_pixels = np.prod(args.input_size) ############################### # MAIN TRAIN LOOP ############################### best_valid_loss = float('inf') train_bpd = [] train_recon_error = [] train_perplexity = [] args.global_it = 0 for epoch in range(args.num_epochs): pbar.epoch_start() train_epoch(args, vq_vae_loss, pbar, train_loader, model, optimizer, train_bpd, train_recon_error, train_perplexity) # loss, _ = test(valid_loader, model, args) # pbar.print_eval(loss) valid_loss = evaluate(args, vq_vae_loss, pbar, valid_loader, model) if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_valid_epoch = epoch torch.save(model.state_dict(), args.save_path) pbar.print_end_epoch() print("Plotting training results") utils.plot_results(train_recon_error, train_perplexity, "results/train.png") print("Evaluate and plot validation set") generate_samples(model, valid_loader)
class Meta(Page): def __init__(self, tag: str, from_top: bool = True): self.base_url = (f'https://archiveofourown.org/tags/' f'{quote(tag).replace(".", "*d*")}/works?page=') tag_path = paths.tag_path(tag) self.progress = Progress(tag_path) self.last = self.progress.read()[0] self.path = paths.meta_path(tag) log_path = paths.meta_log_path(tag) super().__init__(tag + '_meta', log_path) self.from_top = self._start_from_top(from_top) def scrape(self) -> None: if self.from_top is True or self.path.is_file() is False: mode = 'w' else: mode = 'a' with open(self.path, mode) as f_out: pages = self._pages() for page, progress_num in pages: page_elements = self._page_elements(page) for element in page_elements: f_out.write(json.dumps(element) + '\n') self.progress.write(progress_num) self.logger.info(f'Completed scraping "{self.page_kind}"') return super().scrape() def _pages(self) -> Generator[Tuple[BeautifulSoup, str], None, None]: try: page_num = int(self.last) except ValueError: self.logger.error(f'Last scraped value ({self.last})' f' in .meta is not a number') raise ValueError if page_num == -1 or self.from_top is True: page_num = 1 else: page_num += 1 errors = 0 self.logger.info(f"Scraping: {self.base_url}") try: max_pages = self._total_pages() except ConnectionError: self.logger.error(f'Base URL: {self.base_url} Not found.') raise ConnectionError(f"Error connecting to: {self.base_url}\n" f"Could your fandom name be incorrect?") except Exception as e: self.logger.error(f'Base URL: {self.base_url} Not found.') raise Exception(f"Other error: {e}") while errors < cfg.MAX_ERRORS and page_num <= max_pages: try: url = self.base_url + str(page_num) soup = self._get_soup(url) except HTTPError: # just move onto next page self.logger.error(f'PAGE: {url} 404 Error. Skipping this work.' f' {cfg.MAX_ERRORS-errors} attempts left.') errors += 1 time.sleep(cfg.DELAY) page_num += 1 url = self.base_url + str(page_num) except ConnectTimeout: # Try again errors += 1 self.logger.error(f'PAGE: {url} Not found. ' f'{cfg.MAX_ERRORS-errors} attempts left.') time.sleep(cfg.DELAY * errors) # exponential decay wait else: self.logger.info(f'Scraping PAGE: {str(page_num)}') time.sleep(cfg.DELAY) yield (soup, str(page_num)) page_num += 1 url = self.base_url + str(page_num) def _page_elements(self, page: BeautifulSoup) -> Generator[MetaJson, None, None]: """ Find each HTML element and parse out the details into a row. """ time = datetime.datetime.now().strftime("%d/%b/%Y %H:%M") meta: MetaJson = {} # type: ignore works = page.find_all(class_="work blurb group") for work in works: meta.update(self._get_header(work)) meta.update(self._get_required_tags(work)) meta.update(self._get_tags(work)) meta.update(self._get_stats(work)) meta['fandom'] = self._get_fandoms(work) meta['summary'] = self._get_summary(work) meta['series_part'], meta['series_name'] = self._get_series(work) meta['updated'] = self._get_updated(work) meta['scrape_date'] = time yield meta def _total_pages(self) -> int: ''' Make max attempts at loading base url to get starting number''' for attempts in range(cfg.MAX_ERRORS): try: soup = self._get_soup(self.base_url) next_element = soup.find('li', class_='next') max_pages = int(next_element.find_previous('li').text) self.logger.info(f'Attempting to scrape up to ' f'{str(max_pages)} pages.') return max_pages except AttributeError: self.logger.info('Attempting to scrape 1 page.') return 1 except ConnectTimeout: self.logger.error(f'Base URL: {self.base_url} Not found. ' f'{cfg.MAX_ERRORS-attempts} attempts left.') raise ConnectTimeout return 0 def _get_tags(self, meta: BeautifulSoup) -> Any: """Find relationships, characters, and freeforms tags""" tag_dict = {} # type: Dict[str, Optional[List[str]]] tags = ['relationships', 'characters', 'freeforms'] for tag in tags: tag_dict[tag] = self._get_tag_info(tag, meta) return tag_dict def _get_tag_info(self, category: str, meta: BeautifulSoup) -> \ Optional[List[str]]: """ Find relationships, characters, and freeforms tags.""" try: tag_list = meta.find_all("li", class_=category) except AttributeError: return None return [result.text for result in tag_list] def _get_required_tags(self, work: BeautifulSoup) -> Any: """Finds required tags.""" req_dict = {} try: req_tags = work.find(class_='required-tags').find_all('a') req_dict['rating'] = req_tags[0].text req_dict['warnings'] = req_tags[1].text.split(',') req_dict['category'] = req_tags[2].text.split(',') req_dict['status'] = req_tags[3].text except Exception: req_dict['rating'] = None req_dict['warnings'] = [] req_dict['category'] = [] req_dict['status'] = None return req_dict def _get_stats(self, work: BeautifulSoup) -> Any: """ Find stats (language, published, status, date status, words, chapters, comments, kudos, bookmarks, hits """ str_categories = ['language', 'chapters'] num_categories = [ 'collections', 'words', 'comments', 'kudos', 'bookmarks', 'hits' ] stats = {} for s_cat in str_categories: try: stats[s_cat] = work.find("dd", class_=s_cat).text except AttributeError: stats[s_cat] = None for n_cat in num_categories: try: str_num = work.find("dd", class_=n_cat).text stats[n_cat] = int(str_num.replace(',', '')) except (AttributeError, ValueError): stats[n_cat] = 0 return stats def _get_header(self, work: BeautifulSoup) -> Any: '''Finds header information (work_id, title, author, gifted to user).''' header_dict = {} result = work.find('h4', class_='heading').find_all('a') header_dict['work_id'] = result[0].get('href').strip('/works/') header_dict['title'] = result[0].text auth_list = [] header_text = work.find('h4', class_='heading').text if "Anonymous" in header_text: header_dict['author'] = ["Anonymous"] else: authors = work.find_all('a', rel='author') for author in authors: auth_list.append(author.text) header_dict['author'] = auth_list gift_list = [] for link in result: href = link.get('href') if 'gifts' in href: gift_list.append(link.text) if len(gift_list) == 0: header_dict['gifted'] = [] else: header_dict['gifted'] = gift_list return header_dict def _get_fandoms(self, work: BeautifulSoup) -> List[str]: """ Find the list of fandoms.""" try: tag_list = work.find('h5', class_='fandoms heading').find_all('a') fan_list = [x.text for x in tag_list] return fan_list except AttributeError: return [] def _get_summary(self, work: BeautifulSoup) -> Optional[str]: """ Find summary description and return as list of strings. """ try: summary_string = work.find('blockquote', class_='userstuff summary') summary = summary_string.text.strip().replace('\n', ' ') except AttributeError: summary = None return summary def _get_updated(self, work: BeautifulSoup) -> Optional[str]: """ Find update date. Return as list of strings. """ try: date = work.find('p', class_='datetime').text except AttributeError: date = None return date def _get_series(self, work: BeautifulSoup) \ -> Tuple[Optional[str], Optional[str]]: """ Find series info and return as list. """ try: series = work.find('ul', class_='series') part = series.find('strong').text s_name = series.find('a').text except AttributeError: part, s_name = None, None return part, s_name def _start_from_top(self, from_top: bool) -> bool: if from_top is True: self.logger.info("Scraping from the top.") return True elif self.last == self.progress.unscraped_flag: self.logger.info( f"Last scraped unknown: {self.progress.unscraped_flag}. " f"Scraping from the top.") return True else: self.logger.info(f"Picking up from {self.last} ") return False
def extract_smp(filelist, savedir, descobj, njobs=None, verbose=False): """ Extract features/descriptors from a batch of images. Multi-threaded. This function calls an image descripor object on a batch of images in order to extract the images descripor. If a feature/descriptor file already exists for the image, it is skipped. This is a multi-threaded (SMP) pipeline suitable for running on a single computer. Arguments: filelist: A list of files of image names including their paths of images to read and extract descriptors from savedir: A directory in which to save all of the image features. They are pickled objects (protocol 2) with the same name as the image file. The object that is pickled is the return from descobj.extract(). decobj: An image descriptor object which does the actual extraction work. the method called is descobj.extract(image). See descriptors.Descriptor for an abstract base class. njobs: int, Number of threads to use. If None, then the number of threads is chosen to be the same as the number of cores. verbose: bool, display progress? Returns: True if there we any errors extracting image features. False otherwise. If there is a problem extracting any image descriptors, a file "errors.log" is created in the savedir directory with a list of file names, error number and messages. """ # Try to make the save path if not os.path.exists(savedir): os.mkdir(savedir) # Set up parallel job pool = mp.Pool(processes=njobs) # Iterate through all of the images in filelist and extract features result = pool.map_async( __extract_star, itertools.izip(filelist, itertools.repeat(savedir), itertools.repeat(descobj)) ) # Set up progess updates nfiles = len(filelist) progbar = Progress(nfiles, title="Extracting descriptors", verbose=verbose) # Get the status while (result.ready() is not True) and (verbose == True): approx_rem = nfiles - result._number_left * result._chunksize progbar.update(max(0, approx_rem)) time.sleep(5) progbar.finished() # Get notification of errors errflag = any(result.get()) pool.close() pool.join() if errflag == True: print('Done, with errors. See the "errors.log" file in ' + savedir)
def embed_tweets_hashtags(self, tweets, hashtags): with Progress("Calculating hashtag and tweet embeddings", len(hashtags) + len(tweets)) as up: [up(self.tweet_embedding(t)) for t in tweets] [up(self.hashtag_embedding(h)) for h in hashtags]