def __init__(self, kromsatel_args): super().__init__(kromsatel_args) self.cleaner = NanoporeReadsCleaner(kromsatel_args) self.reads_fpath = self.kromsatel_args.long_read_fpath self.chunk_size = self.kromsatel_args.chunk_size num_reads_total = \ _count_unpaired_reads_verbosely(self.reads_fpath) self.progress = Progress(num_reads_total) output_prefix = fs.rm_fastq_extention( os.path.basename(self.reads_fpath) ) if kromsatel_args.split_output: self.binner = SplitUnpairedBinner( self.kromsatel_args.outdir_path, output_prefix, self.kromsatel_args.min_len ) else: self.binner = SimpleUnpairedBinner( self.kromsatel_args.outdir_path, output_prefix, self.kromsatel_args.min_len )
def test_add_item_scraped(): progress = Progress() progress.init() progress_result = progress.read_progress() assert progress_result['items_scraped'] == 0 assert progress_result['total'] == 0 assert len(progress_result['items']) == 0 some_object = {'property': 'value'} progress.add_item_scraped(some_object) progress_result = progress.read_progress() assert len(progress_result['items']) == 1
def run_freida_scraping(): try: progress = Progress() progress.init() progress.save_process_progress(False, False) browser = setup_browser() config = FreidaConfig(browser) url = config.initial_page main(browser, url, config) browser.quit() except Exception: if browser: browser.quit()
def test_save_number_items_scraped_so_far(): progress = Progress() progress.init() total_number_items = progress.read_total_number_items() items_scraped_so_far = progress.read_number_items_scraped_so_far() assert total_number_items == 0 assert items_scraped_so_far == 0 progress.save_number_items_scraped_so_far(10) progress.save_total_number_items(5) items_scraped_so_far = progress.read_number_items_scraped_so_far() total_number_items = progress.read_total_number_items() assert items_scraped_so_far == 10 assert total_number_items
def run_scraping_task(): try: progress = Progress() progress.init() progress.save_process_progress(False,False) # TODO Remove if not using selenium browser = setup() config = Config(browser) #TODO: Pass browser if the crawling is selenium based url = config.initial_page main(browser, config) browser.quit() except Exception: if browser: browser.quit()
import sys import os from flask_bootstrap import Bootstrap from flask import Flask, render_template, jsonify, redirect from src.progress import Progress from src.crawling_threading import CrawlingThreading if getattr(sys, 'frozen', False): template_folder = os.path.join(sys._MEIPASS, 'templates') app = Flask(__name__, template_folder=template_folder) else: template_folder = os.path.join(os.path.dirname(__file__), 'templates') app = Flask(__name__, template_folder=template_folder) Bootstrap(app) progress = Progress() progress.init() @app.route('/') def app_entrypoint(): global progress try: progress_result = progress.read_progress() except Exception: progress_result = progress return render_template('index.html', progress=progress_result) @app.route('/scrape') def scrape_beerwulf():
def notify(self, file) -> None: self.queue.put(Progress(file, self.done.value, self.failed.value, self.files_count))
def test_read_total_number_items(): progress = Progress() progress.init() total_number_items = progress.read_total_number_items() assert total_number_items == 0
def crawl_all_pages_to_end(self, initial_url, file_to_save_results): try: url_first_load = initial_url self.browser.get( "https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140" ) self.config.answer_prompt_questions() # Wait for page to load wait = WebDriverWait(self.browser, 10, ignored_exceptions=[ StaleElementReferenceException, NoSuchElementException ]) items_presence = EC.presence_of_element_located( (By.CSS_SELECTOR, self.config.next_button_css_selector)) wait.until(items_presence) self.browser.execute_script('window.open("' + url_first_load + '");') # Swith to new tab self.browser.switch_to.window(self.browser.window_handles[1]) # Create dataframe to hold the data dataframe_file = Path(file_to_save_results) if (not dataframe_file.exists()): dataframe = pd.DataFrame() else: dataframe = pd.read_csv(file_to_save_results) jsonData = self.loadJsonContent(self.browser.page_source) progress = Progress() total_number_items = self.total_number_items_to_scrape(jsonData) progress.save_total_number_items(total_number_items) progress.save_process_progress(False, False) for pagination_item in jsonData['solrPagination']: url_to_parse = urljoin(self.config.host, pagination_item["url"]) self.browser.get(url_to_parse) page_json_data = self.loadJsonContent(self.browser.page_source) for item in page_json_data["searchResults"]: cleaned_item = self.extract_item_data(item) item_serie = pd.Series(cleaned_item, index=cleaned_item.keys()) dataframe = dataframe.append(item_serie, ignore_index=True, sort=False) progress.save_number_items_scraped_so_far( dataframe.shape[0]) progress.add_item_scraped(cleaned_item) progress.save_process_progress(False, False) dataframe.to_csv(file_to_save_results) progress.save_process_progress(True, False) return True except Exception as e: print(str(e)) progress.save_process_progress(True, True) return False