示例#1
0
    def __init__(self, kromsatel_args):
        super().__init__(kromsatel_args)
        self.cleaner = NanoporeReadsCleaner(kromsatel_args)

        self.reads_fpath = self.kromsatel_args.long_read_fpath
        self.chunk_size = self.kromsatel_args.chunk_size

        num_reads_total = \
            _count_unpaired_reads_verbosely(self.reads_fpath)
        self.progress = Progress(num_reads_total)

        output_prefix = fs.rm_fastq_extention(
            os.path.basename(self.reads_fpath)
        )

        if kromsatel_args.split_output:
            self.binner = SplitUnpairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
        else:
            self.binner = SimpleUnpairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
def test_add_item_scraped():
    progress = Progress()
    progress.init()
    progress_result = progress.read_progress()
    assert progress_result['items_scraped'] == 0
    assert progress_result['total'] == 0
    assert len(progress_result['items']) == 0

    some_object = {'property': 'value'}
    progress.add_item_scraped(some_object)
    progress_result = progress.read_progress()
    assert len(progress_result['items']) == 1
示例#3
0
def run_freida_scraping():
    try:
        progress = Progress()
        progress.init()
        progress.save_process_progress(False, False)
        browser = setup_browser()
        config = FreidaConfig(browser)
        url = config.initial_page
        main(browser, url, config)
        browser.quit()
    except Exception:
        if browser:
            browser.quit()
def test_save_number_items_scraped_so_far():
    progress = Progress()
    progress.init()
    total_number_items = progress.read_total_number_items()
    items_scraped_so_far = progress.read_number_items_scraped_so_far()
    assert total_number_items == 0
    assert items_scraped_so_far == 0

    progress.save_number_items_scraped_so_far(10)
    progress.save_total_number_items(5)
    items_scraped_so_far = progress.read_number_items_scraped_so_far()
    total_number_items = progress.read_total_number_items()
    assert items_scraped_so_far == 10
    assert total_number_items
示例#5
0
def run_scraping_task():
    try:
        progress = Progress()
        progress.init()
        progress.save_process_progress(False,False)
        # TODO Remove if not using selenium
        browser = setup()

        config = Config(browser) #TODO: Pass browser if the crawling is selenium based
        url = config.initial_page
        main(browser, config)
        browser.quit()
    except Exception:
        if browser:
            browser.quit()
示例#6
0
import sys
import os
from flask_bootstrap import Bootstrap
from flask import Flask, render_template, jsonify, redirect
from src.progress import Progress
from src.crawling_threading import CrawlingThreading

if getattr(sys, 'frozen', False):
    template_folder = os.path.join(sys._MEIPASS, 'templates')
    app = Flask(__name__, template_folder=template_folder)
else:
    template_folder = os.path.join(os.path.dirname(__file__), 'templates')
    app = Flask(__name__, template_folder=template_folder)

Bootstrap(app)
progress = Progress()
progress.init()


@app.route('/')
def app_entrypoint():
    global progress
    try:
        progress_result = progress.read_progress()
    except Exception:
        progress_result = progress
    return render_template('index.html', progress=progress_result)


@app.route('/scrape')
def scrape_beerwulf():
示例#7
0
 def notify(self, file) -> None:
     self.queue.put(Progress(file, self.done.value, self.failed.value, self.files_count))
def test_read_total_number_items():

    progress = Progress()
    progress.init()
    total_number_items = progress.read_total_number_items()
    assert total_number_items == 0
示例#9
0
    def crawl_all_pages_to_end(self, initial_url, file_to_save_results):

        try:
            url_first_load = initial_url
            self.browser.get(
                "https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140"
            )
            self.config.answer_prompt_questions()

            # Wait for page to load
            wait = WebDriverWait(self.browser,
                                 10,
                                 ignored_exceptions=[
                                     StaleElementReferenceException,
                                     NoSuchElementException
                                 ])
            items_presence = EC.presence_of_element_located(
                (By.CSS_SELECTOR, self.config.next_button_css_selector))
            wait.until(items_presence)

            self.browser.execute_script('window.open("' + url_first_load +
                                        '");')

            # Swith to new tab
            self.browser.switch_to.window(self.browser.window_handles[1])

            # Create dataframe to hold the data
            dataframe_file = Path(file_to_save_results)
            if (not dataframe_file.exists()):
                dataframe = pd.DataFrame()
            else:
                dataframe = pd.read_csv(file_to_save_results)

            jsonData = self.loadJsonContent(self.browser.page_source)

            progress = Progress()

            total_number_items = self.total_number_items_to_scrape(jsonData)
            progress.save_total_number_items(total_number_items)
            progress.save_process_progress(False, False)

            for pagination_item in jsonData['solrPagination']:

                url_to_parse = urljoin(self.config.host,
                                       pagination_item["url"])
                self.browser.get(url_to_parse)
                page_json_data = self.loadJsonContent(self.browser.page_source)

                for item in page_json_data["searchResults"]:
                    cleaned_item = self.extract_item_data(item)
                    item_serie = pd.Series(cleaned_item,
                                           index=cleaned_item.keys())

                    dataframe = dataframe.append(item_serie,
                                                 ignore_index=True,
                                                 sort=False)
                    progress.save_number_items_scraped_so_far(
                        dataframe.shape[0])
                    progress.add_item_scraped(cleaned_item)
                    progress.save_process_progress(False, False)

                dataframe.to_csv(file_to_save_results)

            progress.save_process_progress(True, False)
            return True
        except Exception as e:
            print(str(e))
            progress.save_process_progress(True, True)
            return False