Python scrape 예제들, web_scraper.scrape Python 예제들

예제 #1

0

파일 보기

파일: flask_server.py 프로젝트: JonathanWarrick/data-viz-web-crawler

def hello_world():
	league_id = request.args.get('league_id')
	if league_id:
		return render_template('index.html', data=scrape(league_id))
		# return render_template('index.html', data=['test', 'test2', 'test3'])
	else:
		return render_template('formTemplate.html')

예제 #2

0

파일 보기

def hello_world():
    league_id = request.args.get('league_id')
    if league_id:
        return render_template('index.html', data=scrape(league_id))
        # return render_template('index.html', data=['test', 'test2', 'test3'])
    else:
        return render_template('formTemplate.html')

예제 #3

0

파일 보기

파일: WikiFinder.py 프로젝트: MahtaGhorbani/WikiCategorySearchEngine

 def similarity_score(self, query, url):
     page = scrape(Site(url, True))
     keywords = get_keywords(page.text)
     score = 0
     for word in query:
         if word in keywords:
             score += 1
     return (score)

예제 #4

0

파일 보기

파일: a2.py 프로젝트: kellisfm/Cisc_121

def main():
    # Program execution starts here.
    url = "http://sites.cs.queensu.ca/courses/cisc121/a2/logons.txt"
    print(f"Retrieving data from {url}.")
    data = web_scraper.scrape(url)
    n = len(data)
    print(f"{n} records read.")

    # This loop converts the data from a list of strings to a list of lists.
    for i in range(n):
        data[i] = line_to_list(data[i])

    # CSV file creation
    # This code first opens a file with the requested name,
    # then joins all values in each interior array with commas and
    # makes a newline between each array, writing the result into the csv.
    csv_filename = "logons.csv"
    csv_file = open(f"{csv_filename}", "w")
    for i in data:
        csv_file.write(",".join([str(x) for x in i]) + "\n")
    csv_file.close()

    # JSON file creation
    # Note: Once your dict_make.make_dict() function is working,
    # this code will create a JSON file for you.  You don't need
    # to do anything in this section.
    json_filename = "logons.json"
    dict_keys = ["user_id", "os", "pc_id", "datetime"]
    dict_list = []
    for i in range(n):
        dict_list.append(dict_maker.make_dict(dict_keys, data[i]))
    if json_writer.write_json_file(json_filename, dict_list):
        print(f"Write to {json_filename} complete.")
    else:
        print(f"Write to {json_filename} failed.")

    while True:
        # This code checks all entries in data for the input PC id,
        # if it finds a match, it checks if the operating system is Linux.
        # In cases where both conditions are met, adds 1 to the Linux login count,
        # and prints the count after the whole list has been checked.
        print("Checking the number of Linux logons for a specific PC.")
        pc_id = input("Enter the PC's ID number or just press Enter to quit: ")
        logon_count = 0
        if pc_id == "":
            break
        for i in range(n):
            if data[i][2] == pc_id:
                if data[i][1] == "Linux":
                    logon_count = logon_count + 1
        print(f"PC {pc_id} was logged onto in Linux {logon_count} times.")
    print("Done.")

예제 #5

0

파일 보기

파일: WikiFinder.py 프로젝트: MahtaGhorbani/WikiCategorySearchEngine

    def populate_web(self):
        """
        Populates the web and the inverted index keyword
        dictionary with the urls provided
        """
        occurdic = {}
        for url in self.urls:
            page = scrape(url)
            keywords = get_keywords(page.text)
            index = len(self.web)
            self.web.append(Index(index, page.title, page.url, page.links_to))

            for word in keywords:
                value = OccurrenceList()
                value.append(index)
                occurdic[word.lower()] = value
                self.keywords.add(word.lower(), value)
        self.rank_page(occurdic, len(self.web))

예제 #6

0

파일 보기

파일: gui.py 프로젝트: peterprescott/agent-based-modelling

                                                   update,
                                                   frames=num_of_iterations,
                                                   repeat=False)
    canvas.draw()


# Only run if script is called directly.
if __name__ == '__main__':
    ## Read parameters from command line
    ## (if none set, will set defaults as defined in read_cmd.py).
    parameters = read_cmd.parameters(argv)
    num_of_rabbits, lifespan, neighbourhood, num_of_iterations, animate = parameters

    ## Scrape initial x- and y-values from webpage.
    url = 'http://www.geog.leeds.ac.uk/courses/computing/practicals/python/agent-framework/part9/data.html'
    scraped_coordinates = web_scraper.scrape(url)

    ## Create environment from CSV file.
    environment = Environment("in.txt").env

    ## Create rabbits.
    rabbits = create_rabbits(environment, num_of_rabbits, scraped_coordinates,
                             lifespan)

    # Configure Tkinter.
    root = tkinter.Tk()
    root.wm_title("Model")
    fig = matplotlib.pyplot.figure(figsize=(7, 7))
    canvas = matplotlib.backends.backend_tkagg.FigureCanvasTkAgg(fig,
                                                                 master=root)
    canvas._tkcanvas.pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)

예제 #7

0

파일 보기

파일: run_model.py 프로젝트: peterprescott/agent-based-modelling

if __name__ == '__main__':

    ## Read parameters from command line
    ## (if none set, will set defaults as defined in read_cmd.py).
    parameters = read_cmd.parameters(argv)
    num_of_rabbits, lifespan, neighbourhood, num_of_iterations, animate = parameters
    report = f"""##### Initial Parameters #####
    Number of rabbits: {num_of_rabbits}
    Lifespan: {lifespan}
    Number of Iterations: {num_of_iterations}
    """
    print(report)

    ## Scrape initial x- and y-values from webpage.
    URL = 'http://www.geog.leeds.ac.uk/courses/computing/practicals/python/agent-framework/part9/data.html'
    scraped_coordinates = web_scraper.scrape(URL)
    print(f"You have successfuly scraped coordinates from {URL}.")

    ## Create environment from CSV file.
    file = "in.txt"
    environment = agentframework.Environment(file).env
    print(f"Environment successfully created from {file}.")

    ## Create rabbits.
    rabbits = create_rabbits(environment, num_of_rabbits, scraped_coordinates,
                             lifespan)
    print(f"You have successfuly created {num_of_rabbits} rabbits.")

    ## Run model and time it.
    print("We are now going to run the model.")
    time_result = run_model()

예제 #8

0

파일 보기

def main():
    """Program execution starts here."""
    url = "http://sites.cs.queensu.ca/courses/cisc121/a2/logons.txt"
    data = web_scraper.scrape(url)
    n = len(data)
    data = parse_logons.logons_to_list_of_dicts(data)
    # The data list (and the file it came from) are in chronological
    # order. That means data is currently sorted by timestamp.
    sort_on_key = "timestamp"

    main_menu_title = "Select an operation:"
    main_menu_choices = [
        "Sort logons data", "Show a selection of the data",
        f"Search for a particular {sort_on_key}"
    ]

    sort_on_menu_title = "Select a key to sort on:"
    sort_on_menu_choices = list(data[0].keys())

    sort_menu_title = "Select a sort algorithm:"
    sort_menu_choices = [
        "Insertion sort", "Bubble sort", "Bubble sort (optimized)",
        "Selection sort"
    ]

    while True:
        choice = menu.do_menu(main_menu_title, main_menu_choices)
        if choice is None:
            return  # Exit main() (and program).
        if choice == 1:  # Sort logons data.
            while True:
                sort_on_choice = menu.do_menu(sort_on_menu_title,
                                              sort_on_menu_choices)

                if sort_on_choice is None:
                    break  # Return to main menu.
                sort_on_key = sort_on_menu_choices[sort_on_choice - 1]
                # Change last choice in main menu to reflect the new
                # sort_on_choice.
                main_menu_choices[
                    -1] = f"Search for a particular {sort_on_key}"

                sort_choice = menu.do_menu(sort_menu_title, sort_menu_choices)
                if sort_choice is None:
                    break  # Return to main menu.

                # If we're here, we can proceed with a sort.
                print()
                if sort_choice == 1:  # Insertion sort
                    sort_function = dict_quad_sorts.insertion_sort
                elif sort_choice == 2:  # Bubble sort
                    sort_function = dict_quad_sorts.bubble_sort
                elif sort_choice == 3:  # Bubble sort (opt)
                    sort_function = dict_quad_sorts.bubble_sort_opt
                else:  # Selection sort
                    sort_function = dict_quad_sorts.selection_sort

                # Do the sort.
                print(f"Sorting on key '{sort_on_key}'...")
                sort_function(data, sort_on_key)
                print("Done.")

                # Show it worked.
                long_list_printer.print_list(data, 5)

        elif choice == 2:  # Show a selection of the data.
            long_list_printer.print_list(data, 10)

        elif choice == 3:  # Search for a specific value.
            search_val = input(f"\nSearch for what {sort_on_key}? ")
            found_at = dict_bin_search.search(data, sort_on_key, search_val)
            if found_at is None:
                print(f"{sort_on_key} {search_val} not found.")
            else:
                print(f"{sort_on_key} {search_val} found at position "\
                      f"{found_at}.")

예제 #9

0

파일 보기

파일: main.py 프로젝트: hampusolsen/muni-geo-grabber

from web_scraper import scrape
from xlsx_writer import save_file

try:
    scrape()
finally:
    save_file()

예제 #10

0

파일 보기

파일: markov.py 프로젝트: sandyg05/scripts

def main():
    if args.mode == "scan":
        if "books.csv" not in os.listdir(root) or "books" not in os.listdir(
                root):
            sys.exit(
                "\nThere is no data to scan. Run the script with scrape mode first."
            )
        else:
            try:
                start = time.time()
                mapper = WordMapper(
                )  # Creating the WordMapper object loads every word in the disk to ram.
                end = time.time()
                logging.info(
                    "{} words are loaded into memory in {} seconds.".format(
                        mapper.word_count(), (end - start)))

                mapper.map_words()  # This will create mappings for every word.
                prefixes, suffixes = WordMapper.stats()
                logging.info("{} words are mapped to {} suffixes.".format(
                    prefixes, suffixes))

                os.chdir("..")
                with open(args.output, "wb") as output:
                    pickle.dump(
                        mapper.word_mappings, output,
                        pickle.HIGHEST_PROTOCOL)  # Saving the mappings.

                logging.info("Mappings written to {} in {}.".format(
                    args.output, os.getcwd()))
            except KeyboardInterrupt:
                sys.exit(logging.error("Exiting current process."))

    elif args.mode == "scrape":
        if "books.csv" not in os.listdir(
                root):  # If books.csv doesn't exist, creating it first.
            logging.info("Writing books.csv ")
            top_book_urls = web_scraper.scrape()  # Getting the list of urls.
            df = web_scraper.to_df(top_book_urls)  # Adding them to df.
            web_scraper.save(df)  # Saving df to csv.
        else:  # If books.csv exists, but books directory doesn't exist.
            df_books = pd.read_csv("books.csv")

            if not os.path.exists(
                    web_scraper.BOOKS_DIR
            ):  # Creating the directory, if it doesn't exist.
                os.makedirs(web_scraper.BOOKS_DIR)

            count = 0
            for i in df_books.index:  # Iterating the .csv file and writing books to disk.
                if df_books.iloc[i, 0] + ".txt" in os.listdir(
                        web_scraper.BOOKS_DIR):
                    logging.error("{} is already written to the disk.".format(
                        df_books.iloc[i, 0]))
                else:
                    try:
                        try:
                            web_scraper.write_book(df_books.iloc[i, 1],
                                                   df_books.iloc[i, 0])
                            count += 1
                        except TypeError:
                            logging.error(
                                "Unable to write {} to the disk.\n".format(
                                    df_books.iloc[i, 0]))
                    except KeyboardInterrupt:
                        logging.info(
                            "{} books written to the disk in the current process.\n"
                            .format(count))
                        sys.exit(
                            logging.error(
                                "There are {} books in the disk.\n".format(
                                    sum(1 for _ in os.listdir(
                                        web_scraper.BOOKS_DIR)))))

    elif args.mode == "chain":
        try:
            with open(args.input, "rb") as input_file:
                mapping = pickle.load(input_file)
                chain = word_chainer.chain_words(mapping, args.word)
                print(chain)
        except FileNotFoundError:
            sys.exit(
                logging.error("File {} is not found in {}.".format(
                    args.input, os.getcwd())))