def main(database: str, url_list_file: str): big_word_list = [] print("we are going to work with " + database) print("we are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) print(len(big_word_list)) # database code os.chdir(os.path.dirname(__file__)) # make sure the db file is in the same directory of the .py file path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, word_list=big_word_list) filename = 'docfile.docx' text_data = read_word.getText(filename) print(text_data)
def main(database, url_list_file): big_word_list = [] print('Db: ' + database) print('input list: ' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print('reading: ', url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) database_utilities.create_database(database) print('length words is: ', len(big_word_list)) database_utilities.save_words_to_database(big_word_list[:250000])
def main(database: str, url_list_file: str): big_word_list = [] print("we are going to work with " + database) print("we are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_content) big_word_list.extend(words) os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print('We are going to work with ' + database) print('We will scan ' + url_list_file) urls = url_utilities.load_urls_from_files(url_list_file) for url in urls: print('Reading ' + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) # database code os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), 'words.db') database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print('we are going to work with ' + database) print('we are going to scan ' + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page(page_contents=page_content) big_word_list.extend(words) # database code - mitigating cross-platform file path issues # dunder __file__ gives the location of the file we're currently running os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def main(database: str, url_list_file: str): big_word_list = [] print("We are going to work with " + database) print("We are going to scan " + url_list_file) urls = url_utilities.load_urls_from_file(url_list_file) for url in urls: print("reading " + url) page_content = url_utilities.load_page(url=url) words = url_utilities.scrape_page( page_contents=page_content) # this gives a raw list of words big_word_list.extend(words) # database code # this is a cross platform and so path mechanism is important os.chdir(os.path.dirname(__file__)) path = os.path.join(os.getcwd(), "words.db") database_utilities.create_database(database_path=path) database_utilities.save_words_to_database(database_path=path, words_list=big_word_list)
def test_save_words_to_database(self): results = save_words_to_database("word.db", ["word1", "word2", "word3"]) assert results == True