示例#1
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going to work with " + database)
    print("we are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)

    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    print(len(big_word_list))

    # database code
    os.chdir(os.path.dirname(__file__))
    # make sure the db file is in the same directory of the .py file
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              word_list=big_word_list)

    filename = 'docfile.docx'
    text_data = read_word.getText(filename)
    print(text_data)
示例#2
0
def main(database, url_list_file):
    big_word_list = []
    print('Db: ' + database)
    print('input list: ' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print('reading: ', url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    database_utilities.create_database(database)
    print('length words is: ', len(big_word_list))
    database_utilities.save_words_to_database(big_word_list[:250000])
示例#3
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("we are going to work with " + database)
    print("we are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_content)
        big_word_list.extend(words)
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words=big_word_list)
示例#4
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('We are going to work with ' + database)
    print('We will scan ' + url_list_file)
    urls = url_utilities.load_urls_from_files(url_list_file)
    for url in urls:
        print('Reading ' + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), 'words.db')
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
示例#5
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print('we are going to work with ' + database)
    print('we are going to scan ' + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(page_contents=page_content)
        big_word_list.extend(words)

    # database code - mitigating cross-platform file path issues
    # dunder __file__ gives the location of the file we're currently running
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
示例#6
0
def main(database: str, url_list_file: str):
    big_word_list = []
    print("We are going to work with " + database)
    print("We are going to scan " + url_list_file)
    urls = url_utilities.load_urls_from_file(url_list_file)
    for url in urls:
        print("reading " + url)
        page_content = url_utilities.load_page(url=url)
        words = url_utilities.scrape_page(
            page_contents=page_content)  # this gives a raw list of words
        big_word_list.extend(words)

    # database code
    # this is a cross platform and so path mechanism is important
    os.chdir(os.path.dirname(__file__))
    path = os.path.join(os.getcwd(), "words.db")
    database_utilities.create_database(database_path=path)
    database_utilities.save_words_to_database(database_path=path,
                                              words_list=big_word_list)
 def test_save_words_to_database(self):
     results = save_words_to_database("word.db",
                                      ["word1", "word2", "word3"])
     assert results == True