for fi in files: if os.path.getsize(root + fi) >= min_size: book = BookItem() SQL_INSERT_QUERY = "" book['author'] = fi.split('___')[0] book['title'] = fi.split('___')[1][:-4] with open(root + fi, 'r') as doc_file: book['content'] = doc_file.read() author = Author(book['author']) author_queried_id = connect_to_database.test_if_author_exists(author) """ The test_if_author_exists function returns -1 if the name of the author is not found on the database. Visit connect_to_database.py for more details. """ if author_queried_id is -1: SQL_INSERT_QUERY += author.get_author_insert_query() """ There is a checking in the Document class to see if author_queried_id is 0, which indicates the author was not found in the database
for fi in files: if os.path.getsize(root + fi) >= min_size: book = BookItem() SQL_INSERT_QUERY = "" book['author'] = fi.split('___')[0] book['title'] = fi.split('___')[1][:-4] with open(root + fi, 'r') as doc_file: book['content'] = doc_file.read() author = Author(book['author']) author_queried_id = connect_to_database.test_if_author_exists( author) """ The test_if_author_exists function returns -1 if the name of the author is not found on the database. Visit connect_to_database.py for more details. """ if author_queried_id is -1: SQL_INSERT_QUERY += author.get_author_insert_query() """ There is a checking in the Document class to see if author_queried_id is 0, which indicates the author was not found in the database In this case, the script will first insert the info of that author into the database. Then,
def process_book_item(book): """ This function will catch the pipeline from Gutenberg crawler, precisely, the book item/object and continue to dump the data into the database. The following code will store the meta-data of the book item. After that, the zip file will be extracted by read_file_get_content function in order to get its content. """ SQL_INSERT_QUERY = '' zip_path = dir_path + book['host_path'][0] author = Author(book['author']) author_queried_id = connect_to_database.test_if_author_exists(author) # Empty content variable for the storage of content content = '' """ The test_if_author_exists function returns -1 if the name of the author is not found on the database. Visit connect_to_database.py for more details. """ if author_queried_id is -1: SQL_INSERT_QUERY += author.get_author_insert_query() """ We do need to check the file type because somehow Gutenberg provides txt file """ try: if zipfile.is_zipfile(zip_path): with zipfile.ZipFile(zip_path, 'r') as z: z.extractall(path_to_store_txt) """ There is a checking in the Document class to see if author_queried_id is 0, which indicates the author was not found in the database In this case, the script will first insert the info of that author into the database. Then, the document will use that newly generated author_id to do its job. Otherwise, the script will just use the author_id returned by the connect_to_database.test_if_author_exists(author) function. """ content = read_file_get_content(z.namelist()[0]) else: content = read_file_get_content(zip_path) except NotImplementedError: print "Broken zip file" return False except IOError: print "File not found" return False SQL_INSERT_QUERY += Document(-1, author_queried_id, book['title'], book['lang'], book['loc_class'], book['rdate'], content, book['gutenberg_url']).get_doc_insert_query() connect_to_database.execute_insert_query(SQL_INSERT_QUERY)