def fomulate_set_paragraphes(author_id, set_size): document_list = data_warehouse.get_docs_from_database_document_by_author_id(author_id) paragraph_list = [data_warehouse.get_cross_tab_features_from_database_by_doc_id(idx) for idx in document_list] for idx in range(0, len(paragraph_list), set_size): yield paragraph_list[idx:idx + set_size]
from data_etl import plaintext_data_etl from data_etl.db_schema_classes.document import Document """ A list of lists is returned and stored in the variable 'results'. Use the database column-name 'doc_content' to reference the content. Visit connect_to_database.py for more details. """ docs_in_fact = [row['doc_id'] for row in data_warehouse.get_doc_ids_from_database_fact()] for author_id in range(8940, 8950): """ Using this method is more memory-friendly as the documents is retrieved sequentially """ print "do author id", author_id docs = data_warehouse.get_docs_from_database_document_by_author_id(author_id) for doc in docs: if doc['doc_id'] in docs_in_fact: docs.remove(doc) print doc['doc_id'], "has already been done" continue print "Dumping novel with doc_id ", str(doc['doc_id']) plaintext_data_etl.read_paragraphs_and_split(Document(doc['doc_id'], doc['author_id'], doc['doc_title'], 'lang', 'loc', '1882-02-25', doc['doc_content'], 'url'))