def import_dataset(database_id, dataset, directories, **kwargs): """Transfer documents and import content into database. Positional arguments: database_id -- the dict key specifying the database in django dataset -- an AbstractDataset directories -- dict returned from get_common_working_directories Keyword arguments: public -- make the dataset public (default False) public_documents -- make the document text available (default False) verbose -- print output about progress (default False) Return the dataset's name/identifier. """ verbose = kwargs.setdefault('verbose', False) if verbose: print('Importing dataset: ' + dataset.name) dataset_dir = directories['dataset'] if DATABASE_OPTIMIZE_DEBUG: con = connections[database_id] query_count = len(con.queries) meta_types_db = get_all_metadata_types(database_id) if verbose: print('Creating dataset entry.') dataset_db = create_dataset(database_id, dataset, dataset_dir, meta_types_db, **kwargs) if DATABASE_OPTIMIZE_DEBUG: print('Dataset and metadata query count:', len(con.queries) - query_count) query_count = len(con.queries) if verbose: print('Copying documents and creating document entries.') if not dataset_db.documents.exists(): create_documents(database_id, dataset_db, dataset, meta_types_db, verbose) if DATABASE_OPTIMIZE_DEBUG: print('Documents and metadata query count:', len(con.queries) - query_count) dataset_db.visible = True dataset_db.save() if verbose: print('Running dataset metrics.') run_metrics(database_id, dataset_db.name, None, BASIC_DATASET_METRICS) if verbose: print('Done importing ' + dataset.name + '.') return dataset.name
def import_dataset(database_id, dataset, directories, **kwargs): """Transfer documents and import content into database. Positional arguments: database_id -- the dict key specifying the database in django dataset -- an AbstractDataset directories -- dict returned from get_common_working_directories Keyword arguments: public -- make the dataset public (default False) public_documents -- make the document text available (default False) verbose -- print output about progress (default False) Return the dataset's name/identifier. """ verbose = kwargs.setdefault('verbose', False) if verbose: print('Importing dataset: '+dataset.name) dataset_dir = directories['dataset'] if DATABASE_OPTIMIZE_DEBUG: con = connections[database_id] query_count = len(con.queries) meta_types_db = get_all_metadata_types(database_id) if verbose: print('Creating dataset entry.') dataset_db = create_dataset(database_id, dataset, dataset_dir, meta_types_db, **kwargs) if DATABASE_OPTIMIZE_DEBUG: print('Dataset and metadata query count:', len(con.queries) - query_count) query_count = len(con.queries) if verbose: print('Copying documents and creating document entries.') if not dataset_db.documents.exists(): create_documents(database_id, dataset_db, dataset, meta_types_db, verbose) if DATABASE_OPTIMIZE_DEBUG: print('Documents and metadata query count:', len(con.queries) - query_count) dataset_db.visible = True dataset_db.save() if verbose: print('Running dataset metrics.') run_metrics(database_id, dataset_db.name, None, BASIC_DATASET_METRICS) if verbose: print('Done importing '+dataset.name+'.') return dataset.name
def run_analysis(database_id, dataset_name, analysis, directories, topic_namers=None, verbose=False): """Give the analysis the text for the documents allowing the bulk of the work to be done by the analysis. Import the tokens, topic token relationships, topics, etc., into the database. Positional Arguments: database_id -- the dict key specifying the database in django dataset_name -- the name that uniquely identifies which dataset this analysis will be run on analysis -- an AbstractAnalysis object directories -- dict returned from get_common_working_directories Keyword Arguments: topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database object and create topic names according to a naming scheme verbose -- if True notifications of progress will be output to the console Return the unique analysis name for the given dataset. """ if verbose: print('Running analysis:', analysis.name) document_iterator = Document.objects.using(database_id).filter( dataset__name=dataset_name).order_by('index') document_iterator.prefetch_related('dataset', 'metadata') analysis.run_analysis(document_iterator) dataset_db = Dataset.objects.using(database_id).get(name=dataset_name) # word types should be relatively sparse, so we load all of them into memory word_types_db = get_all_word_types(database_id) meta_types_db = get_all_metadata_types(database_id) if verbose: print('Creating analysis entry.') analysis_db = create_analysis(database_id, dataset_db, analysis, meta_types_db) if verbose: print('Creating word type entries.') create_word_type_entries(database_id, analysis.get_vocab_iterator(), word_types_db) if not analysis_db.tokens.exists(): if verbose: print('Creating token entries.') create_tokens(database_id, analysis_db, word_types_db, analysis.get_token_iterator(), verbose=verbose) if verbose: print('Adjusting topic heirarchy.') create_topic_heirarchy(database_id, analysis_db, analysis.get_hierarchy_iterator()) if not analysis_db.stopwords.exists(): if verbose: print('Creating stopword entries.') create_stopwords(database_id, analysis_db, word_types_db, analysis.stopwords) if not analysis_db.excluded_words.exists(): if verbose: print('Creating excluded word entries.') create_excluded_words(database_id, analysis_db, word_types_db, analysis.excluded_words) if verbose: print('Naming topics.') if DATABASE_OPTIMIZE_DEBUG: con = connections[database_id] query_count = len(con.queries) if topic_namers == None: topic_namers = DEFAULT_TOPIC_NAMERS create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose) if DATABASE_OPTIMIZE_DEBUG: total_queries = len(con.queries) - query_count print("Namers used %d queries." % (total_queries, )) if total_queries > 10: for query in con.queries[query_count:]: print(query['time']) print(query['sql']) if verbose: print('Running metrics.') run_metrics(database_id, dataset_db.name, analysis_db.name, BASIC_ANALYSIS_METRICS) return analysis.name
def run_analysis(database_id, dataset_name, analysis, directories, topic_namers=None, verbose=False): """Give the analysis the text for the documents allowing the bulk of the work to be done by the analysis. Import the tokens, topic token relationships, topics, etc., into the database. Positional Arguments: database_id -- the dict key specifying the database in django dataset_name -- the name that uniquely identifies which dataset this analysis will be run on analysis -- an AbstractAnalysis object directories -- dict returned from get_common_working_directories Keyword Arguments: topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database object and create topic names according to a naming scheme verbose -- if True notifications of progress will be output to the console Return the unique analysis name for the given dataset. """ if verbose: print('Running analysis:', analysis.name) document_iterator = Document.objects.using(database_id).filter(dataset__name=dataset_name).order_by('index') document_iterator.prefetch_related('dataset', 'metadata') analysis.run_analysis(document_iterator) dataset_db = Dataset.objects.using(database_id).get(name=dataset_name) # word types should be relatively sparse, so we load all of them into memory word_types_db = get_all_word_types(database_id) meta_types_db = get_all_metadata_types(database_id) if verbose: print('Creating analysis entry.') analysis_db = create_analysis(database_id, dataset_db, analysis, meta_types_db) if verbose: print('Creating word type entries.') create_word_type_entries(database_id, analysis.get_vocab_iterator(), word_types_db) if not analysis_db.tokens.exists(): if verbose: print('Creating token entries.') create_tokens(database_id, analysis_db, word_types_db, analysis.get_token_iterator(), verbose=verbose) if verbose: print('Adjusting topic heirarchy.') create_topic_heirarchy(database_id, analysis_db, analysis.get_hierarchy_iterator()) if not analysis_db.stopwords.exists(): if verbose: print('Creating stopword entries.') create_stopwords(database_id, analysis_db, word_types_db, analysis.stopwords) if not analysis_db.excluded_words.exists(): if verbose: print('Creating excluded word entries.') create_excluded_words(database_id, analysis_db, word_types_db, analysis.excluded_words) if verbose: print('Naming topics.') if DATABASE_OPTIMIZE_DEBUG: con = connections[database_id] query_count = len(con.queries) if topic_namers == None: topic_namers = DEFAULT_TOPIC_NAMERS create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose) if DATABASE_OPTIMIZE_DEBUG: total_queries = len(con.queries) - query_count print("Namers used %d queries."%(total_queries,)) if total_queries > 10: for query in con.queries[query_count:]: print(query['time']) print(query['sql']) if verbose: print('Running metrics.') run_metrics(database_id, dataset_db.name, analysis_db.name, BASIC_ANALYSIS_METRICS) return analysis.name