def load_sources_from_file(filepath='manage/sources.txt'): """ Load feeds from a text file. Each line should be the url to the source you want to add. """ logger.info('Loading sources from file. This may take awhile...') add_sources([line for line in open(filepath, 'r')])
def evaluate(): if os.environ.get('FLASK_ENV') == 'TESTING': logger.info('Preparing evaluation database...') db.create_all() evaluate_clustering() logger.info('Cleaning up evaluation database...') db.session.remove() db.drop_all() else: logger.error('This function must be run with FLASK_ENV=TESTING.')
def download(force=False): """ Downloads and extracts the desired DBpedia datasets. They are extracted to the app's `DATASETS_PATH` value. """ # Get the desired dataset urls. dataset_urls = [dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS)] for dataset_url in dataset_urls: # dc = decompressed dc_filepath = os.path.join(DATASETS_PATH, os.path.basename(dataset_url)[:-4]) # remove '.bz2' if os.path.exists(dc_filepath) and not force: logger.warn('File exists, not re-downloading and extracting. You can force by passing `force=True`.') continue # Download the dataset. logger.info('Downloading knowledge dataset from {0}'.format(dataset_url)) filepath = gullet.download(dataset_url, '/tmp/') logger.info('Downloaded to {0}'.format(filepath)) # Decompress the files. logger.info('Extracting to {0}'.format(dc_filepath)) with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file: for data in iter(lambda : file.read(100 * 1024), b''): dc_file.write(data) # Clean up. os.remove(filepath) logger.info('Downloading and extraction complete.')
def collect(): """ Fetch articles from the sources, and save (or update) to db. """ results = [] logger.info('Fetching articles...') print('collecting') # Fetch entries for each source for source in Source.query.all(): try: logger.info('Fetching from {0}...'.format(source.ext_url)) articles = feed.articles(source) # Check for existing copy. for article in articles: if not Article.query.filter_by(ext_url=article.url).count(): db.session.add(article) results.append(article) except feed.SAXException as e: # Error with the feed, make a note. logger.info('Error fetching from {0}.'.format(source.ext_url)) source.errors += 1 logger.info('Finished fetching articles.') db.session.commit() return results
def active(): """ Get info about currently executing tasks. """ try: active_tasks = celery.control.inspect().active() if not active_tasks: logger.info('No active tasks.') return False except IOError as e: logger.error('Error connecting to MQ. Check that it is running.') return False logger.info('There are {0} executing tasks.'.format(len(active_tasks))) return active_tasks
def _iterate_pages(self): """ Parses out and yields pages from the dump. Only yields pages that are in namespace=0 (i.e. articles). """ for elem in self.iterate('page'): # Check the namespace, # only namespace 0 are articles. # https://en.wikipedia.org/wiki/Wikipedia:Namespace ns = int(self._find(elem, 'ns').text) if ns == 0: self.num_docs += 1 yield elem logger.info('There are {0} docs in this dump.'.format(self.num_docs))
def create_sources(filepath): """ Load feeds from a JSON file. It should consist of an dict of source name => list of feeds like so:: { 'The New York Times': [ 'http//www.nytimes.com/services/xml/rss/nyt/World.xml', 'http//www.nytimes.com/services/xml/rss/nyt/politics.xml' ] } """ logger.info('Loading sources from file. This may take awhile...') sources = open(filepath, 'r') raw_sources = json.load(sources) feed.add_sources(raw_sources)
def fetch_dump(self): """ Downloads this instance's Wikipedia dump to replace this instance's current file. """ # Default dump files. base = 'http://dumps.wikimedia.org/enwiki/latest/' pages = 'enwiki-latest-pages-articles.xml.bz2' # Build a default url if one is not specified. if not self.url: self.url = '{0}{1}'.format(base, pages) # Download! logger.info('Fetching pages dump from {0}'.format(self.url)) self.download(self.url)
def _generate_tfidf(self, docs): """ Generate the TF-IDF representations for all the digested docs. Args: | docs (list) -- see `_prepare_tfidf` """ logger.info('Page processing complete. Generating TF-IDF representations.') doc_ids, corpus_counts = self._prepare_tfidf(docs) # Iterate over all docs # the specified docs. for doc_id in doc_ids: self._calculate_tfidf(doc_id, corpus_counts) logger.info('TF-IDF calculations completed!')
def _generate_tfidf(self, docs): """ Generate the TF-IDF representations for all the digested docs. Args: | docs (list) -- see `_prepare_tfidf` """ logger.info( 'Page processing complete. Generating TF-IDF representations.') doc_ids, corpus_counts = self._prepare_tfidf(docs) # Iterate over all docs # the specified docs. for doc_id in doc_ids: self._calculate_tfidf(doc_id, corpus_counts) logger.info('TF-IDF calculations completed!')
def collect(feed): """ Fetch articles from the specified feed, and save to db. """ try: logger.info('Fetching from {0}...'.format(feed.ext_url)) def commit_article(article): db.session.add(article) get_articles(feed, commit_article) db.session.commit() except SAXException as e: # Error with the feed, make a note. logger.info('Error fetching from {0}.'.format(feed.ext_url)) feed.errors += 1 db.session.commit()
def digest(force=False): """ Digests downloaded DBpedia `ttl` (Turtle) dumps using Apache Jena's `tdbloader2`. This digested data can then be interfaced via Apache Jena's Fuseki server (see `argos.core.knowledge`). Note: `tdbloader2` only runs properly on Unix systems. """ knowledge_path = os.path.join(DATASETS_PATH, 'knodb') logger.info('Digesting the datasets to {0}...'.format(knowledge_path)) if os.path.exists(knowledge_path): if not force: logger.warn('It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.') return logger.warn('Existing knowledge database found. Removing...') shutil.rmtree(knowledge_path) loader_path = os.path.expanduser(os.path.join(APP['JENA_PATH'], 'bin/tdbloader2')) cmd = [loader_path, '--loc', knowledge_path] datasets = [os.path.join(DATASETS_PATH, dataset) for dataset in os.listdir(DATASETS_PATH) if dataset.endswith('.ttl') and any(setname in dataset for setname in DESIRED_DATASETS)] logger.info('Using the datasets: {0}'.format(' '.join(datasets))) cmd += datasets subprocess.call(cmd) logger.info('Digestion complete.')
def workers(): """ Get info about currently available Celery workers. If none are available, or there are issues connecting to the MQ, returns False. Returns: | dict -- dict of available workers. OR | bool -- False if no available workers, or cannot connect ot MQ. """ try: # Get info on available workers. workers = celery.control.inspect().stats() if not workers: logger.error('No Celery workers available.') return False except IOError as e: logger.error('Error connecting to MQ. Check that it is running.') return False logger.info('There are {0} workers available.'.format(len(workers))) return workers
def download(force=False): """ Downloads and extracts the desired DBpedia datasets. They are extracted to the app's `DATASETS_PATH` value. """ # Get the desired dataset urls. dataset_urls = [ dataset_url for dataset_url in get_dataset_urls() if any(setname in dataset_url for setname in DESIRED_DATASETS) ] for dataset_url in dataset_urls: # dc = decompressed dc_filepath = os.path.join( DATASETS_PATH, os.path.basename(dataset_url)[:-4]) # remove '.bz2' if os.path.exists(dc_filepath) and not force: logger.warn( 'File exists, not re-downloading and extracting. You can force by passing `force=True`.' ) continue # Download the dataset. logger.info( 'Downloading knowledge dataset from {0}'.format(dataset_url)) filepath = gullet.download(dataset_url, '/tmp/') logger.info('Downloaded to {0}'.format(filepath)) # Decompress the files. logger.info('Extracting to {0}'.format(dc_filepath)) with open(dc_filepath, 'wb+') as dc_file, bz2.BZ2File(filepath, 'rb') as file: for data in iter(lambda: file.read(100 * 1024), b''): dc_file.write(data) # Clean up. os.remove(filepath) logger.info('Downloading and extraction complete.')
def digest(self): """ Will process this instance's dump. """ # Check if the specified file exists. if not exists(self.file): logger.info('Specified file {0} not found, fetching...'.format( self.file)) self.fetch_dump() logger.info('Beginning digestion of pages.') # Process pages and collect their text content ("docs"). docs = [self._process_page(elem) for elem in self._iterate_pages()] logger.info('Vectorizing the page documents...') # Vectorize the docs. doc_vecs = brain.vectorize(docs) # Testing #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb') #import pickle #pickle.dump(doc_vecs, outfile) # Pickle the docs to save to Mongo. #_doc_vecs = self.db().pickle(doc_vecs) #processed_name = self.url if self.url else self.file #self.db().add({'dump': processed_name, 'docs': _doc_vecs}) #self.db().close() # Generate TF-IDF representation # of all docs upon completion. #self._generate_tfidf(docs) logger.info('Digestion complete!') if not self.silent: processed_name = self.url if self.url else self.file notify( 'TF-IDF calculations complete for {0}!'.format(processed_name))
def digest(self): """ Will process this instance's dump. """ # Check if the specified file exists. if not exists(self.file): logger.info('Specified file {0} not found, fetching...'.format(self.file)) self.fetch_dump() logger.info('Beginning digestion of pages.') # Process pages and collect their text content ("docs"). docs = [self._process_page(elem) for elem in self._iterate_pages()] logger.info('Vectorizing the page documents...') # Vectorize the docs. doc_vecs = brain.vectorize(docs) # Testing #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb') #import pickle #pickle.dump(doc_vecs, outfile) # Pickle the docs to save to Mongo. #_doc_vecs = self.db().pickle(doc_vecs) #processed_name = self.url if self.url else self.file #self.db().add({'dump': processed_name, 'docs': _doc_vecs}) #self.db().close() # Generate TF-IDF representation # of all docs upon completion. #self._generate_tfidf(docs) logger.info('Digestion complete!') if not self.silent: processed_name = self.url if self.url else self.file notify('TF-IDF calculations complete for {0}!'.format(processed_name))
def digest(force=False): """ Digests downloaded DBpedia `ttl` (Turtle) dumps using Apache Jena's `tdbloader2`. This digested data can then be interfaced via Apache Jena's Fuseki server (see `argos.core.knowledge`). Note: `tdbloader2` only runs properly on Unix systems. """ knowledge_path = os.path.join(DATASETS_PATH, 'knodb') logger.info('Digesting the datasets to {0}...'.format(knowledge_path)) if os.path.exists(knowledge_path): if not force: logger.warn( 'It looks like a knowledge database already exists, not rebuilding it. You can force by passing `force=True`.' ) return logger.warn('Existing knowledge database found. Removing...') shutil.rmtree(knowledge_path) loader_path = os.path.expanduser( os.path.join(APP['JENA_PATH'], 'bin/tdbloader2')) cmd = [loader_path, '--loc', knowledge_path] datasets = [ os.path.join(DATASETS_PATH, dataset) for dataset in os.listdir(DATASETS_PATH) if dataset.endswith('.ttl') and any(setname in dataset for setname in DESIRED_DATASETS) ] logger.info('Using the datasets: {0}'.format(' '.join(datasets))) cmd += datasets subprocess.call(cmd) logger.info('Digestion complete.')
def evaluate_clustering(): """ Evaluate the clustering algorithm. """ logger.info('Constructing expected clusters and articles...') expected_clusters = {} articles = [] all_files = [] # Collect all appropriate files. for dir, subdir, files in os.walk('manage/evaluate/organized_articles'): for file in files: filepath = os.path.join(dir, file) name, ext = os.path.splitext(filepath) if ext == '.txt': all_files.append((dir, name, filepath)) # Create articles for appropriate files. for dir, name, filepath in all_files: category = dir.split('/')[-1] f = open(filepath, 'r') article = Article(text=f.read(), title=name.split('/')[-1]) expected_clusters.setdefault(category, []).append(article) articles.append(article) progress_bar(len(articles) / len(all_files) * 100) print('\n') logger.info('Will cluster {0} articles.'.format(len(articles))) logger.info('Expecting {0} clusters.'.format(len( expected_clusters.keys()))) logger.info('Clustering...') p = cProfile.Profile() clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True) logger.info('Created {0} clusters.'.format(len(clusters))) logger.info('Cluster composition is as follows...') for c in clusters: logger.info([m.title for m in c.members]) logger.info('Profiling statistics from the clustering...') ps = pstats.Stats(p) ps.sort_stats('time').print_stats(10)
def evaluate_clustering(): """ Evaluate the clustering algorithm. """ logger.info('Constructing expected clusters and articles...') expected_clusters = {} articles = [] all_files = [] # Collect all appropriate files. for dir, subdir, files in os.walk('manage/evaluate/organized_articles'): for file in files: filepath = os.path.join(dir, file) name, ext = os.path.splitext(filepath) if ext == '.txt': all_files.append((dir, name, filepath)) # Create articles for appropriate files. for dir, name, filepath in all_files: category = dir.split('/')[-1] f = open(filepath, 'r') article = Article( text=f.read(), title=name.split('/')[-1] ) expected_clusters.setdefault(category, []).append(article) articles.append(article) progress_bar(len(articles)/len(all_files) * 100) print('\n') logger.info('Will cluster {0} articles.'.format(len(articles))) logger.info('Expecting {0} clusters.'.format(len(expected_clusters.keys()))) logger.info('Clustering...') p = cProfile.Profile() clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True) logger.info('Created {0} clusters.'.format(len(clusters))) logger.info('Cluster composition is as follows...') for c in clusters: logger.info([m.title for m in c.members]) logger.info('Profiling statistics from the clustering...') ps = pstats.Stats(p) ps.sort_stats('time').print_stats(10)
def download(url, save_path, filename=None, progress=False): """ Downloads a file from the specified URL. Will resume an existing download if the target server supports it (responds with the "Accepts-Range" header). Args: | url (str) -- url of the file to download | save_path (str) -- path to the directory to save the file | progress (bool) -- output progress bar to stdout """ # Strip trailing slash, if there is one. save_path = save_path.rstrip('\/') if filename is None: filename = url.split('/').pop() file = '{0}/{1}'.format(save_path, filename) existing_size = 0 # If file already exists, # but there is not a newer file is on the server... if os.path.exists(file) and not _expired(url, file): # Append to existing file. outfile = open(file, 'ab') # Figure out how many bytes we've got. existing_size = outfile.tell() # Setup request for only the remaining bytes. headers = {'Range': 'bytes={0}-'.format(existing_size)} req = request.Request(url, headers=headers) # Otherwise, create a new/overwrite existing file. else: # Create/overwrite file. outfile = open(file, 'wb') outfile.seek(0) # Vanilla request. req = request.Request(url) try: # Get response. resp = request.urlopen(req) # Get total size of content. total_size = float(resp.headers['Content-Length'].strip()) # Check if the file has already been downloaded_size. if total_size == existing_size: logger.info('File already downloaded.') return # Check that the server accepts ranges. # If it does not, the server will ignore the Range header, # And we have to start all over again. if existing_size > 0 and not resp.headers.get('Accept-Ranges', None): logger.info('Server does not allow resuming of downloads.') logger.info('Starting from the beginning! :D') outfile = open(file, 'wb') outfile.seek(0) if progress: progress_bar( (existing_size/total_size) * 100 ) # Pull out the chunks! for chunk in iter(lambda: resp.read(CHUNK), b''): # Write the chunk to the file. outfile.write(chunk) # Update existing size. existing_size += len(chunk) percent_complete = (existing_size/total_size) * 100 # Show progress. if progress: progress_bar(percent_complete) if progress: sys.stdout.write('\n') # Return the download's filepath. return file except request.HTTPError as e: logger.error('HTTP Error:', e.code, url) except request.URLError as e: logger.error('URL Error:', e.reason, url)