def get_clean_urls(table_name): raw_data = list(mongo_driver.get_all(table_name)) urls = list(filter(lambda item: 'url' in item, raw_data)) def clean_link(data): link = data['url'].lower().replace('http://', '').replace('https://', '').replace( 'www.', '').replace((' '), '') if link.endswith('/'): return link[:-1], data else: return link, data return dict(list(map(lambda item: clean_link(item), urls)))
def get_clean_urls(table_name): raw_data = list(mongo_driver.get_all(table_name)) urls = list(filter(lambda item: 'url' in item, raw_data)) def clean_link(data): link = data['url'].lower().replace('http://', '').replace('https://', '').replace('www.', '').replace( (' '), '') if link.endswith('/'): return link[:-1], data else: return link, data return dict(list(map(lambda item: clean_link(item), urls)))
def flags_articles_gen(): for i, _ in enumerate(mongo_driver.get_all('articles')): yield _
def corpus_gen(): for i, _ in enumerate(mongo_driver.get_all('articles_cleaned')): if _['article']: yield _
def threadpool(): pool = Pool(30) x = pool.imap_unordered(go, batch) while True: try: x.next(timeout=10) except multiprocessing.context.TimeoutError: print('timeout!') except AttributeError as e: print(e) except StopIteration: print('batch finished.') pool.close() break except EOFError: pass if __name__ == '__main__': news_sources = mongo_driver.get_all('all_sources') while True: try: batch = itertools.islice(news_sources, 90) threadpool() except StopIteration: print('finished.') exit()
def corpus_gen(): for i, _ in enumerate(mongo_driver.get_all('articles_cleaned')): if _['article']: #and _['flag'] != 'satire': yield _