def __init__(self, File_save): self._config = config()['web_sites']['OSHA'] self._queries = self._config['queries'] self._url = self._config['url'] self.file_save = File_save self._now = datetime.datetime.now().strftime('%m_%d_%Y') self._dir_path = os.path.dirname(os.path.realpath(__file__))
def __init__(self, Chemicals, File_save): self._config = config()['web_sites']['IFA'] self._queries = self._config['queries'] self._url = self._config['url'] self._existing, self.chemicals = checking_existing_chemicals_in_outfile(File_save, Chemicals) self.file_save = File_save self._now = datetime.datetime.now().strftime('%m_%d_%Y')
def __init__(self, investment_site_uid): self.site = investment_site_uid self._config = config()['investment_sites'][investment_site_uid] self._credentials = credentials( )['investment_sites'][investment_site_uid] self._browser = None self._home = "{}".format(self._config['url'])
def __init__(self, Chemicals, File_save): self._config = config()['web_sites']['NIST'] self._queries = self._config['queries'] self._url = self._config['url'] self._existing, self.chemicals = checking_existing_chemicals_in_outfile( File_save, Chemicals) self.file_save = File_save self._now = datetime.datetime.now().strftime('%m_%d_%Y') self._dir_path = os.path.dirname(os.path.realpath(__file__))
def __init__(self, headless): options = Options() if headless: options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') self._browser = webdriver.Chrome(config()['driver']['path'], chrome_options=options) self._browser.implicitly_wait(10)
def __init__(self, year, Files): self.year = year self._dir_path = os.path.dirname(os.path.realpath(__file__)) # Working Directory self._config = config()['web_sites']['TRI'] self._queries = self._config['queries'] self._url = self._config['url'] # Uniform Resource Locator (URL) of TRI Database self._TRI_File_Columns_Dictionary = {} # TRI File Formats for File in Files: self._TRI_File_Columns_Dictionary[File] = []
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Beginning scraper for {host}') homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!!') articles.append(article) _save_articles(news_site_uid, articles)
def __init__(self, Year): # Specification could be found in: # https://rcrainfopreprod.epa.gov/rcrainfo-help/application/publicHelp/index.htm # Date: 3/17/2020 # List of tables: ### BR_REPORTING_2001 ### BR_REPORTING_2003 ### BR_REPORTING_2005 ### BR_REPORTING_2007 ### BR_REPORTING_2009 ### BR_REPORTING_2011 ### BR_REPORTING_2013 ### BR_REPORTING_2015 ### BR_REPORTING_2017 self._dir_path = os.path.dirname( os.path.realpath(__file__)) # Working Directory self.Year = Year self._config = config()['web_sites']['RCRAInfo'] self._queries = self._config['queries'] self._url = self._config[ 'url'] # Uniform Resource Locator (URL) of RCRAInfo Database
def __init__(self): uri = "mongodb+srv://{}:{}@{}/test?retryWrites=true".format( user, password, host) client = pymongo.MongoClient(uri) self._db = client[config()['mongodb']['db']['name']]
import pymongo from extract.common import credentials from extract.common import config host = config()['mongodb']['host'] user = credentials()['mongodb']['user'] password = credentials()['mongodb']['password'] class SaveProjects(object): def __init__(self): uri = "mongodb+srv://{}:{}@{}/test?retryWrites=true".format( user, password, host) client = pymongo.MongoClient(uri) self._db = client[config()['mongodb']['db']['name']] def save(self, projects): self._db.project.insert_many(projects)
import yaml import logging logging.basicConfig(level=logging.INFO) import subprocess from extract.common import config logger = logging.getLogger(__name__) #Extrae la lista de nombres del sitio address='extract/config.yaml' news_sites_uids=list(config(address)['news_sites'].keys()) def main(): _create() _extract() _transform() _load() #Crea el diccionario con los nombres, url y queries de cada sitio de noticias, lo exporta en un archivo .yaml y lo mueve a la carpeta extract def _create(): logger.info('Starting create process') subprocess.run(['python', 'main.py'], cwd='./create') subprocess.run(['mv', 'config.yaml', '../extract/config.yaml'], cwd='./create') #Extrae toda la informacion de sitio de noticias, los exporta por nombre y los mueve a la carpeta transform def _extract(): global news_sites_uids logger.info('Starting extract process for {}'.format(news_sites_uids))
def __init__(self, news_site_uid, url): self._config = config()['news_sites'][news_site_uid] self._queries = self._config['queries'] self._html = None self._url = url self._visit(self._url)
def __init__(self): self._config = config()['web_sites']['FRS'] self._dir_path = os.path.dirname(os.path.realpath(__file__)) # Working Directory
article = None try: article = news.ArticlePage(news_site_uid, _build_link(host, link)) except (HTTPError, MaxRetryError, DecodeError, ContentDecodingError, TimeoutError, NewConnectionError, ConnectionError): logger.warning('Error while fetching the article', exc_info=False) if article and not article.body: logger.warning('Invalid article. There is no body') return None return article def _build_link(host, link): if is_well_formed_link.match(link): return link elif is_root_path.match(link): return f'{host}{link}' else: return '{host}/{uri}'.format(host=host, uri=link) if __name__ == '__main__': parser = argparse.ArgumentParser() news_site_choice = list(config()['news_sites'].keys()) parser.add_argument('news_site', help='The news site that you want to scraper', type=str, choices=news_site_choice) args = parser.parse_args() _news_scraper(args.news_site)