def main(start_date=None, get_indices=True, get_feeds=True, extract_feeds=True): cacher = edgarcache.EDGARCacher() if start_date is None: start_date = dt.date.fromordinal(dt.date.today().toordinal() - 30) if get_feeds: print("Downloading and extracting since {:%Y-%m-%d}...".format( start_date)) if extract_feeds: cacher.extract_daily_feeds(start_date) else: for i_date, feed_path in cacher.download_many_feeds(start_date): if not feed_path: # This day doesn't exist on EDGAR. # Not sure why servers can't work on weekends. continue if not os.path.exists(feed_path): _logger.error("Failed to download %r file to %r.", i_date, feed_path) continue print("Done downloading {}".format(feed_path)) print(" Done!") if get_indices: print("Downloading and extracting indices") index_maker = indices.IndexMaker() index_maker.extract_indexes() print("Done")
def __init__(self, use_tqdm=True): """ Initialize the index making object. use_tqdm: flag for whether or not to wrap downloads in tqdm for progress monitoring """ self._downloader = edgarcache.EDGARCacher(use_tqdm=use_tqdm) self._get_filing_path = localstore.get_filing_path self._get_feed_cache_path = config.get_feed_cache_path self._get_index_cache_path = config.get_index_cache_path self._tq = _tqdm if use_tqdm else _faketqdm
def main(start_date=None, last_n_days=None, get_indices=True, download_feeds=True, extract_feeds=True): """ Download feeds and indices. Examples: This will download/extract last 30 days of forms and all indices: ```python -m pyedgar.downloader -i -d -e --last-n-days 30``` This will just extract the already downloaded last 30 days of forms and ignore indices: ```python -m pyedgar.downloader -e --last-n-days 30``` Args: start_date (datetime): Date to start extraction of feeds from. When empty, defaults to today() - last_n_days last_n_days (int): If start_date is missing, extract this number days before today. Default: 30 get_indices (bool): Flag to download and extract index files. Default: True download_feeds (bool): Flag to download daily feed files since `start_date` or for `last_n_days`. Default: True extract_feeds (bool): Flag to extract daily feed files since `start_date` or for `last_n_days`. Default: True """ rgx = re.compile(config.KEEP_REGEX, re.I) if not config.KEEP_ALL else None _logger.info("From Config: keep regex: %r", rgx) cacher = edgarcache.EDGARCacher(keep_form_type_regex=rgx, check_cik='cik' in config.FILING_PATH_FORMAT) if start_date is None: start_date = dt.date.fromordinal(dt.date.today().toordinal() - (last_n_days or 30)) if download_feeds: _logger.info("Downloading since {:%Y-%m-%d}...".format(start_date)) for _ in cacher.download_many_feeds(start_date): pass if extract_feeds: _logger.info("Extracting since {:%Y-%m-%d}...".format(start_date)) for _ in cacher.extract_daily_feeds(start_date, download_first=False): pass if get_indices: _logger.info("Downloading and extracting indices") index_maker = indices.IndexMaker() index_maker.extract_indexes() _logger.info("Done")
def __init__(self, use_tqdm=True): """ Initialize the index making object. use_tqdm: flag for whether or not to wrap downloads in tqdm for progress monitoring """ # Use the following to default to 10s, 20s, 8s, 13s, and Def 14As. # if keep_form_type_regex is None: # re.compile(r'10-[KQ]|10[KQ]SB|20-F|8-K|13[FDG]|(?:14A$)') self._downloader = edgarcache.EDGARCacher(use_tqdm=use_tqdm) self._get_filing_path = localstore.get_filing_path self._get_feed_cache_path = config.get_feed_cache_path self._get_index_cache_path = config.get_index_cache_path self._tq = _tqdm if use_tqdm else _faketqdm
def main(start_date=None, last_n_days=30, get_indices=False, get_feeds=False, use_curl_to_download=None): """ Download feeds and indices. Feeds will be downloaded for `start_date` through yesterday, or for the past `last_n_days` days. Examples: This will download/extract last 30 days of forms and all indices: ```python -m pyedgar.downloader -i -d --last-n-days 30``` This will download and extract the last 7 days of forms: ```python -m pyedgar.downloader -d --last-n-days 7``` Args: start_date (date): Date to start extraction of feeds from. When empty, defaults to today() - last_n_days last_n_days (int): If start_date is missing, extract this number days before today. Default: 30 get_indices (bool): Flag to download and extract index files. Default: False get_feeds (bool): Flag to download daily feed files since `start_date` or for `last_n_days`. Default: False use_curl_to_download (bool, None): Flag to use cURL subprocess instead of `requests` library. If None, will check for and use cURL if it exists. Default: None """ if use_curl_to_download is None: use_curl_to_download = edgarweb.has_curl() if start_date is None: start_date = dt.date.fromordinal(dt.date.today().toordinal() - last_n_days) else: start_date = utilities.parse_date_input(start_date) if get_feeds: cacher = edgarcache.EDGARCacher( keep_form_type_regex=re.compile(config.KEEP_REGEX, re.I) if not config.KEEP_ALL else None, check_cik="cik" in config.FILING_PATH_FORMAT, use_requests=not use_curl_to_download, ) _logger.info("Downloading since {:%Y-%m-%d}...".format(start_date)) num_dates = len([1 for _ in utilities.iterate_dates(start_date)]) for i_date in tqdm(utilities.iterate_dates(start_date), total=num_dates, desc="Downloading Feeds"): # download one date, so we can track progress with TQDM cacher.extract_daily_feeds(i_date, to_date=i_date, download_first=True, overwrite=False) _logger.info("Done downloading feeds on {:%Y-%m-%d}...".format(i_date)) if get_indices: _logger.info("Downloading and extracting indices") # the last index file we find is probably not 'complete' because it was downloaded during the month maybe. # Let's make sure that isn't the case max_date, last_index = dt.date(1995, 1, 1), None for i_date in utilities.iterate_dates(1995, period="quarterly"): _idx = config.get_index_cache_path(i_date) if os.path.exists(_idx) and i_date > max_date: max_date, last_index = i_date, _idx if last_index is not None: _logger.info("Removing last of the old index caches: %s", last_index) os.remove(last_index) index_maker = indices.IndexMaker(use_tqdm=True, use_requests=not use_curl_to_download) index_maker.extract_indexes() _logger.info("Done downloading and extracting indices")