Exemplo n.º 1
0
def main(start_date=None,
         get_indices=True,
         get_feeds=True,
         extract_feeds=True):
    cacher = edgarcache.EDGARCacher()

    if start_date is None:
        start_date = dt.date.fromordinal(dt.date.today().toordinal() - 30)

    if get_feeds:
        print("Downloading and extracting since {:%Y-%m-%d}...".format(
            start_date))
        if extract_feeds:
            cacher.extract_daily_feeds(start_date)
        else:
            for i_date, feed_path in cacher.download_many_feeds(start_date):
                if not feed_path:
                    # This day doesn't exist on EDGAR.
                    # Not sure why servers can't work on weekends.
                    continue

                if not os.path.exists(feed_path):
                    _logger.error("Failed to download %r file to %r.", i_date,
                                  feed_path)
                    continue
                print("Done downloading {}".format(feed_path))

        print(" Done!")

    if get_indices:
        print("Downloading and extracting indices")
        index_maker = indices.IndexMaker()
        index_maker.extract_indexes()
        print("Done")
Exemplo n.º 2
0
    def __init__(self, use_tqdm=True):
        """
        Initialize the index making object.

        use_tqdm: flag for whether or not to wrap downloads in tqdm for progress monitoring
        """

        self._downloader = edgarcache.EDGARCacher(use_tqdm=use_tqdm)

        self._get_filing_path = localstore.get_filing_path
        self._get_feed_cache_path = config.get_feed_cache_path
        self._get_index_cache_path = config.get_index_cache_path

        self._tq = _tqdm if use_tqdm else _faketqdm
Exemplo n.º 3
0
def main(start_date=None,
         last_n_days=None,
         get_indices=True,
         download_feeds=True,
         extract_feeds=True):
    """
    Download feeds and indices.

    Examples:
        This will download/extract last 30 days of forms and all indices:

            ```python -m pyedgar.downloader -i -d -e --last-n-days 30```

        This will just extract the already downloaded last 30 days of forms and ignore indices:

            ```python -m pyedgar.downloader -e --last-n-days 30```

    Args:
        start_date (datetime): Date to start extraction of feeds from. When empty, defaults to today() - last_n_days
        last_n_days (int): If start_date is missing, extract this number days before today. Default: 30
        get_indices (bool): Flag to download and extract index files. Default: True
        download_feeds (bool): Flag to download daily feed files since `start_date` or for `last_n_days`. Default: True
        extract_feeds (bool): Flag to extract daily feed files since `start_date` or for `last_n_days`. Default: True
    """
    rgx = re.compile(config.KEEP_REGEX, re.I) if not config.KEEP_ALL else None
    _logger.info("From Config: keep regex: %r", rgx)
    cacher = edgarcache.EDGARCacher(keep_form_type_regex=rgx,
                                    check_cik='cik'
                                    in config.FILING_PATH_FORMAT)

    if start_date is None:
        start_date = dt.date.fromordinal(dt.date.today().toordinal() -
                                         (last_n_days or 30))

    if download_feeds:
        _logger.info("Downloading since {:%Y-%m-%d}...".format(start_date))
        for _ in cacher.download_many_feeds(start_date):
            pass

    if extract_feeds:
        _logger.info("Extracting since {:%Y-%m-%d}...".format(start_date))
        for _ in cacher.extract_daily_feeds(start_date, download_first=False):
            pass

    if get_indices:
        _logger.info("Downloading and extracting indices")
        index_maker = indices.IndexMaker()
        index_maker.extract_indexes()

    _logger.info("Done")
Exemplo n.º 4
0
    def __init__(self, use_tqdm=True):
        """
        Initialize the index making object.

        use_tqdm: flag for whether or not to wrap downloads in tqdm for progress monitoring
        """
        # Use the following to default to 10s, 20s, 8s, 13s, and Def 14As.
        # if keep_form_type_regex is None:
        #    re.compile(r'10-[KQ]|10[KQ]SB|20-F|8-K|13[FDG]|(?:14A$)')

        self._downloader = edgarcache.EDGARCacher(use_tqdm=use_tqdm)

        self._get_filing_path = localstore.get_filing_path
        self._get_feed_cache_path = config.get_feed_cache_path
        self._get_index_cache_path = config.get_index_cache_path

        self._tq = _tqdm if use_tqdm else _faketqdm
Exemplo n.º 5
0
def main(start_date=None,
         last_n_days=30,
         get_indices=False,
         get_feeds=False,
         use_curl_to_download=None):
    """
    Download feeds and indices. Feeds will be downloaded for `start_date` through yesterday,
    or for the past `last_n_days` days.

    Examples:
        This will download/extract last 30 days of forms and all indices:

            ```python -m pyedgar.downloader -i -d --last-n-days 30```

        This will download and extract the last 7 days of forms:

            ```python -m pyedgar.downloader -d --last-n-days 7```

    Args:
        start_date (date): Date to start extraction of feeds from. When empty, defaults to today() - last_n_days
        last_n_days (int): If start_date is missing, extract this number days before today. Default: 30
        get_indices (bool): Flag to download and extract index files. Default: False
        get_feeds (bool): Flag to download daily feed files since `start_date` or for `last_n_days`. Default: False
        use_curl_to_download (bool, None): Flag to use cURL subprocess instead of `requests` library. If None,
            will check for and use cURL if it exists. Default: None
    """
    if use_curl_to_download is None:
        use_curl_to_download = edgarweb.has_curl()

    if start_date is None:
        start_date = dt.date.fromordinal(dt.date.today().toordinal() -
                                         last_n_days)
    else:
        start_date = utilities.parse_date_input(start_date)

    if get_feeds:
        cacher = edgarcache.EDGARCacher(
            keep_form_type_regex=re.compile(config.KEEP_REGEX, re.I)
            if not config.KEEP_ALL else None,
            check_cik="cik" in config.FILING_PATH_FORMAT,
            use_requests=not use_curl_to_download,
        )
        _logger.info("Downloading since {:%Y-%m-%d}...".format(start_date))

        num_dates = len([1 for _ in utilities.iterate_dates(start_date)])
        for i_date in tqdm(utilities.iterate_dates(start_date),
                           total=num_dates,
                           desc="Downloading Feeds"):
            # download one date, so we can track progress with TQDM
            cacher.extract_daily_feeds(i_date,
                                       to_date=i_date,
                                       download_first=True,
                                       overwrite=False)

        _logger.info("Done downloading feeds on {:%Y-%m-%d}...".format(i_date))

    if get_indices:
        _logger.info("Downloading and extracting indices")
        # the last index file we find is probably not 'complete' because it was downloaded during the month maybe.
        # Let's make sure that isn't the case
        max_date, last_index = dt.date(1995, 1, 1), None
        for i_date in utilities.iterate_dates(1995, period="quarterly"):
            _idx = config.get_index_cache_path(i_date)
            if os.path.exists(_idx) and i_date > max_date:
                max_date, last_index = i_date, _idx
        if last_index is not None:
            _logger.info("Removing last of the old index caches: %s",
                         last_index)
            os.remove(last_index)

        index_maker = indices.IndexMaker(use_tqdm=True,
                                         use_requests=not use_curl_to_download)
        index_maker.extract_indexes()
        _logger.info("Done downloading and extracting indices")