Exemplo n.º 1
0
    def _write_to_single_csv(self, series_financials_dict: dict):
        attr2id = self._get_attr2id(
            series_financials_dict=series_financials_dict)
        symbols = load_symbol_list(self.symbols_list_name)

        assert len(sorted(self.ordered_symbols)) >= len(sorted(attr2id.keys()))

        with open(self.csv_filename, mode='w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=self.ordered_symbols)

            writer.writeheader()

            period_indices = ['{}{}'.format(year, quarter) for year, quarter in
                              itertools.product(self.year_range,
                                                self.quarters_names)]
            for idx in period_indices:

                for symbol in symbols:

                    symbol_dict = series_financials_dict[symbol]
                    if idx in symbol_dict.keys() and \
                                    Tags.current_date in symbol_dict[idx]:
                        row = symbol_dict[idx]
                        writer.writerow(row)

        return self.csv_filename
Exemplo n.º 2
0
def get_data(thresholds,
             resample_period='1W',
             symbols_list_name='sp500',
             start_date='2006-01-01',
             target_shift=4):
    print("Getting data for: %s - %s from %s with thresholds %s" %
          (symbols_list_name, resample_period, start_date, list(thresholds)))

    # while not decided imputation/removals
    symbols = load_symbol_list(symbols_list_name)
    end_date = '2019-12-31'

    df_prices = get_prices(symbols_list_name=symbols_list_name,
                           start_date=start_date,
                           resample_period=resample_period)

    df_fund = get_fundamentals(symbols_list_name=symbols_list_name,
                               start_date=start_date,
                               end_date=end_date,
                               resample_period=resample_period)

    sic_code, sic_industry = load_sic()

    alist_path = os.path.join(DATA_PATH, 'available_%s' % symbols_list_name)

    if os.path.isfile(alist_path):
        available_symbols = [l.strip() for l in open(alist_path).readlines()]
    else:
        df_fund = compss_wait_on(df_fund)
        available_symbols = set(
            [symbol for symbol, date in df_fund.index.values])
        unavailable = [s for s in symbols if s not in available_symbols]
        removed_symbols = ['ULTA']
        print("Not available symbols: %s\nRemoved symbols: %s" %
              (unavailable, removed_symbols))

        for s in removed_symbols:
            try:
                available_symbols.remove(s)
            except KeyError:
                print("Couldn't remove symbol %s" % s)

        with open(os.path.join(DATA_PATH, 'available_%s' % symbols_list_name),
                  'w') as f:
            f.write('\n'.join(available_symbols))

    df = process_symbols(available_symbols, df_fund, df_prices, sic_code,
                         sic_industry, thresholds, target_shift)

    normal_name, z_name = get_datasets_name(resample_period, symbols_list_name,
                                            thresholds, target_shift)

    normal_file = os.path.join(DATA_PATH, normal_name)
    z_file = os.path.join(DATA_PATH, z_name)

    res = post_process(df, (normal_file, z_file))

    return res
Exemplo n.º 3
0
    def collect(self) -> pd.DataFrame:
        symbols = load_symbol_list(self.symbols_list_name)

        dfs = []

        for symbol in symbols:
            dfs.append(
                self._get_symbol_prices(symbol=symbol))

        return pd.concat(dfs)
Exemplo n.º 4
0
    def _add_periods_info(self, series_financials_dict: dict,
                          save: bool = True):
        # ============================================================================================
        # Compute some target average (or not) price

        # Parameter dependent variables
        symbols = load_symbol_list(self.symbols_list_name)

        for symbol in symbols:
            url = self.report_periods_url.substitute(symbol=symbol)

            data_json = call_and_cache(url, cache=self.cache)

            reporting_periods = data_json['data']
            for i in range(len(reporting_periods)):

                period_info = reporting_periods[i]

                start_date = period_info['start_date']
                end_date = period_info['end_date']
                quarter = period_info['fiscal_period']
                year = period_info['fiscal_year']

                # Starting date of the next reporting period
                next_date = (datetime.strptime(end_date, DATE_FORMAT) +
                             relativedelta(months=3)).strftime(DATE_FORMAT)
                if i > 0:
                    next_date = reporting_periods[i - 1]['start_date']

                # Starting date of the previous reporting period
                prev_date = (datetime.strptime(start_date, DATE_FORMAT) -
                             relativedelta(months=3)).strftime(DATE_FORMAT)
                if i < len(reporting_periods) - 1:
                    prev_date = reporting_periods[i + 1]['start_date']

                period_dict = {Tags.symbol: symbol,
                               Tags.quarter: quarter,
                               Tags.year: year,
                               Tags.current_date: end_date,
                               Tags.prev_date: prev_date,
                               Tags.next_date: next_date,
                               Tags.rep_period: '{}:{}'
                                   .format(start_date, end_date),
                               Tags.next_rep_period: '{}:{}'
                                   .format(end_date, next_date)}

                period_key = '{}{}'.format(year, quarter)
                try:
                    series_financials_dict[symbol][period_key].update(
                        period_dict)
                except KeyError:
                    print("No fundamental info for symbol %s in period %s" % (
                    symbol, period_key))
Exemplo n.º 5
0
    def _collect_fundamentals(self, save=True):
        # Parameter dependent variables
        year_range = list(range(self.start_year, self.end_year))
        symbols = load_symbol_list(self.symbols_list_name)

        series_financials_dict = {}
        for symbol in symbols:
            period_dict = {}
            for year in year_range:
                for quarter in self.quarters_names:

                    symbol_dict = {Tags.symbol: symbol,
                                   Tags.year: year,
                                   Tags.quarter: quarter}

                    for statement in self.statements:

                        url = self.fundamentals_url \
                            .substitute(symbol=symbol,
                                        statement=statement,
                                        year=year,
                                        period=quarter)

                        data_json, retries = {}, 3
                        while 'data' not in data_json and retries > 0:

                            data_json = call_and_cache(url, cache=self.cache)

                            statement_dict = {}

                            if 'data' in data_json:
                                for element in data_json['data']:
                                    statement_dict[element['tag']] = element[
                                        'value']

                                symbol_dict.update(statement_dict)

                        if retries == 0:
                            print(
                                "Couldn't get data after 3 retries for url: %s" % url)

                    period_dict['{}{}'.format(year, quarter)] = symbol_dict

                series_financials_dict[symbol] = period_dict

        if save:
            save_obj(series_financials_dict,
                     '%s/obj/%s_%s-%s_financials' % (
                         DATA_PATH,
                         self.symbols_list_name,
                         self.start_year,
                         self.end_year))
        return series_financials_dict
Exemplo n.º 6
0
def main():
    if len(sys.argv) != 3:
        print('Usage {}: <symbol-file> <destination-dir>'.format(sys.argv[0]))
        exit(1)

    symbols = utils.load_symbol_list(sys.argv[1])
    base_dir = sys.argv[2]

    for symbol in symbols:
        dest_dir = os.path.join(base_dir, symbol + '.csv')
        try:
            finance.fetch_historical_yahoo(symbol, (2000, 1, 1), (2014, 9, 21),
                                           cachename=dest_dir)
        except urllib2.URLError:
            print('Download failed for symbol: {}'.format(symbol))
        time.sleep(30)
Exemplo n.º 7
0
    def _collect_attr_names(self, save=True):
        # Parameter dependent variables

        symbols = load_symbol_list(self.symbols_list_name)

        attr_names = defaultdict(set)

        for symbol in symbols:
            for year in self.year_range:
                for quarter_name in self.quarters_names:

                    for statement in self.statements:

                        url = self.fundamentals_url.substitute(symbol=symbol,
                                                               statement=statement,
                                                               year=year,
                                                               period=quarter_name)
                        data_json, retries = {}, 3
                        while 'data' not in data_json and retries > 0:

                            data_json = call_and_cache(url, cache=self.cache)

                            statement_dict = {}

                            for element in data_json['data']:
                                statement_dict[element['tag']] = element[
                                    'value']

                            attr_names[statement].update(statement_dict.keys())

                        if retries == 0:
                            print(
                                "Couldn't get data after 3 retries for url: %s" % url)

        if save:
            save_obj(attr_names,
                     '%s/%s_%s-%s_attr_names' % (DATA_PATH,
                                                 self.symbols_list_name,
                                                 self.start_year,
                                                 self.end_year))
        return attr_names
Exemplo n.º 8
0
            dfs.append(
                self._get_symbol_prices(symbol=symbol))

        return pd.concat(dfs)


if __name__ == "__main__":
    symbols_list_name = 'sp500'
    start_date = '2006-01-01'

    # df = PriceExtractor(symbols_list_name=symbols_list_name,
    #                     start_date=start_date).collect()

    import fix_yahoo_finance as yf

    symbols = load_symbol_list(symbols_list_name)
    end_date = '2018-12-31'

    dfs = []
    for s in symbols:
        try:
            data = yf.download(s, start_date, end_date)
            df = (data
                  .assign(symbol=s)[['Adj Close', 'symbol']]
                  .rename(index=str, columns={'Adj Close': 'price'}))
            dfs.append(df)
        except ValueError as e:
            print(e)
            print("Exception downloading: %s" % s)

    pzs = (pd.concat(dfs)