def __init__(self, url, from_date, to_date, datetime_format="%Y-%m-%d", browser=None): # TODO: check the url is for a economic calendar if "economic-calendar" not in url: raise ValueError, "The given url does not look like an economic calendar." # TODO: parse url self.url = url _ = self.url.split("/") _ = _[-1].split("-") self.calendar, self.id = string.join(_[:-1], "-"), _[-1] self.timezone = "America/New_York" self.datetime_format = datetime_format self.from_date = parse_tz(from_date, in_tz=None) self.to_date = parse_tz(to_date, in_tz=None) self.browser = webdriver.Chrome() if not browser else browser self.html_table = None self.table = None
def __init__(self, url, resolution, from_date, to_date, datetime_format="%Y-%m-%d", browser=None): self.timezone = "America/New_York" self.datetime_format = datetime_format self.url = url self.resolution = resolution self.from_date = parse_tz(from_date, in_tz=None) self.to_date = parse_tz(to_date, in_tz=None) _ = self.url.split("/") self.instrument = _.pop() self.category = string.join(_[_.index("www.investing.com") + 1:], "/") self.browser = webdriver.Chrome() if not browser else browser self.browser.get( self.URL.format(category=self.category, instrument=self.instrument)) self._html_table = None self.table = None
def _parse_dates(self, cell): """Returns the parsed dates formatted as self.datetime_format. """ m = re.findall(r"\(\w+\)", cell) if m: cell = cell.replace(m.pop(), "") cell = parse_tz(datetime_str=cell, in_tz=None) return cell
def set_html_table(self): wait = WebDriverWait(self.browser, 10) if self.resolution != "Daily": time_frame = self.browser.find_element(By.ID, "data_interval") options = time_frame.find_elements(By.TAG_NAME, "option") for option in options: if option.get_attribute("value") == self.resolution: option.click() break html_table = wait.until( EC.presence_of_element_located((By.ID, "curr_table"))) else: html_table = self.browser.find_element(By.ID, "curr_table") last_record_date = parse_tz( datetime_str=html_table.find_element_by_css_selector( "tbody tr:last-child td").text, in_tz=None) if last_record_date > self.from_date: date_range_button = self.browser.find_element( By.ID, "widgetFieldDateRange") self.browser.execute_script("arguments[0].click();", date_range_button) start_date_field = self.browser.find_element(By.ID, "startDate") start_date_field.clear() start_date_field.send_keys(self.from_date.strftime("%m/%d/%Y")) end_date_field = self.browser.find_element(By.ID, "endDate") end_date_field.clear() end_date_field.send_keys(self.to_date.strftime("%m/%d/%Y")) apply_date_btn = self.browser.find_element(By.ID, "applyBtn") self.browser.execute_script("arguments[0].click();", apply_date_btn) wait = WebDriverWait(self.browser, 10) html_table = wait.until( EC.presence_of_element_located((By.ID, "curr_table"))) self._html_table = html_table return None
def request_data(*args, **kwargs): import locale from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyCBT.common.timezone import parse_tz, timezone_shift locale.setlocale(locale.LC_TIME, "en_US") from_date = parse_tz(kwargs.get("from_date"), in_tz=kwargs.get("timezone")) to_date = parse_tz(kwargs.get("to_date"), in_tz=kwargs.get("timezone")) browser = webdriver.Chrome() browser.get( "https://www.investing.com/economic-calendar/eia-crude-oil-inventories-75" ) inv_table = browser.find_element(By.ID, "eventHistoryTable75") last_date_str = inv_table.find_element_by_css_selector( "tbody tr:last-child td").text last_record_date = parse_tz(remove_pattern(last_date_str, r"\(\w+\)"), in_tz="America/New_York") wait = WebDriverWait(browser, 10) while last_record_date > from_date: show_more = wait.until( EC.element_to_be_clickable((By.ID, "showMoreHistory75"))) browser.execute_script("arguments[0].click();", show_more) inv_table = wait.until( inventory_table_has_changed_from((By.ID, "eventHistoryTable75"), inv_table)) last_date_str = inv_table.find_element_by_css_selector( "tbody tr:last-child td").text last_record_date = parse_tz(remove_pattern(last_date_str, r"\(\w+\)"), in_tz="America/New_York") table = pd.read_html(u"<table>" + inv_table.get_attribute("innerHTML") + u"</table>")[0] table.insert(0, "Datetime", value=table["Release Date"] + " " + table["Time"]) better = map( lambda span: "better" in span.get_attribute("title").lower() if span.get_attribute("title").strip() else None, inv_table.find_elements_by_tag_name("span")) table.insert(table.columns.size, "Better", value=better) table["Datetime"] = table["Datetime"].apply(remove_pattern, args=(r"\(\w+\)", )) table["Datetime"] = table["Datetime"].apply( timezone_shift, args=("America/New_York", kwargs.get("timezone"), kwargs.get("datetime_format"))) mask = [ not (from_date <= parse_tz(release_date, in_tz="America/New_York") <= to_date) for release_date in table["Datetime"] ] table.drop(table.index[mask], axis="index", inplace=True) table.drop(["Release Date", "Time", "Unnamed: 5"], axis="columns", inplace=True) table.set_index("Datetime", inplace=True) table = table.applymap(lambda cell: eval(cell.strip("M")) if type(cell) == str else cell) locale.resetlocale(locale.LC_TIME) return table