def scrape_latest(soup, url, trading_date, formatted_output): if soup is None: print ('Insider ERR: no result for <' + url + '>') return None table = soup.find('table', {'class': 'nc'}) if table is None: if S.DBG_ALL: print ('INFO: No insider data is available for <' + url + '>') return None insiders = {} director = "director" in url # for each row, there are many rows including no table for tr in table.findAll('tr'): td = tr.findAll('td') if S.DBG_INSIDER: print("DBG:") for x in td: print repr(x) # u'\u2019' is the last char in DATO' which can't be encoded to ascii # insider = [x.text.replace(u'\u2019', '').strip().encode("ascii") for x in td] insider = [printable(x.text.replace(u'\u2019', '').encode("ascii")).strip() for x in td] if len(insider) >= 10: name, chg_date, price, view = "", "", "", "" from_date, to_date, min_price, max_price = "", "", "", "" if len(insider) == 11: stock, announce_date, name, chg_date, chg_type, shares, price, direct, indirect, total = \ unpack_latest_td(*insider) view = S.I3_KLSE_URL + td[10].find('a').get('href').encode("ascii") if S.DBG_ALL or S.DBG_INSIDER: print("%s, %s, %s, %s, %s, %s, %s, %s, %s, %s" % (stock, announce_date, chg_date, chg_type, shares, price, direct, indirect, total, view)) else: stock, announce_date, from_date, to_date, chg_type, shares, min_price, max_price, total = \ unpack_company_td(*insider) view = S.I3_KLSE_URL + td[9].find('a').get('href').encode("ascii") if S.DBG_ALL or S.DBG_INSIDER: print("%s, %s, %s, %s, %s, %s, %s, %s, %s" % (stock, announce_date, from_date, to_date, chg_type, shares, min_price, max_price, total)) ann_date = change2KlseDateFmt(announce_date, "%d-%b-%Y") trd_date = change2KlseDateFmt(trading_date, "%d-%b-%Y") if S.DBG_QR: print("DBG:dates:{0}:{1}".format(ann_date, trd_date)) if ann_date >= trd_date: if len(insider) == 11: if stock not in insiders: insiders[stock] = [] insiders[stock].append(format_insider( formatted_output, director, stock, announce_date, name, chg_date, chg_type, shares, price, view)) else: if stock not in insiders: insiders[stock] = [] insiders[stock].append(format_company( formatted_output, stock, announce_date,from_date, to_date, chg_type, shares, min_price, max_price, total, view)) else: break return insiders
def scrape_entitlement(soup, url, trading_date, formatted_output): if soup is None: print('Insider ERR: no result for <' + url + '>') return None table = soup.find('table', {'class': 'nc'}) if table is None: if S.DBG_ALL: print('INFO: No insider data is available for <' + url + '>') return None entitlements = {} others = "others" in url # for each row, there are many rows including no table for tr in table.findAll('tr'): td = tr.findAll('td') if S.DBG_INSIDER: print("DBG:") for x in td: print repr(x) # u'\u2019' is the last char in DATO' which can't be encoded to ascii # insider = [x.text.replace(u'\u2019', '').strip().encode("ascii") for x in td] insider = [ printable(x.text.replace(u'\u2019', '').encode("ascii")).strip() for x in td ] if len(insider) >= 7: if len(insider) == 7: announce_date, stock, open_price, current_price, dividend, ex_date = \ unpack_dividend_td(*insider) view = S.I3_KLSE_URL + td[6].find('a').get('href').encode( "ascii") else: announce_date, stock, subject, open_price, current_price, ratio, ex_date = \ unpack_others_td(*insider) view = S.I3_KLSE_URL + td[7].find('a').get('href').encode( "ascii") if S.DBG_ALL or S.DBG_INSIDER: print "view: {}".format(view) ann_date = change2KlseDateFmt(announce_date, "%d-%b-%Y") trd_date = change2KlseDateFmt(trading_date, "%d-%b-%Y") if S.DBG_QR: print("DBG:dates:{0}:{1}".format(ann_date, trd_date)) if ann_date >= trd_date: if len(insider) == 7: entitlements[stock] = format_dividend( formatted_output, others, announce_date, stock, "", open_price, current_price, dividend, ex_date, view) else: entitlements[stock] = format_dividend( formatted_output, others, announce_date, stock, subject, open_price, current_price, ratio, ex_date, view) else: break return entitlements
def scrape_latest_ar(soup, trading_date): if soup is None or len(soup) <= 0: print('LatestAR ERR: no result') return None table = soup.find('table', {'class': 'nc'}) if table is None: if S.DBG_ALL: print('INFO: No Latest AR data is available') return None ar_list = {} for tr in table.findAll('tr'): td = tr.findAll('td') latest_ar = [ x.text.strip().replace(' ', '').encode("ascii") for x in td ] # latest_ar = [printable(x.text.encode("ascii").replace(' ', '')).strip() for x in td] if S.DBG_QR: print("DBG:") for x in latest_ar: print repr(x) if len(latest_ar) > 0: [stock, fy, ann_date, announce_date, latest_ann] = unpack_latest_ar(*latest_ar) if announce_date == trading_date: if stock not in ar_list: links = tr.findAll('a') jsp_link = "" for link in links: jsp_link = link.get('href') if "annual" in jsp_link: jsp_link = get_yoy_links(jsp_link) if len(jsp_link) > 0: break ar_list[stock] = [ fy, ann_date, announce_date, latest_ann, jsp_link ] else: print("INFO: Duplicated announcement: " + stock + ":" + latest_ann + ":" + announce_date) else: ann_dt = change2KlseDateFmt(announce_date, "%d-%b-%Y") trd_dt = change2KlseDateFmt(trading_date, "%d-%b-%Y") if S.DBG_QR: print("DBG:dates:{0}:{1}".format(ann_dt, trd_dt)) if ann_dt < trd_dt: break return ar_list
def scrape_latest_qr(soup, trading_date): if soup is None or len(soup) <= 0: print ('LatestQR ERR: no result') return None table = soup.find('table', {'class': 'nc'}) if table is None: if S.DBG_ALL: print ('INFO: No Latest QR data is available') return None qr_list = {} pdf_list = {} for tr in table.findAll('tr'): td = tr.findAll('td') latestQR = [x.text.strip().replace(' ', '').encode("ascii") for x in td] # latestQR = [printable(x.text.encode("ascii").replace(' ', '')).strip() for x in td] if S.DBG_QR: print("DBG:") for x in latestQR: print repr(x) if len(latestQR) > 0: [stock, announcementDate, qd, qn, rev, pbt, np, div, roe, eps, dps, qoq, yoy] = unpack_latest_qr(*latestQR) if announcementDate == trading_date: if stock not in qr_list: links = tr.findAll('a') jsp_link = "" for link in links: jsp_link = link.get('href') if "QoQ" in jsp_link: jsp_link = get_qoq_links(jsp_link) if len(jsp_link) > 0: break qr_list[stock] = [announcementDate, qd, qn, rev, pbt, np, div, roe, eps, dps, qoq, yoy, jsp_link] # pdf_list[stock] = review_pdf(jsp_link.keys()) else: print ("INFO: Duplicated announcement: " + stock + ":" + qd + ":Q" + qn) else: ann_dt = change2KlseDateFmt(announcementDate, "%d-%b-%Y") trd_dt = change2KlseDateFmt(trading_date, "%d-%b-%Y") if S.DBG_QR: print("DBG:dates:{0}:{1}".format(ann_dt, trd_dt)) if ann_dt < trd_dt: break return qr_list
def scrape_listing(soup, trading_date, formatted_output): if soup is None: print('Insider ERR: no result for <' + I3_LISTING_URL + '>') return None table = soup.find('table', {'class': 'nc'}) if table is None: if S.DBG_ALL: print('INFO: No insider data is available for <' + I3_LISTING_URL + '>') return None listings = {} for tr in table.findAll('tr'): td = tr.findAll('td') if S.DBG_INSIDER: print("DBG:") for x in td: print repr(x) # u'\u2019' is the last char in DATO' which can't be encoded to ascii # insider = [x.text.replace(u'\u2019', '').strip().encode("ascii") for x in td] insider = [ printable(x.text.replace(u'\u2019', '').encode("ascii")).strip() for x in td ] if len(insider) >= 7: stock, announce_date, listing_date, type, units, price = \ unpack_listing_td(*insider) view = S.I3_KLSE_URL + td[6].find('a').get('href').encode("ascii") if S.DBG_ALL or S.DBG_INSIDER: print "view: {}".format(view) ann_date = change2KlseDateFmt(announce_date, "%d-%b-%Y") trd_date = change2KlseDateFmt(trading_date, "%d-%b-%Y") if S.DBG_QR: print("DBG:dates:{0}:{1}".format(ann_date, trd_date)) if ann_date >= trd_date: listings[stock] = format_listing(formatted_output, stock, announce_date, listing_date, type, units, price, view) else: break return listings
def scrape_target(soup, trading_date, formatted_output): if soup is None: print('Insider ERR: no result for <' + I3_TARGET_URL + '>') return None table = soup.find('table', {'class': 'nc'}) if table is None: if S.DBG_ALL: print('INFO: No insider data is available for <' + I3_TARGET_URL + '>') return None targets = {} for tr in table.findAll('tr'): td = tr.findAll('td') if S.DBG_INSIDER: print("DBG:") for x in td: print repr(x) # u'\u2019' is the last char in DATO' which can't be encoded to ascii # insider = [x.text.replace(u'\u2019', '').strip().encode("ascii") for x in td] insider = [ printable(x.text.replace(u'\u2019', '').encode("ascii")).strip() for x in td ] if len(insider) >= 7: announce_date, stock, last_price, target, upside_down, call, source = \ unpack_listing_td(*insider) ann_date = change2KlseDateFmt(announce_date, "%d/%m/%Y") trd_date = change2KlseDateFmt(trading_date, "%d-%b-%Y") if S.DBG_QR: print("DBG:dates:{0}:{1}".format(ann_date, trd_date)) if ann_date >= trd_date: targets[stock] = format_target(formatted_output, announce_date, stock, last_price, target, upside_down, call, source) else: break return targets
def unpackTD(dt, price_open, price_range, price_close, change, volume): ''' Sample table: <tr> <td class="left">13/04/2018</td> <td class="right">2.92</td> <td class="right">2.92 - 2.98</td> <td class="right">2.98</td> <td class="right" nowrap="nowrap"><span class="up">0.00 (0.00%)</span></td> <td class="right">10,500</td> </tr> ''' # dt = datetime.datetime.strptime(dt, "%d/%m/%Y").strftime('%Y-%m-%d') dt = change2KlseDateFmt(dt, "%d/%m/%Y") prange = [x.strip() for x in price_range.split('-')] return dt, price_open, prange[1], prange[0], price_close, volume
def scrapeFinancials(soup, counter, term, report): if soup is None or len(soup) <= 0: print 'ERR: no result' return None # klsecol = WSJCOL[report] + term table = soup.find('table', {'class': 'cr_dataTable'}) if table is None: print "ERR:", counter, term, report return None qrs = {} # for each row, there are many rows including no table for tr in table.findAll('tr'): td = tr.findAll('td') if len(td) == 0: for th in table.findAll('th'): tht = th.text if len(tht) <= 0: continue if tht[0].isalpha() or tht[len(tht) - 1].isalpha(): # Skip "fiscalYr" class and 5-year trend continue if term == 'Q': qrs[counter][change2KlseDateFmt(tht, '%d-%b-%Y')] = True else: qrs[counter][tht] = True ''' for qr in qrs.iterkeys(): if db[klsecol].find({counter: {term: qr}}).count() <= 0: qrs[qr] = False print qrs ''' continue qrs[counter][] return qrs