def bloomberg_test(): passed, failed = Test('bloomberg_test', 1), Test('bloomberg_test', 0) source = 'bloomberg' # normal url result test url = 'https://www.bloomberg.com/news/articles/2017-08-04/blue-apron-plans-to-cut-24-of-staff-barely-a-month-since-ipo' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 8, 4, 11, 58, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'bloomberg+news'): return failed # press release result test url_pr = 'https://www.bloomberg.com/press-releases/2017-08-01/marathon-kids-run-all-50-states' pr_result = scrape(url_pr, source) if not isinstance(pr_result.article, str): return failed if not similar_dates( pr_result.pubdate, datetime(2017, 8, 1, 13, 0, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(pr_result.article, 'bloomberg+press-releases'): return failed # homepage result test homeurl = 'https://www.bloomberg.com/quote/APRN:US' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def wsj_test(): passed, failed = Test('wsj_test', 1), Test('wsj_test', 0) source = 'wsj' # normal url result test url = 'https://www.wsj.com/articles/BT-CO-20170811-709167' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 8, 11, 4, 49, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, ''): return failed # url = 'https://blogs.wsj.com/moneybeat/2017/08/11/daniel-loebs-third-point-exits-snap-returns-to-alibaba/' # result = scrape(url, source) # if not isinstance(result.article, str): return failed # if not similar_dates(result.pubdate, datetime(2017, , , , , tzinfo=timezone('US/Eastern'))): return failed # if not jaccard_coeff(result.article, ''): return failed homeurl = 'http://quotes.wsj.com/SNAP' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def bloomberg_pr_error_test(): passed, failed = Test('bloomberg_pr_error_test', 1), Test('bloomberg_pr_error_test', 0) url1 = 'https://www.bloomberg.com/press-releases/2017-11-01/gopro-announces-third-quarter-2017-results' url2 = 'https://www.bloomberg.com/press-releases/2017-11-06/proassurance-reports-results-for-third-quarter-2017' source = 'bloomberg' flags = { 'date_checker': True, 'depth': 1, 'validate_url': False, 'length_check': True, 'min_length': 100, } web_node1 = scrape(url1, source, 'GPRO', flags) #web_node2 = scrape(url2, source, 'AAPL', flags) print(web_node1) d_webnode1 = dict(web_node1) print(d_webnode1.keys()) print(d_webnode1['article'].split(' ')) # with open('../data/test.csv', 'w') as f: # fieldnames = d_webnode1.keys() # sort to ensure they are the same order every time # writer = csv.DictWriter(f, fieldnames=fieldnames) # writer.writeheader() # writer.writerows(d_webnode1) #print (web_node2) return passed
def msn_test(): passed, failed = Test('msn_test', 1), Test('msn_test', 0) source = 'msn' # normal url result test url = 'https://www.msn.com/en-us/news/technology/paying-professors-inside-googles-academic-influence-campaign/ar-BBEfaQ3' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 7, 15, 10, 01, tzinfo=timezone('US/Eastern'))):
def yahoo_test(): passed, failed = Test('yahoo_test', 1), Test('yahoo_test', 0) source = 'yahoofinance' # normal url result test url = 'https://finance.yahoo.com/news/blue-apron-aprn-investors-apos-174305777.html' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 7, 11, 13, 43, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'yahoofinance+news'): return failed homeurl = 'https://finance.yahoo.com/quote/APRN?p=APRN' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def marketwatch_test(): passed, failed = Test('marketwatch_test', 1), Test('marketwatch_test', 0) source = 'marketwatch' # normal url result test url = 'http://www.marketwatch.com/story/verizon-earnings-can-you-hear-me-now-verizons-search-for-success-outside-wireless-2017-07-25' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 7, 26, 6, 57, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'marketwatch+story'): return failed homeurl = 'http://www.marketwatch.com/investing/stock/vz' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def investopedia_test(): passed, failed = Test('investopedia_test', 1), Test('investopedia_test', 0) source = 'investopedia' # normal url result test url = 'http://www.investopedia.com/news/food-distributors-outperform-despite-amazon/' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 8, 9, 17, 9, 0, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'investopedia+news'): return failed # homepage result test homeurl = 'http://www.investopedia.com/markets/stocks/amzn/' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def reuters_test(): passed, failed = Test('reuters_test', 1), Test('reuters_test', 0) source = 'reuters' # normal url result test url = 'https://www.reuters.com/article/us-under-armour-strategy-analysis-idUSKBN1AK2H6' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 8, 4, 18, 4, 0, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'reuters+article'): return failed # homepage result test homeurl = 'https://www.reuters.com/finance/stocks/overview?symbol=NKE.N' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def seekingalpha_test(): passed, failed = Test('seekingalpha_test', 1), Test('seekingalpha_test', 0) source = 'seekingalpha' # normal url result test url = 'https://seekingalpha.com/article/4093388-general-electric-breaking-drink-kool-aid' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 8, 2, 8, 1, 0, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'seekingalpha+article'): return failed # homepage result test homeurl = 'https://seekingalpha.com/symbol/GE' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def thestreet_test(): passed, failed = Test('thestreet_test', 1), Test('thestreet_test', 0) source = 'thestreet' # normal url result test url = 'https://www.thestreet.com/story/14219936/1/gopro-shares-are-cheap-for-a-reason.html' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 8, 3, 16, 15, 0, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'thestreet+story'): return failed # homepage result test homeurl = 'https://www.thestreet.com/quote/GPRO.html' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def benzinga_test(): passed, failed = Test('benzinga_test', 1), Test('benzinga_test', 0) source = 'benzinga' # normal url result test url = 'https://www.benzinga.com/general/education/17/08/9903307/an-easy-to-use-cheat-sheet-for-apple-suppliers' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 8, 10, 8, 42, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, ''): return failed homeurl = 'https://www.benzinga.com/stock/qcom' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed for link in home_result: print(link) return passed
def barrons_test(): passed, failed = Test('barrons_test', 1), Test('barrons_test', 0) source = 'barrons' # normal url result test url = 'http://www.barrons.com/articles/facebook-can-climb-more-than-20-1471670309' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2016, 8, 20, 0, 1, tzinfo=timezone('US/Eastern'))): return failed print(result.article) if not jaccard_coeff(result.article, 'barrons+articles'): return failed homeurl = 'http://www.barrons.com/quote/stock/us/xnas/fb' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed for link in home_result: print(link) return passed
def investorplace_test(): passed, failed = Test('investorplace_test', 1), Test('investorplace_test', 0) source = 'investorplace' # normal url result test url = 'http://investorplace.com/2017/07/ok-isnt-good-enough-under-armour-inc-uaa-stock/#.WZPQjMbMwy4' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 7, 5, 10, 0, tzinfo=timezone('US/Eastern'))): return failed print(result.article) if not jaccard_coeff(result.article, 'investorplace'): return failed return passed
def zacks_test(): passed, failed = Test('zacks_test', 1), Test('zacks_test', 0) source = 'zacks' # normal url result test # url = 'https://www.zacks.com/stock/news/264792/should-you-buy-facebook-fb-stock' # result = scrape(url, source) # if not isinstance(result.article, str): return failed # if not similar_dates(result.pubdate, datetime(2017, , , , , tzinfo=timezone('US/Eastern'))): return failed # if not jaccard_coeff(result.article, ''): return failed homeurl = '' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def motelyfool_test(): passed, failed = Test('motelyfool_test', 1), Test('motelyfool_test', 0) source = 'motelyfool' # normal url result test # url = '' # result = scrape(url, source) # if not isinstance(result.article, str): return failed # if not similar_dates(result.pubdate, datetime(2017, , , , , tzinfo=timezone('US/Eastern'))): return failed # if not jaccard_coeff(result.article, ''): return failed homeurl = '' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed
def build_nodes(self, query, urls, flags): """uses the urls to build WebNodes to be written to the csv output""" if len(urls) == 0: return None, [] nodes = [] extra = 0 j = '.' for i, url in enumerate(urls): if verbose: sysprint('parsing urls for query: {}'.format(query.string) + j*(i % 3)) node = scrape(url, query.source, ticker=query.ticker, **flags) if isinstance(node, list): for url in node: if not (url in urls): urls.append(url) extra += 1 #urls += [url for url in node if not(url in urls)] logger.debug('Hit landing page -- crawling for more links') elif node != None: nodes.append(node) else: urls.remove(url) if verbose: sysprint ('built {} nodes to write to disk'.format(len(nodes))) logger.debug('built {} nodes to write to disk'.format(len(nodes))) return nodes, urls, extra
def msn_test(): passed, failed = Test('msn_test', 1), Test('msn_test', 0) source = 'msn' # normal url result test url = 'https://www.msn.com/en-us/news/technology/paying-professors-inside-googles-academic-influence-campaign/ar-BBEfaQ3' result = scrape(url, source) if not isinstance(result.article, str): return failed if not similar_dates( result.pubdate, datetime(2017, 7, 15, 10, 01, tzinfo=timezone('US/Eastern'))): return failed if not jaccard_coeff(result.article, 'msn+en-us+news'): return failed homeurl = 'https://www.msn.com/en-us/money/stockdetails/fi-126.1.GOOG.NAS' home_result = scrape(homeurl, source) if not isinstance(home_result, list): return failed if len(home_result) < 1: return failed # for link in home_result: print(link) return passed # STATUS: need subscription def wsj_test(): passed, failed = Test('wsj_test', 1), Test('wsj_test', 0) source = 'wsj' # normal url result test url = 'https://www.wsj.com/articles/BT-CO-20170811-709167' result = scrape(url, source) if not isinstance(result.article, str): return failed
def main(): # iterate over json objects in the polling subsection for key in db.keys(): text = db[key]['text'] valid_responses = ['exit', 'save', 'skip', 'split', 'g', 'n', 'p', 'x'] if not ('analyzed' in db[key].keys()): print('beginning to analyze new object (', key, ')') if db[key]['url']: text += (scrape(db[key]['url'], '', '').article) sentences = sent_tokenize(text) db[key]['analyzed'] = True skip = False split = False stack = sent_tokenize(text) # add tagged sentence to the vocabulary while len(stack): sentence = stack[0] print(sentence) tmp = [] for stock in db[key]['symbols'].split('-'): while True: tag = input(stock + ' (' + names[stock] + ')' + ' > ') if not tag in valid_responses: print('invalid entry, try again.') continue if tag == 'exit': exit() elif tag == 'save': save() elif tag == 'g': stack = stack[1:] if len(stack) == 1: stack = [] break elif tag == 'skip': skip = True break elif tag == 'split': split_str = '' words = word_tokenize(sentence) for i, val in enumerate(words): split_str += str(i) + ' : ' + val + '\n' idx = int( input('select index for splitting\n' + split_str + '\n\ninput index --> ')) # index bounds checking if idx < 1 or idx > len(words) - 1: print( 'invalid index --> restarting sentence parsing' ) continue s = [' '.join(words[:idx]), ' '.join(words[idx:]) ] + stack[1:] stack = s split = True break else: tmp.append([stock, tag]) stack = stack[1:] if len(stack) == 1: stack = [] break if skip: break if split: split = False break obj = {} for s in tmp: obj[s[0]] = s[1] if obj: sentence = re.sub(r'http\S+', '', str(sentence)) tagged[sentence] = obj tmp = [] if skip: skip = False break