Python scrape 예제들, webparser.scrape Python 예제들

예제 #1

0

파일 보기

def bloomberg_test():
    passed, failed = Test('bloomberg_test', 1), Test('bloomberg_test', 0)
    source = 'bloomberg'

    # normal url result test
    url = 'https://www.bloomberg.com/news/articles/2017-08-04/blue-apron-plans-to-cut-24-of-staff-barely-a-month-since-ipo'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 8, 4, 11, 58, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'bloomberg+news'): return failed

    # press release result test
    url_pr = 'https://www.bloomberg.com/press-releases/2017-08-01/marathon-kids-run-all-50-states'
    pr_result = scrape(url_pr, source)
    if not isinstance(pr_result.article, str): return failed
    if not similar_dates(
            pr_result.pubdate,
            datetime(2017, 8, 1, 13, 0, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(pr_result.article, 'bloomberg+press-releases'):
        return failed

    # homepage result test
    homeurl = 'https://www.bloomberg.com/quote/APRN:US'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #2

0

파일 보기

def wsj_test():
    passed, failed = Test('wsj_test', 1), Test('wsj_test', 0)
    source = 'wsj'

    # normal url result test
    url = 'https://www.wsj.com/articles/BT-CO-20170811-709167'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 8, 11, 4, 49, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, ''): return failed

    # url = 'https://blogs.wsj.com/moneybeat/2017/08/11/daniel-loebs-third-point-exits-snap-returns-to-alibaba/'
    # result = scrape(url, source)
    # if not isinstance(result.article, str): return failed
    # if not similar_dates(result.pubdate, datetime(2017, , , , , tzinfo=timezone('US/Eastern'))): return failed
    # if not jaccard_coeff(result.article, ''): return failed

    homeurl = 'http://quotes.wsj.com/SNAP'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #3

0

파일 보기

def bloomberg_pr_error_test():
    passed, failed = Test('bloomberg_pr_error_test',
                          1), Test('bloomberg_pr_error_test', 0)
    url1 = 'https://www.bloomberg.com/press-releases/2017-11-01/gopro-announces-third-quarter-2017-results'
    url2 = 'https://www.bloomberg.com/press-releases/2017-11-06/proassurance-reports-results-for-third-quarter-2017'
    source = 'bloomberg'
    flags = {
        'date_checker': True,
        'depth': 1,
        'validate_url': False,
        'length_check': True,
        'min_length': 100,
    }
    web_node1 = scrape(url1, source, 'GPRO', flags)
    #web_node2 = scrape(url2, source, 'AAPL', flags)

    print(web_node1)
    d_webnode1 = dict(web_node1)
    print(d_webnode1.keys())

    print(d_webnode1['article'].split(' '))

    # with open('../data/test.csv', 'w') as f:
    #         fieldnames = d_webnode1.keys() # sort to ensure they are the same order every time
    #         writer = csv.DictWriter(f, fieldnames=fieldnames)
    #         writer.writeheader()
    #         writer.writerows(d_webnode1)

    #print (web_node2)

    return passed

예제 #4

0

파일 보기

def msn_test():
    passed, failed = Test('msn_test', 1), Test('msn_test', 0)
    source = 'msn'

    # normal url result test
    url = 'https://www.msn.com/en-us/news/technology/paying-professors-inside-googles-academic-influence-campaign/ar-BBEfaQ3'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 7, 15, 10, 01, tzinfo=timezone('US/Eastern'))):

예제 #5

0

파일 보기

def yahoo_test():
    passed, failed = Test('yahoo_test', 1), Test('yahoo_test', 0)
    source = 'yahoofinance'

    # normal url result test
    url = 'https://finance.yahoo.com/news/blue-apron-aprn-investors-apos-174305777.html'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 7, 11, 13, 43, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'yahoofinance+news'): return failed

    homeurl = 'https://finance.yahoo.com/quote/APRN?p=APRN'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #6

0

파일 보기

def marketwatch_test():
    passed, failed = Test('marketwatch_test', 1), Test('marketwatch_test', 0)
    source = 'marketwatch'

    # normal url result test
    url = 'http://www.marketwatch.com/story/verizon-earnings-can-you-hear-me-now-verizons-search-for-success-outside-wireless-2017-07-25'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 7, 26, 6, 57, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'marketwatch+story'): return failed

    homeurl = 'http://www.marketwatch.com/investing/stock/vz'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #7

0

파일 보기

def investopedia_test():
    passed, failed = Test('investopedia_test', 1), Test('investopedia_test', 0)
    source = 'investopedia'

    # normal url result test
    url = 'http://www.investopedia.com/news/food-distributors-outperform-despite-amazon/'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 8, 9, 17, 9, 0, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'investopedia+news'): return failed

    # homepage result test
    homeurl = 'http://www.investopedia.com/markets/stocks/amzn/'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #8

0

파일 보기

def reuters_test():
    passed, failed = Test('reuters_test', 1), Test('reuters_test', 0)
    source = 'reuters'

    # normal url result test
    url = 'https://www.reuters.com/article/us-under-armour-strategy-analysis-idUSKBN1AK2H6'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 8, 4, 18, 4, 0, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'reuters+article'): return failed

    # homepage result test
    homeurl = 'https://www.reuters.com/finance/stocks/overview?symbol=NKE.N'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #9

0

파일 보기

def seekingalpha_test():
    passed, failed = Test('seekingalpha_test', 1), Test('seekingalpha_test', 0)
    source = 'seekingalpha'

    # normal url result test
    url = 'https://seekingalpha.com/article/4093388-general-electric-breaking-drink-kool-aid'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 8, 2, 8, 1, 0, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'seekingalpha+article'): return failed

    # homepage result test
    homeurl = 'https://seekingalpha.com/symbol/GE'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #10

0

파일 보기

def thestreet_test():
    passed, failed = Test('thestreet_test', 1), Test('thestreet_test', 0)
    source = 'thestreet'

    # normal url result test
    url = 'https://www.thestreet.com/story/14219936/1/gopro-shares-are-cheap-for-a-reason.html'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 8, 3, 16, 15, 0, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'thestreet+story'): return failed

    # homepage result test
    homeurl = 'https://www.thestreet.com/quote/GPRO.html'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #11

0

파일 보기

def benzinga_test():
    passed, failed = Test('benzinga_test', 1), Test('benzinga_test', 0)
    source = 'benzinga'

    # normal url result test
    url = 'https://www.benzinga.com/general/education/17/08/9903307/an-easy-to-use-cheat-sheet-for-apple-suppliers'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 8, 10, 8, 42, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, ''): return failed

    homeurl = 'https://www.benzinga.com/stock/qcom'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    for link in home_result:
        print(link)
    return passed

예제 #12

0

파일 보기

def barrons_test():
    passed, failed = Test('barrons_test', 1), Test('barrons_test', 0)
    source = 'barrons'

    # normal url result test
    url = 'http://www.barrons.com/articles/facebook-can-climb-more-than-20-1471670309'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2016, 8, 20, 0, 1, tzinfo=timezone('US/Eastern'))):
        return failed
    print(result.article)
    if not jaccard_coeff(result.article, 'barrons+articles'): return failed

    homeurl = 'http://www.barrons.com/quote/stock/us/xnas/fb'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    for link in home_result:
        print(link)
    return passed

예제 #13

0

파일 보기

def investorplace_test():
    passed, failed = Test('investorplace_test',
                          1), Test('investorplace_test', 0)
    source = 'investorplace'

    # normal url result test
    url = 'http://investorplace.com/2017/07/ok-isnt-good-enough-under-armour-inc-uaa-stock/#.WZPQjMbMwy4'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 7, 5, 10, 0, tzinfo=timezone('US/Eastern'))):
        return failed
    print(result.article)
    if not jaccard_coeff(result.article, 'investorplace'): return failed
    return passed

예제 #14

0

파일 보기

def zacks_test():
    passed, failed = Test('zacks_test', 1), Test('zacks_test', 0)
    source = 'zacks'

    # normal url result test
    # url = 'https://www.zacks.com/stock/news/264792/should-you-buy-facebook-fb-stock'
    # result = scrape(url, source)
    # if not isinstance(result.article, str): return failed
    # if not similar_dates(result.pubdate, datetime(2017, , , , , tzinfo=timezone('US/Eastern'))): return failed
    # if not jaccard_coeff(result.article, ''): return failed

    homeurl = ''
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #15

0

파일 보기

def motelyfool_test():
    passed, failed = Test('motelyfool_test', 1), Test('motelyfool_test', 0)
    source = 'motelyfool'

    # normal url result test
    # url = ''
    # result = scrape(url, source)
    # if not isinstance(result.article, str): return failed
    # if not similar_dates(result.pubdate, datetime(2017, , , , , tzinfo=timezone('US/Eastern'))): return failed
    # if not jaccard_coeff(result.article, ''): return failed

    homeurl = ''
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed

예제 #16

0

파일 보기

파일: stocker.py 프로젝트: douskaki/Stocker

 def build_nodes(self, query, urls, flags):
     """uses the urls to build WebNodes to be written to the csv output"""
     if len(urls) == 0: return None, []
     nodes = []
     extra = 0
     j = '.'
     for i, url in enumerate(urls):
         if verbose: sysprint('parsing urls for query: {}'.format(query.string) + j*(i % 3))
         node = scrape(url, query.source, ticker=query.ticker, **flags)
         if isinstance(node, list):
             for url in node:
                 if not (url in urls):
                     urls.append(url)
                     extra += 1
             #urls += [url for url in node if not(url in urls)]
             logger.debug('Hit landing page -- crawling for more links')
         elif node != None: nodes.append(node)
         else: urls.remove(url)
     if verbose: sysprint ('built {} nodes to write to disk'.format(len(nodes)))
     logger.debug('built {} nodes to write to disk'.format(len(nodes)))
     return nodes, urls, extra

예제 #17

0

파일 보기

def msn_test():
    passed, failed = Test('msn_test', 1), Test('msn_test', 0)
    source = 'msn'

    # normal url result test
    url = 'https://www.msn.com/en-us/news/technology/paying-professors-inside-googles-academic-influence-campaign/ar-BBEfaQ3'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed
    if not similar_dates(
            result.pubdate,
            datetime(2017, 7, 15, 10, 01, tzinfo=timezone('US/Eastern'))):
        return failed
    if not jaccard_coeff(result.article, 'msn+en-us+news'): return failed

    homeurl = 'https://www.msn.com/en-us/money/stockdetails/fi-126.1.GOOG.NAS'
    home_result = scrape(homeurl, source)
    if not isinstance(home_result, list): return failed
    if len(home_result) < 1: return failed
    # for link in home_result: print(link)
    return passed


# STATUS: need subscription
def wsj_test():
    passed, failed = Test('wsj_test', 1), Test('wsj_test', 0)
    source = 'wsj'

    # normal url result test
    url = 'https://www.wsj.com/articles/BT-CO-20170811-709167'
    result = scrape(url, source)
    if not isinstance(result.article, str): return failed

예제 #18

0

파일 보기

파일: classify.py 프로젝트: VVYing/KaggleDataSet

def main():
    # iterate over json objects in the polling subsection
    for key in db.keys():
        text = db[key]['text']
        valid_responses = ['exit', 'save', 'skip', 'split', 'g', 'n', 'p', 'x']

        if not ('analyzed' in db[key].keys()):
            print('beginning to analyze new object (', key, ')')
            if db[key]['url']:
                text += (scrape(db[key]['url'], '', '').article)

            sentences = sent_tokenize(text)
            db[key]['analyzed'] = True

            skip = False
            split = False
            stack = sent_tokenize(text)

            # add tagged sentence to the vocabulary
            while len(stack):
                sentence = stack[0]
                print(sentence)
                tmp = []
                for stock in db[key]['symbols'].split('-'):
                    while True:
                        tag = input(stock + ' (' + names[stock] + ')' + ' > ')
                        if not tag in valid_responses:
                            print('invalid entry, try again.')
                            continue
                        if tag == 'exit': exit()
                        elif tag == 'save': save()
                        elif tag == 'g':
                            stack = stack[1:]
                            if len(stack) == 1: stack = []
                            break
                        elif tag == 'skip':
                            skip = True
                            break
                        elif tag == 'split':
                            split_str = ''
                            words = word_tokenize(sentence)
                            for i, val in enumerate(words):
                                split_str += str(i) + ' : ' + val + '\n'
                            idx = int(
                                input('select index for splitting\n' +
                                      split_str + '\n\ninput index --> '))

                            # index bounds checking
                            if idx < 1 or idx > len(words) - 1:
                                print(
                                    'invalid index --> restarting sentence parsing'
                                )
                                continue

                            s = [' '.join(words[:idx]), ' '.join(words[idx:])
                                 ] + stack[1:]
                            stack = s
                            split = True
                            break

                        else:
                            tmp.append([stock, tag])
                            stack = stack[1:]
                            if len(stack) == 1: stack = []
                            break
                    if skip:
                        break

                    if split:
                        split = False
                        break

                obj = {}
                for s in tmp:
                    obj[s[0]] = s[1]
                if obj:
                    sentence = re.sub(r'http\S+', '', str(sentence))
                    tagged[sentence] = obj
                tmp = []

                if skip:
                    skip = False
                    break