def search_diffbot_cache(term): response = requests.get('http://api.diffbot.com/v3/search', params={ 'token': config.credentials.diffbot, 'query': requests.utils.quote('"{}"'.format(term)), 'col': 'GLOBAL-INDEX' }).json() if not response.get('objects'): if response.get('error'): print("Response Error '{}' (code: {})".format( response['error'], response['errorCode'])) else: print("NO RESULTS") results = [] for object in response.get('objects', []): if object.get('text'): pr = PageRequest(object.get('pageUrl'), term, run=False) pr.extract_sentences(object.get('text')) result = { "title": object.get('title'), "url": object.get('pageUrl'), 'search_provider': 'diffbot', "author": object.get('author'), "date": parse_date(object.get('date', '')).isoformat(), "doc": object.get('text'), "sentences": pr.sentences, "variants": list(pr.variants) } results.append(result) return results
def search_diffbot_cache(term): response = requests.get('http://api.diffbot.com/v3/search', params={ 'token': config.credentials.diffbot, 'query': requests.utils.quote('"{}"'.format(term)), 'col': 'GLOBAL-INDEX' }).json() if not response.get('objects'): if response.get('error'): print("Response Error '{}' (code: {})".format(response['error'], response['errorCode'])) else: print("NO RESULTS") results = [] for object in response.get('objects', []): if object.get('text'): pr = PageRequest(object.get('pageUrl'), term, run=False) pr.extract_sentences(object.get('text')) result = { "title": object.get('title'), "url": object.get('pageUrl'), 'search_provider': 'diffbot', "author": object.get('author'), "date": parse_date(object.get('date', '')).isoformat(), "doc": object.get('text'), "sentences": pr.sentences, "variants": list(pr.variants) } results.append(result) return results
def test_extract_html_features(): from serapis.extract import PageRequest test_request = PageRequest("http://thescene.whro.org/hear-cool-stuff", 'defenestration', run=False) test_html = "<div><p><em><strong>de-fen-es-tra-tion</strong></em> (dee-fen-uh-STRAY-shun) | n. the act of throwing someone or something out of a window</p></div><div>" test_request.get_html_features(test_html) assert test_request.features['highlighted']
def extract_wrapper(url_object, term): try: result = PageRequest(url_object['url'], term).structured except Exception: import traceback log.error("Failed to get page {} -- {}".format(url_object['url'], traceback.format_exc())) return url_object return merge_dict(url_object, result)
def test_page_structure(): from serapis.extract import PageRequest p = PageRequest( 'http://nytimes.com/2015/10/04/technology/scouring-the-web-to-make-new-words-lookupable.html', "lookupable") assert p.structured['title'] == test_response['title'] assert p.structured['url'] == test_response['url'] assert p.structured['author'] == test_response['author'] assert len(p.structured['doc']) > 0
def search_duckduckgo(term): result = [] try: req = requests.get( 'http://api.duckduckgo.com/?q={}&format=json'.format(term)).json() except: return result if req['AbstractSource'] not in config.duckduckgo_sources: return result if req.get('Abstract'): pr = PageRequest(req['AbstractURL'], term, run=False) pr.extract_sentences(req['Abstract']) result.append({ 'title': req['Heading'], 'url': req['AbstractURL'], 'search_provider': 'duckduckgo', 'author': None, 'date': None, 'source': req['AbstractSource'], 'doc': req['Abstract'], "sentences": pr.sentences, "variants": list(pr.variants) }) if req.get('Definition'): pr = PageRequest(req['DefinitionURL'], term, run=False) pr.extract_sentences(req['Definition']) result.append({ 'title': req['Heading'], 'url': req['DefinitionURL'], 'source': req['DefinitionSource'], 'search_provider': 'duckduckgo', 'author': None, 'date': None, 'doc': req['Definition'], "sentences": pr.sentences, "variants": list(pr.variants) }) log.info("Searching DuckDuckGo for '{}' returned {} results".format( term, len(result))) return result
def search_duckduckgo(term): result = [] try: req = requests.get('http://api.duckduckgo.com/?q={}&format=json'.format(term)).json() except: return result if req['AbstractSource'] not in config.duckduckgo_sources: return result if req.get('Abstract'): pr = PageRequest(req['AbstractURL'], term, run=False) pr.extract_sentences(req['Abstract']) result.append({ 'title': req['Heading'], 'url': req['AbstractURL'], 'search_provider': 'duckduckgo', 'author': None, 'date': None, 'source': req['AbstractSource'], 'doc': req['Abstract'], "sentences": pr.sentences, "variants": list(pr.variants) }) if req.get('Definition'): pr = PageRequest(req['DefinitionURL'], term, run=False) pr.extract_sentences(req['Definition']) result.append({ 'title': req['Heading'], 'url': req['DefinitionURL'], 'source': req['DefinitionSource'], 'search_provider': 'duckduckgo', 'author': None, 'date': None, 'doc': req['Definition'], "sentences": pr.sentences, "variants": list(pr.variants) }) log.info("Searching DuckDuckGo for '{}' returned {} results".format(term, len(result))) return result
def test_page_request(): from serapis.extract import PageRequest p = PageRequest( 'http://nytimes.com/2015/10/04/technology/scouring-the-web-to-make-new-words-lookupable.html', "lookupable") assert p.response