def test_edgar_download_html(): ingestor = Ingestor() edgar = Edgar("html", "2013-01-01") ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory) assert os.path.exists(docs_directory + "/a2124888z10-k.htm") is True
def test_edgar_download_xbrl(): ingestor = Ingestor() edgar = Edgar("xbrl", "2014-01-01") ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory) assert os.path.exists(docs_directory + "/aapl-20130928.xml") is True
def test_sedar_ingest_xbrl(): ingestor = Ingestor() sedar = Sedar("xbrl", "2010-01-01", "2014-01-01") headers = {'User-Agent': 'DIY-FilingsResearch 0.1'} initial_params = { 'lang': 'EN', 'page_no': 1, 'company_search': 'blackberry', 'document_selection': 26, 'industry_group': 'A', 'FromMonth': 1, 'FromDate': 1, 'FromYear': 2010, 'ToMonth': 1, 'ToDate': 1, 'ToYear': 2014, 'Variable': 'Issuer', 'Search': 'Search' } session = requests.session() initial_request = session.post(Sedar().org_root + "/F\ indCompanyDocuments.do", params=initial_params) processed = initial_request.text.encode('utf-8') cookies = requests.utils.dict_from_cookiejar(initial_request.cookies) assert cookies['BIGipServerP_R1_sedar_80'] is not None
#! /usr/bin/env python # encoding: utf-8 import os from ingestor import Ingestor, IngestorException, Sedar ingestor = Ingestor() # xbrl or html? sedar = Sedar("xbrl") docs_directory = "test" # if the directory we will download files does not exist, create it if not os.path.exists(docs_directory): os.mkdir(docs_directory) # for every ticker in our input file, download all the relevant documents with open('sedar_tickers.txt', 'r') as reader: for line in reader: ingestor.file_downloader(sedar.ingest_stock(line.rstrip()), docs_directory)
from time import sleep EXPIRATION_FREQUENCY = 0.01 # Results will be accurate till 0.01 seconds RESULTS_FREQUENCY = 10 # Results will be printed every 10 seconds CURRENT_TIME = 1489926234422 # An epoch of time. We simulate this, to be a reasonable value for the data within the file EXPIRATION_TIMEOUT = 100000 # This is the epoch length of the moving window. We're interested in the top trends of last 100000 seconds def results(memory_store, queue_buffer): while (True): print(memory_store.all_country_trends()) sleep(RESULTS_FREQUENCY) def housekeeping_daemon(now_time, memory_store, queue_buffer): while (True): queue_buffer.housekeeping(now_time, memory_store) sleep(EXPIRATION_FREQUENCY) if __name__ == "__main__": memory_store = Memory() queue_buffer = Queue(EXPIRATION_TIMEOUT) Ingestor(memory_store, queue_buffer, 'file') now_time = CURRENT_TIME housekeeper = Thread(target=housekeeping_daemon, args=(now_time, memory_store, queue_buffer)) trend_printer = Thread(target=results, args=(memory_store, queue_buffer)) housekeeper.start() trend_printer.start()
exit(0) # always declare the signal handler first signal.signal(signal.SIGINT, quit_gracefully) env = lucene.initVM() queryer = Queryer("index", "hits") print 'Using Directory: ', queryer.store_dir # directory for storing downloaded docs directoryToWalk = "docs" # and start the indexer # note the indexer thread is set to daemon causing it to terminate on a SIGINT indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk) ingestor = Ingestor() edgar = Edgar() with open('edgar_tickers.txt', 'r') as reader: for line in reader: ingestor.file_downloader(edgar.ingest_stock(line.rstrip()), directoryToWalk) indexer.indexDocs() # start up the terminal query interface queryer.run(queryer.writer, queryer.analyzer) # if return from Querying then call the signal handler # to clean up the writer cleanly quit_gracefully()
def gethtml(): try: jsonForm = json.loads(str(request.get_data())) url = jsonForm['url'] username = jsonForm['username'] if username == '': username = '******' r = requests.get(url) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.content) #get the title form the web page webpagetitle = '' if soup.title is not None: webpagetitle = soup.title.text for s in soup.findAll('script'): s.extract() bodyText = soup.get_text() text = os.linesep.join([s for s in bodyText.splitlines() if s]) i = Ingestor(jsonForm) if not (i.checkIfUrlExists(url)): jsonDocument = i.extractFeatures(text) #print jsonDocument jsonDocument['images'] = i.extractImages(soup) jsonDocument['title'] = webpagetitle jsonDocument['bodytext'] = text jsonDocument['url'] = url jsonDocument['username'] = username jsonDocument['screenshot'] = i.getwebpagescreenshot(url) #jsonDocument['screenshot']='' #print json.dumps(jsonDocument) jsonld = i.generateJSON(jsonDocument) esresponse = '' objId = '' if jsonld: esresponse = i.publishtoes(jsonld) responseJson = json.loads(json.dumps(esresponse)) objId = responseJson['_id'] #return json.dumps(esresponse) return "Page ingested, ElasticSearch Id:" + objId else: logi("Url: " + url + " already exists") return "Url: " + url + " already exists" else: return r except Exception as e: print >> sys.stderr, e loge(str(e))
from ingestor import Ingestor if __name__ == '__main__': # WIPE AND RELOAD OFF OF kanye ing = Ingestor() ing.clear_database() ing.ingest()
statisticsOnly = "-s" in optional_args or "-statistics" in optional_args if len(required_args) == 0: print("Missing argument 'data file':" + usage) exit(1) # technically.. this is only required if we are actually loading the DB if not statisticsOnly and len(required_args) == 1: print("Missing argument 'sql connection':" + usage) exit(1) dataFile = required_args[0] # Begin our ingestion ingestor = Ingestor(dataFile) if not statisticsOnly: init(required_args[1]) count = 0 inserts = [] for review in ingestor.ingest_reviews(): if review is None: break count += 1 if count % 10000 == 0: print("Processing Record " + str(count) + "...") if not statisticsOnly: # Add a new column for continuation tokens continuation_token = review['date'] + review['review_id']
def test_sedar_exception(): ingestor = Ingestor() with pytest.raises(IngestorException): sedar = Sedar("xbrl", "1995-01-01")
def test_sedar_create_xbrl(): ingestor = Ingestor() assert Sedar("xbrl", "2010-01-01", "2014-01-01")
def test_sedar_create_html(): ingestor = Ingestor() assert Sedar("html", "2010-01-01", "2014-01-01")
def gethtml(): try: jsonForm = json.loads(str(request.get_data())) url = jsonForm['url'] username = jsonForm['username'] if username == '': username='******' r = requests.get(url) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.content) #get the title form the web page webpagetitle='' if soup.title is not None: webpagetitle=soup.title.text for s in soup.findAll('script'): s.extract() bodyText=soup.get_text() text = os.linesep.join([s for s in bodyText.splitlines() if s]) i=Ingestor(jsonForm) if not(i.checkIfUrlExists(url)): jsonDocument=i.extractFeatures(text) #print jsonDocument jsonDocument['images']=i.extractImages(soup) jsonDocument['title']=webpagetitle jsonDocument['bodytext']=text jsonDocument['url']=url jsonDocument['username']=username jsonDocument['screenshot']=i.getwebpagescreenshot(url) #jsonDocument['screenshot']='' #print json.dumps(jsonDocument) jsonld = i.generateJSON(jsonDocument) esresponse='' objId='' if jsonld: esresponse = i.publishtoes(jsonld) responseJson = json.loads(json.dumps(esresponse)) objId = responseJson['_id'] #return json.dumps(esresponse) return "Page ingested, ElasticSearch Id:" + objId else: logi("Url: " + url + " already exists") return "Url: " + url + " already exists" else: return r except Exception as e: print >> sys.stderr, e loge(str(e))
#! /usr/bin/env python # encoding: utf-8 import os from ingestor import Ingestor, IngestorException, Edgar ingestor = Ingestor() # xbrl or html? edgar = Edgar("xbrl") docs_directory = "test" # if the directory we will download files does not exist, create it if not os.path.exists(docs_directory): os.mkdir(docs_directory) # for every ticker in our input file, download all the relevant documents with open('edgar_tickers.txt', 'r') as reader: for line in reader: ingestor.file_downloader(edgar.ingest_stock(line.rstrip()), docs_directory)
exit(0) # always declare the signal handler first signal.signal(signal.SIGINT, quit_gracefully) env = lucene.initVM() queryer = Queryer("index", "hits") print('Using Directory: ', queryer.store_dir) # directory for storing downloaded docs directoryToWalk = "docs" # and start the indexer # note the indexer thread is set to daemon causing it to terminate on a SIGINT indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk) ingestor = Ingestor() edgar = Edgar() with open('data.txt', 'r') as reader: for line in reader: ingestor.file_downloader(edgar.ingest_stock(line.rstrip()), directoryToWalk) indexer.indexDocs() # start up the terminal query interface queryer.run(queryer.writer, queryer.analyzer) # if return from Querying then call the signal handler # to clean up the writer cleanly quit_gracefully()
from ingestor import Ingestor if __name__ == '__main__': # WIPE AND RELOAD OFF OF kanye ing = Ingestor() kanye = '5K4W6rqBFWDnAN6FQUkS6x' ing.clear_database() ing.ingest_by_id(kanye) ing.ingest_related_artist(kanye, _depth=2)