Exemplo n.º 1
0
def test_edgar_download_html():

    ingestor = Ingestor()
    edgar = Edgar("html", "2013-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/a2124888z10-k.htm") is True
Exemplo n.º 2
0
def test_edgar_download_xbrl():

    ingestor = Ingestor()
    edgar = Edgar("xbrl", "2014-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/aapl-20130928.xml") is True
Exemplo n.º 3
0
def test_edgar_download_html():

    ingestor = Ingestor()
    edgar = Edgar("html", "2013-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/a2124888z10-k.htm") is True
Exemplo n.º 4
0
def test_edgar_download_xbrl():

    ingestor = Ingestor()
    edgar = Edgar("xbrl", "2014-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/aapl-20130928.xml") is True
Exemplo n.º 5
0
def test_sedar_ingest_xbrl():
    ingestor = Ingestor()
    sedar = Sedar("xbrl", "2010-01-01", "2014-01-01")

    headers = {'User-Agent': 'DIY-FilingsResearch 0.1'}

    initial_params = {
        'lang': 'EN',
        'page_no': 1,
        'company_search': 'blackberry',
        'document_selection': 26,
        'industry_group': 'A',
        'FromMonth': 1,
        'FromDate': 1,
        'FromYear': 2010,
        'ToMonth': 1,
        'ToDate': 1,
        'ToYear': 2014,
        'Variable': 'Issuer',
        'Search': 'Search'
    }

    session = requests.session()
    initial_request = session.post(Sedar().org_root + "/F\
     indCompanyDocuments.do",
                                   params=initial_params)
    processed = initial_request.text.encode('utf-8')
    cookies = requests.utils.dict_from_cookiejar(initial_request.cookies)

    assert cookies['BIGipServerP_R1_sedar_80'] is not None
Exemplo n.º 6
0
#! /usr/bin/env python
# encoding: utf-8

import os
from ingestor import Ingestor, IngestorException, Sedar

ingestor = Ingestor()

# xbrl or html?
sedar = Sedar("xbrl")

docs_directory = "test"

# if the directory we will download files does not exist, create it
if not os.path.exists(docs_directory):
    os.mkdir(docs_directory)

# for every ticker in our input file, download all the relevant documents
with open('sedar_tickers.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(sedar.ingest_stock(line.rstrip()),
                                 docs_directory)
from time import sleep

EXPIRATION_FREQUENCY = 0.01  # Results will be accurate till 0.01 seconds
RESULTS_FREQUENCY = 10  # Results will be printed every 10 seconds
CURRENT_TIME = 1489926234422  # An epoch of time. We simulate this, to be a reasonable value for the data within the file
EXPIRATION_TIMEOUT = 100000  # This is the epoch length of the moving window. We're interested in the top trends of last 100000 seconds


def results(memory_store, queue_buffer):
    while (True):
        print(memory_store.all_country_trends())
        sleep(RESULTS_FREQUENCY)


def housekeeping_daemon(now_time, memory_store, queue_buffer):
    while (True):
        queue_buffer.housekeeping(now_time, memory_store)
        sleep(EXPIRATION_FREQUENCY)


if __name__ == "__main__":
    memory_store = Memory()
    queue_buffer = Queue(EXPIRATION_TIMEOUT)
    Ingestor(memory_store, queue_buffer, 'file')
    now_time = CURRENT_TIME
    housekeeper = Thread(target=housekeeping_daemon,
                         args=(now_time, memory_store, queue_buffer))
    trend_printer = Thread(target=results, args=(memory_store, queue_buffer))
    housekeeper.start()
    trend_printer.start()
Exemplo n.º 8
0
    exit(0)

# always declare the signal handler first
signal.signal(signal.SIGINT, quit_gracefully)

env = lucene.initVM()
queryer = Queryer("index", "hits")
print 'Using Directory: ', queryer.store_dir

# directory for storing downloaded docs
directoryToWalk = "docs"

# and start the indexer
# note the indexer thread is set to daemon causing it to terminate on a SIGINT
indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk)
ingestor = Ingestor()
edgar = Edgar()

with open('edgar_tickers.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()),
        directoryToWalk)
        indexer.indexDocs()

# start up the terminal query interface
queryer.run(queryer.writer, queryer.analyzer)

# if return from Querying then call the signal handler
# to clean up the writer cleanly
quit_gracefully()
Exemplo n.º 9
0
def gethtml():

    try:

        jsonForm = json.loads(str(request.get_data()))

        url = jsonForm['url']

        username = jsonForm['username']

        if username == '':
            username = '******'

        r = requests.get(url)

        if r.status_code == requests.codes.ok:
            soup = BeautifulSoup(r.content)

            #get the title form the web page
            webpagetitle = ''
            if soup.title is not None:
                webpagetitle = soup.title.text

            for s in soup.findAll('script'):
                s.extract()

            bodyText = soup.get_text()
            text = os.linesep.join([s for s in bodyText.splitlines() if s])

            i = Ingestor(jsonForm)

            if not (i.checkIfUrlExists(url)):
                jsonDocument = i.extractFeatures(text)
                #print jsonDocument
                jsonDocument['images'] = i.extractImages(soup)

                jsonDocument['title'] = webpagetitle
                jsonDocument['bodytext'] = text
                jsonDocument['url'] = url
                jsonDocument['username'] = username

                jsonDocument['screenshot'] = i.getwebpagescreenshot(url)
                #jsonDocument['screenshot']=''

                #print json.dumps(jsonDocument)

                jsonld = i.generateJSON(jsonDocument)

                esresponse = ''
                objId = ''
                if jsonld:
                    esresponse = i.publishtoes(jsonld)

                    responseJson = json.loads(json.dumps(esresponse))

                    objId = responseJson['_id']

                #return json.dumps(esresponse)
                return "Page ingested, ElasticSearch Id:" + objId

            else:
                logi("Url: " + url + " already exists")
                return "Url: " + url + " already exists"
        else:
            return r

    except Exception as e:
        print >> sys.stderr, e
        loge(str(e))
Exemplo n.º 10
0
from ingestor import Ingestor

if __name__ == '__main__':

    # WIPE AND RELOAD OFF OF kanye

    ing = Ingestor()

    ing.clear_database()

    ing.ingest()
Exemplo n.º 11
0
statisticsOnly = "-s" in optional_args or "-statistics" in optional_args

if len(required_args) == 0:
    print("Missing argument 'data file':" + usage)
    exit(1)
# technically.. this is only required if we are actually loading the DB
if not statisticsOnly and len(required_args) == 1:
    print("Missing argument 'sql connection':" + usage)
    exit(1)

dataFile = required_args[0]

# Begin our ingestion

ingestor = Ingestor(dataFile)

if not statisticsOnly:
    init(required_args[1])

count = 0
inserts = []
for review in ingestor.ingest_reviews():
    if review is None:
        break
    count += 1
    if count % 10000 == 0:
        print("Processing Record " + str(count) + "...")
    if not statisticsOnly:
        # Add a new column for continuation tokens
        continuation_token = review['date'] + review['review_id']
Exemplo n.º 12
0
def test_sedar_exception():
    ingestor = Ingestor()
    with pytest.raises(IngestorException):
        sedar = Sedar("xbrl", "1995-01-01")
Exemplo n.º 13
0
def test_sedar_create_xbrl():
    ingestor = Ingestor()
    assert Sedar("xbrl", "2010-01-01", "2014-01-01")
Exemplo n.º 14
0
def test_sedar_create_html():
    ingestor = Ingestor()
    assert Sedar("html", "2010-01-01", "2014-01-01")
Exemplo n.º 15
0
def gethtml():

    try:

        jsonForm = json.loads(str(request.get_data()))

        url = jsonForm['url']

        username = jsonForm['username']

        if username == '':
            username='******'

        r = requests.get(url)

        if r.status_code == requests.codes.ok:
            soup = BeautifulSoup(r.content)

            #get the title form the web page
            webpagetitle=''
            if soup.title is not None:
                webpagetitle=soup.title.text

            for s in soup.findAll('script'):
                s.extract()

            bodyText=soup.get_text()
            text = os.linesep.join([s for s in bodyText.splitlines() if s])

            i=Ingestor(jsonForm)


            if not(i.checkIfUrlExists(url)):
                jsonDocument=i.extractFeatures(text)
                #print jsonDocument
                jsonDocument['images']=i.extractImages(soup)

                jsonDocument['title']=webpagetitle
                jsonDocument['bodytext']=text
                jsonDocument['url']=url
                jsonDocument['username']=username

                jsonDocument['screenshot']=i.getwebpagescreenshot(url)
                #jsonDocument['screenshot']=''

                #print json.dumps(jsonDocument)

                jsonld = i.generateJSON(jsonDocument)

                esresponse=''
                objId=''
                if jsonld:
                    esresponse = i.publishtoes(jsonld)

                    responseJson = json.loads(json.dumps(esresponse))

                    objId = responseJson['_id']

                #return json.dumps(esresponse)
                return "Page ingested, ElasticSearch Id:" + objId

            else:
                logi("Url: " + url + " already exists")
                return "Url: " + url + " already exists"
        else:
            return r

    except Exception as e:
        print >> sys.stderr, e
        loge(str(e))
Exemplo n.º 16
0
#! /usr/bin/env python
# encoding: utf-8

import os
from ingestor import Ingestor, IngestorException, Edgar

ingestor = Ingestor()

# xbrl or html?
edgar = Edgar("xbrl")

docs_directory = "test"

# if the directory we will download files does not exist, create it
if not os.path.exists(docs_directory):
    os.mkdir(docs_directory)

# for every ticker in our input file, download all the relevant documents
with open('edgar_tickers.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()),
         docs_directory)
Exemplo n.º 17
0
    exit(0)

# always declare the signal handler first
signal.signal(signal.SIGINT, quit_gracefully)

env = lucene.initVM()
queryer = Queryer("index", "hits")
print('Using Directory: ', queryer.store_dir)

# directory for storing downloaded docs
directoryToWalk = "docs"

# and start the indexer
# note the indexer thread is set to daemon causing it to terminate on a SIGINT
indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk)
ingestor = Ingestor()
edgar = Edgar()

with open('data.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()),
         directoryToWalk)
        indexer.indexDocs()

# start up the terminal query interface
queryer.run(queryer.writer, queryer.analyzer)

# if return from Querying then call the signal handler
# to clean up the writer cleanly
quit_gracefully()
from ingestor import Ingestor

if __name__ == '__main__':

    # WIPE AND RELOAD OFF OF kanye

    ing = Ingestor()

    kanye = '5K4W6rqBFWDnAN6FQUkS6x'

    ing.clear_database()

    ing.ingest_by_id(kanye)

    ing.ingest_related_artist(kanye, _depth=2)