Python Ingestor примеры использования

Язык программирования: Python

Пространство имен/Пакет: ingestor

Класс/Тип: Ingestor

Примеров на hotexamples.com: 18

Python Ingestor - 18 примеров найдено. Это лучшие примеры Python кода для ingestor.Ingestor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Ingestor(11)

file_downloader(4)

checkIfUrlExists(1)

clear_database(1)

extractFeatures(1)

extractImages(1)

generateJSON(1)

getwebpagescreenshot(1)

ingest(1)

ingest_by_id(1)

ingest_related_artist(1)

ingest_reviews(1)

publishtoes(1)

Пример #1

Показать файл

Файл: test_download.py Проект: greedo/DIY-FilingsResearch

def test_edgar_download_html():

    ingestor = Ingestor()
    edgar = Edgar("html", "2013-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/a2124888z10-k.htm") is True

Пример #2

Показать файл

Файл: test_download.py Проект: ralic/DIY-FilingsResearch

def test_edgar_download_xbrl():

    ingestor = Ingestor()
    edgar = Edgar("xbrl", "2014-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/aapl-20130928.xml") is True

Пример #3

Показать файл

Файл: test_download.py Проект: ralic/DIY-FilingsResearch

def test_edgar_download_html():

    ingestor = Ingestor()
    edgar = Edgar("html", "2013-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/a2124888z10-k.htm") is True

Пример #4

Показать файл

Файл: test_download.py Проект: greedo/DIY-FilingsResearch

def test_edgar_download_xbrl():

    ingestor = Ingestor()
    edgar = Edgar("xbrl", "2014-01-01")
    ingestor.file_downloader(edgar.ingest_stock("AAPL"), docs_directory)

    assert os.path.exists(docs_directory + "/aapl-20130928.xml") is True

Пример #5

Показать файл

Файл: test_download.py Проект: ralic/DIY-FilingsResearch

def test_sedar_ingest_xbrl():
    ingestor = Ingestor()
    sedar = Sedar("xbrl", "2010-01-01", "2014-01-01")

    headers = {'User-Agent': 'DIY-FilingsResearch 0.1'}

    initial_params = {
        'lang': 'EN',
        'page_no': 1,
        'company_search': 'blackberry',
        'document_selection': 26,
        'industry_group': 'A',
        'FromMonth': 1,
        'FromDate': 1,
        'FromYear': 2010,
        'ToMonth': 1,
        'ToDate': 1,
        'ToYear': 2014,
        'Variable': 'Issuer',
        'Search': 'Search'
    }

    session = requests.session()
    initial_request = session.post(Sedar().org_root + "/F\
     indCompanyDocuments.do",
                                   params=initial_params)
    processed = initial_request.text.encode('utf-8')
    cookies = requests.utils.dict_from_cookiejar(initial_request.cookies)

    assert cookies['BIGipServerP_R1_sedar_80'] is not None

Пример #6

Показать файл

Файл: download_sedar.py Проект: ralic/DIY-FilingsResearch

#! /usr/bin/env python
# encoding: utf-8

import os
from ingestor import Ingestor, IngestorException, Sedar

ingestor = Ingestor()

# xbrl or html?
sedar = Sedar("xbrl")

docs_directory = "test"

# if the directory we will download files does not exist, create it
if not os.path.exists(docs_directory):
    os.mkdir(docs_directory)

# for every ticker in our input file, download all the relevant documents
with open('sedar_tickers.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(sedar.ingest_stock(line.rstrip()),
                                 docs_directory)

Пример #7

Показать файл

Файл: trending.py Проект: bhaavanmerchant/challenge-problems

from time import sleep

EXPIRATION_FREQUENCY = 0.01  # Results will be accurate till 0.01 seconds
RESULTS_FREQUENCY = 10  # Results will be printed every 10 seconds
CURRENT_TIME = 1489926234422  # An epoch of time. We simulate this, to be a reasonable value for the data within the file
EXPIRATION_TIMEOUT = 100000  # This is the epoch length of the moving window. We're interested in the top trends of last 100000 seconds


def results(memory_store, queue_buffer):
    while (True):
        print(memory_store.all_country_trends())
        sleep(RESULTS_FREQUENCY)


def housekeeping_daemon(now_time, memory_store, queue_buffer):
    while (True):
        queue_buffer.housekeeping(now_time, memory_store)
        sleep(EXPIRATION_FREQUENCY)


if __name__ == "__main__":
    memory_store = Memory()
    queue_buffer = Queue(EXPIRATION_TIMEOUT)
    Ingestor(memory_store, queue_buffer, 'file')
    now_time = CURRENT_TIME
    housekeeper = Thread(target=housekeeping_daemon,
                         args=(now_time, memory_store, queue_buffer))
    trend_printer = Thread(target=results, args=(memory_store, queue_buffer))
    housekeeper.start()
    trend_printer.start()

Пример #8

Показать файл

Файл: searcher.py Проект: greedo/DIY-FilingsResearch

    exit(0)

# always declare the signal handler first
signal.signal(signal.SIGINT, quit_gracefully)

env = lucene.initVM()
queryer = Queryer("index", "hits")
print 'Using Directory: ', queryer.store_dir

# directory for storing downloaded docs
directoryToWalk = "docs"

# and start the indexer
# note the indexer thread is set to daemon causing it to terminate on a SIGINT
indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk)
ingestor = Ingestor()
edgar = Edgar()

with open('edgar_tickers.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()),
        directoryToWalk)
        indexer.indexDocs()

# start up the terminal query interface
queryer.run(queryer.writer, queryer.analyzer)

# if return from Querying then call the signal handler
# to clean up the writer cleanly
quit_gracefully()

Пример #9

Показать файл

def gethtml():

    try:

        jsonForm = json.loads(str(request.get_data()))

        url = jsonForm['url']

        username = jsonForm['username']

        if username == '':
            username = '******'

        r = requests.get(url)

        if r.status_code == requests.codes.ok:
            soup = BeautifulSoup(r.content)

            #get the title form the web page
            webpagetitle = ''
            if soup.title is not None:
                webpagetitle = soup.title.text

            for s in soup.findAll('script'):
                s.extract()

            bodyText = soup.get_text()
            text = os.linesep.join([s for s in bodyText.splitlines() if s])

            i = Ingestor(jsonForm)

            if not (i.checkIfUrlExists(url)):
                jsonDocument = i.extractFeatures(text)
                #print jsonDocument
                jsonDocument['images'] = i.extractImages(soup)

                jsonDocument['title'] = webpagetitle
                jsonDocument['bodytext'] = text
                jsonDocument['url'] = url
                jsonDocument['username'] = username

                jsonDocument['screenshot'] = i.getwebpagescreenshot(url)
                #jsonDocument['screenshot']=''

                #print json.dumps(jsonDocument)

                jsonld = i.generateJSON(jsonDocument)

                esresponse = ''
                objId = ''
                if jsonld:
                    esresponse = i.publishtoes(jsonld)

                    responseJson = json.loads(json.dumps(esresponse))

                    objId = responseJson['_id']

                #return json.dumps(esresponse)
                return "Page ingested, ElasticSearch Id:" + objId

            else:
                logi("Url: " + url + " already exists")
                return "Url: " + url + " already exists"
        else:
            return r

    except Exception as e:
        print >> sys.stderr, e
        loge(str(e))

Пример #10

Показать файл

from ingestor import Ingestor

if __name__ == '__main__':

    # WIPE AND RELOAD OFF OF kanye

    ing = Ingestor()

    ing.clear_database()

    ing.ingest()

Пример #11

Показать файл

Файл: ingest.py Проект: nathanestark/code_challenge

statisticsOnly = "-s" in optional_args or "-statistics" in optional_args

if len(required_args) == 0:
    print("Missing argument 'data file':" + usage)
    exit(1)
# technically.. this is only required if we are actually loading the DB
if not statisticsOnly and len(required_args) == 1:
    print("Missing argument 'sql connection':" + usage)
    exit(1)

dataFile = required_args[0]

# Begin our ingestion

ingestor = Ingestor(dataFile)

if not statisticsOnly:
    init(required_args[1])

count = 0
inserts = []
for review in ingestor.ingest_reviews():
    if review is None:
        break
    count += 1
    if count % 10000 == 0:
        print("Processing Record " + str(count) + "...")
    if not statisticsOnly:
        # Add a new column for continuation tokens
        continuation_token = review['date'] + review['review_id']

Пример #12

Показать файл

Файл: test_download.py Проект: ralic/DIY-FilingsResearch

def test_sedar_exception():
    ingestor = Ingestor()
    with pytest.raises(IngestorException):
        sedar = Sedar("xbrl", "1995-01-01")

Пример #13

Показать файл

Файл: test_download.py Проект: ralic/DIY-FilingsResearch

def test_sedar_create_xbrl():
    ingestor = Ingestor()
    assert Sedar("xbrl", "2010-01-01", "2014-01-01")

Пример #14

Показать файл

Файл: test_download.py Проект: ralic/DIY-FilingsResearch

def test_sedar_create_html():
    ingestor = Ingestor()
    assert Sedar("html", "2010-01-01", "2014-01-01")

Пример #15

Показать файл

Файл: ingestWebPage.py Проект: usc-isi-i2/dig-ingest

def gethtml():

    try:

        jsonForm = json.loads(str(request.get_data()))

        url = jsonForm['url']

        username = jsonForm['username']

        if username == '':
            username='******'

        r = requests.get(url)

        if r.status_code == requests.codes.ok:
            soup = BeautifulSoup(r.content)

            #get the title form the web page
            webpagetitle=''
            if soup.title is not None:
                webpagetitle=soup.title.text

            for s in soup.findAll('script'):
                s.extract()

            bodyText=soup.get_text()
            text = os.linesep.join([s for s in bodyText.splitlines() if s])

            i=Ingestor(jsonForm)


            if not(i.checkIfUrlExists(url)):
                jsonDocument=i.extractFeatures(text)
                #print jsonDocument
                jsonDocument['images']=i.extractImages(soup)

                jsonDocument['title']=webpagetitle
                jsonDocument['bodytext']=text
                jsonDocument['url']=url
                jsonDocument['username']=username

                jsonDocument['screenshot']=i.getwebpagescreenshot(url)
                #jsonDocument['screenshot']=''

                #print json.dumps(jsonDocument)

                jsonld = i.generateJSON(jsonDocument)

                esresponse=''
                objId=''
                if jsonld:
                    esresponse = i.publishtoes(jsonld)

                    responseJson = json.loads(json.dumps(esresponse))

                    objId = responseJson['_id']

                #return json.dumps(esresponse)
                return "Page ingested, ElasticSearch Id:" + objId

            else:
                logi("Url: " + url + " already exists")
                return "Url: " + url + " already exists"
        else:
            return r

    except Exception as e:
        print >> sys.stderr, e
        loge(str(e))

Пример #16

Показать файл

Файл: download_edgar.py Проект: greedo/DIY-FilingsResearch

#! /usr/bin/env python
# encoding: utf-8

import os
from ingestor import Ingestor, IngestorException, Edgar

ingestor = Ingestor()

# xbrl or html?
edgar = Edgar("xbrl")

docs_directory = "test"

# if the directory we will download files does not exist, create it
if not os.path.exists(docs_directory):
    os.mkdir(docs_directory)

# for every ticker in our input file, download all the relevant documents
with open('edgar_tickers.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()),
         docs_directory)

Пример #17

Показать файл

Файл: searcher.py Проект: Python3pkg/DIY-FilingsResearch

    exit(0)

# always declare the signal handler first
signal.signal(signal.SIGINT, quit_gracefully)

env = lucene.initVM()
queryer = Queryer("index", "hits")
print('Using Directory: ', queryer.store_dir)

# directory for storing downloaded docs
directoryToWalk = "docs"

# and start the indexer
# note the indexer thread is set to daemon causing it to terminate on a SIGINT
indexer = Indexer(queryer.store_dir, queryer.writer, directoryToWalk)
ingestor = Ingestor()
edgar = Edgar()

with open('data.txt', 'r') as reader:
    for line in reader:
        ingestor.file_downloader(edgar.ingest_stock(line.rstrip()),
         directoryToWalk)
        indexer.indexDocs()

# start up the terminal query interface
queryer.run(queryer.writer, queryer.analyzer)

# if return from Querying then call the signal handler
# to clean up the writer cleanly
quit_gracefully()

Пример #18

Показать файл

Файл: ingeest_kanye_ara.py Проект: zackdrescher/spotify_ingestor

from ingestor import Ingestor

if __name__ == '__main__':

    # WIPE AND RELOAD OFF OF kanye

    ing = Ingestor()

    kanye = '5K4W6rqBFWDnAN6FQUkS6x'

    ing.clear_database()

    ing.ingest_by_id(kanye)

    ing.ingest_related_artist(kanye, _depth=2)