示例#1
0
def test_threaded_reader():
    r = Reader(
            thread_count=2,
            reader=FileReader,
            from_path='tests/data/tweets')
    df = r.to_pandas()

    assert len(df) == 50
示例#2
0
def create_data_reader(context: dict):
    reader = Reader(project=context['config'].get('source_project'),
                    from_path=context['config'].get('source_path'),
                    extention=context['config'].get('source_extention'),
                    data_format=context['config'].get('source_format'),
                    date_range=(context.get('date'), context.get('date')))
    return reader
示例#3
0
def test_reader_writer():

    do_writer()

    r = Reader(reader=FileReader, from_path='_tests/year_%Y/')
    l = len(list(r))
    shutil.rmtree("_tests", ignore_errors=True)
    assert l == 200000, l
示例#4
0
def test_reader_context():
    counter = 0
    with Reader(reader=FileReader, from_path='tests/data/tweets') as r:
        n = r.read_line()
        while n:
            counter += 1
            n = r.read_line()

    assert counter == 50
示例#5
0
def test_reader_writer_compressed():

    do_writer_compressed()

    g = glob.glob('_tests/**/*.lzma')
    assert len(g) > 0, g

    r = Reader(reader=FileReader, from_path='_tests/year_%Y/')
    l = len(list(r))
    shutil.rmtree("_tests", ignore_errors=True)
    assert l == 200000, l
def test_format_not_known():
    failed = False
    try:
        reader = Reader(project='',
                        select=['a', 'b'],
                        from_path='',
                        date_range=datetime.datetime.now(),
                        data_format='excel')
    except TypeError:
        failed = True

    assert failed
示例#7
0
def test_unknown_format():
    failed = False

    try:
        r = Reader(
            reader=FileReader,
            from_path='tests/data/tweets',
            data_format='csv'
        )
    except TypeError:
        failed = True

    assert failed
def test_reader_select_not_list():
    failed = False
    try:
        reader = Reader(project='',
                        select='everything',
                        from_path='',
                        date_range=(datetime.datetime.now(),
                                    datetime.datetime.now()),
                        data_format='json')
    except TypeError:
        failed = True

    assert failed
def test_reader_where_not_callable():
    failed = False
    try:
        reader = Reader(project='',
                        select=['a', 'b'],
                        from_path='',
                        where=True,
                        date_range=(datetime.datetime.now(),
                                    datetime.datetime.now()),
                        data_format='json')
    except TypeError:
        failed = True

    assert failed
def test_reader_all_good():
    failed = False

    try:
        reader = Reader(project='',
                        select=['a', 'b'],
                        from_path='',
                        date_range=(datetime.datetime.now(),
                                    datetime.datetime.now()),
                        data_format='json')
    except TypeError:
        failed = True

    assert not failed
示例#11
0
def test_reader_to_pandas():
    r = Reader(reader=FileReader, from_path='tests/data/tweets')
    df = r.to_pandas()

    assert len(df) == 50
示例#12
0
def test_reader_can_read():
    r = Reader(
        reader=FileReader,
        from_path='tests/data/tweets'
    )
    assert len(list(r)) == 50
示例#13
0
        if v == maximum:
            print(k, '*' * int(((v / maximum) * width) // 1), maximum)
        else:
            print(k, '*' * int(((v / maximum) * width) // 1))


if __name__ == "__main__":

    reader = Reader(
        thread_count=0,
        #select=['username'],
        #from_path='TWITTER/tweets/%datefolders/',
        from_path='TWITTER/tweets/year_%Y/month_%m/day_%d/',
        #where=lambda r: r['username'] in ['realDonaldTrump', 'BillGates', 'Twitter', 'Amazon', 'NBCNews', 'BBCNews', 'CNNNews'],
        #where=lambda r: ('coronavirus' in r['tweet'].lower()) or ('corona virus' in r['tweet'].lower()) or ('corona-virus' in r['tweet'].lower()),
        #where=lambda r: ('joyce' in r['text'].lower()),
        reader=MinioReader,
        end_point=os.getenv('MINIO_END_POINT'),
        access_key=os.getenv('MINIO_ACCESS_KEY'),
        secret_key=os.getenv('MINIO_SECRET_KEY'),
        start_date=datetime.date(2021, 1, 2),
        end_date=datetime.date(2021, 1, 2),
        secure=False)

    #    save = SaveToMinioOperator(
    #            end_point=os.getenv('MINIO_END_POINT'),
    #            access_key=os.getenv('MINIO_ACCESS_KEY'),
    #            secret_key=os.getenv('MINIO_SECRET_KEY'),
    #            to_path="TWITTER/tweets/%datefolders/reformatted_twitter_%date.jsonl",
    #            secure=False,
    #            compress=False)
示例#14
0
try:
    from dotenv import load_dotenv   # type:ignore
    from pathlib import Path
    env_path = Path('.') / '.env'
    load_dotenv(dotenv_path=env_path)
except ImportError:
    pass


reader = Reader(
        thread_count=4,
        reader=MinioReader,
        secure=False,
        end_point=os.getenv('MINIO_END_POINT'),
        access_key=os.getenv('MINIO_ACCESS_KEY'),
        secret_key=os.getenv('MINIO_SECRET_KEY'),
        from_path='SNAPSHOTS/NVD/NVD_CVE_LIST/%datefolders/',
        #data_format='text',
        #start_date=datetime.date(2020, 1, 30),
        #end_date=datetime.date(2020, 2, 5),
        #select=['username'],
        #where=lambda r: b'smb' in r
)

#reader = dictset.limit(reader, 100)

start = time.perf_counter_ns()
count = 0
for count, item in enumerate(reader):
    pass

print(count, (time.perf_counter_ns() - start)/1e9)
示例#15
0
def create_data_reader(project, from_path, date):
    reader = Reader(
            project=project,
            from_path=from_path,
            date_range=(date, date))
    return reader
示例#16
0
"""
Schema Guesser

Reads through a dataset to 'guess' the schema.

Current implementation only lists all the values in a set of fields to
work out the set of symbols for an enumerated type.
"""
from gva.data import Reader
from gva.data.formats import dictset
import json

reader = Reader(
    project='dcsgva-da-prd',
    from_path=
    'dcsgva-da-prd-ai-notebook/02_INTERMEDIATE/VIEWS/NVD_CVE_SUMMARY/%datefolders/'
)

values = {}

for record in reader:

    for k, v in record.items():

        if k not in [
                'CVE', 'CWE', 'publishedDate', 'Description',
                'v2.0:vectorString', 'v2.0:baseScore',
                'v2.0:exploitabilityScore', 'v2.0:impactScore',
                'v3.0:vectorString', 'v3.0:baseScore',
                'v3.0:exploitabilityScore', 'v3.0:impactScore'
        ]: