def extract_field(input, output, field): """Extracts field from given JSON lines file.""" # TODO: Add support for CSV with smart_open(input, 'r') as input_file, smart_open(output, 'w') as output_file: for line in input_file: item = json.loads(line) output_file.write(item[field] + '\n')
def extract_csv_column(input, output, column): """Extracts column from given CSV file.""" set_max_field_size_limit() with smart_open(input, 'r') as input_file, smart_open(output, 'w') as output_file: reader = csv.DictReader(input_file) for row in reader: output_file.write(row[column] + '\n')
def filter_items(input, output, predicate): """Filters given JSON lines file by predicate.""" # TODO: Add support for CSV with smart_open(input, 'r') as input_file, smart_open(output, 'w') as output_file: for line in input_file: item = json.loads(line) if eval(predicate, globals(), {'item': item}): output_file.write(json.dumps(item) + '\n')
def extract_csv_column_unique(input, output, column): set_max_field_size_limit() with smart_open(input, 'r') as input_file, smart_open(output, 'w') as output_file: reader = csv.DictReader(input_file) seen = set() # set for fast O(1) amortized lookup for row in reader: if row[column] in seen: continue seen.add(row[column]) output_file.write(row[column] + '\n')
def get_block_range_for_date(provider_uri, date, output): """Outputs start and end blocks for given date.""" provider = get_provider_from_uri(provider_uri) web3 = Web3(provider) eth_service = EthService(web3) start_block, end_block = eth_service.get_block_range_for_date(date) with smart_open(output, 'w') as output_file: output_file.write('{},{}\n'.format(start_block, end_block))
def export_tokens(token_addresses, output, max_workers, provider_uri): """Exports ERC20/ERC721 tokens.""" with smart_open(token_addresses, 'r') as token_addresses_file: job = ExportTokensJob( token_addresses_iterable=(token_address.strip() for token_address in token_addresses_file), web3=ThreadLocalProxy(lambda: Web3(get_provider_from_uri(provider_uri))), item_exporter=tokens_item_exporter(output), max_workers=max_workers) job.run()
def export_contracts(batch_size, contract_addresses, output, max_workers, provider_uri): """Exports contracts bytecode and sighashes.""" with smart_open(contract_addresses, 'r') as contract_addresses_file: contract_addresses = (contract_address.strip() for contract_address in contract_addresses_file if contract_address.strip()) job = ExportContractsJob( contract_addresses_iterable=contract_addresses, batch_size=batch_size, batch_web3_provider=ThreadLocalProxy(lambda: get_provider_from_uri(provider_uri, batch=True)), item_exporter=contracts_item_exporter(output), max_workers=max_workers) job.run()
def extract_geth_traces(input, batch_size, output, max_workers): """Extracts geth traces from JSON lines file.""" with smart_open(input, 'r') as geth_traces_file: if input.endswith('.json'): traces_iterable = (json.loads(line) for line in geth_traces_file) else: traces_iterable = (trace for trace in csv.DictReader(geth_traces_file)) job = ExtractGethTracesJob(traces_iterable=traces_iterable, batch_size=batch_size, max_workers=max_workers, item_exporter=traces_item_exporter(output)) job.run()
def extract_token_transfers(logs, batch_size, output, max_workers): """Extracts ERC20/ERC721 transfers from logs file.""" with smart_open(logs, 'r') as logs_file: if logs.endswith('.json'): logs_reader = (json.loads(line) for line in logs_file) else: logs_reader = csv.DictReader(logs_file) job = ExtractTokenTransfersJob( logs_iterable=logs_reader, batch_size=batch_size, max_workers=max_workers, item_exporter=token_transfers_item_exporter(output)) job.run()
def export_receipts_and_logs(batch_size, transaction_hashes, provider_uri, max_workers, receipts_output, logs_output, chain='ethereum'): """Exports receipts and logs.""" provider_uri = check_classic_provider_uri(chain, provider_uri) with smart_open(transaction_hashes, 'r') as transaction_hashes_file: job = ExportReceiptsJob( transaction_hashes_iterable=(transaction_hash.strip() for transaction_hash in transaction_hashes_file), batch_size=batch_size, batch_web3_provider=ThreadLocalProxy(lambda: get_provider_from_uri(provider_uri, batch=True)), max_workers=max_workers, item_exporter=receipts_and_logs_item_exporter(receipts_output, logs_output), export_receipts=receipts_output is not None, export_logs=logs_output is not None) job.run()
def get_block_range_for_timestamps(provider_uri, start_timestamp, end_timestamp, output, chain='ethereum'): """Outputs start and end blocks for given timestamps.""" provider_uri = check_classic_provider_uri(chain, provider_uri) provider = get_provider_from_uri(provider_uri) web3 = Web3(provider) eth_service = EthService(web3) start_block, end_block = eth_service.get_block_range_for_timestamps( start_timestamp, end_timestamp) with smart_open(output, 'w') as output_file: output_file.write('{},{}\n'.format(start_block, end_block))
def extract_field(input_file, output_file, field): with get_item_iterable(input_file) as item_iterable, smart_open( output_file, 'w') as output: for item in item_iterable: output.write(item[field] + '\n')
# copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import argparse import csv from ethereumetl.csv_utils import set_max_field_size_limit from ethereumetl.file_utils import smart_open parser = argparse.ArgumentParser(description='Extracts a single column from a given csv file.') parser.add_argument('-i', '--input', default='-', type=str, help='The input file. If not specified stdin is used.') parser.add_argument('-o', '--output', default='-', type=str, help='The output file. If not specified stdout is used.') parser.add_argument('-c', '--column', required=True, type=str, help='The csv column name to extract.') args = parser.parse_args() set_max_field_size_limit() with smart_open(args.input, 'r') as input_file, smart_open(args.output, 'w') as output_file: reader = csv.DictReader(input_file) for row in reader: output_file.write(row[args.column] + '\n')
type=str, help='The output file. If not specified stdout is used.') parser.add_argument('-w', '--max-workers', default=5, type=int, help='The maximum number of workers.') parser.add_argument( '-p', '--provider-uri', default='https://mainnet.infura.io/', type=str, help='The URI of the web3 provider e.g. ' 'file://$HOME/Library/Ethereum/geth.ipc or https://mainnet.infura.io/') args = parser.parse_args() with smart_open(args.contract_addresses, 'r') as contract_addresses_file: contract_addresses = (contract_address.strip() for contract_address in contract_addresses_file if contract_address.strip()) job = ExportContractsJob( contract_addresses_iterable=contract_addresses, batch_size=args.batch_size, batch_web3_provider=ThreadLocalProxy( lambda: get_provider_from_uri(args.provider_uri, batch=True)), item_exporter=contracts_item_exporter(args.output), max_workers=args.max_workers) job.run()
def export_all(partitions, output_dir, provider_uri, max_workers, batch_size): for batch_start_block, batch_end_block, partition_dir in partitions: # # # start # # # start_time = time() padded_batch_start_block = str(batch_start_block).zfill(8) padded_batch_end_block = str(batch_end_block).zfill(8) block_range = f'{padded_batch_start_block}-{padded_batch_end_block}' file_name_suffix = f'{padded_batch_start_block}_{padded_batch_end_block}' # # # blocks_and_transactions # # # blocks_output_dir = f'{output_dir}/blocks{partition_dir}' os.makedirs(os.path.dirname(blocks_output_dir), exist_ok=True) transactions_output_dir = f'{output_dir}/transactions{partition_dir}' os.makedirs(os.path.dirname(transactions_output_dir), exist_ok=True) blocks_file = f'{blocks_output_dir}/blocks_{file_name_suffix}.csv' transactions_file = f'{transactions_output_dir}/transactions_{file_name_suffix}.csv' logger.info(f'Exporting blocks {block_range} to {blocks_file}') logger.info( f'Exporting transactions from blocks {block_range} to {transactions_file}' ) job = ExportBlocksJob( start_block=batch_start_block, end_block=batch_end_block, batch_size=batch_size, batch_web3_provider=ThreadLocalProxy( lambda: get_provider_from_uri(provider_uri, batch=True)), max_workers=max_workers, item_exporter=blocks_and_transactions_item_exporter( blocks_file, transactions_file), export_blocks=blocks_file is not None, export_transactions=transactions_file is not None) job.run() # # # token_transfers # # # token_transfers_file = None if is_log_filter_supported(provider_uri): token_transfers_output_dir = f'{output_dir}/token_transfers{partition_dir}' os.makedirs(os.path.dirname(token_transfers_output_dir), exist_ok=True) token_transfers_file = f'{token_transfers_output_dir}/token_transfers_{file_name_suffix}.csv' logger.info( f'Exporting ERC20 transfers from blocks {block_range} to {token_transfers_file}' ) job = ExportTokenTransfersJob( start_block=batch_start_block, end_block=batch_end_block, batch_size=batch_size, web3=ThreadLocalProxy( lambda: Web3(get_provider_from_uri(provider_uri))), item_exporter=token_transfers_item_exporter( token_transfers_file), max_workers=max_workers) job.run() # # # receipts_and_logs # # # transaction_hashes_output_dir = f'{output_dir}/transaction_hashes{partition_dir}' os.makedirs(os.path.dirname(transaction_hashes_output_dir), exist_ok=True) transaction_hashes_file = f'{transaction_hashes_output_dir}/transaction_hashes_{file_name_suffix}.csv' logger.info( f'Extracting hash column from transaction file {transactions_file}' ) extract_csv_column_unique(transactions_file, transaction_hashes_file, 'hash') receipts_output_dir = f'{output_dir}/receipts{partition_dir}' os.makedirs(os.path.dirname(receipts_output_dir), exist_ok=True) logs_output_dir = f'{output_dir}/logs{partition_dir}' os.makedirs(os.path.dirname(logs_output_dir), exist_ok=True) receipts_file = f'{receipts_output_dir}/receipts_{file_name_suffix}.csv' logs_file = f'{logs_output_dir}/logs_{file_name_suffix}.csv' logger.info( f'Exporting receipts and logs from blocks {block_range} to {receipts_file} and {logs_file}' ) with smart_open(transaction_hashes_file, 'r') as transaction_hashes: job = ExportReceiptsJob( transaction_hashes_iterable=( transaction_hash.strip() for transaction_hash in transaction_hashes), batch_size=batch_size, batch_web3_provider=ThreadLocalProxy( lambda: get_provider_from_uri(provider_uri, batch=True)), max_workers=max_workers, item_exporter=receipts_and_logs_item_exporter( receipts_file, logs_file), export_receipts=receipts_file is not None, export_logs=logs_file is not None) job.run() # # # contracts # # # contract_addresses_output_dir = f'{output_dir}/contract_addresses{partition_dir}' os.makedirs(os.path.dirname(contract_addresses_output_dir), exist_ok=True) contract_addresses_file = f'{contract_addresses_output_dir}/contract_addresses_{file_name_suffix}.csv' logger.info( f'Extracting contract_address from receipt file {receipts_file}') extract_csv_column_unique(receipts_file, contract_addresses_file, 'contract_address') contracts_output_dir = f'{output_dir}/contracts{partition_dir}' os.makedirs(os.path.dirname(contracts_output_dir), exist_ok=True) contracts_file = f'{contracts_output_dir}/contracts_{file_name_suffix}.csv' logger.info( f'Exporting contracts from blocks {block_range} to {contracts_file}' ) with smart_open(contract_addresses_file, 'r') as contract_addresses_file: contract_addresses = ( contract_address.strip() for contract_address in contract_addresses_file if contract_address.strip()) job = ExportContractsJob( contract_addresses_iterable=contract_addresses, batch_size=batch_size, batch_web3_provider=ThreadLocalProxy( lambda: get_provider_from_uri(provider_uri, batch=True)), item_exporter=contracts_item_exporter(contracts_file), max_workers=max_workers) job.run() # # # tokens # # # if token_transfers_file is not None: token_addresses_output_dir = f'{output_dir}/token_addresses{partition_dir}' os.makedirs(os.path.dirname(token_addresses_output_dir), exist_ok=True) token_addresses_file = f'{token_addresses_output_dir}/token_addresses_{file_name_suffix}' logger.info( f'Extracting token_address from token_transfers file {token_transfers_file}' ) extract_csv_column_unique(token_transfers_file, token_addresses_file, 'token_address') tokens_output_dir = f'{output_dir}/tokens{partition_dir}' os.makedirs(os.path.dirname(tokens_output_dir), exist_ok=True) tokens_file = f'{tokens_output_dir}/tokens_{file_name_suffix}.csv' logger.info( f'Exporting tokens from blocks {block_range} to {tokens_file}') with smart_open(token_addresses_file, 'r') as token_addresses: job = ExportTokensJob( token_addresses_iterable=( token_address.strip() for token_address in token_addresses), web3=ThreadLocalProxy( lambda: Web3(get_provider_from_uri(provider_uri))), item_exporter=tokens_item_exporter(tokens_file), max_workers=max_workers) job.run() # # # finish # # # end_time = time() time_diff = round(end_time - start_time, 5) logger.info(f'Exporting blocks {block_range} took {time_diff} seconds')
import argparse from eth_utils import keccak from ethereumetl.file_utils import smart_open from ethereumetl.logging_utils import logging_basic_config logging_basic_config() parser = argparse.ArgumentParser( description='Outputs the 32-byte keccak hash of the given string.') parser.add_argument( '-i', '--input-string', default='Transfer(address,address,uint256)', type=str, help='String to hash, e.g. Transfer(address,address,uint256)') parser.add_argument('-o', '--output', default='-', type=str, help='The output file. If not specified stdout is used.') args = parser.parse_args() hash = keccak(text=args.input_string) with smart_open(args.output, 'w') as output_file: output_file.write('0x{}\n'.format(hash.hex()))
parser.add_argument('-o', '--output', default='-', type=str, help='The output file. If not specified stdout is used.') parser.add_argument('-w', '--max-workers', default=5, type=int, help='The maximum number of workers.') parser.add_argument( '-p', '--provider-uri', default=None, type=str, help='The URI of the web3 provider e.g. ' 'file://$HOME/Library/Ethereum/geth.ipc or https://mainnet.infura.io/') args = parser.parse_args() with smart_open(args.token_addresses, 'r') as token_addresses_file: job = ExportErc20TokensJob( token_addresses_iterable=(token_address.strip() for token_address in token_addresses_file), web3=ThreadLocalProxy( lambda: Web3(get_provider_from_uri(args.provider_uri))), item_exporter=export_erc20_tokens_job_item_exporter(args.output), max_workers=args.max_workers) job.run()
parser.add_argument('-b', '--batch-size', default=100, type=int, help='The number of blocks to filter at a time.') parser.add_argument('-o', '--output', default='-', type=str, help='The output file. If not specified stdout is used.') parser.add_argument('-w', '--max-workers', default=5, type=int, help='The maximum number of workers.') args = parser.parse_args() with smart_open(args.logs, 'r') as logs_file: if args.logs.endswith('.json'): logs_reader = (json.loads(line) for line in logs_file) else: logs_reader = csv.DictReader(logs_file) job = ExtractErc20TransfersJob(logs_iterable=logs_reader, batch_size=args.batch_size, max_workers=args.max_workers, item_exporter=erc20_transfers_item_exporter( args.output)) job.run()
def get_keccak_hash(input_string, output): """Outputs 32-byte Keccak hash of given string.""" hash = keccak(text=input_string) with smart_open(output, 'w') as output_file: output_file.write('0x{}\n'.format(hash.hex()))
parser.add_argument( '--receipts-output', default=None, type=str, help= 'The output file for receipts. If not provided receipts will not be exported. ' 'Use "-" for stdout') parser.add_argument( '--logs-output', default=None, type=str, help= 'The output file for receipt logs. If not provided receipt logs will not be exported. ' 'Use "-" for stdout') args = parser.parse_args() with smart_open(args.tx_hashes, 'r') as tx_hashes_file: job = ExportReceiptsJob( tx_hashes_iterable=(tx_hash.strip() for tx_hash in tx_hashes_file), batch_size=args.batch_size, batch_web3_provider=ThreadLocalProxy( lambda: get_provider_from_uri(args.provider_uri, batch=True)), max_workers=args.max_workers, item_exporter=receipts_and_logs_item_exporter(args.receipts_output, args.logs_output), export_receipts=args.receipts_output is not None, export_logs=args.logs_output is not None) job.run()
default=None, type=str, help= 'The output file for receipts. If not provided receipts will not be exported. ' 'Use "-" for stdout') parser.add_argument( '--logs-output', default=None, type=str, help= 'The output file for receipt logs. If not provided receipt logs will not be exported. ' 'Use "-" for stdout') args = parser.parse_args() with smart_open(args.transaction_hashes, 'r') as transaction_hashes_file: job = ExportReceiptsJob( transaction_hashes_iterable=( transaction_hash.strip() for transaction_hash in transaction_hashes_file), batch_size=args.batch_size, batch_web3_provider=ThreadLocalProxy( lambda: get_provider_from_uri(args.provider_uri, batch=True)), max_workers=args.max_workers, item_exporter=receipts_and_logs_item_exporter(args.receipts_output, args.logs_output), export_receipts=args.receipts_output is not None, export_logs=args.logs_output is not None) job.run()