def add_budget(source, budget_text, fixed_text='', hourly_text='', double=True): codes = '$£€₹' no_raw_budget = {'field': 'k:budget_raw'} has_code = {'field': 'k:cur_code', 'include': True} is_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE, 'include': True} not_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE} isnt_fixed = {'field': 'summary', 'text': fixed_text} isnt_hourly = {'field': 'summary', 'text': hourly_text} no_symbol = {'field': 'k:budget_raw', 'text': codes, 'op': 'intersection'} code_or_no_raw_budget = [has_code, no_raw_budget] def_cur_or_no_raw_budget = [is_def_cur, no_raw_budget] not_def_cur_or_no_raw_budget = [not_def_cur, no_raw_budget] first_num_rule = {'find': r'\d+', 'location': 'at'} last_num_rule = {'find': r'\d+', 'location': 'at', 'param': 'last'} cur_rule = {'find': r'\b[A-Z]{3}\b', 'location': 'at'} sym_rule = {'find': '[%s]' % codes, 'location': 'at'} # make_regex('k:budget_raw', r'[(),.\s]', ''), invalid_budgets = [ {'find': 'Less than', 'replace': '0-'}, {'find': 'Under', 'replace': '0-'}, {'find': 'Upto', 'replace': '0-'}, {'find': 'or less', 'replace': '-0'}, {'find': 'k', 'replace': '000'}, {'find': 'Not Sure', 'replace': ''}, {'find': 'Not sure', 'replace': ''}, {'find': '(', 'replace': ''}, {'find': ')', 'replace': ''}, {'find': '.', 'replace': ''}, {'find': ',', 'replace': ''}, {'find': ' ', 'replace': ''}, ] cur_strreplace_rule = [ {'find': '$', 'replace': 'USD'}, {'find': '£', 'replace': 'GBP'}, {'find': '€', 'replace': 'EUR'}, {'find': '₹', 'replace': 'INR'}, ] converted_budget_part = [ {'subkey': 'k:budget_w_sym'}, {'value': ' ('}, {'subkey': 'k:budget_converted_w_sym'}, {'value': ')'} ] def_full_budget_part = {'subkey': 'k:budget_w_sym'} hourly_budget_part = [{'subkey': 'k:budget_full'}, {'value': ' / hr'}] exchangerate_conf = {'url': get_path('quote.json')} native_currencyformat_conf = {'currency': {'subkey': 'k:cur_code'}} def_currencyformat_conf = {'currency': DEF_CUR_CODE} ave_budget_conf = make_simplemath('k:budget_raw2_num', 'mean') convert_budget_conf = make_simplemath('k:rate', 'multiply') if fixed_text: source = source.strconcat( conf={'part': {'value': 'fixed'}}, assign='k:job_type', skip_if=isnt_fixed) if hourly_text: source = source.strconcat( conf={'part': {'value': 'hourly'}}, assign='k:job_type', skip_if=isnt_hourly) source = (source .refind( conf={'rule': cur_rule}, field='k:budget_raw', assign='k:cur_code', skip_if=no_raw_budget) .strreplace( conf={'rule': invalid_budgets}, field='k:budget_raw', assign='k:budget_raw', skip_if=no_raw_budget)) if double: source = (source .refind( conf={'rule': first_num_rule}, field='k:budget_raw', assign='k:budget_raw_num', skip_if=no_raw_budget) .refind( conf={'rule': last_num_rule}, field='k:budget_raw', assign='k:budget_raw2_num', skip_if=no_raw_budget) .simplemath( conf=ave_budget_conf, field='k:budget_raw_num', assign='k:budget', skip_if=no_raw_budget) ) else: source = source.refind( conf={'rule': first_num_rule}, field='k:budget_raw', assign='k:budget', skip_if=no_raw_budget) source = (source .refind( conf={'rule': sym_rule}, field='k:budget_raw', assign='k:budget_raw_sym', skip_if=no_symbol) .strreplace( conf={'rule': cur_strreplace_rule}, field='k:budget_raw_sym', assign='k:cur_code', skip_if=code_or_no_raw_budget) .currencyformat( conf=native_currencyformat_conf, field='k:budget', assign='k:budget_w_sym', skip_if=no_raw_budget) .exchangerate( conf=exchangerate_conf, field='k:cur_code', assign='k:rate', skip_if=def_cur_or_no_raw_budget) .simplemath( conf=convert_budget_conf, field='k:budget', assign='k:budget_converted', skip_if=def_cur_or_no_raw_budget) .currencyformat( conf=def_currencyformat_conf, field='k:budget_converted', assign='k:budget_converted_w_sym', skip_if=def_cur_or_no_raw_budget) .strconcat( conf={'part': converted_budget_part}, assign='k:budget_full', skip_if=def_cur_or_no_raw_budget) .strconcat( conf={'part': def_full_budget_part}, assign='k:budget_full', skip_if=not_def_cur_or_no_raw_budget) ) if hourly_text: source = (source .strconcat( conf={'part': hourly_budget_part}, assign='k:budget_full', skip_if=isnt_hourly) ) return source
from pprint import pprint from riko import get_path from riko.bado import coroutine from riko.collections import SyncPipe, AsyncPipe p1_conf = {'url': get_path('gigs.json'), 'path': 'value.items'} p2_conf = {'uniq_key': 'link'} p3_conf = { 'combine': 'or', 'mode': 'block', 'rule': [{'field': 'title', 'value': 'php', 'op': 'contains'}]} p4_conf = {'rule': [{'sort_key': 'pubDate', 'sort_dir': 'desc'}]} def pipe(test=False): stream = (SyncPipe('fetchdata', conf=p1_conf, test=test) .uniq(conf=p2_conf) .filter(conf=p3_conf) .sort(conf=p4_conf) .list) for i in stream: pprint(i) return stream @coroutine def async_pipe(reactor, test=False): stream = yield (AsyncPipe('fetchdata', conf=p1_conf, test=test)
True >>> item['title'][:24] == 'This Is What A Celebrity' True >>> item['link'][:23] == 'http://feeds.gawker.com' True """ from __future__ import ( absolute_import, division, print_function, unicode_literals) from riko import get_path from riko.bado import coroutine from riko.collections.sync import SyncPipe from riko.collections.async import AsyncPipe replace_conf = {'rule': {'find': '\n', 'replace': ' '}} health = get_path('health.xml') caltrain = get_path('caltrain.html') start = '<body id="thebody" class="Level2">' fetch_conf = {'url': caltrain, 'start': start, 'end': '</body>', 'detag': True} def pipe(test=False): s1 = SyncPipe('fetch', test=test, conf={'url': health}).output s2 = (SyncPipe('fetchpage', test=test, conf=fetch_conf) .strreplace(conf=replace_conf, assign='content') .stringtokenizer(conf={'delimiter': ' '}, emit=True) .count() .output) print(next(s1)['title'], next(s2)['count'])
# vim: sw=4:ts=4:expandtab from __future__ import ( absolute_import, division, print_function, unicode_literals) from pprint import pprint from functools import partial from riko import get_path from riko.bado import coroutine from riko.collections import SyncPipe, AsyncPipe BR = {'find': '<br>'} DEF_CUR_CODE = 'USD' odesk_conf = {'url': get_path('odesk.json'), 'path': 'items'} guru_conf = {'url': get_path('guru.json'), 'path': 'items'} elance_conf = {'url': get_path('elance.json'), 'path': 'items'} freelancer_conf = {'url': get_path('freelancer.json'), 'path': 'items'} def make_regex(field, match, replace, default=None): result = { 'field': field, 'match': match, 'replace': replace, 'default': default} return result def make_simplemath(other, op): return {'other': {'subkey': other, 'type': 'number'}, 'op': op}
def add_budget(source, budget_text, fixed_text='', hourly_text='', double=True): codes = '$£€₹' no_raw_budget = {'field': 'k:budget_raw'} has_code = {'field': 'k:cur_code', 'include': True} is_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE, 'include': True} not_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE} isnt_fixed = {'field': 'summary', 'text': fixed_text} isnt_hourly = {'field': 'summary', 'text': hourly_text} no_symbol = {'field': 'k:budget_raw', 'text': codes, 'op': 'intersection'} code_or_no_raw_budget = [has_code, no_raw_budget] def_cur_or_no_raw_budget = [is_def_cur, no_raw_budget] not_def_cur_or_no_raw_budget = [not_def_cur, no_raw_budget] first_num_rule = {'find': r'\d+', 'location': 'at'} last_num_rule = {'find': r'\d+', 'location': 'at', 'param': 'last'} cur_rule = {'find': r'\b[A-Z]{3}\b', 'location': 'at'} sym_rule = {'find': '[%s]' % codes, 'location': 'at'} # make_regex('k:budget_raw', r'[(),.\s]', ''), invalid_budgets = [ { 'find': 'Less than', 'replace': '0-' }, { 'find': 'Under', 'replace': '0-' }, { 'find': 'Upto', 'replace': '0-' }, { 'find': 'or less', 'replace': '-0' }, { 'find': 'k', 'replace': '000' }, { 'find': 'Not Sure', 'replace': '' }, { 'find': 'Not sure', 'replace': '' }, { 'find': '(', 'replace': '' }, { 'find': ')', 'replace': '' }, { 'find': '.', 'replace': '' }, { 'find': ',', 'replace': '' }, { 'find': ' ', 'replace': '' }, ] cur_strreplace_rule = [ { 'find': '$', 'replace': 'USD' }, { 'find': '£', 'replace': 'GBP' }, { 'find': '€', 'replace': 'EUR' }, { 'find': '₹', 'replace': 'INR' }, ] converted_budget_part = [{ 'subkey': 'k:budget_w_sym' }, { 'value': ' (' }, { 'subkey': 'k:budget_converted_w_sym' }, { 'value': ')' }] def_full_budget_part = {'subkey': 'k:budget_w_sym'} hourly_budget_part = [{'subkey': 'k:budget_full'}, {'value': ' / hr'}] exchangerate_conf = {'url': get_path('quote.json')} native_currencyformat_conf = {'currency': {'subkey': 'k:cur_code'}} def_currencyformat_conf = {'currency': DEF_CUR_CODE} ave_budget_conf = make_simplemath('k:budget_raw2_num', 'mean') convert_budget_conf = make_simplemath('k:rate', 'multiply') if fixed_text: source = source.strconcat(conf={'part': { 'value': 'fixed' }}, assign='k:job_type', skip_if=isnt_fixed) if hourly_text: source = source.strconcat(conf={'part': { 'value': 'hourly' }}, assign='k:job_type', skip_if=isnt_hourly) source = (source.refind(conf={ 'rule': cur_rule }, field='k:budget_raw', assign='k:cur_code', skip_if=no_raw_budget).strreplace( conf={'rule': invalid_budgets}, field='k:budget_raw', assign='k:budget_raw', skip_if=no_raw_budget)) if double: source = (source.refind(conf={ 'rule': first_num_rule }, field='k:budget_raw', assign='k:budget_raw_num', skip_if=no_raw_budget).refind( conf={ 'rule': last_num_rule }, field='k:budget_raw', assign='k:budget_raw2_num', skip_if=no_raw_budget).simplemath( conf=ave_budget_conf, field='k:budget_raw_num', assign='k:budget', skip_if=no_raw_budget)) else: source = source.refind(conf={'rule': first_num_rule}, field='k:budget_raw', assign='k:budget', skip_if=no_raw_budget) source = (source.refind( conf={ 'rule': sym_rule }, field='k:budget_raw', assign='k:budget_raw_sym', skip_if=no_symbol).strreplace( conf={ 'rule': cur_strreplace_rule }, field='k:budget_raw_sym', assign='k:cur_code', skip_if=code_or_no_raw_budget).currencyformat( conf=native_currencyformat_conf, field='k:budget', assign='k:budget_w_sym', skip_if=no_raw_budget).exchangerate( conf=exchangerate_conf, field='k:cur_code', assign='k:rate', skip_if=def_cur_or_no_raw_budget).simplemath( conf=convert_budget_conf, field='k:budget', assign='k:budget_converted', skip_if=def_cur_or_no_raw_budget).currencyformat( conf=def_currencyformat_conf, field='k:budget_converted', assign='k:budget_converted_w_sym', skip_if=def_cur_or_no_raw_budget).strconcat( conf={ 'part': converted_budget_part }, assign='k:budget_full', skip_if=def_cur_or_no_raw_budget).strconcat( conf={'part': def_full_budget_part}, assign='k:budget_full', skip_if=not_def_cur_or_no_raw_budget)) if hourly_text: source = (source.strconcat(conf={'part': hourly_budget_part}, assign='k:budget_full', skip_if=isnt_hourly)) return source
# -*- coding: utf-8 -*- # vim: sw=4:ts=4:expandtab from pprint import pprint from functools import partial from riko import get_path from riko.bado import coroutine from riko.collections import SyncPipe, AsyncPipe BR = {'find': '<br>'} DEF_CUR_CODE = 'USD' odesk_conf = {'url': get_path('odesk.json'), 'path': 'items'} guru_conf = {'url': get_path('guru.json'), 'path': 'items'} elance_conf = {'url': get_path('elance.json'), 'path': 'items'} freelancer_conf = {'url': get_path('freelancer.json'), 'path': 'items'} def make_regex(field, match, replace, default=None): result = { 'field': field, 'match': match, 'replace': replace, 'default': default } return result def make_simplemath(other, op):
from __future__ import ( absolute_import, division, print_function, unicode_literals) from pprint import pprint from riko import get_path from riko.bado import coroutine from riko.collections.sync import SyncPipe from riko.collections.async import AsyncPipe p1_conf = {'url': get_path('gigs.json'), 'path': 'value.items'} p2_conf = {'uniq_key': 'link'} p3_conf = { 'combine': 'or', 'mode': 'block', 'rule': [{'field': 'title', 'value': 'php', 'op': 'contains'}]} p4_conf = {'rule': [{'sort_key': 'pubDate', 'sort_dir': 'desc'}]} def pipe(test=False): stream = (SyncPipe('fetchdata', conf=p1_conf, test=test) .uniq(conf=p2_conf) .filter(conf=p3_conf) .sort(conf=p4_conf) .list) for i in stream: pprint(i) return stream
strconcat2_conf = { 'part': [{'subkey': 'k:budget_raw1_sym'}, {'subkey': 'k:budget_raw2_sym'}]} strconcat3_conf = { 'part': [ {'subkey': 'k:budget_w_sym'}, {'value': ' ('}, {'subkey': 'k:budget_converted_w_sym'}, {'value': ')'}]} strconcat4_conf = {'part': [{'subkey': 'k:budget_full'}, {'value': ' / hr'}]} tokenizer_conf = {'dedupe': True, 'sort': True} substring1_conf = {'from': 0, 'length': 3} substring2_conf = {'from': 0, 'length': 1} currencyformat1_conf = {'currency': {'subkey': 'k:cur_code'}} exchangerate_conf = {'url': get_path('quote.json')} currencyformat2_conf = {'currency': DEF_CUR_CODE} simplemath1_conf = make_simplemath('k:budget_raw2_num', 'mean') simplemath2_conf = make_simplemath('k:rate', 'multiply') test1 = lambda item: item.get('k:cur_code') test2 = lambda item: item.get('k:cur_code') != DEF_CUR_CODE test3 = lambda item: item.get('k:cur_code') == DEF_CUR_CODE test4 = lambda item: item.get('k:job_type') != 'hourly' sources = [ {'url': get_path('kazeeki_1.json'), 'type': 'fetchdata', 'path': 'items'}, {'url': get_path('kazeeki_2.json'), 'type': 'fetchdata', 'path': 'items'}, {'url': get_path('kazeeki_3.json'), 'type': 'fetchdata', 'path': 'items'}, ]
>>> set(item).issuperset(intersection) True >>> item['title'][:24] == 'This Is What A Celebrity' True >>> item['link'][:23] == 'http://feeds.gawker.com' True """ from __future__ import (absolute_import, division, print_function, unicode_literals) from riko import get_path from riko.bado import coroutine from riko.collections import SyncPipe, AsyncPipe replace_conf = {'rule': {'find': '\n', 'replace': ' '}} health = get_path('health.xml') caltrain = get_path('caltrain.html') start = '<body id="thebody" class="Level2">' fetch_conf = {'url': caltrain, 'start': start, 'end': '</body>', 'detag': True} def pipe(test=False): s1 = SyncPipe('fetch', test=test, conf={'url': health}).output s2 = (SyncPipe('fetchpage', test=test, conf=fetch_conf).strreplace(conf=replace_conf, assign='content').tokenizer( conf={ 'delimiter': ' ' }, emit=True).count().output)