示例#1
0
文件: kazeeki.py 项目: nerevu/riko
def add_budget(source, budget_text, fixed_text='', hourly_text='', double=True):
    codes = '$£€₹'
    no_raw_budget = {'field': 'k:budget_raw'}
    has_code = {'field': 'k:cur_code', 'include': True}
    is_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE, 'include': True}
    not_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE}
    isnt_fixed = {'field': 'summary', 'text': fixed_text}
    isnt_hourly = {'field': 'summary', 'text': hourly_text}
    no_symbol = {'field': 'k:budget_raw', 'text': codes, 'op': 'intersection'}
    code_or_no_raw_budget = [has_code, no_raw_budget]
    def_cur_or_no_raw_budget = [is_def_cur, no_raw_budget]
    not_def_cur_or_no_raw_budget = [not_def_cur, no_raw_budget]

    first_num_rule = {'find': r'\d+', 'location': 'at'}
    last_num_rule = {'find': r'\d+', 'location': 'at', 'param': 'last'}
    cur_rule = {'find': r'\b[A-Z]{3}\b', 'location': 'at'}
    sym_rule = {'find': '[%s]' % codes, 'location': 'at'}

    # make_regex('k:budget_raw', r'[(),.\s]', ''),
    invalid_budgets = [
        {'find': 'Less than', 'replace': '0-'},
        {'find': 'Under', 'replace': '0-'},
        {'find': 'Upto', 'replace': '0-'},
        {'find': 'or less', 'replace': '-0'},
        {'find': 'k', 'replace': '000'},
        {'find': 'Not Sure', 'replace': ''},
        {'find': 'Not sure', 'replace': ''},
        {'find': '(', 'replace': ''},
        {'find': ')', 'replace': ''},
        {'find': '.', 'replace': ''},
        {'find': ',', 'replace': ''},
        {'find': ' ', 'replace': ''},
    ]

    cur_strreplace_rule = [
        {'find': '$', 'replace': 'USD'},
        {'find': '£', 'replace': 'GBP'},
        {'find': '€', 'replace': 'EUR'},
        {'find': '₹', 'replace': 'INR'},
    ]

    converted_budget_part = [
        {'subkey': 'k:budget_w_sym'},
        {'value': ' ('},
        {'subkey': 'k:budget_converted_w_sym'},
        {'value': ')'}
    ]

    def_full_budget_part = {'subkey': 'k:budget_w_sym'}
    hourly_budget_part = [{'subkey': 'k:budget_full'}, {'value': ' / hr'}]
    exchangerate_conf = {'url': get_path('quote.json')}
    native_currencyformat_conf = {'currency': {'subkey': 'k:cur_code'}}
    def_currencyformat_conf = {'currency': DEF_CUR_CODE}
    ave_budget_conf = make_simplemath('k:budget_raw2_num', 'mean')
    convert_budget_conf = make_simplemath('k:rate', 'multiply')

    if fixed_text:
        source = source.strconcat(
            conf={'part': {'value': 'fixed'}}, assign='k:job_type',
            skip_if=isnt_fixed)

    if hourly_text:
        source = source.strconcat(
            conf={'part': {'value': 'hourly'}}, assign='k:job_type',
            skip_if=isnt_hourly)

    source = (source
        .refind(
            conf={'rule': cur_rule}, field='k:budget_raw',
            assign='k:cur_code', skip_if=no_raw_budget)
        .strreplace(
            conf={'rule': invalid_budgets}, field='k:budget_raw',
            assign='k:budget_raw', skip_if=no_raw_budget))

    if double:
        source = (source
            .refind(
                conf={'rule': first_num_rule}, field='k:budget_raw',
                assign='k:budget_raw_num', skip_if=no_raw_budget)
            .refind(
                conf={'rule': last_num_rule}, field='k:budget_raw',
                assign='k:budget_raw2_num', skip_if=no_raw_budget)
            .simplemath(
                conf=ave_budget_conf, field='k:budget_raw_num',
                assign='k:budget', skip_if=no_raw_budget)
        )
    else:
        source = source.refind(
            conf={'rule': first_num_rule}, field='k:budget_raw',
            assign='k:budget', skip_if=no_raw_budget)

    source = (source
        .refind(
            conf={'rule': sym_rule}, field='k:budget_raw',
            assign='k:budget_raw_sym', skip_if=no_symbol)
        .strreplace(
            conf={'rule': cur_strreplace_rule}, field='k:budget_raw_sym',
            assign='k:cur_code', skip_if=code_or_no_raw_budget)
        .currencyformat(
            conf=native_currencyformat_conf, field='k:budget',
            assign='k:budget_w_sym', skip_if=no_raw_budget)
        .exchangerate(
            conf=exchangerate_conf, field='k:cur_code', assign='k:rate',
            skip_if=def_cur_or_no_raw_budget)
        .simplemath(
            conf=convert_budget_conf, field='k:budget',
            assign='k:budget_converted', skip_if=def_cur_or_no_raw_budget)
        .currencyformat(
            conf=def_currencyformat_conf, field='k:budget_converted',
            assign='k:budget_converted_w_sym', skip_if=def_cur_or_no_raw_budget)
        .strconcat(
            conf={'part': converted_budget_part}, assign='k:budget_full',
            skip_if=def_cur_or_no_raw_budget)
        .strconcat(
            conf={'part': def_full_budget_part}, assign='k:budget_full',
            skip_if=not_def_cur_or_no_raw_budget)
    )

    if hourly_text:
        source = (source
            .strconcat(
                conf={'part': hourly_budget_part}, assign='k:budget_full',
                skip_if=isnt_hourly)
        )

    return source
示例#2
0
文件: gigs.py 项目: tianhm/riko
from pprint import pprint
from riko import get_path
from riko.bado import coroutine
from riko.collections import SyncPipe, AsyncPipe

p1_conf = {'url': get_path('gigs.json'), 'path': 'value.items'}
p2_conf = {'uniq_key': 'link'}
p3_conf = {
    'combine': 'or',
    'mode': 'block',
    'rule': [{'field': 'title', 'value': 'php', 'op': 'contains'}]}

p4_conf = {'rule': [{'sort_key': 'pubDate', 'sort_dir': 'desc'}]}


def pipe(test=False):
    stream = (SyncPipe('fetchdata', conf=p1_conf, test=test)
        .uniq(conf=p2_conf)
        .filter(conf=p3_conf)
        .sort(conf=p4_conf)
        .list)

    for i in stream:
        pprint(i)

    return stream


@coroutine
def async_pipe(reactor, test=False):
    stream = yield (AsyncPipe('fetchdata', conf=p1_conf, test=test)
示例#3
0
文件: demo.py 项目: Fuzzwah/riko
    True
    >>> item['title'][:24] == 'This Is What A Celebrity'
    True
    >>> item['link'][:23] == 'http://feeds.gawker.com'
    True
"""
from __future__ import (
    absolute_import, division, print_function, unicode_literals)

from riko import get_path
from riko.bado import coroutine
from riko.collections.sync import SyncPipe
from riko.collections.async import AsyncPipe

replace_conf = {'rule': {'find': '\n', 'replace': ' '}}
health = get_path('health.xml')
caltrain = get_path('caltrain.html')
start = '<body id="thebody" class="Level2">'
fetch_conf = {'url': caltrain, 'start': start, 'end': '</body>', 'detag': True}


def pipe(test=False):
    s1 = SyncPipe('fetch', test=test, conf={'url': health}).output
    s2 = (SyncPipe('fetchpage', test=test, conf=fetch_conf)
        .strreplace(conf=replace_conf, assign='content')
        .stringtokenizer(conf={'delimiter': ' '}, emit=True)
        .count()
        .output)

    print(next(s1)['title'], next(s2)['count'])
示例#4
0
文件: kazeeki.py 项目: nerevu/riko
# vim: sw=4:ts=4:expandtab

from __future__ import (
    absolute_import, division, print_function, unicode_literals)

from pprint import pprint
from functools import partial

from riko import get_path
from riko.bado import coroutine
from riko.collections import SyncPipe, AsyncPipe

BR = {'find': '<br>'}
DEF_CUR_CODE = 'USD'

odesk_conf = {'url': get_path('odesk.json'), 'path': 'items'}
guru_conf = {'url': get_path('guru.json'), 'path': 'items'}
elance_conf = {'url': get_path('elance.json'), 'path': 'items'}
freelancer_conf = {'url': get_path('freelancer.json'), 'path': 'items'}


def make_regex(field, match, replace, default=None):
    result = {
        'field': field, 'match': match, 'replace': replace, 'default': default}

    return result


def make_simplemath(other, op):
    return {'other': {'subkey': other, 'type': 'number'}, 'op': op}
示例#5
0
文件: kazeeki.py 项目: sottom/riko
def add_budget(source,
               budget_text,
               fixed_text='',
               hourly_text='',
               double=True):
    codes = '$£€₹'
    no_raw_budget = {'field': 'k:budget_raw'}
    has_code = {'field': 'k:cur_code', 'include': True}
    is_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE, 'include': True}
    not_def_cur = {'field': 'k:cur_code', 'text': DEF_CUR_CODE}
    isnt_fixed = {'field': 'summary', 'text': fixed_text}
    isnt_hourly = {'field': 'summary', 'text': hourly_text}
    no_symbol = {'field': 'k:budget_raw', 'text': codes, 'op': 'intersection'}
    code_or_no_raw_budget = [has_code, no_raw_budget]
    def_cur_or_no_raw_budget = [is_def_cur, no_raw_budget]
    not_def_cur_or_no_raw_budget = [not_def_cur, no_raw_budget]

    first_num_rule = {'find': r'\d+', 'location': 'at'}
    last_num_rule = {'find': r'\d+', 'location': 'at', 'param': 'last'}
    cur_rule = {'find': r'\b[A-Z]{3}\b', 'location': 'at'}
    sym_rule = {'find': '[%s]' % codes, 'location': 'at'}

    # make_regex('k:budget_raw', r'[(),.\s]', ''),
    invalid_budgets = [
        {
            'find': 'Less than',
            'replace': '0-'
        },
        {
            'find': 'Under',
            'replace': '0-'
        },
        {
            'find': 'Upto',
            'replace': '0-'
        },
        {
            'find': 'or less',
            'replace': '-0'
        },
        {
            'find': 'k',
            'replace': '000'
        },
        {
            'find': 'Not Sure',
            'replace': ''
        },
        {
            'find': 'Not sure',
            'replace': ''
        },
        {
            'find': '(',
            'replace': ''
        },
        {
            'find': ')',
            'replace': ''
        },
        {
            'find': '.',
            'replace': ''
        },
        {
            'find': ',',
            'replace': ''
        },
        {
            'find': ' ',
            'replace': ''
        },
    ]

    cur_strreplace_rule = [
        {
            'find': '$',
            'replace': 'USD'
        },
        {
            'find': '£',
            'replace': 'GBP'
        },
        {
            'find': '€',
            'replace': 'EUR'
        },
        {
            'find': '₹',
            'replace': 'INR'
        },
    ]

    converted_budget_part = [{
        'subkey': 'k:budget_w_sym'
    }, {
        'value': ' ('
    }, {
        'subkey': 'k:budget_converted_w_sym'
    }, {
        'value': ')'
    }]

    def_full_budget_part = {'subkey': 'k:budget_w_sym'}
    hourly_budget_part = [{'subkey': 'k:budget_full'}, {'value': ' / hr'}]
    exchangerate_conf = {'url': get_path('quote.json')}
    native_currencyformat_conf = {'currency': {'subkey': 'k:cur_code'}}
    def_currencyformat_conf = {'currency': DEF_CUR_CODE}
    ave_budget_conf = make_simplemath('k:budget_raw2_num', 'mean')
    convert_budget_conf = make_simplemath('k:rate', 'multiply')

    if fixed_text:
        source = source.strconcat(conf={'part': {
            'value': 'fixed'
        }},
                                  assign='k:job_type',
                                  skip_if=isnt_fixed)

    if hourly_text:
        source = source.strconcat(conf={'part': {
            'value': 'hourly'
        }},
                                  assign='k:job_type',
                                  skip_if=isnt_hourly)

    source = (source.refind(conf={
        'rule': cur_rule
    },
                            field='k:budget_raw',
                            assign='k:cur_code',
                            skip_if=no_raw_budget).strreplace(
                                conf={'rule': invalid_budgets},
                                field='k:budget_raw',
                                assign='k:budget_raw',
                                skip_if=no_raw_budget))

    if double:
        source = (source.refind(conf={
            'rule': first_num_rule
        },
                                field='k:budget_raw',
                                assign='k:budget_raw_num',
                                skip_if=no_raw_budget).refind(
                                    conf={
                                        'rule': last_num_rule
                                    },
                                    field='k:budget_raw',
                                    assign='k:budget_raw2_num',
                                    skip_if=no_raw_budget).simplemath(
                                        conf=ave_budget_conf,
                                        field='k:budget_raw_num',
                                        assign='k:budget',
                                        skip_if=no_raw_budget))
    else:
        source = source.refind(conf={'rule': first_num_rule},
                               field='k:budget_raw',
                               assign='k:budget',
                               skip_if=no_raw_budget)

    source = (source.refind(
        conf={
            'rule': sym_rule
        },
        field='k:budget_raw',
        assign='k:budget_raw_sym',
        skip_if=no_symbol).strreplace(
            conf={
                'rule': cur_strreplace_rule
            },
            field='k:budget_raw_sym',
            assign='k:cur_code',
            skip_if=code_or_no_raw_budget).currencyformat(
                conf=native_currencyformat_conf,
                field='k:budget',
                assign='k:budget_w_sym',
                skip_if=no_raw_budget).exchangerate(
                    conf=exchangerate_conf,
                    field='k:cur_code',
                    assign='k:rate',
                    skip_if=def_cur_or_no_raw_budget).simplemath(
                        conf=convert_budget_conf,
                        field='k:budget',
                        assign='k:budget_converted',
                        skip_if=def_cur_or_no_raw_budget).currencyformat(
                            conf=def_currencyformat_conf,
                            field='k:budget_converted',
                            assign='k:budget_converted_w_sym',
                            skip_if=def_cur_or_no_raw_budget).strconcat(
                                conf={
                                    'part': converted_budget_part
                                },
                                assign='k:budget_full',
                                skip_if=def_cur_or_no_raw_budget).strconcat(
                                    conf={'part': def_full_budget_part},
                                    assign='k:budget_full',
                                    skip_if=not_def_cur_or_no_raw_budget))

    if hourly_text:
        source = (source.strconcat(conf={'part': hourly_budget_part},
                                   assign='k:budget_full',
                                   skip_if=isnt_hourly))

    return source
示例#6
0
文件: kazeeki.py 项目: sottom/riko
# -*- coding: utf-8 -*-
# vim: sw=4:ts=4:expandtab

from pprint import pprint
from functools import partial

from riko import get_path
from riko.bado import coroutine
from riko.collections import SyncPipe, AsyncPipe

BR = {'find': '<br>'}
DEF_CUR_CODE = 'USD'

odesk_conf = {'url': get_path('odesk.json'), 'path': 'items'}
guru_conf = {'url': get_path('guru.json'), 'path': 'items'}
elance_conf = {'url': get_path('elance.json'), 'path': 'items'}
freelancer_conf = {'url': get_path('freelancer.json'), 'path': 'items'}


def make_regex(field, match, replace, default=None):
    result = {
        'field': field,
        'match': match,
        'replace': replace,
        'default': default
    }

    return result


def make_simplemath(other, op):
示例#7
0
文件: gigs.py 项目: Fuzzwah/riko
from __future__ import (
    absolute_import, division, print_function, unicode_literals)

from pprint import pprint
from riko import get_path
from riko.bado import coroutine
from riko.collections.sync import SyncPipe
from riko.collections.async import AsyncPipe

p1_conf = {'url': get_path('gigs.json'), 'path': 'value.items'}
p2_conf = {'uniq_key': 'link'}
p3_conf = {
    'combine': 'or',
    'mode': 'block',
    'rule': [{'field': 'title', 'value': 'php', 'op': 'contains'}]}

p4_conf = {'rule': [{'sort_key': 'pubDate', 'sort_dir': 'desc'}]}


def pipe(test=False):
    stream = (SyncPipe('fetchdata', conf=p1_conf, test=test)
        .uniq(conf=p2_conf)
        .filter(conf=p3_conf)
        .sort(conf=p4_conf)
        .list)

    for i in stream:
        pprint(i)

    return stream
示例#8
0
文件: kazeeki.py 项目: Fuzzwah/riko
strconcat2_conf = {
    'part': [{'subkey': 'k:budget_raw1_sym'}, {'subkey': 'k:budget_raw2_sym'}]}

strconcat3_conf = {
    'part': [
        {'subkey': 'k:budget_w_sym'},
        {'value': ' ('},
        {'subkey': 'k:budget_converted_w_sym'},
        {'value': ')'}]}

strconcat4_conf = {'part': [{'subkey': 'k:budget_full'}, {'value': ' / hr'}]}
tokenizer_conf = {'dedupe': True, 'sort': True}
substring1_conf = {'from': 0, 'length': 3}
substring2_conf = {'from': 0, 'length': 1}
currencyformat1_conf = {'currency': {'subkey': 'k:cur_code'}}
exchangerate_conf = {'url': get_path('quote.json')}
currencyformat2_conf = {'currency': DEF_CUR_CODE}
simplemath1_conf = make_simplemath('k:budget_raw2_num', 'mean')
simplemath2_conf = make_simplemath('k:rate', 'multiply')
test1 = lambda item: item.get('k:cur_code')
test2 = lambda item: item.get('k:cur_code') != DEF_CUR_CODE
test3 = lambda item: item.get('k:cur_code') == DEF_CUR_CODE
test4 = lambda item: item.get('k:job_type') != 'hourly'

sources = [
    {'url': get_path('kazeeki_1.json'), 'type': 'fetchdata', 'path': 'items'},
    {'url': get_path('kazeeki_2.json'), 'type': 'fetchdata', 'path': 'items'},
    {'url': get_path('kazeeki_3.json'), 'type': 'fetchdata', 'path': 'items'},
]

示例#9
0
    >>> set(item).issuperset(intersection)
    True
    >>> item['title'][:24] == 'This Is What A Celebrity'
    True
    >>> item['link'][:23] == 'http://feeds.gawker.com'
    True
"""
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from riko import get_path
from riko.bado import coroutine
from riko.collections import SyncPipe, AsyncPipe

replace_conf = {'rule': {'find': '\n', 'replace': ' '}}
health = get_path('health.xml')
caltrain = get_path('caltrain.html')
start = '<body id="thebody" class="Level2">'
fetch_conf = {'url': caltrain, 'start': start, 'end': '</body>', 'detag': True}


def pipe(test=False):
    s1 = SyncPipe('fetch', test=test, conf={'url': health}).output
    s2 = (SyncPipe('fetchpage', test=test,
                   conf=fetch_conf).strreplace(conf=replace_conf,
                                               assign='content').tokenizer(
                                                   conf={
                                                       'delimiter': ' '
                                                   },
                                                   emit=True).count().output)