Exemplo n.º 1
0
 def test_do_not_know_why_this_should_fail(self):
     from midas.pig_schema import make_parser_from_schema
     s = '(b: bag{tuple(i: int, s: chararray)}, s: chararray)'
     parser = make_parser_from_schema(s)
     self.assertEqual(parser('{(1,braz),(2,iale)}\tfoo\n'),
                      ([(1, 'braz'), (2, 'iale')], 'foo'))
     self.assertEqual(parser('{(3,froz),(4,ae)}\tbar\n'),
                      ([(3, 'froz'), (4, 'ae')], 'bar'))
     self.assertEqual(parser('{(8,boz)}\t\n'),
                      ([(8, 'boz')], None))
     self.assertEqual(parser('\t\n'), ([], None))
Exemplo n.º 2
0
# -*- coding: utf-8 -*-

import shelve
import random

import pandas

from midas.scripts import MDCommand
from midas.pig_schema import make_parser_from_schema


PARSER = make_parser_from_schema(
    '(site: chararray, ranking: bag{(tstamp: chararray, rank: int)})'
    )

class GenerateNegativeSamples(MDCommand):
    """ 
    This script ideally creates one sample for each company with an
    associated sites that has restrictions stored in the
    shelve. Hence, you first have to merge/split the original files
    containing the data (this should be `sites_wo_company`) into the
    right amount of splits, e.g. if your data is located in files
    /data/sites_wo_company/part-*

    DATA=/data/sites_wo_company/part-*
    split_size=$(( $(cat ${DATA} | wc -l) / 10 + 1 ))
    cat ${DATA} | split -l ${split_size} - splitted_

    This will generate files with the naming scheme `splitted_aa`,
    `splitted_ab`, etc. in your current working directory. Now start
    this very script once per file: