def test_do_not_know_why_this_should_fail(self): from midas.pig_schema import make_parser_from_schema s = '(b: bag{tuple(i: int, s: chararray)}, s: chararray)' parser = make_parser_from_schema(s) self.assertEqual(parser('{(1,braz),(2,iale)}\tfoo\n'), ([(1, 'braz'), (2, 'iale')], 'foo')) self.assertEqual(parser('{(3,froz),(4,ae)}\tbar\n'), ([(3, 'froz'), (4, 'ae')], 'bar')) self.assertEqual(parser('{(8,boz)}\t\n'), ([(8, 'boz')], None)) self.assertEqual(parser('\t\n'), ([], None))
# -*- coding: utf-8 -*- import shelve import random import pandas from midas.scripts import MDCommand from midas.pig_schema import make_parser_from_schema PARSER = make_parser_from_schema( '(site: chararray, ranking: bag{(tstamp: chararray, rank: int)})' ) class GenerateNegativeSamples(MDCommand): """ This script ideally creates one sample for each company with an associated sites that has restrictions stored in the shelve. Hence, you first have to merge/split the original files containing the data (this should be `sites_wo_company`) into the right amount of splits, e.g. if your data is located in files /data/sites_wo_company/part-* DATA=/data/sites_wo_company/part-* split_size=$(( $(cat ${DATA} | wc -l) / 10 + 1 )) cat ${DATA} | split -l ${split_size} - splitted_ This will generate files with the naming scheme `splitted_aa`, `splitted_ab`, etc. in your current working directory. Now start this very script once per file: