def extract_sopr(options):
    if not os.path.exists(ORIG_DIR):
        mkdir_p(ORIG_DIR)

    if options.get('loglevel', None):
        log.setLevel(options['loglevel'])

    cache_paths = glob(os.path.join(CACHE_DIR, 'sopr/*/*/*.zip'))
    log.debug("cache paths ({num}):".format(num=len(cache_paths)) +
              "\n\t".join(cache_paths))

    extracted = cache_paths >> filter(lambda x: check_ext(x, ext='.zip')) \
                            >> map(lambda p: translate_dir(p,
                                                           from_dir=CACHE_DIR,
                                                           to_dir=ORIG_DIR)) \
                            >> ThreadPool(extract_all_zips)

    for path, destination_dir, num_files in extracted:
        log.info("successfully extracted " +
                 "{path} to {dest_dir} ({num} files)".format(
                    path=path, dest_dir=destination_dir, num=num_files))

    for url, exception in extracted.failure:
        log.error("extracting from {path} failed: {exception}".format(
            url=url, exception=exception))
示例#2
0
def confirm_download_schedule(schedule):
    """Reports the total number of bytes and total number of files
    to download. Also lists the inaccessible files (based on HEAD
    response). Then asks user to confirm downloading.
    """
    def content_length(tpl):
        return tpl[2][1]

    def status_code(tpl):
        return tpl[2][0]

    def href(tpl):
        return tpl[0]

    def is_OK(tpl):
        return status_code(tpl) == 200

    def not_OK(tpl):
        return status_code(tpl) != 200

    increment = lambda x, _: x + 1
    file_count = (
        schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0))

    bytes_to_download = (
        schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum)

    inaccessible_files = (schedule >> stream.filter(not_OK) >> list)

    if len(inaccessible_files) > 0:
        print
        print "Some files are inaccessible:"
        for (idx, sched) in enumerate(inaccessible_files):
            print "%d: %d %s" % (idx, status_code(sched), href(sched))

    if bytes_to_download > 0:
        print
        print "Need to download %s in %d files." % (
            pretty_bytes(bytes_to_download), file_count)
        print
        print "Are you sure you want to continue? [Y/n]"
        user_input = raw_input("> ")
        return (user_input.upper() in ("", "Y", "YES"))
    else:
        print
        print "Nothing to download."
        return False
示例#3
0
def confirm_download_schedule(schedule):
    """Reports the total number of bytes and total number of files
    to download. Also lists the inaccessible files (based on HEAD
    response). Then asks user to confirm downloading.
    """

    def content_length(tpl):
        return tpl[2][1]

    def status_code(tpl):
        return tpl[2][0]

    def href(tpl):
        return tpl[0]

    def is_OK(tpl):
        return status_code(tpl) == 200

    def not_OK(tpl):
        return status_code(tpl) != 200

    increment = lambda x, _: x + 1
    file_count = schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0)

    bytes_to_download = schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum

    inaccessible_files = schedule >> stream.filter(not_OK) >> list

    if len(inaccessible_files) > 0:
        print
        print "Some files are inaccessible:"
        for (idx, sched) in enumerate(inaccessible_files):
            print "%d: %d %s" % (idx, status_code(sched), href(sched))

    if bytes_to_download > 0:
        print
        print "Need to download %s in %d files." % (pretty_bytes(bytes_to_download), file_count)
        print
        print "Are you sure you want to continue? [Y/n]"
        user_input = raw_input("> ")
        return user_input.upper() in ("", "Y", "YES")
    else:
        print
        print "Nothing to download."
        return False
示例#4
0
    def run( self ):
      if self.type == "TRANSIENT":
        sink = item[:1]
      else:
        sink = min

      dataFilter = MovAvg(10)
      nop = map(myPrint1)
      elements = [nop, nop, nop, nop, nop]

      elements[0] = map(myPrint1)
      elements[1] = map(lambda x: dataFilter(x)) 
      elements[2] = filter(lambda x: bigVals(x, 40))
      elements[3] = map(notify_ctrl)
    
      self.myGen() >> elements[0] >> elements[1] >> elements[2] >> elements[3] >> elements[4] >> sink 
示例#5
0
            yield math.sqrt(i)
    #static computation in a time block
    ts = time()
    evens = instream[::2]
    odds  = instream[1::2]
    evens = map(math.sqrt, evens)
    odds  = map(math.sqrt, odds)
    even_ans = scan(ops.add, evens)
    odd_ans  = reduce(ops.add, odds )
    static_time = time() - ts

    #streaming computation

    # create our filters
    cong_2 = lambda x: x%2==0
    evens = filter(cong_2)
    odds  = filter(lambda x: not cong_2(x))
    ts = time()
    # wire the split into the filters
    instream >> tee(evens)
    instream >> odds

    # wire up the map and fold (scan/accumulate)
    foldedevens = (evens >> stream.map(math.sqrt) >> fold(ops.add))
    print(time() - ts)
    sqrtodds = odds >> (stream.Processor(my_sqrt))
    print("established the sqrter %f" % (time() - ts))
    foldedodd = sqrtodds >> stream.fold(ops.add)
    print("made odd folder: %f" % (time() - ts))
    # force execution
    soans = foldedodd >> item[-1:]
示例#6
0
    return values


def randomized(n):
    values = []
    for _ in range(n):
        values.append(randint(-sys.maxint, sys.maxint))
    return values

for v in [10, 100, 1000] >> stream.map(alternating):
    dataset.append(v)

for v in [10, 100, 1000] >> stream.map(randomized):
    dataset.append(v)

func = stream.filter(lambda x: x & 1)

resultset = dataset >> stream.map(lambda s: s >> func >> set) >> list


## Test scenario

def threadpool(i):
    result = dataset[i] >> stream.ThreadPool(func, poolsize=2) >> set
    pprint(result)
    assert result == resultset[i]


def processpool(i):
    result = dataset[i] >> stream.ProcessPool(func, poolsize=2) >> set
    pprint(result)
示例#7
0
		values.append(-i)
	return values

def randomized(n):
	values = []
	for _ in range(n):
		values.append(randint(-sys.maxint, sys.maxint))
	return values

for v in [10, 100, 1000] >> map(alternating):
	dataset.append(v)

for v in [10, 100, 1000] >> map(randomized):
	dataset.append(v)

func = filter(lambda x: x&1)

resultset = dataset >> map(lambda s: s >> func >> set) >> list


## Test scenario

def threadpool(i):
	result = dataset[i] >> ThreadPool(func, poolsize=2) >> set
	pprint(result)
	assert result == resultset[i]

def processpool(i):
	result = dataset[i] >> ProcessPool(func, poolsize=2) >> set
	pprint(result)
	assert result == resultset[i]
示例#8
0
    # no argument given: exit
    if len(args) == 0:
        print('Nothing to do. Try -h or --help option.')
        sys.exit(0)

    # main action arguments
    if args[0] in ('-h', '--help'):
        # help
        usage()
        sys.exit(0)
    elif args[0] in ('-r', '--retrieve'):
        # -r / --retrieve: get tweets in json format to STDOUT
        args = args[1:]
        s = None
        if len(args) > 0 and args[0] == 'filter':
            s = stream.filter(track=args[1:], limit=RETRIEVAL_LIMIT)
        elif len(args) > 0 and args[0] == 'news':
            s = stream.filter(follow=list(english_sources_twitter.values()),
                              limit=RETRIEVAL_LIMIT)
        elif len(args) > 1 and args[0] == 'idsfile':
            s = stream.fromIDsFile(args[1])
        else:
            s = stream.filter()

        for tweet in s:
            print(json.dumps(tweet))
        sys.exit(0)
    elif args[0] in ('-a', '--analyze'):
        # -a / --analyze: generate in output directory topics.json, summary.txt and stats.csv
        # exit if too few arguments
        if len(args) < 3:
示例#9
0

def randomized(n):
    values = []
    for _ in range(n):
        values.append(randint(-sys.maxint, sys.maxint))
    return values


for v in [10, 100, 1000] >> stream.map(alternating):
    dataset.append(v)

for v in [10, 100, 1000] >> stream.map(randomized):
    dataset.append(v)

func = stream.filter(lambda x: x & 1)

resultset = dataset >> stream.map(lambda s: s >> func >> set) >> list

## Test scenario


def threadpool(i):
    result = dataset[i] >> stream.ThreadPool(func, poolsize=2) >> set
    pprint(result)
    assert result == resultset[i]


def processpool(i):
    result = dataset[i] >> stream.ProcessPool(func, poolsize=2) >> set
    pprint(result)