Пример #1
0
def tokenize(outfile, paths, base_path, no_shuffle, tokenizer_type,
             tokenizer_pickle, doc_id_level, n_jobs, chunksize):
    """
    Write later if module interface is needed. See _cli for the documentation.
    """
    assert (paths == []) or (base_path is None)

    if base_path:
        paths = filefilter.get_paths(base_path, file_type='*', get_iter=True)
        if no_shuffle is False:
            paths = list(paths)
            shuffle(paths)

    if tokenizer_pickle is not None:
        tokenizer = SaveLoad.load(tokenizer_pickle)
    else:
        tokenizer_dict = {'basic': text_processors.TokenizerBasic}
        tokenizer = tokenizer_dict[tokenizer_type]()

    formatter = text_processors.VWFormatter()

    func = partial(_tokenize_one, tokenizer, formatter, doc_id_level)

    results_iterator = imap_easy(func, paths, n_jobs, chunksize)

    for result in results_iterator:
        outfile.write(result + '\n')
Пример #2
0
def countReadsPerFragment(fragmentCount, lookup_structure, options, args, triangular=True):
    '''
        slurps in all input fils in parallel
        counts the reads per fragment and generates appropriate output files
    '''

    if (options.verbose):
        print >> sys.stdout, "- %s STARTED : reading input files : %s" % (timeStamp(), str(args))
        print >> sys.stdout, "    FragmentCount: %d" % (fragmentCount)

    fragmentList = np.zeros((fragmentCount,), dtype=np.uint16)
    fragmentPairs = None
    func = partial(countReadsPerFragmentParallel, fragmentCount=fragmentCount, lookup_structure=lookup_structure, triangular=triangular, options=options)
    results_iterator = imap_easy(func, args, n_jobs=min(8, len(args)), chunksize=1)

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: reading input files " % (timeStamp())
        print >> sys.stdout, "- %s STARTED : combining input files " % (timeStamp())


    # combine
    for (fl,fp) in results_iterator:
        fragmentList += fl

        if fragmentPairs is not None:
            fragmentPairs += fp
        else:
            fragmentPairs = fp

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: combining input files " % (timeStamp())


    return [ fragmentList, fragmentPairs ]
Пример #3
0
def tokenize(
    outfile, paths, base_path, no_shuffle, tokenizer_type, tokenizer_pickle,
    doc_id_level, n_jobs, chunksize):
    """
    Write later if module interface is needed. See _cli for the documentation.
    """
    assert (paths == []) or (base_path is None)

    if base_path:
        paths = filefilter.get_paths(base_path, file_type='*', get_iter=True)
        if no_shuffle is False:
            paths = list(paths)
            shuffle(paths)

    if tokenizer_pickle is not None:
        tokenizer = SaveLoad.load(tokenizer_pickle)
    else:
        tokenizer_dict = {'basic': text_processors.TokenizerBasic}
        tokenizer = tokenizer_dict[tokenizer_type]()

    formatter = text_processors.VWFormatter()

    func = partial(_tokenize_one, tokenizer, formatter, doc_id_level)

    results_iterator = imap_easy(func, paths, n_jobs, chunksize)

    for result in results_iterator:
        outfile.write(result + '\n')
Пример #4
0
    def to_vw(self, outfile, n_jobs=-1, chunksize=1000, raise_on_bad_id=True,
            cache_list=None, cache_list_file=None):
        """
        Write our filestream to a VW (Vowpal Wabbit) formatted file.

        Parameters
        ----------
        outfile : filepath or buffer
        n_jobs : Integer
            Use n_jobs different jobs to do the processing.  Set = 4 for 4
            jobs.  Set = -1 to use all available, -2 for all except 1,...
        chunksize : Integer
            Workers process this many jobs at once before pickling and sending
            results to master.  If this is too low, communication overhead
            will dominate.  If this is too high, jobs will not be distributed
            evenly.
        cache_list : List of strings
            Write these info_stream items to file on every iteration.
        cache_list_file : filepath or buffer
          """
        formatter = text_processors.VWFormatter()
        func = partial(_to_sstr, formatter=formatter,
                raise_on_bad_id=raise_on_bad_id, cache_list=cache_list)
        results_iterator = imap_easy(func, self.info_stream(), n_jobs, chunksize)
        if cache_list_file:
            with smart_open(outfile, 'w') as open_outfile, \
                    smart_open(cache_list_file, 'w') as open_cache_file:
                for result, cache_list in results_iterator:
                    open_outfile.write(result + '\n')
                    open_cache_file.write(str(cache_list) + '\n')
        else:
            with smart_open(outfile, 'w') as open_outfile:
                for result, cache_list in results_iterator:
                    open_outfile.write(result + '\n')
Пример #5
0
def countReadsPerFragment(fragmentCount,
                          lookup_structure,
                          options,
                          args,
                          triangular=True):
    '''
        slurps in all input fils in parallel
        counts the reads per fragment and generates appropriate output files
    '''

    if (options.verbose):
        print >> sys.stdout, "- %s STARTED : reading input files : %s" % (
            timeStamp(), str(args))
        print >> sys.stdout, "    FragmentCount: %d" % (fragmentCount)

    fragmentList = np.zeros((fragmentCount, ), dtype=np.uint16)
    fragmentPairs = None
    func = partial(countReadsPerFragmentParallel,
                   fragmentCount=fragmentCount,
                   lookup_structure=lookup_structure,
                   triangular=triangular,
                   options=options)
    results_iterator = imap_easy(func,
                                 args,
                                 n_jobs=min(8, len(args)),
                                 chunksize=1)

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: reading input files " % (
            timeStamp())
        print >> sys.stdout, "- %s STARTED : combining input files " % (
            timeStamp())

    # combine
    for (fl, fp) in results_iterator:
        fragmentList += fl

        if fragmentPairs is not None:
            fragmentPairs += fp
        else:
            fragmentPairs = fp

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: combining input files " % (
            timeStamp())

    return [fragmentList, fragmentPairs]
Пример #6
0
    def to_vw(self, outfile, n_jobs=1, chunksize=1000, raise_on_bad_id=True):
        """
        Write our filestream to a VW (Vowpal Wabbit) formatted file.

        Parameters
        ----------
        outfile : filepath or buffer
        n_jobs : Integer
            Use n_jobs different jobs to do the processing.  Set = 4 for 4
            jobs.  Set = -1 to use all available, -2 for all except 1,...
        chunksize : Integer
            Workers process this many jobs at once before pickling and sending
            results to master.  If this is too low, communication overhead
            will dominate.  If this is too high, jobs will not be distributed
            evenly.
        raise_on_bad_id : Boolean
            If True, raise DocIDError when the doc_id (formed by self) is not
            a valid VW "Tag".  I.e. contains :, |, ', or whitespace.
            If False, print warning.
        """
        # Note:  This is similar to rosetta/cmd/files_to_vw.py
        # This implementation is more complicated, due to the fact that a
        # streamer specifies the method to extract doc_id from a stream.
        # To be faithful to the streamer, we must therefore use the streamer
        # to stream the files.  This requires a combination of imap_easy and
        # a chunker.
        #
        # Create an iterator over chunks of paths
        path_group_iter = common.grouper(self.paths, chunksize)

        formatter = text_processors.VWFormatter()

        func = partial(_group_to_sstr, self, formatter, raise_on_bad_id)
        # Process one group at a time...set imap_easy chunksize arg to 1
        # since each group contains many paths.
        results_iterator = imap_easy(func, path_group_iter, n_jobs, 1)

        with smart_open(outfile, 'w') as open_outfile:
            for group_results in results_iterator:
                for sstr in group_results:
                    open_outfile.write(sstr + '\n')
Пример #7
0
    def to_vw(self, outfile, n_jobs=1, chunksize=1000, raise_on_bad_id=True):
        """
        Write our filestream to a VW (Vowpal Wabbit) formatted file.

        Parameters
        ----------
        outfile : filepath or buffer
        n_jobs : Integer
            Use n_jobs different jobs to do the processing.  Set = 4 for 4
            jobs.  Set = -1 to use all available, -2 for all except 1,...
        chunksize : Integer
            Workers process this many jobs at once before pickling and sending
            results to master.  If this is too low, communication overhead
            will dominate.  If this is too high, jobs will not be distributed
            evenly.
        raise_on_bad_id : Boolean
            If True, raise DocIDError when the doc_id (formed by self) is not
            a valid VW "Tag".  I.e. contains :, |, ', or whitespace.
            If False, print warning.
        """
        # Note:  This is similar to rosetta/cmd/files_to_vw.py
        # This implementation is more complicated, due to the fact that a
        # streamer specifies the method to extract doc_id from a stream.
        # To be faithful to the streamer, we must therefore use the streamer
        # to stream the files.  This requires a combination of imap_easy and
        # a chunker.
        #
        # Create an iterator over chunks of paths
        path_group_iter = common.grouper(self.paths, chunksize)

        formatter = text_processors.VWFormatter()

        func = partial(_group_to_sstr, self, formatter, raise_on_bad_id)
        # Process one group at a time...set imap_easy chunksize arg to 1
        # since each group contains many paths.
        results_iterator = imap_easy(func, path_group_iter, n_jobs, 1)

        with smart_open(outfile, 'w') as open_outfile:
            for group_results in results_iterator:
                for sstr in group_results:
                    open_outfile.write(sstr + '\n')
Пример #8
0
 def test_imap_easy_3job(self):
     result_iterator = parallel_easy.imap_easy(abfunc, self.numbers, 3, 1)
     result = []
     for number in result_iterator:
         result.append(number)
     self.assertEqual(result, self.benchmark)
Пример #9
0
 def test_imap_easy_3job_lambda(self):
     result_iterator = parallel_easy.imap_easy(sqlambda, self.numbers, 3, 1, use_pathos=True)
     result = []
     for number in result_iterator:
         result.append(number)
     self.assertEqual(result, self.sqbenchmark)