예제 #1
0
    def mapper(self, empty, public_url):
        '''
        Takes as input a public URL to a TREC KBA 2012 chunk file,
        which it then loads, decrypts, uncompresses, and deserializes,
        so that it can count the number of NER tokens.

        This emits keys equal to the subcorpus name ('news',
        'linking', or 'social') and value is a two tuple of integers.
        First integer in the two-tuple is equal to the number of NER
        tokens, and second the number of sentences as tokenized by
        Stanford NER.
        '''

        subcorpus_name = None
        num_ner_tokens = 0
        num_ner_sentences = 0

        try:
            ## fetch the file to a local tempfile
            kba_corpus.log('fetching %r' % public_url)
            data = urllib.urlopen(public_url.strip()).read()

            ## shell out to gpg and xz to get the thrift
            thrift_data = kba_corpus.decrypt_and_uncompress(
                data, 'kba_corpus.tar.gz/trec-kba-rsa.secret-key')

            ## iterate over all the docs in this chunk
            for stream_item in kba_corpus.stream_items(thrift_data):
                ## this should be the same every time, could assert
                subcorpus_name = stream_item.source

                ## for fun, keep counters on how many docs have NER or not
                if not (stream_item.body.ner or stream_item.anchor.ner
                        or stream_item.title.ner):
                    self.increment_counter('SubcorpusCounter', 'no-NER', 1)
                else:
                    self.increment_counter('SubcorpusCounter', 'hasNER', 1)

                ## tell hadoop we are still alive
                self.increment_counter('SubcorpusCounter',
                                       'StreamItemsProcessed', 1)

                ## iterate over sentences to generate the two counts
                for content in ['body', 'anchor', 'title']:
                    for sentence in kba_corpus.sentences(stream_item,
                                                         content=content):
                        num_ner_tokens += len(sentence)
                        num_ner_sentences += 1

        except Exception, exc:
            ## oops, log verbosely, including with counters (maybe too clever)
            kba_corpus.log(traceback.format_exc(exc))
            key = 'FAILED-%s' % re.sub('\s+', '-', str(exc))
            ## could emit this, but that would polute the output
            # yield key, public_url
            self.increment_counter('SubcorpusCounter', key, 1)
예제 #2
0
    def mapper(self, empty, public_url):
        '''
        Takes as input a public URL to a TREC KBA 2012 chunk file,
        which it then loads, decrypts, uncompresses, and deserializes,
        so that it can count the number of NER tokens.

        This emits keys equal to the subcorpus name ('news',
        'linking', or 'social') and value is a two tuple of integers.
        First integer in the two-tuple is equal to the number of NER
        tokens, and second the number of sentences as tokenized by
        Stanford NER.
        '''
        
        subcorpus_name = None
        num_ner_tokens = 0
        num_ner_sentences = 0

        try:
            ## fetch the file to a local tempfile
            kba_corpus.log('fetching %r' % public_url)
            data = urllib.urlopen(public_url.strip()).read()

            ## shell out to gpg and xz to get the thrift
            thrift_data = kba_corpus.decrypt_and_uncompress(
                data, 'kba_corpus.tar.gz/trec-kba-rsa.secret-key')

            ## iterate over all the docs in this chunk            
            for stream_item in kba_corpus.stream_items(thrift_data):
                ## this should be the same every time, could assert
                subcorpus_name = stream_item.source

                ## for fun, keep counters on how many docs have NER or not
                if not (stream_item.body.ner or stream_item.anchor.ner or stream_item.title.ner):
                    self.increment_counter('SubcorpusCounter', 'no-NER', 1)
                else:
                    self.increment_counter('SubcorpusCounter', 'hasNER', 1)

                ## tell hadoop we are still alive
                self.increment_counter('SubcorpusCounter', 'StreamItemsProcessed', 1)

                ## iterate over sentences to generate the two counts
                for content in ['body', 'anchor', 'title']:
                    for sentence in kba_corpus.sentences(stream_item, content=content):
                        num_ner_tokens += len(sentence)
                        num_ner_sentences += 1

        except Exception, exc:
            ## oops, log verbosely, including with counters (maybe too clever)
            kba_corpus.log(traceback.format_exc(exc))
            key = 'FAILED-%s' % re.sub('\s+', '-', str(exc))
            ## could emit this, but that would polute the output
            # yield key, public_url
            self.increment_counter('SubcorpusCounter', key, 1)
예제 #3
0
 def reducer(self, source, counts):
     '''
     Sums up all the counts for a given source
     '''
     num_ner_tokens = 0
     num_ner_sentences = 0
     kba_corpus.log('reading counts for %r' % source)
     self.increment_counter('SubcorpusCounter','ReducerLaunched',1)
     for count_pair in counts:
         num_ner_tokens    += count_pair[0]
         num_ner_sentences += count_pair[1]
         self.increment_counter('SubcorpusCounter','CountPairRead',1)
     yield source, (num_ner_tokens, num_ner_sentences)
     self.increment_counter('SkippingTaskCounters','ReduceProcessedRecords',1)
예제 #4
0
 def reducer(self, source, counts):
     '''
     Sums up all the counts for a given source
     '''
     num_ner_tokens = 0
     num_ner_sentences = 0
     kba_corpus.log('reading counts for %r' % source)
     self.increment_counter('SubcorpusCounter', 'ReducerLaunched', 1)
     for count_pair in counts:
         num_ner_tokens += count_pair[0]
         num_ner_sentences += count_pair[1]
         self.increment_counter('SubcorpusCounter', 'CountPairRead', 1)
     yield source, (num_ner_tokens, num_ner_sentences)
     self.increment_counter('SkippingTaskCounters',
                            'ReduceProcessedRecords', 1)