def test_aligner(tmpdir, request, test_data_dir, third_dir):

    si = make_hyperlink_labeled_test_stream_item(test_data_dir)
    assert len(si.body.clean_visible) > 200
    #for x in si.body.labels['author']:
    #    print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id
    c_path = str(tmpdir.join('chunk.sc'))
    chunk = streamcorpus.Chunk(c_path, mode='wb')
    chunk.add(si)
    chunk.close()

    lp = lingpipe(
        config={
            'tmp_dir_path': str(tmpdir),
            'exit_code_on_out_of_memory': 1,
            'third_dir_path': third_dir,
            'path_in_third': 'lingpipe-4.10',
            'offset_types': ['BYTES'],
            'offset_debugging': True,
            'cleanup_tmp_files': False,
            'align_labels_by': 'byte_offset_labels',
            'aligner_data': {
                'annotator_id': 'author',
                'tagger_id': 'lingpipe',
            },
        })
    lp.process_path(c_path)
    ## this are only present if cleanup_tmp_files is False
    assert tmpdir.join('chunk.sc-clean_visible.xml').read()
    assert tmpdir.join('chunk.sc-ner.xml').read()
    si = list(streamcorpus.Chunk(c_path))[0]
    assert len(si.body.clean_visible) > 200
    assert len(si.body.sentences['lingpipe']) == 41
예제 #2
0
def worker(args):
    event, event_title, query, hours, rel_dir, c_dir, log_file = args
    msg = sc.StreamItem_v0_2_0
    
    with open(log_file, 'w') as lgf:
        for hour in hours:

            total_docs = 0
            total_rel = 0

            hdir = os.path.join(c_dir, hour)
            opath = str(os.path.join(rel_dir, '{}.sc.gz'.format(hour)))
            if not os.path.exists(hdir):
                continue
            if os.path.exists(opath):
                os.remove(opath)
            print hdir        

            ochunk = sc.Chunk(path=opath, message=msg, mode='wb')
            for cname in os.listdir(hdir):
                path = str(os.path.join(hdir, cname))
                for si in sc.Chunk(path=path, message=msg):
                    total_docs += 1            
                    if si.body.clean_visible is None:
                        continue
                    elif re.search(query, si.body.clean_visible, re.I):
                        total_rel += 1
                        ochunk.add(si)
            ochunk.close()
            lgf.write('{}\t{}\t{}\n'.format(hour, total_rel, total_docs))
            lgf.flush()
예제 #3
0
def worker(args):
    rc_dir, out_dir, hours, event, ad_dir, log_file = args
    vct_pkl = os.path.join(ad_dir, 'article_vectorizer.pkl')
    clf_pkl = os.path.join(ad_dir, 'article_clf.pkl')
    artcl_detect = ArticleDetector(vct_pkl, clf_pkl, event)
    lgf = open(log_file, 'w')

    n_hours = len(hours)
    for h, hour in enumerate(hours, 1):

        n_docs = 0
        n_sents = 0
        n_rel_docs = 0
        n_rel_sents = 0

        #print u'({}/{}) hour: {}'.format(h, n_hours, hour)
        chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour))
        opath = str(os.path.join(out_dir, '{}.sc.gz'.format(hour)))
        ochunk = sc.Chunk(path=opath, mode='wb')
        try:
            for si_idx, si in enumerate(sc.Chunk(path=chunk)):

                n_docs += 1
                if u'serif' in si.body.sentences:
                    annotator = u'serif'
                elif u'lingpipe' in si.body.sentences:
                    annotator = u'lingpipe'
                else:
                    continue

                n_sents += len(si.body.sentences[annotator])
                sent_idxs = artcl_detect.find_articles(si, annotator)
                n_idxs = len(sent_idxs)
                if n_idxs > 0:
                    n_rel_docs += 1
                    n_rel_sents += n_idxs
                    rel_sents = []
                    for sent_idx in sent_idxs:
                        rel_sents.append(
                            si.body.sentences[annotator][sent_idx])
                    si.body.sentences['article-clf'] = rel_sents

                    ochunk.add(si)

            ochunk.close()
            lgf.write('{}\t{}\t{}\t{}\t{}\n'.format(hour, n_docs, n_sents,
                                                    n_rel_docs, n_rel_sents))
            lgf.flush()
        except IOError, e:
            print str(e)
예제 #4
0
def ids_and_clean_visible_from_streamcorpus_chunk_path(corpus_path):
    '''converts a streamcorpus.Chunk file into the structure that is
    passed by the search engine to find_soft_selectors

    '''
    ch = clean_html(clean_html.default_config)
    cv = clean_visible(clean_visible.default_config)
    ids_and_clean_visible = []
    for si in streamcorpus.Chunk(path=corpus_path):
        if not si.body.clean_visible:
            ## attempt to make clean_visible
            if not si.body.raw:
                logger.critical('no raw content, so skipping: %r', si.abs_url)
                continue
            abs_url = si.abs_url
            si = ch(si, {})
            if not si:
                logger.critical(
                    'failed to make clean_html, so skipping: %r', abs_url)
                continue
            si = cv(si, {})
            if not si or not si.body.clean_visible:
                logger.critical(
                    'failed to make clean_visible, so skipping: %r', abs_url)
                continue
        rec = (si.stream_id, si.body.clean_visible.decode('utf8'), {})
        ids_and_clean_visible.append(rec)
    return ids_and_clean_visible
예제 #5
0
 def __call__(self, i_str):
     backoff = 0.1
     start_time = time.time()
     tries = 0
     max_retries = int(self.config.get('max_retries', 1))
     last_exc = None
     while tries < max_retries:
         try:
             message = _message_versions[self.config['streamcorpus_version']]
             logger.debug('reading from %r' % i_str)
             chunk = streamcorpus.Chunk(path=i_str, mode='rb', message=message)
             return chunk
         except IOError, exc:
             if exc.errno == errno.ENOENT:
                 logger.critical('File is missing?  Assume is slow NFS, try %d more times',
                                 max_retries - tries)
                 backoff *= 2
                 tries += 1
                 elapsed = time.time() - start_time
                 if elapsed > self.config['max_backoff']:
                     ## give up after five minutes of retries
                     logger.critical('File %r not found after %d retries', i_str, tries)
                     raise
                 time.sleep(backoff)
                 last_exc = exc
             else:
                 logger.critical('failed loading %r', i_str, exc_info=True)
                 raise
 def read_stream_id(fn):
     ids = set()
     for si in streamcorpus.Chunk(path=fn,
                                  mode='rb',
                                  message=streamcorpus.StreamItem_v0_3_0):
         ids.add(si.stream_id)
     return ids
def worker(args):
    msg = sc.StreamItem_v0_2_0
    chunk_dir, hours, counts_dir, pid = args
    nhours = len(hours)
    for i, hour in enumerate(hours, 1):
        hdir = os.path.join(chunk_dir, hour)
        chunks = [os.path.join(hdir, fname) for fname in os.listdir(hdir)]
        ofile = os.path.join(counts_dir, '{}.txt.gz'.format(hour))
        print '{}) {} -- {}/{}'.format(pid, hour, i, nhours)
        print '--> {}'.format(ofile)

        counts = defaultdict(int)
        doc_counts = defaultdict(int)
        for chunk in chunks:
            for si in sc.Chunk(path=chunk, message=msg):
                doc_words = set()
                for sentence in si.body.sentences['lingpipe']:
                    for token in sentence.tokens:
                        t = token.token.decode('utf-8')
                        counts[t] += 1
                        doc_words.add(t)
                for word in doc_words:
                    doc_counts[word] += 1
        with gzip.open(ofile, 'wb') as f:
            for token, count in counts.iteritems():
                doc_count = doc_counts[token]
                f.write(token.encode('utf-8'))
                f.write('\t')
                f.write(str(count))
                f.write('\t')
                f.write(str(doc_count))
                f.write('\n')
예제 #8
0
def worker(args):
    chunk_dir, hours, wcounts_dir, dcounts_dir, log_file = args
    nhours = len(hours)

    with open(log_file, 'w') as lf:
        for i, hour in enumerate(hours, 1):
            hdir = os.path.join(chunk_dir, hour)
            chunks = [os.path.join(hdir, fname) for fname in os.listdir(hdir)]

            wcfile = os.path.join(wcounts_dir, '{}.txt.gz'.format(hour))
            dcfile = os.path.join(dcounts_dir, '{}.txt'.format(hour))

            lf.write('Counting hour {} ({}/{})\n'.format(hour, i, nhours))
            lf.flush()

            if os.path.exists(wcfile) and os.path.exists(dcfile):
                continue

            counts = defaultdict(int)
            doc_counts = defaultdict(int)
            num_docs = 0
            for chunk in chunks:
                for si in sc.Chunk(path=chunk):

                    if 'serif' not in si.body.sentences:
                        continue

                    num_docs += 1

                    doc_words = set()
                    for sentence in si.body.sentences['serif']:
                        for token in sentence.tokens:
                            t = token.token.decode('utf-8')
                            counts[t] += 1
                            doc_words.add(t)

                    for word in doc_words:
                        doc_counts[word] += 1

            if len(counts) == 0:
                lf.write(u'Warning: {} contained no words.\n'.format(chunk))
                lf.flush()
                continue

            # Write doc counts for this hour
            with open(dcfile, 'w') as f:
                f.write(str(num_docs))
                f.flush()

            # Write word counts for this hour
            with gzip.open(wcfile, 'wb') as f:
                for token, count in counts.iteritems():
                    doc_count = doc_counts[token]
                    f.write(token.encode('utf-8'))
                    f.write('\t')
                    f.write(str(count))
                    f.write('\t')
                    f.write(str(doc_count))
                    f.write('\n')
예제 #9
0
 def streamitem_iter(self, event, corpus):
     for hour in event.list_event_hours():
         for chunk_path in self.get_chunks_for_hour(hour, corpus, event):
             with sc.Chunk(path=chunk_path,
                           mode="rb",
                           message=corpus.sc_msg()) as chunk:
                 for si in chunk:
                     yield hour, chunk_path, si
예제 #10
0
def test_protection(test_data_dir):
    with pytest.raises(streamcorpus.VersionMismatchError):  # pylint: disable=E1101
        for si in streamcorpus.Chunk(os.path.join(
                test_data_dir,
                'test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz'
        ),
                                     message=streamcorpus.StreamItem):
            pass
예제 #11
0
def worker(args):
    corpus, path = args

    locations = set()
    for si in sc.Chunk(path=path, message=corpus.sc_msg()):
        for sentence in si.body.sentences[u'article-clf']:
            for loc_seq in cuttsum.geo.get_loc_sequences(sentence):
                locations.add(loc_seq)
    return tuple(locations)
예제 #12
0
    def _maybe_run_post_batch_incremental_transforms(self, t_path):
        ## Run post batch incremental (pbi) transform stages.
        ## These exist because certain batch transforms have
        ## to run before certain incremental stages.
        if self.pbi_stages:
            t_path2 = os.path.join(
                self.tmp_dir_path,
                'trec-kba-pipeline-tmp-%s' % str(uuid.uuid1()))
            # open destination for _run_incremental_transforms to write to
            self.t_chunk = streamcorpus.Chunk(path=t_path2, mode='wb')

            input_t_chunk = streamcorpus.Chunk(path=t_path, mode='rb')
            for si in input_t_chunk:
                self._run_incremental_transforms(si, self.pbi_stages)

            self.t_chunk.close()

            os.rename(t_path2, t_path)
예제 #13
0
def test_upgrade_streamcorpus_v0_3_0(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    count = 0

    for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir),
                                 message=streamcorpus.StreamItem_v0_2_0):
        count += 1
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0']
        if count > 10:
            break
예제 #14
0
def sentencestring_worker_(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)

    corpus = kwargs.get(u'corpus')
    cnlp = corenlp.server.CoreNLPClient()

    while not job_queue.empty():
        try:

            chunk_path, tsv_path = job_queue.get(block=False)

            sent_string_data = []
            for si in sc.Chunk(path=chunk_path, message=corpus.sc_msg()):

                sentences = corpus.get_sentences(si)
                str2idx = {}
                for idx, sentence in enumerate(sentences):
                    key = stringify_streamcorpus_sentence(sentence)
                    str2idx[key] = idx

                for sentence in si.body.sentences[u'article-clf']:
                    sc_string = stringify_streamcorpus_sentence(sentence)
                    idx = str2idx[sc_string]

                    #print idx, ")", sc_string
                    doc = cnlp.annotate(sc_string)
                    locs = get_loc_sequences(doc)
                    if len(locs) > 0:
                        locs_string = (u','.join(locs)).encode(u'utf-8')
                    else:
                        locs_string = 'nan'
                    cnlp_string = stringify_corenlp_doc(doc)
                    #print cnlp_string
                    sent_string_data.append({
                        u'stream id': si.stream_id,
                        u'sentence id': idx,
                        u'streamcorpus': sc_string,
                        u'corenlp': cnlp_string,
                        u'locations': locs_string
                    })

            if len(sent_string_data) > 0:
                df = pd.DataFrame(sent_string_data,
                                  columns=[
                                      u'stream id', u'sentence id',
                                      u'streamcorpus', u'corenlp', u'locations'
                                  ])

                with gzip.open(tsv_path, u'w') as f:
                    df.to_csv(f, sep='\t', index=False, index_label=False)

            result_queue.put(None)
        except Queue.Empty:
            pass
예제 #15
0
 def streamitem_iter(self, event, corpus, extractor):
     for hour in event.list_event_hours():
         path = self.get_chunk_path(event, extractor, hour, corpus)
         if os.path.exists(path):
             print path
             try: 
                 with sc.Chunk(path=path, mode="rb", 
                         message=corpus.sc_msg()) as chunk:
                     for si in chunk:
                         yield hour, path, si   
             except IOError, msg:
                 print msg
예제 #16
0
    def get_chunk(self, key):
        tries = 0
        while 1:
            fh = StringIO()
            key.get_contents_to_file(fh)
            data = fh.getvalue()
            _errors, data = decrypt_and_uncompress(
                data,
                self.config.get('gpg_decryption_key_path'),
                ## how should this get into the config...?
                tmp_dir=self.config['tmp_dir_path'],
            )
            logger.info('\n'.join(_errors))
            if self.config['input_format'] == 'streamitem' and \
                    self.config['streamcorpus_version'] == 'v0_1_0':
                i_content_md5 = key.key.split('.')[-3]
            else:
                ## go past {sc,protostream}.xz.gpg
                parts = key.key.split('.')
                if parts[-1] == '.gpg':
                    parts.pop()
                i_content_md5 = parts[-3][-32:]

            ## verify the data matches expected md5
            f_content_md5 = hashlib.md5(data).hexdigest()  # pylint: disable=E1101
            if i_content_md5 != f_content_md5:
                msg = 'FAIL(%d): %s --> %s != %s' % (
                    tries, key.key, i_content_md5, f_content_md5)
                logger.critical(msg)
                tries += 1
                if tries > self.config['tries']:
                    ## indicate complete failure to pipeline so it
                    ## gets recorded in task_queue
                    raise FailedExtraction(msg)
                else:
                    continue

            if self.config['input_format'] == 'spinn3r':
                ## convert the data from spinn3r's protostream format
                return _generate_stream_items(data)

            elif self.config['input_format'] == 'streamitem':
                message = _message_versions[
                    self.config['streamcorpus_version']]

                return streamcorpus.Chunk(data=data, message=message)

            else:
                sys.exit('Invalid config: input_format = %r' %
                         self.config['input_format'])
예제 #17
0
    def add(self, si):
        '''puts `si` into the currently open chunk, which it creates if
        necessary.  If this item causes the chunk to cross chunk_max,
        then the chunk closed after adding.

        '''
        if self.o_chunk is None:
            if os.path.exists(self.t_path):
                os.remove(self.t_path)
            self.o_chunk = streamcorpus.Chunk(self.t_path, mode='wb')
        self.o_chunk.add(si)
        logger.debug('added %d-th item to chunk', len(self.o_chunk))
        if len(self.o_chunk) == self.chunk_max:
            self.close()
예제 #18
0
        def next_chunk_file(chunk_file_num):
            deduped_path_fmt = self.get_deduped_path_fmt(
                event, corpus, extractor, threshold=thresh)
            deduped_path = deduped_path_fmt.format(
                chunk_file_num)
            deduped_dir = os.path.dirname(deduped_path)
            if not os.path.exists(deduped_dir):
                os.makedirs(deduped_dir)
            
            if os.path.exists(deduped_path):
                os.remove(deduped_path)

            return sc.Chunk(path=deduped_path, mode="wb", 
                message=corpus.sc_msg())
예제 #19
0
def _article_resource_worker(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    event = kwargs.get(u'event')
    corpus = kwargs.get(u'corpus')
    while not job_queue.empty():
        try:
            opath, chunk_paths = job_queue.get(block=False)
            artcl_detect = ArticleDetector(event)
            patt = event.regex_pattern()
            with sc.Chunk(path=opath, mode='wb', message=corpus.sc_msg()) as ochunk:
                for path in chunk_paths:
                    for si in sc.Chunk(path=path, message=corpus.sc_msg()):
                        if si.body.clean_visible is None:
                            continue
                        
                        elif patt.search(si.body.clean_visible, re.I):
                            
                            #if corpus.annotator() not in si.body.sentences:
                            #    continue
                            sentences = corpus.get_sentences(si)
                            sent_idxs = artcl_detect.find_articles(
                                sentences)
                            if len(sent_idxs) > 0:
                                rel_sents = []
                                for sent_idx in sent_idxs:
                                    #for token in sentences[sent_idx].tokens:
                                    #    print token.token,
                                    #print
                                    rel_sents.append(sentences[sent_idx])
                                si.body.sentences[u'article-clf'] = rel_sents
                                ochunk.add(si)


            result_queue.put(None)
        except Queue.Empty:
            pass
예제 #20
0
    def streamitem_iter(self, event, corpus, extractor, threshold=.8):
        df = self.get_stats_df(
            event, corpus, extractor, threshold)
        if df is None: return

        import math
        num_chunks = int(math.ceil(len(df) / 1000.))
        tmp = self.get_deduped_path_fmt(
            event, corpus, extractor, threshold)
        for i in xrange(1, num_chunks + 1):
            path = tmp.format(i)
            if os.path.exists(path): 
                with sc.Chunk(path=path, mode="rb", 
                        message=corpus.sc_msg()) as chunk:
                    for si in chunk:
                        yield si
예제 #21
0
def _idf_resource_worker(job_queue, result_queue, **kwargs):

    signal.signal(signal.SIGINT, signal.SIG_IGN)
    corpus = kwargs.get(u'corpus')

    while not job_queue.empty():
        try:

            mpath, paths = job_queue.get(block=False)
            n_docs = 0
            counts = defaultdict(int)
            for path in paths:
                for si in sc.Chunk(path=path,
                                   mode='rb',
                                   message=corpus.sc_msg()):

                    sentences = corpus.get_sentences(si)
                    if len(sentences) == 0:
                        continue

                    n_docs += 1
                    unique_words = set()
                    for sentence in sentences:
                        for token in sentence.tokens:
                            unique_words.add(
                                token.token.decode(u'utf-8').lower())
                    for word in unique_words:
                        counts[word] += 1
            n_docs = float(n_docs)
            words = counts.keys()

            idfs = [
                tuple([np.log(n_docs / value) + 1, value])
                for value in counts.values()
            ]

            trie = marisa_trie.RecordTrie("<dd", zip(words, idfs))
            with gzip.open(mpath, u'wb') as f:
                trie.write(f)

            result_queue.put(None)
        except Queue.Empty:
            pass
예제 #22
0
def test_kvlayer_index_with_source(configurator, test_data_dir):
    overlay = {
        'streamcorpus_pipeline': {
            'to_kvlayer': {
                'indexes': ['with_source'],
            },
        },
    }
    with chunks(configurator, test_data_dir, overlay) as (path, client):
        # We should not have written the doc_id_epoch_ticks index at all
        for k, v in client.scan('stream_items_doc_id_epoch_ticks'):
            assert False, 'epoch_ticks present! k={!r}'.format(k)
        # Every item in the ...with_source index should match a real item
        for k, v in client.scan('stream_items_with_source'):
            assert v == 'WEBLOG'  # by inspection
            for kk, sixz in client.get('stream_items', k):
                errs, sibytes = streamcorpus.decrypt_and_uncompress(sixz)
                assert errs == []
                for si in streamcorpus.Chunk(data=sibytes):
                    assert si.source == v
예제 #23
0
def test_upgrade_streamcorpus_v0_3_0_check_mention_ids(test_data_dir):
    up = upgrade_streamcorpus_v0_3_0(config={})
    all_mention_ids = set()
    for si in streamcorpus.Chunk(os.path.join(
            test_data_dir,
            'test/MAINSTREAM_NEWS-15-9d6218f0aa7c9585cda12a10d642a8b3-41600ffca7703f7914102da5256233ce.sc.xz'
    ),
                                 message=streamcorpus.StreamItem_v0_2_0):
        si3 = up(si)
        assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0']
        mention_ids = set()
        for sentence in si3.body.sentences['lingpipe']:
            sentence_mention_ids = set()
            for token in sentence.tokens:
                if token.mention_id not in [None, -1]:
                    sentence_mention_ids.add(token.mention_id)

            assert mention_ids.intersection(sentence_mention_ids) == set()
            mention_ids.update(sentence_mention_ids)
            all_mention_ids.update(sentence_mention_ids)
    assert len(all_mention_ids) > 0
예제 #24
0
        def keys_and_values():
            for si in streamcorpus.Chunk(t_path):
                key1 = uuid.UUID(int=si.stream_time.epoch_ticks)
                key2 = uuid.UUID(hex=si.doc_id)
                data = streamcorpus.serialize(si)
                errors, data = streamcorpus.compress_and_encrypt(data)
                assert not errors, errors

                yield (key1, key2), data

                for ndx in indexes:
                    if ndx == 'doc_id_epoch_ticks':
                        kvp = ((key2, key1), r'')
                    elif ndx == 'with_source':
                        ## si.source can be None but we can't write None blobs to kvlayer
                        if si.source:
                            kvp = ((key1, key2), si.source)
                        else:
                            continue
                    else:
                        assert False, ('invalid index type ' + ndx)
                    indexes[ndx].append(kvp)
예제 #25
0
	def __init__(self, filenames, **kwargs):
		super(StreamCorpusDataset, self).__init__(kwargs)

		filenames = getfiles(filenames)

		for filename in filenames:
			for si in sc.Chunk(path=filename):
				if si.body.clean_visible == None:
					continue

				did = si.stream_id

				try:
					sentences = si.body.sentences["serif"]
				except KeyError:
					sentences = si.body.sentences["lingpipe"]

				for sind, sentence in enumerate(sentences):
					sid = make_sid(did, sind)
					self.add_sentence(sid, sentence)

		self.build_dictionary()
예제 #26
0
def test_kvlayer_reader_and_writer(configurator, test_data_dir):
    with chunks(configurator, test_data_dir) as (path, client):
        ## check that index table was created
        all_doc_ids = set()
        all_epoch_ticks = set()
        for (doc_id, epoch_ticks
             ), empty_data in client.scan('stream_items_doc_id_epoch_ticks'):
            all_doc_ids.add(doc_id)
            all_epoch_ticks.add(epoch_ticks)
        all_doc_ids = sorted(all_doc_ids)
        all_epoch_ticks = sorted(all_epoch_ticks)
        logger.info('%d doc_ids', len(all_doc_ids))

        ## make an reader
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'from_kvlayer')
        reader = from_kvlayer(config)

        ## test it with different i_str inputs:
        for i_str in [
                '',
                '0,,%d,' % 10**10,
                '%d,%s,%d,%s' % (all_epoch_ticks[0], all_doc_ids[0],
                                 all_epoch_ticks[-1], all_doc_ids[-1])
        ]:
            stream_ids = []
            for si in reader(i_str):
                stream_ids.append(si.stream_id)
            _input_chunk_ids = [
                si.stream_id for si in streamcorpus.Chunk(path)
            ]
            input_chunk_ids = list(set(_input_chunk_ids))
            logger.info('%d inserts, %d unique', len(_input_chunk_ids),
                        len(input_chunk_ids))
            input_chunk_ids.sort()
            stream_ids.sort()
            assert len(input_chunk_ids) == len(stream_ids)
            assert input_chunk_ids == stream_ids
예제 #27
0
def main():
    event_file, rc_dir, event_title, ofile = parse_args()
    event = load_event(event_title, event_file)
    hours = [dth for dth in gen_dates(event.start, event.end)]
    num_hours = len(hours)

    meta_data = []
    bow_dicts = []

    for h, hour in enumerate(hours, 1):
        path = os.path.join(rc_dir, '{}.sc.gz'.format(hour))
        for si in sc.Chunk(path=path):
            uni2id = {}
            for sid, sentence in enumerate(si.body.sentences[u'serif'], 0):
                uni2id[sentence_uni(sentence)] = sid

            for sent in si.body.sentences[u'article-clf']:
                bow_dict = {}
                for token in sent.tokens:
                    t = token.token.decode(u'utf-8').lower()
                    bow_dict[t] = 1
                bow_dicts.append(bow_dict)
                uni = sentence_uni(sent)
                sent_id = uni2id[uni]
                meta_data.append((hour, si.stream_id, sent_id, uni))

    vctr = DictVectorizer()
    X = vctr.fit_transform(bow_dicts)

    with codecs.open(ofile, 'w', 'utf-8') as f:
        for i, (hour, stream_id, sent_id, uni) in enumerate(meta_data):
            uni = uni.replace(u'\n', u' ').replace(u'\t', u' ')
            f.write(u'{}\t{}\t{}\t{}\t'.format(hour, stream_id, sent_id, uni))
            x = u' '.join([unicode(col) for col in X[i, :].indices])
            f.write(x)
            f.write(u'\n')
            f.flush()
def worker(args):
    rc_dir, nuggets, hours, event, doc_freqs, word_freqs = args
    msg = sc.StreamItem_v0_2_0

    for hour in hours:
        active_nuggets = get_active_nuggets(hour, nuggets)
        if len(active_nuggets) == 0:
            continue
        hour_m5 = get_previous_hour(hour, 5)
        hour_m10 = get_previous_hour(hour, 10)
        num_docs = doc_count(doc_freqs, hour)
        num_m5_docs = doc_count(doc_freqs, hour_m5)
        num_m10_docs = doc_count(doc_freqs, hour_m10)
        hour_wc = read_tfdf(word_freqs, hour)
        hour_m5_wc = read_tfdf(word_freqs, hour_m5)
        hour_m10_wc = read_tfdf(word_freqs, hour_m10)
        chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour))

        for si in sc.Chunk(path=chunk, message=msg):
            doc_wc = make_doc_wordcounts(si)

            for sentence in si.body.sentences['lingpipe']:
                avg_tfidf = compute_avg_tfidf(sentence, doc_wc, hour_wc,
                                              num_docs)
                avg_m5_tfidf = compute_avg_tfidf(sentence, None, hour_m5_wc,
                                                 num_m5_docs)
                avg_m10_tfidf = compute_avg_tfidf(sentence, None, hour_m10_wc,
                                                  num_m10_docs)
                delta_m5_tfidf = avg_tfidf - avg_m5_tfidf
                delta_m10_tfidf = avg_tfidf - avg_m10_tfidf

                tokens = [token.token for token in sentence.tokens]
                print avg_tfidf, avg_m5_tfidf, avg_m10_tfidf, ' '.join(tokens)

        print hour, hour_m5, num_m5_docs, hour_m10
        sys.exit()
예제 #29
0
 def _get_streamitem(self):
     for si in streamcorpus.Chunk(EXPORT_SC_FILENAME):
         return si
     return None
def main():

    event_file, rc_dir, event_title, ofile, ports, cnts_dirs = parse_args()
    wc_dir, dc_dir = cnts_dirs
    event = load_event(event_title, event_file)
    hours = [dth for dth in gen_dates(event.start, event.end)]
    print "Connecting lm clients..."
    dm_lm_score = lm_client_init(ports[0])
    bg_lm3_score = lm_client_init(ports[1][0])
    bg_lm4_score = lm_client_init(ports[1][1])
    bg_lm5_score = lm_client_init(ports[1][2])
    print "Query words:", event.query
    query_matcher = query_term_match_init(event.query)

    wn_terms = wn_synset_terms(event.type)
    print "WordNet synset terms:", wn_terms
    synset_matcher = query_term_match_init(wn_terms)

    tfidfers = []
    preroll = [get_previous_hour(hours[0], i) for i in range(1, 6)]
    for hour in preroll:
        tfidfers.append(init_tfidfers(wc_dir, dc_dir, hour, lower=True))
    tfidfers.append(None)

    of = open(ofile, 'w')

    header = "hour\tstream-id\tsent-id\t" \
             + "avg-tfidf\tavg-tfidf-m1\tavg-tfidf-m5\t" \
             + "dm-logprob\tdm-avg-logprob\tbg3-logprob\tbg3-avg-logprob\t" \
             + "bg4-logprob\tbg4-avg-logprob\tbg5-logprob\tbg5-avg-logprob\t" \
             + "query-matches\tsynset-matches\tnum-tokens\tarticle-position\t" \
             + "article-position-rel\tcapsrate\n"
    of.write(header)
    of.flush()

    num_hours = len(hours)
    for h, hour in enumerate(hours, 1):
        tfidfers = [init_tfidfers(wc_dir, dc_dir, hour, lower=True)] \
            + tfidfers[0:-1]
        print "({}/{}) {}".format(h, num_hours, hour)

        path = os.path.join(rc_dir, '{}.sc.gz'.format(hour))
        for si in sc.Chunk(path=path):

            ticks = float(si.stream_time.epoch_ticks)
            si_datetime = datetime.utcfromtimestamp(ticks)
            tdelta = si_datetime - event.start

            uni2id = {}
            doc_word_counts = defaultdict(int)
            for sid, sentence in enumerate(si.body.sentences[u'serif'], 0):
                uni2id[sentence_uni(sentence)] = sid
                for token in sentence.tokens:
                    t = token.token.decode(u'utf-8').lower()
                    doc_word_counts[t] += 1

            nsents = len(si.body.sentences[u'article-clf'])
            for apos, sent in enumerate(si.body.sentences[u'article-clf'], 1):

                tf_dict = {}
                for token in sent.tokens:
                    t = token.token.decode(u'utf-8').lower()
                    tf_dict[t] = doc_word_counts[t]
                tfidfs_now = tfidfers[0](tf_dict)
                tfidfs_m1 = tfidfers[1](tf_dict)
                tfidfs_m5 = tfidfers[5](tf_dict)

                scores = compute_tfidfs(tfidfs_now, tfidfs_m1, tfidfs_m5)
                avg_tfidf, avg_tfidf_m1, avg_tfidf_m5 = scores

                uni = sentence_uni(sent)
                sent_id = uni2id[uni]
                apos_rel = apos / float(nsents)
                num_tokens = len(sent.tokens)
                caps_rate = get_caps_rate(sent)
                dm_lp, dm_alp = dm_lm_score(uni)
                bg3_lp, bg3_alp = bg_lm3_score(uni)
                bg4_lp, bg4_alp = bg_lm4_score(uni)
                bg5_lp, bg5_alp = bg_lm5_score(uni)
                query_matches = query_matcher(uni)
                synset_matches = synset_matcher(uni)
                #                print dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp, bg5_lp, bg5_alp

                dstr = ('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}' \
                        +'\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n').format(
                    hour, si.stream_id, sent_id,
                    avg_tfidf, avg_tfidf_m1, avg_tfidf_m5,
                    dm_lp, dm_alp, bg3_lp, bg3_alp, bg4_lp, bg4_alp,
                    bg5_lp, bg5_alp, query_matches, synset_matches,
                    num_tokens, apos, apos_rel, caps_rate)
                of.write(dstr)
                of.flush()
    of.close()