def make_hyperlink_labeled_test_chunk(tmpdir): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = tmpdir.join(str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') ipath = get_test_chunk_path() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [BYTES], }) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def make_hyperlink_labeled_test_chunk(): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') dpath = os.path.dirname(__file__) ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' ) cv = _init_stage('clean_visible', {}) hl = hyperlink_labels( {'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES']} ) for si in Chunk(path=ipath): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('out_path') args = parser.parse_args() si = make_stream_item(1, 'http://crazydog.com') si.body.raw = ''' Flying dogs are amazing. The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog. ''' topic_name = 'The flight of the super dog Sam Vroomvroom' sel = Selector( selector_type=SelectorType.TOPIC.value, raw_selector=topic_name, canonical_selector=topic_name.lower( ), # this is the key for making it appear for a profile of this title offsets={ OffsetType.CHARS: Offset( type=OffsetType.CHARS, first=si.body.raw.find('The'), length=len(topic_name), ) }, ) si.body.selectors['other'] = [sel] chunk = Chunk(args.out_path, mode='wb') chunk.add(si) chunk.close()
def test_matcher(): config = dict( ## command to run fpat_path="cat" ) fm = fpat_matcher(config) si1 = make_stream_item(None, "http://example.com") si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.") si2 = make_stream_item(None, "http://example.com") si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.") chunk_path = "/tmp/%s" % uuid.uuid1() ch = Chunk(chunk_path, mode="wb") ch.add(si1) ch.add(si1) ch.add(si2) ch.close() fm(chunk_path) ch = Chunk(chunk_path, mode="rb") SIs = list(ch) ## verify the si has expected things for si in SIs: len(si.body.labels) == 1 for i in range(2): print SIs[i].ratings
class factorie(FactorieBase): ''' incremental transform ''' def __init__(self, config): super(FactorieIncremetal, self).__init__(config) self.toFactoriePipeName = None self.fromFactoriePipeName = None self.pipeToFactorie = None self.pipeFromFactorie = None self.taggedChunkIter = None def start(self): self.toFactoriePipeName = os.tmpnam() self.fromFactoriePipeName = os.tmpnam() os.mkfifo(self.toFactoriePipeName) os.mkfifo(self.fromFactoriePipeName) logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName) self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName) self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab') self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb') self.taggedChunkIter = iter(self.pipeFromFactorie) def close(self): self.pipeToFactorie.close() self.taggedChunkIter = None self.pipeFromFactorie = None os.unlink(self.toFactoriePipeName) os.unlink(self.fromFactoriePipeName) if self.process: self.process.terminate() self.process = None def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False def __call__(self, stream_item, context): if not self.process: self.start() logger.debug('pushing stream item to factorie') self.pipeToFactorie.add(stream_item) self.pipeToFactorie.flush() nc = self.taggedChunkIter.next() logger.debug('got item from factorie') return nc
def _aligner_core(t_path1, aligner, aligner_data): t_chunk1 = Chunk(t_path1, mode='rb') t_path2 = t_path1 + '-tmp-aligning' t_chunk2 = Chunk(t_path2, mode='wb') for si in t_chunk1: aligner(si, aligner_data) t_chunk2.add(si) t_chunk1.close() t_chunk2.close() if aligner_data.get('cleanup_tmp_files', True): logger.info('atomic rename: %r --> %r', t_path2, t_path1) os.rename(t_path2, t_path1) logger.debug('done renaming') else: # for development, leave intermediate tmp file shutil.copy(t_path2, t_path1) logger.info('copied %r -> %r', t_path2, t_path1)
def _aligner_core(t_path1, aligner, aligner_data): t_chunk1 = Chunk(t_path1, mode='rb') t_path2 = t_path1 + '-tmp-aligning' t_chunk2 = Chunk(t_path2, mode='wb') for si in t_chunk1: aligner( si, aligner_data ) t_chunk2.add(si) t_chunk1.close() t_chunk2.close() if aligner_data.get('cleanup_tmp_files', True): logger.info('atomic rename: %r --> %r', t_path2, t_path1) os.rename(t_path2, t_path1) logger.debug('done renaming') else: # for development, leave intermediate tmp file shutil.copy(t_path2, t_path1) logger.info('copied %r -> %r', t_path2, t_path1)
def __call__(self, chunk_path): ''' batch-type transform stage: reads a chunk from chunk_path, and replaces it with a new chunk at the same path ''' ## make a new output chunk at a temporary path tmp_chunk_path = chunk_path + '_' t_chunk = Chunk(path=tmp_chunk_path, mode='wb') for num, si in enumerate(Chunk(path=chunk_path)): if num < self.config['max_items']: t_chunk.add(si) else: break ## flush to disk t_chunk.close() ## atomic rename new chunk file into place os.rename(tmp_chunk_path, chunk_path)
def __call__(self, chunk_path): ''' batch-type transform stage: reads a chunk from chunk_path, and replaces it with a new chunk at the same path ''' ## make a new output chunk at a temporary path tmp_chunk_path = chunk_path + '_' t_chunk = Chunk(path=tmp_chunk_path, mode='wb') for num, si in enumerate(Chunk(path=chunk_path)): if num < self.config['max_items']: t_chunk.add(si) else: break ## flush to disk t_chunk.close() ## atomic rename new chunk file into place os.rename(tmp_chunk_path, chunk_path)
def make_hyperlink_labeled_test_chunk(tmpdir): """ returns a path to a temporary chunk that has been hyperlink labeled """ tpath = tmpdir.join(str(uuid.uuid1()) + ".sc") o_chunk = Chunk(tpath, mode="wb") ipath = get_test_chunk_path() hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]}) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
# ) ## To properly represent a taggers output in a StreamItem, you ## should populate these fields: #si.body.sentences[tagger_id] = [Sentence(....) for ... in sentence_builder] ## Serif can convert serifxml into this structure. ## now that you have populated this StreamItem, add it to the ## chunk file, and go to the next StreamItem ch.add(si) print 'added StreamItem.stream_id = %s from date_hour = %s' % ( si.stream_id, get_date_hour(si)) ## after adding all the StreamItems, close the chunk: ch.close() ## Typically, chunk files should be limited to about 500 documents or ## smaller. There are several nice pythonic techniques for making ## many chunk files, ask us for examples to suit your circumstances. ## Typically, all of the StreamItems in a chunk file have stream_times ## from the same hour in history. That is, if you call ## get_date_hour(si) you should get the same string for every ## StreamItem in the chunk file. ## Organizing a large number of documents to meet these requirements ## can take some work. Post an issue ticket if you want to discuss ## your needs. print 'saved a file to %s with md5 sum: %s' % (output_path, ch.md5_hexdigest)
) si.source_metadata['lang'] = pe.lang[0].code si.source_metadata['author'] = json.dumps( dict( name = pe.author[0].name, email = pe.author[0].email, link = pe.author[0].link[0].href, ) ) si.source = entry.source.publisher_type yield si if __name__ == '__main__': #import sys #from _handle_unconvertible_spinn3r import handle_unconvertible_spinn3r as hus #map(hus, _generate_stream_items(sys.stdin.read())) o_chunk = Chunk('/tmp/foo.sc', mode='wb') for si in _generate_stream_items(sys.stdin.read()): print '---post smoosh raw: %s --' % si.stream_id print si.body.raw print si.stream_id if si.stream_id == '1345928297-da71cfa833ce8218684b6dab152dd69b': o_chunk.add( si ) o_chunk.close()
# ) ## To properly represent a taggers output in a StreamItem, you ## should populate these fields: #si.body.sentences[tagger_id] = [Sentence(....) for ... in sentence_builder] ## Serif can convert serifxml into this structure. ## now that you have populated this StreamItem, add it to the ## chunk file, and go to the next StreamItem ch.add(si) print 'added StreamItem.stream_id = %s from date_hour = %s' % ( si.stream_id, get_date_hour(si)) ## after adding all the StreamItems, close the chunk: ch.close() ## Typically, chunk files should be limited to about 500 documents or ## smaller. There are several nice pythonic techniques for making ## many chunk files, ask us for examples to suit your circumstances. ## Typically, all of the StreamItems in a chunk file have stream_times ## from the same hour in history. That is, if you call ## get_date_hour(si) you should get the same string for every ## StreamItem in the chunk file. ## Organizing a large number of documents to meet these requirements ## can take some work. Post an issue ticket if you want to discuss ## your needs. print 'saved a file to %s with md5 sum: %s' % (output_path, ch.md5_hexdigest)