def make_hyperlink_labeled_test_chunk(): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') dpath = os.path.dirname(__file__) ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' ) cv = _init_stage('clean_visible', {}) hl = hyperlink_labels( {'require_abs_url': True, 'all_domains': True, 'offset_types': ['BYTES']} ) for si in Chunk(path=ipath): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('out_path') args = parser.parse_args() si = make_stream_item(1, 'http://crazydog.com') si.body.raw = ''' Flying dogs are amazing. The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog. ''' topic_name = 'The flight of the super dog Sam Vroomvroom' sel = Selector( selector_type=SelectorType.TOPIC.value, raw_selector=topic_name, canonical_selector=topic_name.lower( ), # this is the key for making it appear for a profile of this title offsets={ OffsetType.CHARS: Offset( type=OffsetType.CHARS, first=si.body.raw.find('The'), length=len(topic_name), ) }, ) si.body.selectors['other'] = [sel] chunk = Chunk(args.out_path, mode='wb') chunk.add(si) chunk.close()
def make_hyperlink_labeled_test_chunk(tmpdir): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = tmpdir.join(str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') ipath = get_test_chunk_path() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [BYTES], }) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def test_matcher(): config = dict( ## command to run fpat_path="cat" ) fm = fpat_matcher(config) si1 = make_stream_item(None, "http://example.com") si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.") si2 = make_stream_item(None, "http://example.com") si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.") chunk_path = "/tmp/%s" % uuid.uuid1() ch = Chunk(chunk_path, mode="wb") ch.add(si1) ch.add(si1) ch.add(si2) ch.close() fm(chunk_path) ch = Chunk(chunk_path, mode="rb") SIs = list(ch) ## verify the si has expected things for si in SIs: len(si.body.labels) == 1 for i in range(2): print SIs[i].ratings
def test_get_name_info(tmpdir): path = str(tmpdir.join('test_path')) c = Chunk(path, mode='wb') c.add(make_stream_item(28491, 'abs_url')) name_info = get_name_info(path, i_str='foo') assert name_info['date_now'] == name_info['date_time_now'][:10] assert name_info['date_now'] + '-' + name_info['time_now'] == name_info['date_time_now']
class factorie(FactorieBase): ''' incremental transform ''' def __init__(self, config): super(FactorieIncremetal, self).__init__(config) self.toFactoriePipeName = None self.fromFactoriePipeName = None self.pipeToFactorie = None self.pipeFromFactorie = None self.taggedChunkIter = None def start(self): self.toFactoriePipeName = os.tmpnam() self.fromFactoriePipeName = os.tmpnam() os.mkfifo(self.toFactoriePipeName) os.mkfifo(self.fromFactoriePipeName) logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName) self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName) self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab') self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb') self.taggedChunkIter = iter(self.pipeFromFactorie) def close(self): self.pipeToFactorie.close() self.taggedChunkIter = None self.pipeFromFactorie = None os.unlink(self.toFactoriePipeName) os.unlink(self.fromFactoriePipeName) if self.process: self.process.terminate() self.process = None def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False def __call__(self, stream_item, context): if not self.process: self.start() logger.debug('pushing stream item to factorie') self.pipeToFactorie.add(stream_item) self.pipeToFactorie.flush() nc = self.taggedChunkIter.next() logger.debug('got item from factorie') return nc
def _aligner_core(t_path1, aligner, aligner_data): t_chunk1 = Chunk(t_path1, mode='rb') t_path2 = t_path1 + '-tmp-aligning' t_chunk2 = Chunk(t_path2, mode='wb') for si in t_chunk1: aligner(si, aligner_data) t_chunk2.add(si) t_chunk1.close() t_chunk2.close() if aligner_data.get('cleanup_tmp_files', True): logger.info('atomic rename: %r --> %r', t_path2, t_path1) os.rename(t_path2, t_path1) logger.debug('done renaming') else: # for development, leave intermediate tmp file shutil.copy(t_path2, t_path1) logger.info('copied %r -> %r', t_path2, t_path1)
def _aligner_core(t_path1, aligner, aligner_data): t_chunk1 = Chunk(t_path1, mode='rb') t_path2 = t_path1 + '-tmp-aligning' t_chunk2 = Chunk(t_path2, mode='wb') for si in t_chunk1: aligner( si, aligner_data ) t_chunk2.add(si) t_chunk1.close() t_chunk2.close() if aligner_data.get('cleanup_tmp_files', True): logger.info('atomic rename: %r --> %r', t_path2, t_path1) os.rename(t_path2, t_path1) logger.debug('done renaming') else: # for development, leave intermediate tmp file shutil.copy(t_path2, t_path1) logger.info('copied %r -> %r', t_path2, t_path1)
def get_john_smith_tagged_by_lingpipe_without_labels_data(): fh = StringIO() o_chunk = Chunk(file_obj=fh, mode='wb') path = get_john_smith_tagged_by_lingpipe_path() for si in Chunk(path): for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: for labels in token.labels.values(): for label in labels: label.offsets.update(token.offsets) for offset in label.offsets.values(): offset.value = token.token add_annotation(si.body, label) token.labels = dict() o_chunk.add(si) o_chunk.flush() return fh.getvalue()
def get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir): fh = StringIO() o_chunk = Chunk(file_obj=fh, mode='wb') path = get_john_smith_tagged_by_lingpipe_path(test_data_dir) for si in Chunk(path): for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: for labels in token.labels.values(): for label in labels: label.offsets.update(token.offsets) for offset in label.offsets.values(): offset.value = token.token add_annotation(si.body, label) token.labels = dict() o_chunk.add(si) o_chunk.flush() return fh.getvalue()
def __call__(self, chunk_path): ''' batch-type transform stage: reads a chunk from chunk_path, and replaces it with a new chunk at the same path ''' ## make a new output chunk at a temporary path tmp_chunk_path = chunk_path + '_' t_chunk = Chunk(path=tmp_chunk_path, mode='wb') for num, si in enumerate(Chunk(path=chunk_path)): if num < self.config['max_items']: t_chunk.add(si) else: break ## flush to disk t_chunk.close() ## atomic rename new chunk file into place os.rename(tmp_chunk_path, chunk_path)
def make_hyperlink_labeled_test_chunk(tmpdir): """ returns a path to a temporary chunk that has been hyperlink labeled """ tpath = tmpdir.join(str(uuid.uuid1()) + ".sc") o_chunk = Chunk(tpath, mode="wb") ipath = get_test_chunk_path() hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]}) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
#si.body.taggings['serif'] = Taggings( # tagger_id = 'serif', # raw_tagging = serifxml, # tagger_config = 'streamcorpus-all.par', # tagger_config = '6.0.1', # generation_time = make_stream_time('2013-04-18T18:18:20.000000Z'), # ) ## To properly represent a taggers output in a StreamItem, you ## should populate these fields: #si.body.sentences[tagger_id] = [Sentence(....) for ... in sentence_builder] ## Serif can convert serifxml into this structure. ## now that you have populated this StreamItem, add it to the ## chunk file, and go to the next StreamItem ch.add(si) print 'added StreamItem.stream_id = %s from date_hour = %s' % ( si.stream_id, get_date_hour(si)) ## after adding all the StreamItems, close the chunk: ch.close() ## Typically, chunk files should be limited to about 500 documents or ## smaller. There are several nice pythonic techniques for making ## many chunk files, ask us for examples to suit your circumstances. ## Typically, all of the StreamItems in a chunk file have stream_times ## from the same hour in history. That is, if you call ## get_date_hour(si) you should get the same string for every ## StreamItem in the chunk file.
) si.source_metadata['lang'] = pe.lang[0].code si.source_metadata['author'] = json.dumps( dict( name = pe.author[0].name, email = pe.author[0].email, link = pe.author[0].link[0].href, ) ) si.source = entry.source.publisher_type yield si if __name__ == '__main__': #import sys #from _handle_unconvertible_spinn3r import handle_unconvertible_spinn3r as hus #map(hus, _generate_stream_items(sys.stdin.read())) o_chunk = Chunk('/tmp/foo.sc', mode='wb') for si in _generate_stream_items(sys.stdin.read()): print '---post smoosh raw: %s --' % si.stream_id print si.body.raw print si.stream_id if si.stream_id == '1345928297-da71cfa833ce8218684b6dab152dd69b': o_chunk.add( si ) o_chunk.close()
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path): ''' iterate through the i_chunk and tmp_ner_path to generate a new Chunk with body.ner ''' o_chunk = Chunk() input_iter = i_chunk.__iter__() ner = '' stream_id = None all_ner = xml.dom.minidom.parse(open(tmp_ner_path)) for raw_ner in all_ner.getElementsByTagName('FILENAME'): stream_item = input_iter.next() ## get stream_id out of the XML stream_id = raw_ner.attributes.get('docid').value assert stream_id and stream_id == stream_item.stream_id, \ '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner) tagger_id = 'lingpipe' tagging = Tagging() tagging.tagger_id = tagger_id ## get this one file out of its FILENAME tags tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1] tagging.raw_tagging = tagged_doc tagging.generation_time = streamcorpus.make_stream_time() stream_item.body.taggings[tagger_id] = tagging sentences = list(lingpipe.sentences(tagged_doc)) ## make JS labels on individual tokens assert stream_item.ratings[0].mentions, stream_item.stream_id john_smith_label = Label() john_smith_label.annotator = stream_item.ratings[0].annotator john_smith_label.target_id = stream_item.ratings[0].target_id # first map all corefchains to their words equiv_ids = collections.defaultdict(lambda: set()) for sent in sentences: for tok in sent.tokens: if tok.entity_type is not None: equiv_ids[tok.equiv_id].add(cleanse(tok.token)) ## find all the chains that are John Smith johnsmiths = set() for equiv_id, names in equiv_ids.items(): ## detect 'smith' in 'smithye' _names = cleanse(' '.join(names)) if 'john' in _names and 'smith' in _names: johnsmiths.add(equiv_id) print len(johnsmiths) ## now apply the label for sent in sentences: for tok in sent.tokens: if tok.equiv_id in johnsmiths: tok.labels = [john_smith_label] stream_item.body.sentences[tagger_id] = sentences o_chunk.add(stream_item) ## put the o_chunk bytes into the specified file open(tmp_done_path, 'wb').write(str(o_chunk)) ## replace this with log.info() print 'created %s' % tmp_done_path