def store_analysis(results, iri): """Store results of the analysis in redis.""" red = store_analysis.redis # results ... list of strings (json.dumps()) if len(results) > 0: store = json.dumps({ 'analysis': [ json.loads(x) for x in results if ((x is not None) and (len(x) > 0)) ], 'iri': iri }) else: red.delete(data_key(iri)) return key_result = analysis_dataset(iri) with red.pipeline() as pipe: pipe.set(key_result, store) pipe.sadd('purgeable', key_result) pipe.expire(key_result, expiration[KeyRoot.ANALYSIS]) pipe.delete( data_key(iri)) # trash original content (index doesn't need it?) pipe.execute()
def gen_iri_guess(iri, r): deco = { 'zip': decompress_7z, 'gzip': decompress_gzip } for sub_iri in deco[type](iri, r, red): if red.sadd(key, sub_iri) == 0: log.debug(f'Skipping distribution as it was recently analyzed: {sub_iri!s}') continue sub_key = data_key(sub_iri) red.expire(sub_key, expiration[KeyRoot.DATA]) red.sadd('purgeable', sub_key) # end_todo if sub_iri.endswith('/data'): # extracted a file without a filename yield sub_iri, 'text/plain' # this will allow for analysis to happen continue try: guess, _ = guess_format(sub_iri, r, log, red) except Skip: continue if guess is None: log.warn(f'Unknown format after decompression: {sub_iri}') red.expire(data_key(sub_iri), 1) else: yield sub_iri, guess
def analyze(iri, format_guess): """Actually run the analyzer.""" key = data_key(iri) # analyze.redis.sadd('processed', iri) tokens = [it.token for it in AbstractAnalyzer.__subclasses__()] chord(run_one_analyzer.si(token, key, format_guess) for token in tokens)(store_analysis.s(iri))
def gen_tasks(iri, r): lst = [] try: for x in gen_iri_guess(iri, r): #this does the decompression lst.append(x) except SizeException as e: log.warn(f'One of the files in archive {iri} is too large ({e.name})') for sub_iri, _ in lst: log.debug(f'Expire {sub_iri}') red.expire(data_key(sub_iri), 1) except TypeError: log.exception(f'iri: {iri!s}, type: {type!s}') else: for sub_iri, guess in lst: yield index.si(sub_iri, guess) yield analyze.si(sub_iri, guess)
def run_one_indexer(token, iri, format_guess): """Extract graph from redis and run indexer identified by token on it.""" log = logging.getLogger(__name__) red = run_one_indexer.redis key = data_key(iri) log.debug('Parsing graph') try: g = rdflib.ConjunctiveGraph() g.parse(data=red.get(key), format=format_guess) except rdflib.plugin.PluginException: log.debug('Failed to parse graph') return 0 except ValueError: log.debug('Failed to parse graph') return 0 return run_indexer(token, iri, g, red)
def store_content(iri, r, red): """Store contents into redis.""" key = data_key(iri) if not red.exists(key): chsize = 1024 conlen = 0 with red.pipeline() as pipe: for chunk in r.iter_content(chunk_size=chsize): if chunk: if len(chunk) + conlen > MAX_CONTENT_LENGTH: pipe.delete(key) pipe.execute() raise SizeException(iri) pipe.append(key, chunk) conlen = conlen + len(chunk) pipe.expire(key, expiration[KeyRoot.DATA]) pipe.sadd('purgeable', key) pipe.execute() monitor.log_size(conlen)
def decompress_7z(iri, r, red): """Download a 7z file, decompress it and store contents in redis.""" data = load_data(iri, r) log = logging.getLogger(__name__) expiration = expire_table[KeyRoot.DATA] deco_size_total = 0 with libarchive.memory_reader(data) as archive: for entry in archive: try: name = str(entry) except: name = str(uuid.uuid4()) if len(name) == 0: if iri.endswith('.zip'): sub_iri = iri[:-4] else: sub_iri = f'{iri}/{name}' log.error(f'Empty name, iri: {iri!s}') else: sub_iri = f'{iri}/{name}' sub_key = data_key(sub_iri) log.debug(f'Store {name} into {sub_key}') conlen = 0 if not red.exists(sub_key): red.sadd('purgeable', sub_key) for block in entry.get_blocks(): if len(block) + conlen > MAX_CONTENT_LENGTH: # Will fail due to redis limitation red.expire(sub_key, 0) raise SizeException(name) red.append(sub_key, block) conlen = conlen + len(block) red.expire(sub_key, expiration) monitor.log_size(conlen) log.debug(f'Subfile has size {conlen}') deco_size_total = deco_size_total + conlen else: log.warn(f'Data already exists for {sub_iri}') if conlen > 0: yield sub_iri log.debug(f'Done decompression, total decompressed size {deco_size_total}')
def decompress_gzip(iri, r, red): data = load_data(iri, r) expiration = expire_table[KeyRoot.DATA] if iri.endswith('.gz'): iri = iri[:-3] else: iri = iri + '/data' key = data_key(iri) decompressed = gzip.decompress(data) if len(decompressed) > MAX_CONTENT_LENGTH: raise SizeException(name) deco_size_total = red.set(key, decompressed) red.expire(key, expiration) monitor.log_size(deco_size_total) log = logging.getLogger(__name__) log.debug(f'Done decompression, total decompressed size {deco_size_total}') return f'{iri}'