def fetch_data_math(self): ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD) temp = ssh.execute('mktemp -d')[0].rstrip('\n') ssh.chdir(temp) code, matches = self.replace_exports(temp) Log.debug('atmospherics.data.models.MathematicaSource.fetch_data', code) code = code.replace("'", '\'"\'"\'') command = "echo '{}' > {}/package.m".format(code, temp) ssh.execute(command) ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp)) if ret: message = 'A message was returned by mathematica script {}.m:\n{}'.format(self.name, ret[-100:]) Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message) ssh.execute('rm {}'.format(os.path.join(temp, 'package.m'))) client_subdirectory = eval(self.client_subdirectory) target =os.path.join(settings.STATIC_ROOT, 'data', self.client_directory, client_subdirectory()) if not os.path.exists(target): os.makedirs(target) ssh.get_d(temp, target) #ssh.execute('rm -rf {}'.format(temp)) ssh.execute('disown') ssh.close() message = 'MathematicaSource {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(self.name, target) Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message)
def pull(self): """ Fetch all available mail at the target address and store it locally in the consumption directory so that the file consumer can pick it up and do its thing. """ if self._enabled: Log.info("Checking mail", Log.COMPONENT_MAIL) for message in self._get_messages(): Log.debug( 'Storing email: "{}"'.format(message.subject), Log.COMPONENT_MAIL ) t = int(time.mktime(message.time.timetuple())) file_name = os.path.join(Consumer.CONSUME, message.file_name) with open(file_name, "wb") as f: f.write(message.attachment.data) os.utime(file_name, times=(t, t)) self.last_checked = datetime.datetime.now()
def _cleanup(self, tempdir, doc): # Remove temporary directory recursively Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER) shutil.rmtree(tempdir) # Remove doc Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) os.unlink(doc)
def _cleanup(self, pngs, doc): png_glob = os.path.join( self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) for f in list(glob.glob(png_glob)) + [doc]: Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER) os.unlink(f)
def _guess_language(text): try: guess = langdetect.detect(text) Log.debug("Language detected: {}".format(guess), Log.COMPONENT_CONSUMER) return guess except Exception as e: Log.warning("Language detection error: {}".format(e), Log.COMPONENT_MAIL)
def mathematica_session(math_session, extra_args, user): """ Runs the given Math session on the Thorek01 server. """ ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD) if ssh.execute('pidof MathKernel') or ssh.execute('pidof Mathematica'): ssh.close() mathematica_session.retry(countdown=5*60, max_retries=(60/5)*24) temp = ssh.execute('mktemp -d')[0].rstrip('\n') ssh.chdir(temp) code, matches = math_session.replace_exports(temp) code = code.replace('{extra_args}', extra_args['extra_args']) code = code.replace('{data}', extra_args['data']) Log.debug('atmospherics.analysis.tasks.mathematica_session', code) code = code.replace("'", '\'"\'"\'') command = "echo '{}' > {}/package.m".format(code, temp) ssh.execute(command) ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp)) target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_session.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M')) os.makedirs(target) ssh.get_d(temp, target) if ret: message = ''' A message was returned by mathematica script {}.m:\n (trimmed to contain only the last 100 lines) \n\n {} '''.format(math_session.name, '\n'.join(ret[-100:])) Log.info('atmospherics.analysis.tasks.mathematica_session', message) with open(os.path.join(target, 'response.txt'), 'w') as logfile: logfile.write(message) #ssh.execute('rm -rf {}'.format(temp)) ssh.execute('disown') ssh.close() message = 'Mathematica session {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_session.name, os.path.join('output', user.username, os.path.split(target)[1])) Log.info('atmospherics.analysis.tasks.mathematica_session', message) if hasattr(user, 'email'): subject = 'Atmospherics Mathematica {} Complete'.format(math_session.name) from_email = 'Atmospherics<*****@*****.**>' email = EmailMultiAlternatives(subject, message, from_email, [user.email]) email.send()
def _guess_language(text): try: guess = langdetect.detect(text) Log.debug( "Language detected: {}".format(guess), Log.COMPONENT_CONSUMER ) return guess except Exception as e: Log.warning( "Language detection error: {}".format(e), Log.COMPONENT_MAIL)
def _get_ocr(self, pngs): """ Attempts to do the best job possible OCR'ing the document based on simple language detection trial & error. """ if not pngs: raise OCRError Log.debug("OCRing the document", Log.COMPONENT_CONSUMER) # Since the division gets rounded down by int, this calculation works # for every edge-case, i.e. 1 middle = int(len(pngs) / 2) raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE) guessed_language = self._guess_language(raw_text) if not guessed_language or guessed_language not in ISO639: Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER) if settings.FORGIVING_OCR: Log.warning( "As FORGIVING_OCR is enabled, we're going to make the best " "with what we have.", Log.COMPONENT_CONSUMER ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text raise OCRError if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text try: return self._ocr(pngs, ISO639[guessed_language]) except pyocr.pyocr.tesseract.TesseractError: if settings.FORGIVING_OCR: Log.warning( "OCR for {} failed, but we're going to stick with what " "we've got since FORGIVING_OCR is enabled.".format( guessed_language ), Log.COMPONENT_CONSUMER ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text raise OCRError
def _get_ocr(self, pngs): """ Attempts to do the best job possible OCR'ing the document based on simple language detection trial & error. """ if not pngs: raise OCRError Log.debug("OCRing the document", Log.COMPONENT_CONSUMER) # Since the division gets rounded down by int, this calculation works # for every edge-case, i.e. 1 middle = int(len(pngs) / 2) raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE) guessed_language = self._guess_language(raw_text) if not guessed_language or guessed_language not in ISO639: Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER) if settings.FORGIVING_OCR: Log.warning( "As FORGIVING_OCR is enabled, we're going to make the " "best with what we have.", Log.COMPONENT_CONSUMER ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text raise OCRError if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text try: return self._ocr(pngs, ISO639[guessed_language]) except pyocr.pyocr.tesseract.TesseractError: if settings.FORGIVING_OCR: Log.warning( "OCR for {} failed, but we're going to stick with what " "we've got since FORGIVING_OCR is enabled.".format( guessed_language ), Log.COMPONENT_CONSUMER ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text raise OCRError
def _get_greyscale(self, tempdir, doc): Log.debug("Generating greyscale image from {}".format(doc), Log.COMPONENT_CONSUMER) png = os.path.join(tempdir, "convert-%04d.jpg") subprocess.Popen((self.CONVERT, "-density", "300", "-depth", "8", "-type", "grayscale", doc, png)).wait() pngs = [] for f in os.listdir(tempdir): if f.startswith("convert"): pngs.append(os.path.join(tempdir, f)) return sorted(filter(lambda __: os.path.isfile(__), pngs))
def _ocr(self, pngs, lang): """ Performs a single OCR attempt. """ if not pngs: return "" Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER) with Pool(processes=self.THREADS) as pool: r = pool.map(self.image_to_string, itertools.product(pngs, [lang])) r = " ".join(r) # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r)
def _get_greyscale(self, tempdir, doc): Log.debug( "Generating greyscale image from {}".format(doc), Log.COMPONENT_CONSUMER ) png = os.path.join(tempdir, "convert-%04d.jpg") subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8", "-type", "grayscale", doc, png )).wait() pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")] return sorted(filter(lambda f: os.path.isfile(f), pngs))
def _get_greyscale(self, doc): Log.debug( "Generating greyscale image from {}".format(doc), Log.COMPONENT_CONSUMER ) i = random.randint(1000000, 9999999) png = os.path.join(self.SCRATCH, "{}.png".format(i)) subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8", "-type", "grayscale", doc, png )).wait() return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _ocr(self, pngs, lang): """ Performs a single OCR attempt. """ if not pngs: return "" Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER) with Pool(processes=self.THREADS) as pool: r = pool.map( self.image_to_string, itertools.product(pngs, [lang])) r = " ".join(r) # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r)
def mathematica_package(math_package, extra_args, user): """ Runs the given Math package on the Thorek01 server. """ ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD) temp = ssh.execute('mktemp -d')[0].rstrip('\n') ssh.chdir(temp) code, matches = math_package.replace_exports(temp) code = code.replace('{extra_args}', extra_args['extra_args']) code = code.replace('{data}', extra_args['data']) Log.debug('atmospherics.analysis.tasks.mathematica_package', code) code = code.replace("'", '\'"\'"\'') command = "echo '{}' > {}/package.m".format(code, temp) ssh.execute(command) ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp)) if ret: message = 'A message was returned by mathematica script {}.m:\n{}'.format(math_package.name, ret[-100:]) Log.info('atmospherics.analysis.tasks.mathematica_package', message) #ssh.execute('rm {}'.format(os.path.join(temp, 'package.m'))) target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_package.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M')) os.makedirs(target) ssh.get_d(temp, target) #ssh.execute('rm -rf {}'.format(temp)) ssh.execute('disown') ssh.close() message = 'Mathematica package {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_package.name, os.path.join('output', user.username, os.path.split(target)[1])) Log.info('atmospherics.analysis.tasks.mathematica_package', message) if user.email: subject = 'Atmospherics Mathematica {} Complete'.format(math_package.name) from_email = 'Atmospherics<*****@*****.**>' email = EmailMultiAlternatives(subject, message, from_email, [user.email]) email.send()
def _store(self, text, doc): sender, title, tags, file_type = self._guess_attributes_from_name(doc) relevant_tags = set(list(Tag.match_all(text)) + list(tags)) stats = os.stat(doc) Log.debug("Saving record to database", Log.COMPONENT_CONSUMER) document = Document.objects.create( sender=sender, title=title, content=text, file_type=file_type, created=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)), modified=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime))) if relevant_tags: tag_names = ", ".join([t.slug for t in relevant_tags]) Log.debug("Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER) document.tags.add(*relevant_tags) with open(doc, "rb") as unencrypted: with open(document.source_path, "wb") as encrypted: Log.debug("Encrypting", Log.COMPONENT_CONSUMER) encrypted.write(GnuPG.encrypted(unencrypted))
def _store(self, text, doc): sender, title, tags, file_type = self._guess_attributes_from_name(doc) relevant_tags = set(list(Tag.match_all(text)) + list(tags)) stats = os.stat(doc) Log.debug("Saving record to database", Log.COMPONENT_CONSUMER) document = Document.objects.create( sender=sender, title=title, content=text, file_type=file_type, created=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)), modified=timezone.make_aware( datetime.datetime.fromtimestamp(stats.st_mtime)) ) if relevant_tags: tag_names = ", ".join([t.slug for t in relevant_tags]) Log.debug( "Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER) document.tags.add(*relevant_tags) with open(doc, "rb") as unencrypted: with open(document.source_path, "wb") as encrypted: Log.debug("Encrypting", Log.COMPONENT_CONSUMER) encrypted.write(GnuPG.encrypted(unencrypted))
def pull(self): """ Fetch all available mail at the target address and store it locally in the consumption directory so that the file consumer can pick it up and do its thing. """ if self._enabled: Log.info("Checking mail", Log.COMPONENT_MAIL) for message in self._get_messages(): Log.debug('Storing email: "{}"'.format(message.subject), Log.COMPONENT_MAIL) t = int(time.mktime(message.time.timetuple())) file_name = os.path.join(Consumer.CONSUME, message.file_name) with open(file_name, "wb") as f: f.write(message.attachment.data) os.utime(file_name, times=(t, t)) self.last_checked = datetime.datetime.now()
def _cleanup_doc(doc): Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) os.unlink(doc)
def _cleanup_tempdir(d): Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER) shutil.rmtree(d)