def consume(self): for doc in os.listdir(self.CONSUME): doc = os.path.join(self.CONSUME, doc) if not os.path.isfile(doc): continue if not re.match(self.REGEX_TITLE, doc): continue if doc in self._ignore: continue if self._is_ready(doc): continue Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) pngs = self._get_greyscale(tempdir, doc) try: text = self._get_ocr(pngs) self._store(text, doc) except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) self._cleanup_tempdir(tempdir) continue else: self._cleanup_tempdir(tempdir) self._cleanup_doc(doc)
def consume(self): for doc in os.listdir(self.CONSUME): doc = os.path.join(self.CONSUME, doc) if not os.path.isfile(doc): continue if not re.match(self.REGEX_TITLE, doc): continue if doc in self._ignore: continue if self._is_ready(doc): continue Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) pngs = self._get_greyscale(tempdir, doc) try: text = self._get_ocr(pngs) self._store(text, doc) except OCRError: self._ignore.append(doc) Log.error( "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) self._cleanup_tempdir(tempdir) continue else: self._cleanup_tempdir(tempdir) self._cleanup_doc(doc)
def fetch_data_math(self): ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD) temp = ssh.execute('mktemp -d')[0].rstrip('\n') ssh.chdir(temp) code, matches = self.replace_exports(temp) Log.debug('atmospherics.data.models.MathematicaSource.fetch_data', code) code = code.replace("'", '\'"\'"\'') command = "echo '{}' > {}/package.m".format(code, temp) ssh.execute(command) ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp)) if ret: message = 'A message was returned by mathematica script {}.m:\n{}'.format(self.name, ret[-100:]) Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message) ssh.execute('rm {}'.format(os.path.join(temp, 'package.m'))) client_subdirectory = eval(self.client_subdirectory) target =os.path.join(settings.STATIC_ROOT, 'data', self.client_directory, client_subdirectory()) if not os.path.exists(target): os.makedirs(target) ssh.get_d(temp, target) #ssh.execute('rm -rf {}'.format(temp)) ssh.execute('disown') ssh.close() message = 'MathematicaSource {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(self.name, target) Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message)
def pull(self): """ Fetch all available mail at the target address and store it locally in the consumption directory so that the file consumer can pick it up and do its thing. """ if self._enabled: Log.info("Checking mail", Log.COMPONENT_MAIL) for message in self._get_messages(): Log.debug( 'Storing email: "{}"'.format(message.subject), Log.COMPONENT_MAIL ) t = int(time.mktime(message.time.timetuple())) file_name = os.path.join(Consumer.CONSUME, message.file_name) with open(file_name, "wb") as f: f.write(message.attachment.data) os.utime(file_name, times=(t, t)) self.last_checked = datetime.datetime.now()
def consume(self): for doc in os.listdir(self.CONSUME): doc = os.path.join(self.CONSUME, doc) if not os.path.isfile(doc): continue if not re.match(self.REGEX_TITLE, doc): continue if doc in self._ignore: continue if self._is_ready(doc): continue Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) pngs = self._get_greyscale(doc) try: text = self._get_ocr(pngs) except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) continue self._store(text, doc) self._cleanup(pngs, doc)
def mathematica_session(math_session, extra_args, user): """ Runs the given Math session on the Thorek01 server. """ ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD) if ssh.execute('pidof MathKernel') or ssh.execute('pidof Mathematica'): ssh.close() mathematica_session.retry(countdown=5*60, max_retries=(60/5)*24) temp = ssh.execute('mktemp -d')[0].rstrip('\n') ssh.chdir(temp) code, matches = math_session.replace_exports(temp) code = code.replace('{extra_args}', extra_args['extra_args']) code = code.replace('{data}', extra_args['data']) Log.debug('atmospherics.analysis.tasks.mathematica_session', code) code = code.replace("'", '\'"\'"\'') command = "echo '{}' > {}/package.m".format(code, temp) ssh.execute(command) ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp)) target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_session.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M')) os.makedirs(target) ssh.get_d(temp, target) if ret: message = ''' A message was returned by mathematica script {}.m:\n (trimmed to contain only the last 100 lines) \n\n {} '''.format(math_session.name, '\n'.join(ret[-100:])) Log.info('atmospherics.analysis.tasks.mathematica_session', message) with open(os.path.join(target, 'response.txt'), 'w') as logfile: logfile.write(message) #ssh.execute('rm -rf {}'.format(temp)) ssh.execute('disown') ssh.close() message = 'Mathematica session {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_session.name, os.path.join('output', user.username, os.path.split(target)[1])) Log.info('atmospherics.analysis.tasks.mathematica_session', message) if hasattr(user, 'email'): subject = 'Atmospherics Mathematica {} Complete'.format(math_session.name) from_email = 'Atmospherics<*****@*****.**>' email = EmailMultiAlternatives(subject, message, from_email, [user.email]) email.send()
def __init__(self, data, verbosity=1): """ Cribbed heavily from https://www.ianlewis.org/en/parsing-email-attachments-python """ self.verbosity = verbosity self.subject = None self.time = None self.attachment = None message = BytesParser(policy=policy.default).parsebytes(data) self.subject = str(message["Subject"]).replace("\r\n", "") self.body = str(message.get_body()) self.check_subject() self.check_body() self._set_time(message) Log.info( 'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL) attachments = [] for part in message.walk(): content_disposition = part.get("Content-Disposition") if not content_disposition: continue dispositions = content_disposition.strip().split(";") if not dispositions[0].lower() == "attachment": continue file_data = part.get_payload() attachments.append(Attachment( b64decode(file_data), content_type=part.get_content_type())) if len(attachments) == 0: raise InvalidMessageError( "There don't appear to be any attachments to this message") if len(attachments) > 1: raise InvalidMessageError( "There's more than one attachment to this message. It cannot " "be indexed automatically." ) self.attachment = attachments[0]
def __init__(self, data, verbosity=1): """ Cribbed heavily from https://www.ianlewis.org/en/parsing-email-attachments-python """ self.verbosity = verbosity self.subject = None self.time = None self.attachment = None message = BytesParser(policy=policy.default).parsebytes(data) self.subject = str(message["Subject"]).replace("\r\n", "") self.body = str(message.get_body()) self.check_subject() self.check_body() self._set_time(message) Log.info('Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL) attachments = [] for part in message.walk(): content_disposition = part.get("Content-Disposition") if not content_disposition: continue dispositions = content_disposition.strip().split(";") if not dispositions[0].lower() == "attachment": continue file_data = part.get_payload() attachments.append( Attachment(b64decode(file_data), content_type=part.get_content_type())) if len(attachments) == 0: raise InvalidMessageError( "There don't appear to be any attachments to this message") if len(attachments) > 1: raise InvalidMessageError( "There's more than one attachment to this message. It cannot " "be indexed automatically.") self.attachment = attachments[0]
def mathematica_package(math_package, extra_args, user): """ Runs the given Math package on the Thorek01 server. """ ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD) temp = ssh.execute('mktemp -d')[0].rstrip('\n') ssh.chdir(temp) code, matches = math_package.replace_exports(temp) code = code.replace('{extra_args}', extra_args['extra_args']) code = code.replace('{data}', extra_args['data']) Log.debug('atmospherics.analysis.tasks.mathematica_package', code) code = code.replace("'", '\'"\'"\'') command = "echo '{}' > {}/package.m".format(code, temp) ssh.execute(command) ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp)) if ret: message = 'A message was returned by mathematica script {}.m:\n{}'.format(math_package.name, ret[-100:]) Log.info('atmospherics.analysis.tasks.mathematica_package', message) #ssh.execute('rm {}'.format(os.path.join(temp, 'package.m'))) target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_package.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M')) os.makedirs(target) ssh.get_d(temp, target) #ssh.execute('rm -rf {}'.format(temp)) ssh.execute('disown') ssh.close() message = 'Mathematica package {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_package.name, os.path.join('output', user.username, os.path.split(target)[1])) Log.info('atmospherics.analysis.tasks.mathematica_package', message) if user.email: subject = 'Atmospherics Mathematica {} Complete'.format(math_package.name) from_email = 'Atmospherics<*****@*****.**>' email = EmailMultiAlternatives(subject, message, from_email, [user.email]) email.send()
def pull(self): """ Fetch all available mail at the target address and store it locally in the consumption directory so that the file consumer can pick it up and do its thing. """ if self._enabled: Log.info("Checking mail", Log.COMPONENT_MAIL) for message in self._get_messages(): Log.debug('Storing email: "{}"'.format(message.subject), Log.COMPONENT_MAIL) t = int(time.mktime(message.time.timetuple())) file_name = os.path.join(Consumer.CONSUME, message.file_name) with open(file_name, "wb") as f: f.write(message.attachment.data) os.utime(file_name, times=(t, t)) self.last_checked = datetime.datetime.now()