예제 #1
0
    def fetch_data_math(self):
        ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD)
        
        temp = ssh.execute('mktemp -d')[0].rstrip('\n')
        ssh.chdir(temp)
        
        code, matches = self.replace_exports(temp)
        
        Log.debug('atmospherics.data.models.MathematicaSource.fetch_data', code)
        code = code.replace("'", '\'"\'"\'')
        command = "echo '{}' > {}/package.m".format(code, temp)
        ssh.execute(command)

        ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp))
        
        if ret:
            message = 'A message was returned by mathematica script  {}.m:\n{}'.format(self.name, ret[-100:])
            Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message)

        ssh.execute('rm {}'.format(os.path.join(temp, 'package.m')))
        
        client_subdirectory = eval(self.client_subdirectory)
        target =os.path.join(settings.STATIC_ROOT, 'data', self.client_directory, client_subdirectory())
        if not os.path.exists(target):
            os.makedirs(target)
        ssh.get_d(temp, target)
        
        #ssh.execute('rm -rf {}'.format(temp))
        ssh.execute('disown')
        ssh.close()

        message = 'MathematicaSource {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(self.name, target)
        Log.info('atmospherics.data.models.MathematicaSource.fetch_data', message)
예제 #2
0
    def pull(self):
        """
        Fetch all available mail at the target address and store it locally in
        the consumption directory so that the file consumer can pick it up and
        do its thing.
        """

        if self._enabled:

            Log.info("Checking mail", Log.COMPONENT_MAIL)

            for message in self._get_messages():

                Log.debug(
                    'Storing email: "{}"'.format(message.subject),
                    Log.COMPONENT_MAIL
                )

                t = int(time.mktime(message.time.timetuple()))
                file_name = os.path.join(Consumer.CONSUME, message.file_name)
                with open(file_name, "wb") as f:
                    f.write(message.attachment.data)
                    os.utime(file_name, times=(t, t))

        self.last_checked = datetime.datetime.now()
예제 #3
0
    def _cleanup(self, tempdir, doc):
        # Remove temporary directory recursively
        Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
        shutil.rmtree(tempdir)

        # Remove doc
        Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
        os.unlink(doc)
예제 #4
0
    def _cleanup(self, pngs, doc):

        png_glob = os.path.join(
            self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))

        for f in list(glob.glob(png_glob)) + [doc]:
            Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
            os.unlink(f)
예제 #5
0
 def _guess_language(text):
     try:
         guess = langdetect.detect(text)
         Log.debug("Language detected: {}".format(guess),
                   Log.COMPONENT_CONSUMER)
         return guess
     except Exception as e:
         Log.warning("Language detection error: {}".format(e),
                     Log.COMPONENT_MAIL)
예제 #6
0
def mathematica_session(math_session, extra_args, user):
    """
    Runs the given Math session on the Thorek01 server.
    """
    ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD)

    if ssh.execute('pidof MathKernel') or ssh.execute('pidof Mathematica'):
        ssh.close()
        mathematica_session.retry(countdown=5*60, max_retries=(60/5)*24)
    
    temp = ssh.execute('mktemp -d')[0].rstrip('\n')
    ssh.chdir(temp)
    
    code, matches = math_session.replace_exports(temp)
        
    code = code.replace('{extra_args}', extra_args['extra_args'])
    code = code.replace('{data}', extra_args['data'])
    
    Log.debug('atmospherics.analysis.tasks.mathematica_session', code)
    code = code.replace("'", '\'"\'"\'')
    command = "echo '{}' > {}/package.m".format(code, temp)
    ssh.execute(command)

    ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp))
    
    target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_session.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M'))
    
    os.makedirs(target)
    ssh.get_d(temp, target)
    
    if ret:
        message = '''
            A message was returned by mathematica script  {}.m:\n
            (trimmed to contain only the last 100 lines) \n\n
            {}
        '''.format(math_session.name, '\n'.join(ret[-100:]))
        Log.info('atmospherics.analysis.tasks.mathematica_session', message)
        
        with open(os.path.join(target, 'response.txt'), 'w') as logfile:
            logfile.write(message)
    
    #ssh.execute('rm -rf {}'.format(temp))
    ssh.execute('disown')
    ssh.close()

    message = 'Mathematica session {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_session.name, os.path.join('output', user.username, os.path.split(target)[1]))
    Log.info('atmospherics.analysis.tasks.mathematica_session', message)
    if hasattr(user, 'email'):
        subject = 'Atmospherics Mathematica {} Complete'.format(math_session.name)
        from_email = 'Atmospherics<*****@*****.**>'
        email = EmailMultiAlternatives(subject,
                                       message,
                                       from_email,
                                       [user.email])
        email.send()
예제 #7
0
 def _guess_language(text):
     try:
         guess = langdetect.detect(text)
         Log.debug(
             "Language detected: {}".format(guess),
             Log.COMPONENT_CONSUMER
         )
         return guess
     except Exception as e:
         Log.warning(
             "Language detection error: {}".format(e), Log.COMPONENT_MAIL)
예제 #8
0
    def _get_ocr(self, pngs):
        """
        Attempts to do the best job possible OCR'ing the document based on
        simple language detection trial & error.
        """

        if not pngs:
            raise OCRError

        Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)

        # Since the division gets rounded down by int, this calculation works
        # for every edge-case, i.e. 1
        middle = int(len(pngs) / 2)
        raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)

        guessed_language = self._guess_language(raw_text)

        if not guessed_language or guessed_language not in ISO639:
            Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
            if settings.FORGIVING_OCR:
                Log.warning(
                    "As FORGIVING_OCR is enabled, we're going to make the best "
                    "with what we have.",
                    Log.COMPONENT_CONSUMER
                )
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                return raw_text
            raise OCRError

        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
            raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
            return raw_text

        try:
            return self._ocr(pngs, ISO639[guessed_language])
        except pyocr.pyocr.tesseract.TesseractError:
            if settings.FORGIVING_OCR:
                Log.warning(
                    "OCR for {} failed, but we're going to stick with what "
                    "we've got since FORGIVING_OCR is enabled.".format(
                        guessed_language
                    ),
                    Log.COMPONENT_CONSUMER
                )
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                return raw_text
            raise OCRError
예제 #9
0
    def _get_ocr(self, pngs):
        """
        Attempts to do the best job possible OCR'ing the document based on
        simple language detection trial & error.
        """

        if not pngs:
            raise OCRError

        Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)

        # Since the division gets rounded down by int, this calculation works
        # for every edge-case, i.e. 1
        middle = int(len(pngs) / 2)
        raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)

        guessed_language = self._guess_language(raw_text)

        if not guessed_language or guessed_language not in ISO639:
            Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
            if settings.FORGIVING_OCR:
                Log.warning(
                    "As FORGIVING_OCR is enabled, we're going to make the "
                    "best with what we have.",
                    Log.COMPONENT_CONSUMER
                )
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                return raw_text
            raise OCRError

        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
            raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
            return raw_text

        try:
            return self._ocr(pngs, ISO639[guessed_language])
        except pyocr.pyocr.tesseract.TesseractError:
            if settings.FORGIVING_OCR:
                Log.warning(
                    "OCR for {} failed, but we're going to stick with what "
                    "we've got since FORGIVING_OCR is enabled.".format(
                        guessed_language
                    ),
                    Log.COMPONENT_CONSUMER
                )
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                return raw_text
            raise OCRError
예제 #10
0
    def _get_greyscale(self, tempdir, doc):

        Log.debug("Generating greyscale image from {}".format(doc),
                  Log.COMPONENT_CONSUMER)

        png = os.path.join(tempdir, "convert-%04d.jpg")

        subprocess.Popen((self.CONVERT, "-density", "300", "-depth", "8",
                          "-type", "grayscale", doc, png)).wait()

        pngs = []
        for f in os.listdir(tempdir):
            if f.startswith("convert"):
                pngs.append(os.path.join(tempdir, f))

        return sorted(filter(lambda __: os.path.isfile(__), pngs))
예제 #11
0
    def _ocr(self, pngs, lang):
        """
        Performs a single OCR attempt.
        """

        if not pngs:
            return ""

        Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)

        with Pool(processes=self.THREADS) as pool:
            r = pool.map(self.image_to_string, itertools.product(pngs, [lang]))
            r = " ".join(r)

        # Strip out excess white space to allow matching to go smoother
        return re.sub(r"\s+", " ", r)
예제 #12
0
    def _get_greyscale(self, tempdir, doc):

        Log.debug(
            "Generating greyscale image from {}".format(doc),
            Log.COMPONENT_CONSUMER
        )

        png = os.path.join(tempdir, "convert-%04d.jpg")

        subprocess.Popen((
            self.CONVERT, "-density", "300", "-depth", "8",
            "-type", "grayscale", doc, png
        )).wait()

        pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
        return sorted(filter(lambda f: os.path.isfile(f), pngs))
예제 #13
0
    def _get_greyscale(self, doc):

        Log.debug(
            "Generating greyscale image from {}".format(doc),
            Log.COMPONENT_CONSUMER
        )

        i = random.randint(1000000, 9999999)
        png = os.path.join(self.SCRATCH, "{}.png".format(i))

        subprocess.Popen((
            self.CONVERT, "-density", "300", "-depth", "8",
            "-type", "grayscale", doc, png
        )).wait()

        return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
예제 #14
0
    def _ocr(self, pngs, lang):
        """
        Performs a single OCR attempt.
        """

        if not pngs:
            return ""

        Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)

        with Pool(processes=self.THREADS) as pool:
            r = pool.map(
                self.image_to_string, itertools.product(pngs, [lang]))
            r = " ".join(r)

        # Strip out excess white space to allow matching to go smoother
        return re.sub(r"\s+", " ", r)
예제 #15
0
def mathematica_package(math_package, extra_args, user):
    """
    Runs the given Math package on the Thorek01 server.
    """
    ssh=pysftp.Connection(settings.SSH_HOST, username=settings.SSH_USER, password=settings.SSH_PASSWORD)
    
    temp = ssh.execute('mktemp -d')[0].rstrip('\n')
    ssh.chdir(temp)
    
    code, matches = math_package.replace_exports(temp)
        
    code = code.replace('{extra_args}', extra_args['extra_args'])
    code = code.replace('{data}', extra_args['data'])
    
    Log.debug('atmospherics.analysis.tasks.mathematica_package', code)
    code = code.replace("'", '\'"\'"\'')
    command = "echo '{}' > {}/package.m".format(code, temp)
    ssh.execute(command)

    ret = ssh.execute('xvfb-run -s "-screen 0 640x480x24" math -script {}/package.m &\n\n\n\n'.format(temp))
    
    if ret:
        message = 'A message was returned by mathematica script  {}.m:\n{}'.format(math_package.name, ret[-100:])
        Log.info('atmospherics.analysis.tasks.mathematica_package', message)

    #ssh.execute('rm {}'.format(os.path.join(temp, 'package.m')))
    
    target = os.path.join(settings.MEDIA_ROOT, user.username, 'output', math_package.name.replace(' ', '_')+datetime.now().strftime('_%m%d%y_%H%M'))
    os.makedirs(target)
    ssh.get_d(temp, target)
    
    #ssh.execute('rm -rf {}'.format(temp))
    ssh.execute('disown')
    ssh.close()

    message = 'Mathematica package {} run.\nOutput saved to:\nhttp://atmospherics.lossofgenerality.com/{}'.format(math_package.name, os.path.join('output', user.username, os.path.split(target)[1]))
    Log.info('atmospherics.analysis.tasks.mathematica_package', message)
    if user.email:
        subject = 'Atmospherics Mathematica {} Complete'.format(math_package.name)
        from_email = 'Atmospherics<*****@*****.**>'
        email = EmailMultiAlternatives(subject,
                                       message,
                                       from_email,
                                       [user.email])
        email.send()
예제 #16
0
    def _store(self, text, doc):

        sender, title, tags, file_type = self._guess_attributes_from_name(doc)
        relevant_tags = set(list(Tag.match_all(text)) + list(tags))

        stats = os.stat(doc)

        Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)

        document = Document.objects.create(
            sender=sender,
            title=title,
            content=text,
            file_type=file_type,
            created=timezone.make_aware(
                datetime.datetime.fromtimestamp(stats.st_mtime)),
            modified=timezone.make_aware(
                datetime.datetime.fromtimestamp(stats.st_mtime)))

        if relevant_tags:
            tag_names = ", ".join([t.slug for t in relevant_tags])
            Log.debug("Tagging with {}".format(tag_names),
                      Log.COMPONENT_CONSUMER)
            document.tags.add(*relevant_tags)

        with open(doc, "rb") as unencrypted:
            with open(document.source_path, "wb") as encrypted:
                Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
                encrypted.write(GnuPG.encrypted(unencrypted))
예제 #17
0
    def _store(self, text, doc):

        sender, title, tags, file_type = self._guess_attributes_from_name(doc)
        relevant_tags = set(list(Tag.match_all(text)) + list(tags))

        stats = os.stat(doc)

        Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)

        document = Document.objects.create(
            sender=sender,
            title=title,
            content=text,
            file_type=file_type,
            created=timezone.make_aware(
                datetime.datetime.fromtimestamp(stats.st_mtime)),
            modified=timezone.make_aware(
                datetime.datetime.fromtimestamp(stats.st_mtime))
        )

        if relevant_tags:
            tag_names = ", ".join([t.slug for t in relevant_tags])
            Log.debug(
                "Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER)
            document.tags.add(*relevant_tags)

        with open(doc, "rb") as unencrypted:
            with open(document.source_path, "wb") as encrypted:
                Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
                encrypted.write(GnuPG.encrypted(unencrypted))
예제 #18
0
파일: mail.py 프로젝트: zedster/paperless
    def pull(self):
        """
        Fetch all available mail at the target address and store it locally in
        the consumption directory so that the file consumer can pick it up and
        do its thing.
        """

        if self._enabled:

            Log.info("Checking mail", Log.COMPONENT_MAIL)

            for message in self._get_messages():

                Log.debug('Storing email: "{}"'.format(message.subject),
                          Log.COMPONENT_MAIL)

                t = int(time.mktime(message.time.timetuple()))
                file_name = os.path.join(Consumer.CONSUME, message.file_name)
                with open(file_name, "wb") as f:
                    f.write(message.attachment.data)
                    os.utime(file_name, times=(t, t))

        self.last_checked = datetime.datetime.now()
예제 #19
0
 def _cleanup_doc(doc):
     Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
     os.unlink(doc)
예제 #20
0
 def _cleanup_tempdir(d):
     Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
     shutil.rmtree(d)
예제 #21
0
 def _cleanup_tempdir(d):
     Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
     shutil.rmtree(d)
예제 #22
0
 def _cleanup_doc(doc):
     Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
     os.unlink(doc)