Пример #1
0
    def _ParseImagePage(self, html, page_url):

        if 'member_illust.php?mode=manga' in html:

            manga_url = page_url.replace('medium', 'manga')

            raise HydrusExceptions.MimeException(
                page_url +
                ' was manga, not a single image, so could not be downloaded.')

        if 'member_illust.php?mode=ugoira_view' in html:

            raise HydrusExceptions.MimeException(
                page_url +
                ' was ugoira, not a single image, so could not be downloaded.')

        soup = ClientParsing.GetSoup(html)

        #

        original_image = soup.find(class_='original-image')

        image_url = original_image[
            'data-src']  # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg

        #

        tags_parent = soup.find('section', class_='work-tags')

        # <a href="/search.php?s_mode=s_tag_full&amp;word=%E3%83%8F%E3%83%B3%E3%83%89%E3%83%A1%E3%82%A4%E3%83%89" class="text">[unicode tag here]</a>
        tags = [
            link.string for link in tags_parent.find_all('a', class_='text')
        ]

        user = soup.find('h1', class_='user')

        if user is not None:

            tags.append('creator:' + user.string)

        title_parent = soup.find('section', class_=re.compile('work-info'))

        if title_parent is not None:

            title = title_parent.find('h1', class_='title')

            if title is not None:

                tags.append('title:' + title.string)

        return (image_url, tags)
Пример #2
0
def CheckFFMPEGError( lines ):
    
    if len( lines ) == 0:
        
        raise HydrusExceptions.MimeException( 'Could not parse that file--no FFMPEG output given.' )
        
    
    if "No such file or directory" in lines[-1]:
        
        raise IOError( "File not found!" )
        
    
    if 'Invalid data' in lines[-1]:
        
        raise HydrusExceptions.MimeException( 'FFMPEG could not parse.' )
Пример #3
0
def GetFileInfo(path, mime=None):

    size = os.path.getsize(path)

    if size == 0:

        raise HydrusExceptions.SizeException('File is of zero length!')

    if mime is None:

        mime = GetMime(path)

    if mime not in HC.ALLOWED_MIMES:

        raise HydrusExceptions.MimeException('Filetype is not permitted!')

    width = None
    height = None
    duration = None
    num_frames = None
    num_words = None

    if mime in (HC.IMAGE_JPEG, HC.IMAGE_PNG, HC.IMAGE_GIF):

        ((width, height), duration,
         num_frames) = HydrusImageHandling.GetImageProperties(path)

    elif mime == HC.APPLICATION_FLASH:

        ((width, height), duration,
         num_frames) = HydrusFlashHandling.GetFlashProperties(path)

    elif mime in (HC.IMAGE_APNG, HC.VIDEO_AVI, HC.VIDEO_FLV, HC.VIDEO_WMV,
                  HC.VIDEO_MOV, HC.VIDEO_MP4, HC.VIDEO_MKV, HC.VIDEO_WEBM,
                  HC.VIDEO_MPEG):

        ((width, height), duration,
         num_frames) = HydrusVideoHandling.GetFFMPEGVideoProperties(path)

    elif mime == HC.APPLICATION_PDF:

        num_words = HydrusDocumentHandling.GetPDFNumWords(path)

    elif mime == HC.AUDIO_MP3:

        duration = HydrusAudioHandling.GetMP3Duration(path)

    elif mime == HC.AUDIO_OGG:

        duration = HydrusAudioHandling.GetOGGVorbisDuration(path)

    elif mime == HC.AUDIO_FLAC:

        duration = HydrusAudioHandling.GetFLACDuration(path)

    elif mime == HC.AUDIO_WMA:

        duration = HydrusAudioHandling.GetWMADuration(path)

    return (size, mime, width, height, duration, num_frames, num_words)
Пример #4
0
def CheckFFMPEGError(lines):

    if "No such file or directory" in lines[-1]:

        raise IOError("File not found!")

    if 'Invalid data' in lines[-1]:

        raise HydrusExceptions.MimeException('FFMPEG could not parse.')
Пример #5
0
def ParseFFMPEGFPS( lines ):
    
    try:
        
        line = ParseFFMPEGVideoLine( lines )
        
        # get the frame rate
        
        possible_results = []
        
        match = re.search("( [0-9]*.| )[0-9]* tbr", line)
        
        if match is not None:
            
            tbr = line[match.start():match.end()].split(' ')[1]
            
            tbr_fps_is_likely_garbage = match is None or tbr.endswith( 'k' ) or float( tbr ) > 60
            
            if not tbr_fps_is_likely_garbage:
                
                possible_results.append( float( tbr ) )
                
            
        
        #
        
        match = re.search("( [0-9]*.| )[0-9]* fps", line)
        
        if match is not None:
            
            fps = line[match.start():match.end()].split(' ')[1]
            
            fps_is_likely_garbage = match is None or fps.endswith( 'k' ) or float( fps ) > 60
            
            if not fps_is_likely_garbage:
                
                possible_results.append( float( fps ) )
                
            
        
        if len( possible_results ) == 0:
            
            return None
            
        else:
            
            # in some cases, fps is 0.77 and tbr is incorrectly 20. extreme values cause bad results. let's try erroring on the side of slow
            # tbh in these cases, the frame are prob going to get counted manually anyway due to no neat ints at the end, so nbd
            
            return min( possible_results )
            
        
    except:
        
        raise HydrusExceptions.MimeException( 'Error estimating framerate!' )
Пример #6
0
def ParseFFMPEGDuration( lines ):
    
    # get duration (in seconds)
    #   Duration: 00:00:02.46, start: 0.033000, bitrate: 1069 kb/s
    try:
        
        line = [ l for l in lines if 'Duration:' in l ][0]
        
        if 'Duration: N/A' in line:
            
            return None
            
        
        if 'start:' in line:
            
            m = re.search( '(start\\: )' + '-?[0-9]+\\.[0-9]*', line )
            
            start_offset = float( line[ m.start() + 7 : m.end() ] )
            
            if abs( start_offset ) > 1.0: # once had a file with start offset of 957499 seconds jej
                
                start_offset = 0
                
            
        else:
            
            start_offset = 0
            
        
        match = re.search("[0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9]", line)
        hms = map(float, line[match.start()+1:match.end()].split(':'))
        
        if len( hms ) == 1:
            
            duration = hms[0]
            
        elif len( hms ) == 2:
            
            duration = 60 * hms[0] + hms[1]
            
        elif len( hms ) ==3:
            
            duration = 3600 * hms[0] + 60 * hms[1] + hms[2]
            
        
        duration -= start_offset
        
        return duration
        
    except:
        
        raise HydrusExceptions.MimeException( 'Error reading duration!' )
Пример #7
0
def ParseFFMPEGVideoLine( lines ):
    
    # get the output line that speaks about video
    lines_video = [ l for l in lines if 'Video: ' in l and not ( 'Video: png' in l or 'Video: jpg' in l ) ] # mp3 says it has a 'png' video stream
    
    if len( lines_video ) == 0:
        
        raise HydrusExceptions.MimeException( 'Could not find video information!' )
        
    
    line = lines_video[0]
    
    return line
Пример #8
0
def ParseFFMPEGMimeText(lines):

    try:

        (input_line, ) = [l for l in lines if l.startswith('Input #0')]

        # Input #0, matroska, webm, from 'm.mkv':

        text = input_line[10:]

        mime_text = text.split(', from')[0]

        return mime_text

    except:

        raise HydrusExceptions.MimeException('Error reading mime!')
Пример #9
0
def ParseFFMPEGNumFramesManually( lines ):
    
    try:
        
        frame_lines = [ l for l in lines if l.startswith( 'frame= ' ) ]
        
        l = frame_lines[-1] # there will be several of these, counting up as the file renders. we hence want the final one
        
        while '  ' in l:
            
            l = l.replace( '  ', ' ' )
            
        
        num_frames = int( l.split( ' ' )[1] )
        
        return num_frames
        
    except:
        
        raise HydrusExceptions.MimeException( 'Error counting number of frames!' )
Пример #10
0
def ParseFFMPEGFPS( lines ):
    
    try:
        
        line = ParseFFMPEGVideoLine( lines )
        
        # get the frame rate
        
        match = re.search("( [0-9]*.| )[0-9]* tbr", line)
        
        if match is not None:
            
            fps = line[match.start():match.end()].split(' ')[1]
            
        
        tbr_fps_is_likely_garbage = match is None or fps.endswith( 'k' ) or float( fps ) > 60
        
        if tbr_fps_is_likely_garbage:
            
            match = re.search("( [0-9]*.| )[0-9]* fps", line)
            
            if match is not None:
                
                fps = line[match.start():match.end()].split(' ')[1]
                
            
            fps_is_likely_garbage = match is None or fps.endswith( 'k' ) or float( fps ) > 60
            
            if fps_is_likely_garbage:
                
                return None
                
            
        
        fps = float( fps )
        
        return fps
        
    except:
        
        raise HydrusExceptions.MimeException( 'Error estimating framerate!' )
Пример #11
0
def ParseFFMPEGVideoResolution(lines):

    try:

        line = ParseFFMPEGVideoLine(lines)

        # get the size, of the form 460x320 (w x h)
        match = re.search(" [0-9]*x[0-9]*(,| )", line)

        resolution = list(
            map(int, line[match.start():match.end() - 1].split('x')))

        sar_match = re.search(" SAR [0-9]*:[0-9]* ", line)

        if sar_match is not None:

            # ' SAR 2:3 '
            sar_string = line[sar_match.start():sar_match.end()]

            # '2:3'
            sar_string = sar_string[5:-1]

            (sar_w, sar_h) = sar_string.split(':')

            (sar_w, sar_h) = (int(sar_w), int(sar_h))

            (x, y) = resolution

            x *= sar_w
            x //= sar_h

            resolution = (x, y)

        return resolution

    except:

        raise HydrusExceptions.MimeException('Error parsing resolution!')
Пример #12
0
def Hydrusffmpeg_parse_infos(filename, print_infos=False):
    """Get file infos using ffmpeg.

    Returns a dictionnary with the fields:
    "video_found", "video_fps", "duration", "video_nframes",
    "video_duration"
    "audio_found", "audio_fps"

    "video_duration" is slightly smaller than "duration" to avoid
    fetching the uncomplete frames at the end, which raises an error.

    """

    # open the file in a pipe, provoke an error, read output

    cmd = [FFMPEG_PATH, "-i", filename]

    is_GIF = filename.endswith('.gif')

    if is_GIF:
        if HC.PLATFORM_WINDOWS: cmd += ["-f", "null", "NUL"]
        else: cmd += ["-f", "null", "/dev/null"]

    try:

        proc = subprocess.Popen(
            cmd,
            bufsize=10**5,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            startupinfo=HydrusData.GetSubprocessStartupInfo())

    except:

        if not os.path.exists(FFMPEG_PATH):

            raise Exception('FFMPEG was not found!')

        else:

            raise

    infos = proc.stderr.read().decode('utf8')

    proc.terminate()

    del proc

    if print_infos:
        # print the whole info text returned by FFMPEG
        HydrusData.Print(infos)

    lines = infos.splitlines()
    if "No such file or directory" in lines[-1]:
        raise IOError("%s not found ! Wrong path ?" % filename)
    if 'Invalid data' in lines[-1]:
        raise HydrusExceptions.MimeException('FFMPEG could not parse.')

    result = dict()

    # get duration (in seconds)
    #   Duration: 00:00:02.46, start: 0.033000, bitrate: 1069 kb/s
    try:
        keyword = ('frame=' if is_GIF else 'Duration: ')
        line = [l for l in lines if keyword in l][0]

        if 'start:' in line:

            m = re.search('(start\\: )' + '-?[0-9]+\\.[0-9]*', line)

            start_offset = float(line[m.start() + 7:m.end()])

            if abs(
                    start_offset
            ) > 1.0:  # once had a file with start offset of 957499 seconds jej

                start_offset = 0

        else:

            start_offset = 0

        match = re.search("[0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9]", line)
        hms = map(float, line[match.start() + 1:match.end()].split(':'))

        if len(hms) == 1:
            result['duration'] = hms[0]
        elif len(hms) == 2:
            result['duration'] = 60 * hms[0] + hms[1]
        elif len(hms) == 3:
            result['duration'] = 3600 * hms[0] + 60 * hms[1] + hms[2]

        result['duration'] -= start_offset

    except:
        raise IOError("Error reading duration in file %s," % (filename) +
                      "Text parsed: %s" % infos)

    try:

        (input_line, ) = [l for l in lines if l.startswith('Input #0')]

        # Input #0, matroska, webm, from 'm.mkv':

        text = input_line[10:]

        mime_text = text.split(', from')[0]

        result['mime_text'] = mime_text

    except:

        pass

    # get the output line that speaks about video
    lines_video = [
        l for l in lines
        if ' Video: ' in l and not (' Video: png' in l or ' Video: jpg' in l)
    ]  # mp3 says it has a 'png' video stream

    result['video_found'] = (lines_video != [])

    if result['video_found']:

        line = lines_video[0]

        # get the size, of the form 460x320 (w x h)
        match = re.search(" [0-9]*x[0-9]*(,| )", line)
        s = list(map(int, line[match.start():match.end() - 1].split('x')))
        result['video_size'] = s

        # get the frame rate
        try:
            match = re.search("( [0-9]*.| )[0-9]* tbr", line)
            result['video_fps'] = float(
                line[match.start():match.end()].split(' ')[1])
        except:
            match = re.search("( [0-9]*.| )[0-9]* fps", line)
            result['video_fps'] = float(
                line[match.start():match.end()].split(' ')[1])

        num_frames = result['duration'] * result['video_fps']

        if num_frames != int(num_frames): num_frames += 1  # rounding up

        result['video_nframes'] = int(num_frames)

        result['video_duration'] = result['duration']
        # We could have also recomputed the duration from the number
        # of frames, as follows:
        # >>> result['video_duration'] = result['video_nframes'] / result['video_fps']

    lines_audio = [l for l in lines if ' Audio: ' in l]

    result['audio_found'] = lines_audio != []

    if result['audio_found']:
        line = lines_audio[0]
        try:
            match = re.search(" [0-9]* Hz", line)
            result['audio_fps'] = int(line[match.start() + 1:match.end()])
        except:
            result['audio_fps'] = 'unknown'

    return result
Пример #13
0
def GetFFMPEGVideoProperties(path, count_frames_manually=False):

    lines = GetFFMPEGInfoLines(path, count_frames_manually)

    if not ParseFFMPEGHasVideo(lines):

        raise HydrusExceptions.MimeException(
            'File did not appear to have a video stream!')

    resolution = ParseFFMPEGVideoResolution(lines)

    duration = ParseFFMPEGDuration(lines)

    if duration is None:

        fps = ParseFFMPEGFPS(lines)

        if fps is None:

            fps = 24  # screw it, let's just put one in there

        if not count_frames_manually:

            count_frames_manually = True

            lines = GetFFMPEGInfoLines(path, count_frames_manually)

        num_frames = ParseFFMPEGNumFramesManually(lines)

        duration = num_frames / float(fps)

    else:

        num_frames = None

        if not count_frames_manually:

            fps = ParseFFMPEGFPS(lines)

            it_was_accurate = fps is not None

            if it_was_accurate:

                num_frames = duration * fps

                if num_frames != int(
                        num_frames
                ):  # we want whole numbers--anything else suggests start_offset is off or whatever

                    if os.path.getsize(
                            path
                    ) < 30 * 1048576:  # but only defer to a super precise +/- 1-frame manual count in this case when the file is small

                        it_was_accurate = False

            if not it_was_accurate:

                count_frames_manually = True

                lines = GetFFMPEGInfoLines(path, count_frames_manually)

        if count_frames_manually:

            try:

                num_frames = ParseFFMPEGNumFramesManually(lines)

            except HydrusExceptions.MimeException:

                if num_frames is None:

                    raise

    duration_in_ms = int(duration * 1000)

    return (resolution, duration_in_ms, num_frames)
Пример #14
0
def GetFileInfo(path, mime=None):

    size = os.path.getsize(path)

    if size == 0:

        raise HydrusExceptions.SizeException('File is of zero length!')

    if mime is None:

        mime = GetMime(path)

    if mime not in HC.ALLOWED_MIMES:

        raise HydrusExceptions.MimeException('Filetype is not permitted!')

    width = None
    height = None
    duration = None
    num_frames = None
    num_words = None

    if mime in (HC.IMAGE_JPEG, HC.IMAGE_PNG, HC.IMAGE_GIF):

        ((width, height), duration,
         num_frames) = HydrusImageHandling.GetImageProperties(path)

    elif mime == HC.APPLICATION_FLASH:

        ((width, height), duration,
         num_frames) = HydrusFlashHandling.GetFlashProperties(path)

    elif mime in (HC.IMAGE_APNG, HC.VIDEO_AVI, HC.VIDEO_FLV, HC.VIDEO_WMV,
                  HC.VIDEO_MOV, HC.VIDEO_MP4, HC.VIDEO_MKV, HC.VIDEO_WEBM,
                  HC.VIDEO_MPEG):

        ((width, height), duration,
         num_frames) = HydrusVideoHandling.GetFFMPEGVideoProperties(path)

    elif mime == HC.APPLICATION_PDF:

        num_words = HydrusDocumentHandling.GetPDFNumWords(path)

    elif mime in HC.AUDIO:

        ffmpeg_lines = HydrusVideoHandling.GetFFMPEGInfoLines(path)

        duration_in_s = HydrusVideoHandling.ParseFFMPEGDuration(ffmpeg_lines)

        duration = int(duration_in_s * 1000)

    if width is not None and width < 0:

        width *= -1

    if height is not None and height < 0:

        width *= -1

    if duration is not None and duration < 0:

        duration *= -1

    if num_frames is not None and num_frames < 0:

        num_frames *= -1

    if num_words is not None and num_words < 0:

        num_words *= -1

    return (size, mime, width, height, duration, num_frames, num_words)
Пример #15
0
def GetFileInfo(path, mime=None):

    size = os.path.getsize(path)

    if size == 0:

        raise HydrusExceptions.SizeException('File is of zero length!')

    if mime is None:

        mime = GetMime(path)

    if mime not in HC.ALLOWED_MIMES:

        if mime == HC.TEXT_HTML:

            raise HydrusExceptions.MimeException(
                'Looks like HTML -- maybe the client needs to be taught how to parse this?'
            )

        elif mime == HC.APPLICATION_UNKNOWN:

            raise HydrusExceptions.MimeException('Unknown filetype!')

        else:

            raise HydrusExceptions.MimeException('Filetype is not permitted!')

    width = None
    height = None
    duration = None
    num_frames = None
    num_words = None

    if mime in (HC.IMAGE_JPEG, HC.IMAGE_PNG, HC.IMAGE_GIF):

        ((width, height), duration,
         num_frames) = HydrusImageHandling.GetImageProperties(path, mime)

    elif mime == HC.APPLICATION_FLASH:

        ((width, height), duration,
         num_frames) = HydrusFlashHandling.GetFlashProperties(path)

    elif mime in (HC.IMAGE_APNG, HC.VIDEO_AVI, HC.VIDEO_FLV, HC.VIDEO_WMV,
                  HC.VIDEO_MOV, HC.VIDEO_MP4, HC.VIDEO_MKV, HC.VIDEO_WEBM,
                  HC.VIDEO_MPEG):

        ((width, height), duration,
         num_frames) = HydrusVideoHandling.GetFFMPEGVideoProperties(path)

    elif mime == HC.APPLICATION_PDF:

        num_words = HydrusDocumentHandling.GetPDFNumWords(
            path)  # this now give None until a better solution can be found

    elif mime in HC.AUDIO:

        ffmpeg_lines = HydrusVideoHandling.GetFFMPEGInfoLines(path)

        duration_in_s = HydrusVideoHandling.ParseFFMPEGDuration(ffmpeg_lines)

        duration = int(duration_in_s * 1000)

    if width is not None and width < 0:

        width *= -1

    if height is not None and height < 0:

        width *= -1

    if duration is not None and duration < 0:

        duration *= -1

    if num_frames is not None and num_frames < 0:

        num_frames *= -1

    if num_words is not None and num_words < 0:

        num_words *= -1

    return (size, mime, width, height, duration, num_frames, num_words)