class Extractor:
    def __init__(self, directory, fields):
        self.directory = directory
        self.fields = fields
        self.charset = "UTF-8"
        self.total = 0
        self.invalid = 0

    def main(self):
        output = codecs.open(OUTPUT_FILENAME, "w", self.charset)
        for filename in self.findFiles(self.directory, '*.doc'):
            self.total += 1
            line = self.processFile(filename)
            if line:
                print >> output, line
            else:
                self.invalid += 1
        output.close()
        self.summary()

    def summary(self):
        print >> stderr
        print >> stderr, "Valid files: %s" % (self.total - self.invalid)
        print >> stderr, "Invalid files: %s" % self.invalid
        print >> stderr, "Total files: %s" % self.total
        print >> stderr
        print >> stderr, "Result written into %s" % OUTPUT_FILENAME

    def findFiles(self, directory, pattern):
        for dirpath, dirnames, filenames in walk(directory):
            for filename in filenames:
                if not fnmatch(filename.lower(), pattern):
                    continue
                yield path_join(dirpath, filename)

    def processFile(self, filename):
        filename, realname = unicodeFilename(filename), filename
        print u"[%s] Process file %s..." % (self.total, filename)
        parser = createParser(filename, realname)
        if not parser:
            print >> stderr, "Unable to parse file"
            return None
        try:
            metadata = extractMetadata(parser)
        except HachoirError, err:
            print >> stderr, "Metadata extraction error: %s" % unicode(err)
            return None
        if not metadata:
            print >> stderr, "Unable to extract metadata"
            return None

        filename = makePrintable(filename, self.charset, to_unicode=True)
        line = [filename]
        for field in self.fields:
            value = metadata.getText(field, u'')
            value = makePrintable(value, self.charset, to_unicode=True)
            line.append(value)
        return '; '.join(line)
Пример #2
0
def unicodeFilename(filename, charset=None):
    if not charset:
        charset = getTerminalCharset()
    try:
        return unicode(filename, charset)
    except UnicodeDecodeError:
        return makePrintable(filename, charset, to_unicode=True)
Пример #3
0
def unicodeFilename(filename, charset=None):
    if not charset:
        charset = getTerminalCharset()
    try:
        return unicode(filename, charset)
    except UnicodeDecodeError:
        return makePrintable(filename, charset, to_unicode=True)
Пример #4
0
 def createFields(self):
     addr = self.absolute_address
     len = self.stream.searchBytesLength(':', False, addr,
                                         addr + (MAX_STRING_LENGTH + 1) * 8)
     if len is None:
         raise ParserError("Torrent: unable to find string separator (':')")
     if not len:
         raise ParserError("Torrent: error: no string length!")
     val = String(self, "length", len, "String length")
     yield val
     try:
         len = int(val.value)
     except ValueError:
         len = -1
     if len < 0:
         raise ParserError(
             "Invalid string length (%s)" %
             makePrintable(val.value, "ASCII", to_unicode=True))
     yield String(self, "separator", 1, "String length/value separator")
     if not len:
         self.info("Empty string: len=%i" % len)
         return
     if len < 512:
         yield String(self,
                      "value",
                      len,
                      "String value",
                      charset="ISO-8859-1")
     else:
         # Probably raw data
         yield RawBytes(self, "value", len, "Raw data")
Пример #5
0
    def __init__(self,
                 parent,
                 name,
                 length,
                 description=None,
                 parser=None,
                 filename=None,
                 mime_type=None,
                 parser_class=None):
        if filename:
            if not isinstance(filename, unicode):
                filename = makePrintable(filename, "ISO-8859-1")
            if not description:
                description = 'File "%s" (%s)' % (filename,
                                                  humanFilesize(length))
        Bytes.__init__(self, parent, name, length, description)

        def createInputStream(cis, **args):
            tags = args.setdefault("tags", [])
            if parser_class:
                tags.append(("class", parser_class))
            if parser is not None:
                tags.append(("id", parser.PARSER_TAGS["id"]))
            if mime_type:
                tags.append(("mime", mime_type))
            if filename:
                tags.append(("filename", filename))
            return cis(**args)

        self.setSubIStream(createInputStream)
Пример #6
0
 def getFieldType(self):
     info = self.charset
     if self._strip:
         if isinstance(self._strip, (str, unicode)):
             info += ",strip=%s" % makePrintable(self._strip, "ASCII", quote="'")
         else:
             info += ",strip=True"
     return "%s<%s>" % (Bytes.getFieldType(self), info)
Пример #7
0
 def getFieldType(self):
     info = self.charset
     if self._strip:
         if isinstance(self._strip, (str, unicode)):
             info += ",strip=%s" % makePrintable(self._strip, "ASCII", quote="'")
         else:
             info += ",strip=True"
     return "%s<%s>" % (Bytes.getFieldType(self), info)
Пример #8
0
 def createDisplay(self, human=True):
     if not human:
         if self._raw_value is None:
             self._raw_value = GenericString.createValue(self, False)
         value = makePrintable(self._raw_value, "ASCII", to_unicode=True)
     elif self._charset:
         value = makePrintable(self.value, "ISO-8859-1", to_unicode=True)
     else:
         value = self.value
     if config.max_string_length < len(value):
         # Truncate string if needed
         value = "%s(...)" % value[:config.max_string_length]
     if not self._charset or not human:
         return makePrintable(value, "ASCII", quote='"', to_unicode=True)
     else:
         if value:
             return '"%s"' % value.replace('"', '\\"')
         else:
             return _("(empty)")
Пример #9
0
 def createDisplay(self, human=True):
     if not human:
         if self._raw_value is None:
             self._raw_value = GenericString.createValue(self, False)
         value = makePrintable(self._raw_value, "ASCII", to_unicode=True)
     elif self._charset:
         value = makePrintable(self.value, "ISO-8859-1", to_unicode=True)
     else:
         value = self.value
     if config.max_string_length < len(value):
         # Truncate string if needed
         value = "%s(...)" % value[:config.max_string_length]
     if not self._charset or not human:
         return makePrintable(value, "ASCII", quote='"', to_unicode=True)
     else:
         if value:
             return '"%s"' % value.replace('"', '\\"')
         else:
             return _("(empty)")
Пример #10
0
 def _getDescription(self):
     if self._description is None:
         try:
             self._description = self.createDescription()
             if isinstance(self._description, str):
                 self._description = makePrintable(
                     self._description, "ISO-8859-1", to_unicode=True)
         except HACHOIR_ERRORS, err:
             self.error("Error getting description: " + unicode(err))
             self._description = ""
Пример #11
0
 def _getDescription(self):
     if self._description is None:
         try:
             self._description = self.createDescription()
             if isinstance(self._description, str):
                 self._description = makePrintable(self._description,
                                                   "ISO-8859-1",
                                                   to_unicode=True)
         except HACHOIR_ERRORS, err:
             self.error("Error getting description: " + unicode(err))
             self._description = ""
Пример #12
0
 def getFilename(self):
     name = self["name"].value
     if isinstance(name, str):
         name = makePrintable(name, "ASCII", to_unicode=True)
     ext = self["ext"].value
     if ext:
         name += "." + ext
     if name[0] == 5:
         name = "\xE5" + name[1:]
     if not self.LFN and self["directory"].value:
         name += "/"
     return name
Пример #13
0
 def run(self):
     for filename in os.listdir(self.adir):
         filename = os.path.join(self.adir, filename)
         if os.path.isdir(filename):
             while 1:
                 try:
                     Walk(filename).start()
                     break
                 except:
                     continue
                     
         elif os.path.isfile(filename) and is_song(filename):
             filename, realname = unicodeFilename(filename), filename
             try:
                 song = Song.objects.get(filename = filename)
             except:
                 song = Song(filename = filename, name = os.path.splitext(os.path.basename(filename))[0])  
             if not has_changed(song):
                 continue
             song.stat = stat(filename)
             try:
                 parser = createParser(filename, realname)
             except:
                 parser = None
             if not parser:
                 print >>stderr, "Unable to parse file %s"%filename
                 continue 
             try:
                 metadata = extractMetadata(parser)
             except HachoirError, err:
                 print >>stderr, "Metadata extraction error: %s" % unicode(err)
                 continue
             if not metadata:
                 print >>stderr, "Unable to extract metadata"
                 continue
             else:
                 text = metadata.exportPlaintext()
                 charset = getTerminalCharset()
                 for line in text[1:]:
                     line = makePrintable(line, charset)
                     key = line[2:].split(': ')[0].replace(' ','_').replace('/','_').lower()
                     if key in COLS:
                         setattr(song,key,line[len(key)+4:])
             while 1:
                 try:
                     song.save()
                     break
                 except:
                     continue
Пример #14
0
    def __str__(self):
        r"""
        Create a multi-line ASCII string (end of line is "\n") which
        represents all datas.

        >>> a = RootMetadata()
        >>> a.author = "haypo"
        >>> a.copyright = unicode("© Hachoir", "UTF-8")
        >>> print a
        Metadata:
        - Author: haypo
        - Copyright: \xa9 Hachoir

        @see __unicode__() and exportPlaintext()
        """
        text = self.exportPlaintext()
        return "\n".join(makePrintable(line, "ASCII") for line in text)
Пример #15
0
    def __str__(self):
        r"""
        Create a multi-line ASCII string (end of line is "\n") which
        represents all datas.

        >>> a = RootMetadata()
        >>> a.author = "haypo"
        >>> a.copyright = unicode("© Hachoir", "UTF-8")
        >>> print a
        Metadata:
        - Author: haypo
        - Copyright: \xa9 Hachoir

        @see __unicode__() and exportPlaintext()
        """
        text = self.exportPlaintext()
        return "\n".join(makePrintable(line, "ASCII") for line in text)
Пример #16
0
 def __init__(self, parent, name, length, description=None,
 parser=None, filename=None, mime_type=None, parser_class=None):
     if filename:
         if not isinstance(filename, unicode):
             filename = makePrintable(filename, "ISO-8859-1")
         if not description:
             description = 'File "%s" (%s)' % (filename, humanFilesize(length))
     Bytes.__init__(self, parent, name, length, description)
     def createInputStream(cis, **args):
         tags = args.setdefault("tags",[])
         if parser_class:
             tags.append(( "class", parser_class ))
         if parser is not None:
             tags.append(( "id", parser.PARSER_TAGS["id"] ))
         if mime_type:
             tags.append(( "mime", mime_type ))
         if filename:
             tags.append(( "filename", filename ))
         return cis(**args)
     self.setSubIStream(createInputStream)
Пример #17
0
 def _createDisplay(self, human):
     max_bytes = config.max_byte_length
     if type(self._getValue) is type(lambda: None):
         display = self.value[:max_bytes]
     else:
         if self._display is None:
             address = self.absolute_address
             length = min(self._size / 8, max_bytes)
             self._display = self._parent.stream.readBytes(address, length)
         display = self._display
     truncated = (8 * len(display) < self._size)
     if human:
         if truncated:
             display += "(...)"
         return makePrintable(display, "latin-1", quote='"', to_unicode=True)
     else:
         display = str2hex(display, format=r"\x%02x")
         if truncated:
             return '"%s(...)"' % display
         else:
             return '"%s"' % display
Пример #18
0
    def processID3v2(self, field):
        # Read value
        if "content" not in field:
            return
        content = field["content"]
        if "text" not in content:
            return
        if "title" in content and content["title"].value:
            value = "%s: %s" % (content["title"].value, content["text"].value)
        else:
            value = content["text"].value

        # Known tag?
        tag = field["tag"].value
        if tag not in self.TAG_TO_KEY:
            if tag:
                if isinstance(tag, str):
                    tag = makePrintable(tag, "ISO-8859-1", to_unicode=True)
                self.warning("Skip ID3v2 tag %s: %s" % (tag, value))
            return
        key = self.TAG_TO_KEY[tag]
        setattr(self, key, value)
Пример #19
0
    def processID3v2(self, field):
        # Read value
        if "content" not in field:
            return
        content = field["content"]
        if "text" not in content:
            return
        if "title" in content and content["title"].value:
            value = "%s: %s" % (content["title"].value, content["text"].value)
        else:
            value = content["text"].value

        # Known tag?
        tag = field["tag"].value
        if tag not in self.TAG_TO_KEY:
            if tag:
                if isinstance(tag, str):
                    tag = makePrintable(tag, "ISO-8859-1", to_unicode=True)
                self.warning("Skip ID3v2 tag %s: %s" % (tag, value))
            return
        key = self.TAG_TO_KEY[tag]
        setattr(self, key, value)
Пример #20
0
 def _createDisplay(self, human):
     max_bytes = config.max_byte_length
     if type(self._getValue) is type(lambda: None):
         display = self.value[:max_bytes]
     else:
         if self._display is None:
             address = self.absolute_address
             length = min(self._size / 8, max_bytes)
             self._display = self._parent.stream.readBytes(address, length)
         display = self._display
     truncated = (8 * len(display) < self._size)
     if human:
         if truncated:
             display += "(...)"
         return makePrintable(display,
                              "latin-1",
                              quote='"',
                              to_unicode=True)
     else:
         display = str2hex(display, format=r"\x%02x")
         if truncated:
             return '"%s(...)"' % display
         else:
             return '"%s"' % display
Пример #21
0
 def createFields(self):
     addr = self.absolute_address
     len = self.stream.searchBytesLength(':', False, addr, addr+(MAX_STRING_LENGTH+1)*8)
     if len is None:
         raise ParserError("Torrent: unable to find string separator (':')")
     if not len:
         raise ParserError("Torrent: error: no string length!")
     val = String(self, "length", len, "String length")
     yield val
     try:
         len = int(val.value)
     except ValueError:
         len = -1
     if len < 0:
         raise ParserError("Invalid string length (%s)" % makePrintable(val.value, "ASCII", to_unicode=True))
     yield String(self, "separator", 1, "String length/value separator")
     if not len:
         self.info("Empty string: len=%i" % len)
         return
     if len<512:
         yield String(self, "value", len, "String value", charset="ISO-8859-1")
     else:
         # Probably raw data
         yield RawBytes(self, "value", len, "Raw data")
Пример #22
0
 def createDisplay(self):
     return makePrintable(self.value, "UTF-8", to_unicode=True, quote='"')
    def processHeader(self, header):
        compression = []
        bit_rates = []
        is_vbr = None

        if "ext_desc/content" in header:
            # Extract all data from ext_desc
            data = {}
            for desc in header.array("ext_desc/content/descriptor"):
                self.useExtDescItem(desc, data)

            # Have ToolName and ToolVersion? If yes, group them to producer key
            if "ToolName" in data and "ToolVersion" in data:
                self.producer = "%s (version %s)" % (data["ToolName"], data["ToolVersion"])
                del data["ToolName"]
                del data["ToolVersion"]

            # "IsVBR" key
            if "IsVBR" in data:
                is_vbr = data["IsVBR"] == 1
                del data["IsVBR"]

            # Store data
            for key, value in data.iteritems():
                if key in self.EXT_DESC_TO_ATTR:
                    key = self.EXT_DESC_TO_ATTR[key]
                else:
                    if isinstance(key, str):
                        key = makePrintable(key, "ISO-8859-1", to_unicode=True)
                    value = "%s=%s" % (key, value)
                    key = "comment"
                setattr(self, key, value)

        if "file_prop/content" in header:
            self.useFileProp(header["file_prop/content"], is_vbr)

        if "codec_list/content" in header:
            for codec in header.array("codec_list/content/codec"):
                if "name" in codec:
                    text = codec["name"].value
                    if "desc" in codec and codec["desc"].value:
                        text = "%s (%s)" % (text, codec["desc"].value)
                    compression.append(text)

        audio_index = 1
        video_index = 1
        for index, stream_prop in enumerate(header.array("stream_prop")):
            if "content/audio_header" in stream_prop:
                meta = Metadata(self)
                self.streamProperty(header, index, meta)
                self.streamAudioHeader(stream_prop["content/audio_header"], meta)
                if self.addGroup("audio[%u]" % audio_index, meta, "Audio stream #%u" % audio_index):
                    audio_index += 1
            elif "content/video_header" in stream_prop:
                meta = Metadata(self)
                self.streamProperty(header, index, meta)
                self.streamVideoHeader(stream_prop["content/video_header"], meta)
                if self.addGroup("video[%u]" % video_index, meta, "Video stream #%u" % video_index):
                    video_index += 1

        if "metadata/content" in header:
            info = header["metadata/content"]
            try:
                self.title = info["title"].value
                self.author = info["author"].value
                self.copyright = info["copyright"].value
            except MissingField:
                pass
Пример #24
0
    def update(self, node):
        if node.depth:
            text = ' ' * (3 * node.depth - 2)
            if node.childs:
                text += '- '
            elif node.field.is_field_set:
                text += '+ '
            else:
                text += '  '
            name = node.field.name
        else:
            text = ''
            name = node.field.stream.source

        if node.field.size:
            if self.flags & self.use_absolute_address:
                address = node.field.absolute_address
            else:
                address = node.field.address
            display_bits = (address % 8) != 0 or (node.field.size % 8) != 0

            if self.flags & self.hex_address:
                if display_bits:
                    text += "%04x.%x" % (address/8, address%8)
                else:
                    text += "%04x" % (address/8)
            else:
                if display_bits:
                    text += "%u.%u" % (address/8, address%8)
                else:
                    text += "%u" % (address/8)
            text += ") " + name
        else:
            text += "-> " + name

        smart_display = True
        if self.flags & self.display_value and node.field.hasValue():
            if self.flags & self.human_size:
                display = node.field.display
            else:
                display = node.field.raw_display
                smart_display = False
            text += "= %s" % display
        if node.field.description and self.flags & self.display_description:
            description = node.field.description
            if not(self.flags & self.human_size):
                description = makePrintable(description, "ASCII")
            text += ": %s" % description
        if self.flags & self.display_size and node.field.size or self.flags & self.display_type:
            tmp_text = []
            if self.flags & self.display_type:
                tmp_text.append(node.field.getFieldType())
            if self.flags & self.display_size:
                if node.field.size % 8:
                    tmp_text.append( humanBitSize(node.field.size) )
                else:
                    size = node.field.size / 8
                    if not self.flags & self.human_size:
                        tmp_text.append( ngettext("%u byte", "%u bytes", size) % size)
                    else:
                        tmp_text.append( humanFilesize(size) )
            text += " (%s)" % ", ".join(tmp_text)
        text = makePrintable(text, self.charset, to_unicode=True, smart=smart_display)
        node.setText(text, self.flags)
Пример #25
0
 def createRawDisplay(self):
     value = self.value
     if isinstance(value, str):
         return makePrintable(value, "ASCII", to_unicode=True)
     else:
         return unicode(value)
Пример #26
0
    def update(self, node):
        if node.depth:
            text = ' ' * (3 * node.depth - 2)
            if node.childs:
                text += '- '
            elif node.field.is_field_set:
                text += '+ '
            else:
                text += '  '
            name = node.field.name
        else:
            text = ''
            name = node.field.stream.source

        if node.field.size:
            if self.flags & self.use_absolute_address:
                address = node.field.absolute_address
            else:
                address = node.field.address
            display_bits = (address % 8) != 0 or (node.field.size % 8) != 0

            if self.flags & self.hex_address:
                if display_bits:
                    text += "%04x.%x" % (address/8, address%8)
                else:
                    text += "%04x" % (address/8)
            else:
                if display_bits:
                    text += "%u.%u" % (address/8, address%8)
                else:
                    text += "%u" % (address/8)
            text += ") " + name
        else:
            text += "-> " + name

        smart_display = True
        if self.flags & self.display_value and node.field.hasValue():
            if self.flags & self.human_size:
                display = node.field.display
            else:
                display = node.field.raw_display
                smart_display = False
            text += "= %s" % display
        if node.field.description and self.flags & self.display_description:
            text += ": %s" % node.field.description
        if self.flags & self.display_size and node.field.size or self.flags & self.display_type:
            tmp_text = []
            if self.flags & self.display_type:
                tmp_text.append(node.field.getFieldType())
            if self.flags & self.display_size:
                if node.field.size % 8:
                    tmp_text.append( humanBitSize(node.field.size) )
                else:
                    size = node.field.size / 8
                    if not self.flags & self.human_size:
                        tmp_text.append( ngettext("%u byte", "%u bytes", size) % size)
                    else:
                        tmp_text.append( humanFilesize(size) )
            text += " (%s)" % ", ".join(tmp_text)
        text = makePrintable(text, self.charset, to_unicode=True, smart=smart_display)
        node.setText(text, self.flags)
Пример #27
0
 def createRawDisplay(self):
     value = self.value
     if isinstance(value, str):
         return makePrintable(value, "ASCII", to_unicode=True)
     else:
         return unicode(value)
Пример #28
0
 def __str__(self):
     return makePrintable(self.message, "ASCII")
Пример #29
0
 def createDisplay(self):
     return makePrintable(self.value, "ASCII", quote="'", to_unicode=True)
Пример #30
0
 def __init__(self, message):
     message_bytes = makePrintable(message, "ASCII")
     Exception.__init__(self, message_bytes)
     self.text = message
Пример #31
0
 def createDisplay(self):
     return makePrintable(self.value, "UTF-8", to_unicode=True, quote='"')
from hachoir_parser import createParser


def metadata_for(filename):
    filename, realname = unicodeFilename(filename), filename
    parser = createParser(filename, realname)
    if not parser:
        print "Unable to parse file"
        exit(1)
    try:
        metadata = extractMetadata(parser)
    except HachoirError, err:
        print "Metadata extraction error: %s" % unicode(err)
        metadata = None
    if not metadata:
        print "Unable to extract metadata"
        exit(1)

    text = metadata.exportPlaintext()
    charset = getTerminalCharset()
    for line in text:
        print makePrintable(line, charset)

    return metadata


def extract_data(metadata):
    for data in sorted(metadata):
        if len(data.values) > 0:
            print data.key, data.values[0].value
 def createDisplay(self):
     if self._display_pattern:
         return u"<padding pattern=%s>" % makePrintable(self.pattern, "ASCII", quote="'")
     else:
         return Bytes.createDisplay(self)
Пример #34
0
    def processHeader(self, header):
        compression = []
        bit_rates = []
        is_vbr = None

        if "ext_desc/content" in header:
            # Extract all data from ext_desc
            data = {}
            for desc in header.array("ext_desc/content/descriptor"):
                self.useExtDescItem(desc, data)

            # Have ToolName and ToolVersion? If yes, group them to producer key
            if "ToolName" in data and "ToolVersion" in data:
                self.producer = "%s (version %s)" % (data["ToolName"], data["ToolVersion"])
                del data["ToolName"]
                del data["ToolVersion"]

            # "IsVBR" key
            if "IsVBR" in data:
                is_vbr = (data["IsVBR"] == 1)
                del data["IsVBR"]

            # Store data
            for key, value in data.iteritems():
                if key in self.EXT_DESC_TO_ATTR:
                    key = self.EXT_DESC_TO_ATTR[key]
                else:
                    if isinstance(key, str):
                        key = makePrintable(key, "ISO-8859-1", to_unicode=True)
                    value = "%s=%s" % (key, value)
                    key = "comment"
                setattr(self, key, value)

        if "file_prop/content" in header:
            self.useFileProp(header["file_prop/content"], is_vbr)

        if "codec_list/content" in header:
            for codec in header.array("codec_list/content/codec"):
                if "name" in codec:
                    text = codec["name"].value
                    if "desc" in codec and codec["desc"].value:
                        text = "%s (%s)" % (text, codec["desc"].value)
                    compression.append(text)

        audio_index = 1
        video_index = 1
        for index, stream_prop in enumerate(header.array("stream_prop")):
            if "content/audio_header" in stream_prop:
                meta = Metadata(self)
                self.streamProperty(header, index, meta)
                self.streamAudioHeader(stream_prop["content/audio_header"], meta)
                if self.addGroup("audio[%u]" % audio_index, meta, "Audio stream #%u" % audio_index):
                    audio_index += 1
            elif "content/video_header" in stream_prop:
                meta = Metadata(self)
                self.streamProperty(header, index, meta)
                self.streamVideoHeader(stream_prop["content/video_header"], meta)
                if self.addGroup("video[%u]" % video_index, meta, "Video stream #%u" % video_index):
                    video_index += 1

        if "metadata/content" in header:
            info = header["metadata/content"]
            try:
                self.title = info["title"].value
                self.author = info["author"].value
                self.copyright = info["copyright"].value
            except MissingField:
                pass
Пример #35
0
 def __str__(self):
     return makePrintable(self.message, "ASCII")
Пример #36
0
 def createDisplay(self):
     return makePrintable(self.value, "ASCII", quote="'", to_unicode=True)
Пример #37
0
 def createDisplay(self):
     if self._display_pattern:
         return u"<padding pattern=%s>" % makePrintable(
             self.pattern, "ASCII", quote="'")
     else:
         return Bytes.createDisplay(self)