Exemplo n.º 1
0
    def __guess_metadata(self, path):
        guess = []

        try:
            keys = self.xtract.extract(path)
            for keyword_type, keyword in keys:
                ##~ print 'ktype:', keyword_type
                try:
                    predicate = self.__extractor2nepomuk(str(keyword_type))
                    if predicate:
                        ##~ print 'predicate:', predicate,
                        if predicate == NIE['generator']:
                            for ch in keyword:
                                if not ch.isalpha():
                                    break
                            sep = keyword.find(ch)
                            keyword = keyword[0:sep]
                        #~ print 'keyword:', keyword,
                        guess.append((predicate, Literal(str2utf8(keyword))))
                    else:
                        ##~ print
                        self.log.debug("guess_metadata: %s, %s" % (keyword_type, keyword))
                except Exception, error:
                    self.log.error(error)
                ##~ print
        except Exception, error:
            self.log.error(error)
Exemplo n.º 2
0
    def extract_from_string(self, content):
        data = []
        try:
            title = content[0:100].strip()
        except Exception:
            title = "No title. Change it manually."

        data.append((RDF.type, NFO['Clipboard']))
        data.append((NIE['title'], Literal(str2utf8(title))))
        data.append((NIE['mimeType'], Literal('text/plain')))
        data.append((NIE['mimeMedia'], Literal('text')))
        data.append((NIE['mimeSubtype'], Literal('plain')))
        #data.append((NFO['fileUrl'], Literal(str(id)[9:])))

        return data
Exemplo n.º 3
0
    def __get_html_metadata(self, filename):
        data = []
        self.log.info('Extracting metadata from: %s' % filename)
        log = open('/tmp/vlog.txt', 'w')
        try:
            keys = self.xtract.extract(filename)
            tagdict = {}
            for keyword_type, keyword in keys:
                if keyword_type == 'keywords':
                    keyword = keyword.replace(' ', ',')
                    t = 0
                    for label in keyword.split(','):
                        t += 1
                        tag = label.split()
                        tag = str(('').join(tag))
                        tag = tag.strip()
                        tag = str2utf8(tag)
                        if len(tag) < 3:
                            continue

                        tagid = self.__lookup_tag_id(tag)
                        if not tagid:
                            try:
                                tagid = tagdict[tag]
                            except:
                                vres = self.app.gui.factory.new_resource()
                                tagid = vres.get_id()
                                tagdict[tag] = tagid
                        else:
                            self.log.info("Reusing tag: %s" % tag)

                        data.append((NAO['hasTag'], URIRef(tagid)))

                    lvres = []
                    for tag in tagdict:
                        tagid = tagdict[tag]
                        self.log.debug("New tag: %s" % tag)
                        vres = self.app.gui.factory.new_resource(tagid, True)
                        metadata = []
                        metadata.append((RDF.type, PIMO['Tag']))
                        metadata.append((NIE['title'], Literal(tag)))
                        vres.set_label(tag)
                        vres.set_type(PIMO['Tag'])
                        vres.set_metadata(metadata)
                        lvres.append(vres)

                    """
                    rid = SYSRES['tag-all']
                    vres = self.app.gui.factory.new_resource(rid)
                    metadata = []
                    metadata.append((RDF.type, PIMO['Tag']))
                    metadata.append((NIE['title'], Literal(_('[]'))))
                    vres.set_label(_('all'))
                    vres.set_type(PIMO['Tag'])
                    vres.set_metadata(metadata)
                    lvres.append(vres)
                    """
                    self.app.gui.factory.transport(lvres)
                elif keyword_type == 'title':
                    title = keyword.strip()
                    title = title.replace('\n', '')
                    title = title.replace('\t', '')
                    title = str2utf8(title)
                    data.append((NIE['title'], Literal(title)))
                elif keyword_type in ['creator', 'author', 'last saved by', 'artist']:
                    # Creator of a data object, an entity primarily responsible
                    #for the creation of the content of the data object.
                    title = str2utf8(keyword.strip())
                    data.append((NCO['creator'], Literal(title)))
                elif keyword_type == 'format':
                    #~ # PDF 1.4
                    pass
                elif keyword_type in ['subject', 'description', 'comment']:
                    subject = str2utf8(keyword.strip())
                    data.append((NIE['subject'], Literal(subject)))
                elif keyword_type == 'language':
                    # Language the InformationElement is expressed in.
                    # This property applies to the data object in its entirety.
                    # If the data object is divisible into parts expressed
                    # in multiple languages - more specific properties should be used.
                    # Users are encouraged to use the two-letter code specified in the
                    # RFC 3066
                    lang = str2utf8(keyword.strip())
                    data.append((NIE['language'], Literal(lang)))
                elif keyword_type in  ['generator', 'producer', 'software', 'publisher']:
                    # Software used to "generate" the contents. E.g. a word processor name.
                    generator = str2utf8(keyword.strip())
                    data.append((NIE['generator'], Literal(generator)))
                elif keyword_type == 'character count':
                    # The amount of characters in the document.
                    charc = str2utf8(keyword.strip())
                    data.append((NFO['characterCount'], Literal(charc)))
                elif keyword_type == 'line count':
                    # The amount of lines in a text document
                    linec = str2utf8(keyword.strip())
                    data.append((NFO['lineCount'], Literal(linec)))
                elif keyword_type == 'word count':
                    # The amount of words in a text document
                    wordc = str2utf8(keyword.strip())
                    data.append((NFO['wordCount'], Literal(wordc)))
                elif keyword_type == 'creation date':
                    # The amount of words in a text document
                    data.append((NIE['contentCreated'], Literal(str2utf8(keyword))))
                elif keyword_type == 'mimetype':
                    # The amount of words in a text document
                    data.append((NIE['mimeType'], Literal(str2utf8(keyword))))
                elif keyword_type == 'track number':
                    data.append((NID3['trackNumber'], Literal(str2utf8(keyword))))
                elif keyword_type == 'album':
                    data.append((NID3['albumTitle'], Literal(str2utf8(keyword))))
                elif keyword_type in ['genre', 'content type']:
                    data.append((NID3['genre'], Literal(str2utf8(keyword))))
                elif keyword_type == 'year':
                    data.append((NID3['recordingYear'], Literal(str2utf8(keyword))))
                elif keyword_type == 'disc number':
                    data.append((NID3['discNumber'], Literal(str2utf8(keyword))))
                elif keyword_type == 'camera model':
                    data.append((NEXIF['cameraModel'], Literal(str2utf8(keyword))))
                elif keyword_type == 'camera make':
                    data.append((NEXIF['cameraMaker'], Literal(str2utf8(keyword))))
                elif keyword_type == 'aperture':
                    data.append((NEXIF['apertureValue'], Literal(str2utf8(keyword))))
                elif keyword_type == 'exposure':
                    data.append((NEXIF['exposureValue'], Literal(str2utf8(keyword))))
                elif keyword_type == 'exposure bias':
                    data.append((NEXIF['exposureBiasValue'], Literal(str2utf8(keyword))))
                elif keyword_type == 'exposure mode':
                    data.append((NEXIF['exposureMode'], Literal(str2utf8(keyword))))
                elif keyword_type == 'iso speed':
                    data.append((NEXIF['isoSpeed'], Literal(str2utf8(keyword))))
                elif keyword_type == 'focal length':
                    data.append((NEXIF['focalLength'], Literal(str2utf8(keyword))))
                elif keyword_type == 'flash':
                    data.append((NEXIF['flash'], Literal(str2utf8(keyword))))
                elif keyword_type == 'metering mode':
                    data.append((NEXIF['meteringMode'], Literal(str2utf8(keyword))))
                elif keyword_type == 'orientation':
                    data.append((NEXIF['orientation'], Literal(str2utf8(keyword))))
                #~ elif keyword_type in ['format', 'resource-type']:
                    #~ data.append((DC['format'], Literal(str2utf8(int(keyword)))))
                elif keyword_type == 'size':
                    # The amount of words in a text document
                    try:
                        if 'x' in keyword:
                            width = keyword[:keyword.find('x')]
                            height = keyword[keyword.find('x') + 1:]
                            data.append((NEXIF['width'], Literal(str2utf8(width))))
                            data.append((NEXIF['height'], Literal(str2utf8(height))))
                    except: pass
                else:
                    log.write('KType: %s\t\t%s\t\t%s\n' % (keyword_type, keyword, filename))

            log.close()
            # add new resources before return metadata

            self.app.store.do_commit()

            return data
        except Exception, error:
            self.log.error("get_html_metadata: %s" % error)
            return data
Exemplo n.º 4
0
    def extract_from_network(self, thing, vuri=None):
        data = []
        if thing.startswith('www.'):
            thing = 'http://' + thing
        # 1. Get Url
        # RFC 1738: <scheme>://<user>:<password>@<host>:<port>/<url-path>;<params>?<query>#<fragment>
        parsed_url = urlparse(thing)
        url = parsed_url.geturl()
        self.log.debug("Got URL: %s" % url)

        # 2. Check if it's already in the kb
        link_id = self.app.gui.ask.get_link_id(url)
        #~ print url, link_id, type(link_id)
        if link_id:
            return []
        #~ if link_id is not None:
            #~ self.log.debug("%s exists" % link_id)
            #~ title = self.app.vstore.get_property(link_id)
            #~ created = self.app.vstore.get_property(link_id, NAO['created'])
            #~ #FIXME: no dialogs in metadata. return error codes?
            #~ #~ print res
            #~ return []
        #~ #~ print parsed_url.netloc, parsed_url.fragment
        # Uncomment the following lines to get more metadata about url
        status = self.__get_status_code(url)
        data.append((NFO['fileStatus'], Literal(str(status))))
        data.append((NFO['fileScheme'], Literal(parsed_url.scheme)))
        if (parsed_url.port):       data.append((NFO['filePort'],       Literal(parsed_url.port)))
        if (parsed_url.fragment):   data.append((NFO['fileFragment'],   Literal(parsed_url.fragment)))
        if (parsed_url.netloc):     data.append((NFO['fileNetloc'],     Literal(parsed_url.netloc)))
        if (parsed_url.params):     data.append((NFO['fileParams'],     Literal(parsed_url.params)))
        if (parsed_url.path):       data.append((NFO['filePath'],       Literal(parsed_url.path)))
        if (parsed_url.query):      data.append((NFO['fileQuery'],      Literal(parsed_url.query)))

        #TODO: add more metadata from tags if it is webpage or feed

        # 3. Download url content and write it to temporal file
        self.log.info('Downloading %s' % url)
        try:
            output_file = LPATH['ROOT'] + tempfile.mktemp()
            fp = open(output_file, 'wb')
            c = pycurl.Curl()
            c.setopt(c.URL, url)
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(pycurl.MAXREDIRS, 5)
            c.setopt(pycurl.CONNECTTIMEOUT, 30)
            c.setopt(pycurl.TIMEOUT, 300)
            c.setopt(pycurl.WRITEDATA, fp)
            c.perform()
            stsize = int(c.getinfo(c.SIZE_DOWNLOAD))
            self.log.info("Document size: %d bytes" % stsize)
            c.close()

            # 4. Get Mimetype
            try:
                fmt = self.magic.file(output_file)
                data.append((NIE['mimeType'], Literal(fmt)))
                self.log.info(fmt)
            except Exception, error:
                self.log.error(error)

            if fmt == 'text/html':
                data.append((RDF.type, NFO['Website']))
                data.append((NFO['fileUrl'], Literal(str2utf8(url))))
                data.append((NFO['fileHostname'], Literal(str2utf8(parsed_url.hostname))))
                print 'website', url, parsed_url.hostname
                try:
                    data += self.__get_html_metadata(output_file)
                except Exception, error:
                    self.log.error(error)

                self.log.info('Data: %s' % len(data))

                hasTags = False
                hasTitle = False
                for p, o in data:
                    if p == NAO['hasTag']:
                        hasTags = True
                    elif p == NIE['title']:
                        hasTitle = True
                if not hasTags:
                    data.append((NAO['hasTag'], SYSRES['no-tags']))
                if not hasTitle:
                    data.append((NIE['title'], Literal(str2utf8(url))))
Exemplo n.º 5
0
     hasTags = False
     hasTitle = False
     for p, o in data:
         if p == NAO['hasTag']:
             hasTags = True
         elif p == NIE['title']:
             hasTitle = True
     if not hasTags:
         data.append((NAO['hasTag'], SYSRES['no-tags']))
     if not hasTitle:
         data.append((NIE['title'], Literal(str2utf8(url))))
 elif fmt == 'application/xml':
     res = feedparser.parse(output_file)
     if res.bozo == 0:
         data.append((RDF.type, NFO['Feed']))
         data.append((NFO['fileUrl'], Literal(str2utf8(url))))
         data.append((NFO['fileHostname'], Literal(str2utf8(parsed_url.hostname))))
         data.append((NIE['title'], Literal(str2utf8(res.feed.title))))
     else:
         data.append((RDF.type, NFO['Website']))
         data.append((NFO['fileUrl'], Literal(str2utf8(url))))
         data.append((NFO['fileHostname'], Literal(str2utf8(parsed_url.hostname))))
         data += self.__get_html_metadata(output_file)
         hasTags = False
         hasTitle = False
         for p, o in data:
             if p == NAO['hasTag']:
                 hasTags = True
             elif p == NIE['title']:
                 hasTitle = True
         if not hasTags: