def get_article(self, title, version=-1):
        LOG.debug('Get article')

        if version != -1:
            partial_ipfs_address = self.history[version]['ID']
        else:
            partial_ipfs_address = self.blockchain_db.get_article_ID(title)
        full_ipfs_address = self.ipfs.get_article(partial_ipfs_address, 20)

        title = title + "_" + str(version)

        # Todo - this can be checked before downloading ???
        # Check if local file exists and is up-to-date
        if (os.path.exists(title)
                and utils.file_hash(
                    full_ipfs_address) != utils.file_hash(title)):
            os.remove(title)
            os.rename(full_ipfs_address, title)
        elif not os.path.exists(title):
            os.rename(full_ipfs_address, title)
        else:
            os.remove(full_ipfs_address)

        path = os.path.join(utils.get_prefix_path(), title)
        article_content_file = open(path)
        article_content = article_content_file.read()
        article_content_file.close()
        return article_content
示例#2
0
def check_uploaded_file(mime_type, fname):
    size = os.stat(fname).st_size
    hash = file_hash(fname)
    extension = os.path.splitext(fname)[1]
    if extension:
        extension=extension[1:]
    return check_file(mime_type, size, hash, extension)
示例#3
0
def _clean_seed_corpus(seed_corpus_dir):
    """Moves seed corpus files from sub-directories into the corpus directory
    root. Also, deletes any files that exceed the 1 MB limit."""
    if not os.path.exists(seed_corpus_dir):
        return

    failed_to_move_files = []
    for root, _, files in os.walk(seed_corpus_dir):
        for filename in files:
            file_path = os.path.join(root, filename)

            if os.path.getsize(file_path) > CORPUS_ELEMENT_BYTES_LIMIT:
                os.remove(file_path)
                logs.warning('Removed seed file %s as it exceeds 1 Mb limit.',
                             file_path)
                continue

            sha1sum = utils.file_hash(file_path)
            new_file_path = os.path.join(seed_corpus_dir, sha1sum)
            try:
                shutil.move(file_path, new_file_path)
            except OSError:
                failed_to_move_files.append((file_path, new_file_path))

    if failed_to_move_files:
        logs.error('Failed to move seed corpus files: %s',
                   failed_to_move_files)
示例#4
0
def check_uploaded_file(mime_type, fname):
    size = os.stat(fname).st_size
    hash = file_hash(fname)
    extension = os.path.splitext(fname)[1]
    if extension:
        extension = extension[1:]
    return check_file(mime_type, size, hash, extension)
示例#5
0
def _clean_seed_corpus(seed_corpus_dir):
    """Prepares |seed_corpus_dir| for the trial. This ensures that it can be
    used by AFL which is picky about the seed corpus. Moves seed corpus files
    from sub-directories into the corpus directory root. Also, deletes any files
    that exceed the 1 MB limit. If the NO_SEEDS env var is specified than the
    seed corpus files are deleted."""
    if not os.path.exists(seed_corpus_dir):
        return

    if environment.get('NO_SEEDS'):
        logs.info('NO_SEEDS specified, deleting seed corpus files.')
        shutil.rmtree(seed_corpus_dir)
        os.mkdir(seed_corpus_dir)
        return

    failed_to_move_files = []
    for root, _, files in os.walk(seed_corpus_dir):
        for filename in files:
            file_path = os.path.join(root, filename)

            if os.path.getsize(file_path) > CORPUS_ELEMENT_BYTES_LIMIT:
                os.remove(file_path)
                logs.warning('Removed seed file %s as it exceeds 1 Mb limit.',
                             file_path)
                continue

            sha1sum = utils.file_hash(file_path)
            new_file_path = os.path.join(seed_corpus_dir, sha1sum)
            try:
                shutil.move(file_path, new_file_path)
            except OSError:
                failed_to_move_files.append((file_path, new_file_path))

    if failed_to_move_files:
        logs.error('Failed to move seed corpus files: %s', failed_to_move_files)
示例#6
0
    def do(self):
        fname = self.opts.file
        alt_name = self.opts.file_name or fname
        if not (os.access(fname, os.R_OK) and os.path.isfile(fname)):
            raise ActionError('File %s does not exists or is not readable' %
                              fname)
        file_info = {
            'size': os.stat(fname).st_size,
            'hash': file_hash(fname),
            'mime_type': guess_type(alt_name)[0] or '',
            'extension': os.path.splitext(alt_name)[1].lower()[1:] or ''
        }
        res = self.http.post('/api/upload/check', json=file_info)
        try:
            f = open(fname, 'rb')
            res = self.http.post('/api/upload',
                                 files={
                                     'file': (os.path.basename(alt_name), f,
                                              file_info['mime_type'])
                                 })
        finally:
            f.close()
        uploaded_file = res['file']
        log.debug('File uploaded as %s', uploaded_file)
        proposed_meta = self._get_meta()
        res = self.client.call('metadata', uploaded_file, proposed_meta)
        upload_meta_id = res['result']

        res = self.http.get('/api/uploads-meta/%d' % upload_meta_id)
        meta = res['meta']
        log.debug('Metadata #%d for ebook - %s', upload_meta_id, meta)
        if not ('title' in meta and meta['title'] and 'language' in meta
                and meta['language'].get('id')):
            raise ActionError('We need at least title and language')
        search = []
        if meta['authors']:
            search.extend(
                map(
                    lambda x: x['first_name'] + ' ' + x['last_name']
                    if 'first_name' in x else x['last_name'], meta['authors']))
        search.append(meta['title'].replace('/', ''))
        if 'series' in meta:
            search.append(meta['series']['title'])

        search = ' '.join(search)

        def get_ignore_404(*args, **kwargs):
            try:
                res = self.http.get(*args, **kwargs)
            except HTTPError as e:
                if hasattr(e, 'response') and e.response.status_code == 404:
                    res = {}
                else:
                    raise e
            return res

        res = get_ignore_404('/api/search/' + quote_plus(search),
                             params={
                                 'page': 1,
                                 'page_size': 5
                             })

        log.debug('search results %s', res)
        book_id = None

        def same_authors(ebook, authors):
            #trivial case -  if ebook does not have authors
            if not authors or not 'authors' in ebook or not ebook['authors']:
                if not authors and (not 'authors' in ebook
                                    or not ebook['authors']):
                    return 1
                else:
                    return 0
            last_names = set(map(lambda a: a['last_name'], authors))
            count = 0
            for a in ebook['authors']:
                if a['last_name'] in last_names:
                    count += 1
            return count

        if 'items' in res and res['items']:
            # some basic sanity check that what we found is OK
            # title is almost same and at least one author last hame is same
            for ebook in res['items']:
                # for now setting conservativelly to 0 - not to mess books like Volume I with Volume II
                if damlev(ebook['title'], meta['title']) <= 0 and\
                    same_authors(ebook, meta.get('authors')) >= 1:
                    book_id = ebook['id']
                    break

        if not book_id:
            res = get_ignore_404('/api/ebooks/index/' +
                                 quote_plus(meta['title']))
            log.debug('Alternative search by full title -result %s', res)
            if 'items' in res and res['items']:
                for ebook in res['items']:
                    if same_authors(ebook, meta.get('authors')) >= 1:
                        book_id = ebook['id']
                        break

        if book_id:
            res = self.http.post('/api/ebooks/%d/add-upload' % (book_id, ),
                                 json={
                                     'upload_id': upload_meta_id,
                                     'quality': self.opts.quality
                                 })
            log.info('Added file to existing ebook #%d', book_id)
        else:
            res = self.http.post('/api/ebooks', json=meta)
            book_id = res['id']
            res = self.http.post('/api/ebooks/%d/add-upload' % (book_id, ),
                                 json={
                                     'upload_id': upload_meta_id,
                                     'quality': self.opts.quality
                                 })
            log.info('Added file to new ebook #%d', book_id)

        if self.opts.json:
            res = self.http.get('/api/ebooks/%d' % book_id)
            print(json.dumps(res))
示例#7
0
    def test_logic(self):

        b1 = model.Ebook.query.get(33837)
        self.assertEqual(b1.authors_str, 'Crichton Michael')

        b2 = model.Ebook.query.get(37157)
        self.assertEqual(b2.authors_str, 'Strugackij A N, Strugackij B N')

        b3 = model.Ebook.query.get(62546)
        self.assertEqual(b3.authors_str, 'Wilkins G, Dalton M, Young K')
        b3.authors.append(b1.authors[0])

        self.assertEqual(
            b3.authors_str, 'Wilkins G, Dalton M, Young K and others')

        b1.authors = []
        self.assertEqual(b1.authors_str, 'No Authors')

        source = model.Source.query.get(46519)
        name = logic.norm_file_name(source)
        self.assertEqual(
            name, 'Strugackij A N, Strugackij B N/Noc na Marse(sk)/Strugackij A N, Strugackij B N - Noc na Marse.doc')

        source = model.Source.query.get(63546)
        name = logic.norm_file_name(source)
        self.assertEqual(
            name, 'Monroe Lucy/Nevesty od Stredozemniho more/Nevesty od Stredozemniho more 2 - Spanelova milenka(cs)/Monroe Lucy - Nevesty od Stredozemniho more 2 - Spanelova milenka.doc')

        res = logic.check_uploaded_file('application/epub+zip', ebook_file)
        self.assertEqual(res['error'], 'file already exists')

        size = os.stat(downloaded_file).st_size
        hash = utils.file_hash(downloaded_file)

        s = model.Source.query.get(86060)

        new_loc = logic.create_new_location(s, downloaded_file)
        self.assertEqual(
            new_loc, 'Kissinger Henry/Roky v Bilem dome(cs)/Kissinger Henry - Roky v Bilem dome.epub')

        shutil.copy(ebook_file, downloaded_file)
        new_loc = logic.create_new_location(s, downloaded_file)
        self.assertEqual(
            new_loc, 'Kissinger Henry/Roky v Bilem dome(cs)/Kissinger Henry - Roky v Bilem dome(1).epub')

        admin = model.User.query.get(1)
        conv = model.Conversion(source=source, format=model.Format.query.filter_by(extension='epub').one(),
                                location='bla.epub', created_by=admin, modified_by=admin)
        db.session.add(conv)
        db.session.commit()

        res = logic.query_converted_sources_for_ebook(
            source.ebook.id, admin).all()
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0].location, 'bla.epub')

        b1 = model.Ebook.query.get(33837)
        b2 = model.Ebook.query.get(37157)
        tot = len(b1.sources) + len(b2.sources)
        logic.merge_ebooks(b1, b2)
        self.assertEqual(len(b1.sources), tot)
        
        s = model.Series(title='Series/Neserie')
        a = model.Author(first_name='Jan', last_name='Kocian/Koci')
        b = model.Ebook(title= 'Neco / Nekde', language=model.Language(code='cs', name='Czech'), 
                        series_index=1)
        b.series = s
        b.authors.append(a)
        
        self.assertEqual(logic.norm_file_name(b,'doc'), 'Kocian-Koci Jan/Series-Neserie/Series-Neserie 1 - Neco - Nekde(cs)/Kocian-Koci Jan - Series-Neserie 1 - Neco - Nekde.doc')
示例#8
0
 def do(self):
     fname = self.opts.file
     alt_name = self.opts.file_name or fname
     if not (os.access(fname, os.R_OK) and os.path.isfile(fname)):
         raise ActionError('File %s does not exists or is not readable'%fname)
     file_info = {'size': os.stat(fname).st_size,
                  'hash': file_hash(fname),
                  'mime_type': guess_type(alt_name)[0] or '',
                  'extension': os.path.splitext(alt_name)[1].lower()[1:] or ''}
     res=self.http.post('/api/upload/check', json=file_info)
     try:
         f= open(fname, 'rb')
         res = self.http.post('/api/upload', files={'file':(os.path.basename(alt_name), f, file_info['mime_type'])})
     finally:
         f.close()
     uploaded_file = res['file']
     log.debug('File uploaded as %s', uploaded_file)
     proposed_meta = self._get_meta()
     res = self.client.call('metadata', uploaded_file, proposed_meta)
     upload_meta_id = res['result']
     
     res = self.http.get('/api/uploads-meta/%d'%upload_meta_id)
     meta = res['meta']
     log.debug('Metadata #%d for ebook - %s', upload_meta_id, meta)
     if not ('title' in meta and meta['title'] and 'language' in meta and meta['language'].get('id')):
         raise ActionError('We need at least title and language')
     search = []
     if meta['authors']:
         search.extend(map(lambda x: x['first_name']+ ' ' + x['last_name'] if 'first_name' in x else x['last_name'], meta['authors']))
     search.append(meta['title'].replace('/', ''))   
     if 'series' in meta:
         search.append(meta['series']['title'])
         
     search = ' '.join(search)
     
     def get_ignore_404(*args, **kwargs):
         try:   
             res = self.http.get(*args, **kwargs)
         except HTTPError as e:
             if hasattr(e, 'response') and e.response.status_code == 404:
                 res ={}
             else:
                 raise e
         return res
         
     res = get_ignore_404('/api/search/'+quote_plus(search), params={'page':1, 'page_size':5})
         
     log.debug('search results %s', res)
     book_id=None    
     
     def same_authors(ebook, authors):
         #trivial case -  if ebook does not have authors
         if not authors or not 'authors' in ebook or not ebook['authors']:
             if not authors and (not 'authors' in ebook or not ebook['authors']):
                 return 1
             else:
                 return 0
         last_names = set(map(lambda a: a['last_name'], authors))
         count = 0
         for a in ebook['authors']:
             if a['last_name'] in last_names:
                 count+=1
         return count
     
     if 'items' in  res and res['items']:
         # some basic sanity check that what we found is OK
         # title is almost same and at least one author last hame is same
         for ebook in res['items']:
             # for now setting conservativelly to 0 - not to mess books like Volume I with Volume II
             if damlev(ebook['title'], meta['title']) <= 0 and\
                 same_authors(ebook, meta.get('authors')) >= 1:
                 book_id = ebook['id']
                 break
     
     if not book_id:
         res = get_ignore_404('/api/ebooks/index/'+quote_plus(meta['title']))
         log.debug('Alternative search by full title -result %s', res)
         if 'items' in res and res['items']:
             for ebook in res['items']:
                 if same_authors(ebook, meta.get('authors')) >= 1:
                     book_id = ebook['id']
                     break
             
         
         
     
     
     
     if book_id:
         res = self.http.post('/api/ebooks/%d/add-upload'%(book_id,), json={'upload_id':upload_meta_id, 'quality':self.opts.quality})
         log.info('Added file to existing ebook #%d', book_id)  
     else:
         res = self.http.post('/api/ebooks', json = meta)
         book_id = res['id']
         res = self.http.post('/api/ebooks/%d/add-upload'%(book_id,), json={'upload_id':upload_meta_id, 'quality':self.opts.quality})
         log.info('Added file to new ebook #%d', book_id)  
         
     if self.opts.json:
         res=self.http.get('/api/ebooks/%d'%book_id)
         print(json.dumps(res))
示例#9
0
    def test_logic(self):

        b1 = model.Ebook.query.get(33837)
        self.assertEqual(b1.authors_str, 'Crichton Michael')

        b2 = model.Ebook.query.get(37157)
        self.assertEqual(b2.authors_str, 'Strugackij A N, Strugackij B N')

        b3 = model.Ebook.query.get(62546)
        self.assertEqual(b3.authors_str, 'Wilkins G, Dalton M, Young K')
        b3.authors.append(b1.authors[0])

        self.assertEqual(b3.authors_str,
                         'Wilkins G, Dalton M, Young K and others')

        b1.authors = []
        self.assertEqual(b1.authors_str, 'No Authors')

        source = model.Source.query.get(46519)
        name = logic.norm_file_name(source)
        self.assertEqual(
            name,
            'Strugackij A N, Strugackij B N/Noc na Marse(sk)/Strugackij A N, Strugackij B N - Noc na Marse.doc'
        )

        source = model.Source.query.get(63546)
        name = logic.norm_file_name(source)
        self.assertEqual(
            name,
            'Monroe Lucy/Nevesty od Stredozemniho more/Nevesty od Stredozemniho more 2 - Spanelova milenka(cs)/Monroe Lucy - Nevesty od Stredozemniho more 2 - Spanelova milenka.doc'
        )

        res = logic.check_uploaded_file('application/epub+zip', ebook_file)
        self.assertEqual(res['error'], 'file already exists')

        size = os.stat(downloaded_file).st_size
        hash = utils.file_hash(downloaded_file)

        s = model.Source.query.get(86060)

        new_loc = logic.create_new_location(s, downloaded_file)
        self.assertEqual(
            new_loc,
            'Kissinger Henry/Roky v Bilem dome(cs)/Kissinger Henry - Roky v Bilem dome.epub'
        )

        shutil.copy(ebook_file, downloaded_file)
        new_loc = logic.create_new_location(s, downloaded_file)
        self.assertEqual(
            new_loc,
            'Kissinger Henry/Roky v Bilem dome(cs)/Kissinger Henry - Roky v Bilem dome(1).epub'
        )

        admin = model.User.query.get(1)
        conv = model.Conversion(
            source=source,
            format=model.Format.query.filter_by(extension='epub').one(),
            location='bla.epub',
            created_by=admin,
            modified_by=admin)
        db.session.add(conv)
        db.session.commit()

        res = logic.query_converted_sources_for_ebook(source.ebook.id,
                                                      admin).all()
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0].location, 'bla.epub')

        b1 = model.Ebook.query.get(33837)
        b2 = model.Ebook.query.get(37157)
        tot = len(b1.sources) + len(b2.sources)
        logic.merge_ebooks(b1, b2)
        self.assertEqual(len(b1.sources), tot)

        s = model.Series(title='Series/Neserie')
        a = model.Author(first_name='Jan', last_name='Kocian/Koci')
        b = model.Ebook(title='Neco / Nekde',
                        language=model.Language(code='cs', name='Czech'),
                        series_index=1)
        b.series = s
        b.authors.append(a)

        self.assertEqual(
            logic.norm_file_name(b, 'doc'),
            'Kocian-Koci Jan/Series-Neserie/Series-Neserie 1 - Neco - Nekde(cs)/Kocian-Koci Jan - Series-Neserie 1 - Neco - Nekde.doc'
        )