Пример #1
0
    def compile_txt(self, collection=DEFAULT_COLLECTION):
        """
		This will download the txt files from IA.
		"""
        # make sure exists
        import shutil
        if not os.path.exists(self.path_txt): os.makedirs(self.path_txt)
        os.chdir(self.path_txt)

        # getting ids
        print(
            f'>> [{self.name}] downloading txt files, using custom function...'
        )
        id_list = self.get_collection_ids(collection=collection)

        # download txt
        for i, idx in enumerate(tqdm(id_list, position=1)):
            if os.path.exists(idx + '.txt'): continue
            ia.download(idx,
                        silent=True,
                        glob_pattern='*.txt',
                        ignore_existing=True)

            # get files and rename]
            if not os.path.exists(idx): continue
            for fn in os.listdir(idx):
                if not fn.endswith('.txt'): continue
                fnfn = os.path.join(idx, fn)
                os.rename(fnfn, idx + '.txt')
            if os.path.exists(idx): shutil.rmtree(idx)
Пример #2
0
 def __init__(self,ident):
     """
     set up directory info - download from archive if necessary
     """
     self.ident = ident
     self.dir = os.path.join(Film.root_dir,ident)
     self.clip_dir = os.path.join(self.dir,"clips")
     self.failed = False
     if os.path.isdir(self.dir) == False:
         item = get_item(ident)
         ogg = [x['name'] for x in item.files if ".ogv" in x['name']]
         meta = [x['name'] for x in item.files if "_meta.xml" in x['name']]
         if ogg and meta:
             ogg = ogg[:1]
             meta = meta[:1]
             os.makedirs(self.dir)
             os.makedirs(self.clip_dir)
             download(ident,files=ogg+meta,destdir=Film.root_dir, verbose=True)
         else:
             self.failed = True
     
     if self.failed == False:
         self.ogv = [x for x in os.listdir(self.dir) if ".ogv" in x]
         self.meta = [x for x in os.listdir(self.dir) if "_meta.xml" in x]
         
         if self.ogv and self.meta:
             self.ogv = self.ogv[0]
             self.meta =self.meta[0]
             self.load_meta()
         else:
             self.failed = True
Пример #3
0
def stack_downloader(name):
    name = name.lower()
    stack_exchange_list = []
    name_list = []
    f = open("site_list.txt", "r")
    while True:
        a = f.readline()
        a = a[:len(a) - 1]
        stack_exchange_list.append(a)
        l = []
        for i in a:
            if (i == '.'):
                break
            l.append(i)
        x = ''.join(l)
        name_list.append(x)
        if (not f.readline()):
            break
    del name_list[len(name_list) - 1]
    site_name = {}
    for i in range(len(name_list)):
        site_name[name_list[i]] = stack_exchange_list[i]

    download('stackexchange', verbose=True, glob_pattern=site_name[name])
    call(["7z", "x", 'stackexchange/' + site_name[name]])
    '''
    calling the KML converter for Stack_exchange
    '''
    SE_converter(name)
    dir_name = os.getcwd()
    file_list = os.listdir(dir_name)
    for file in file_list:
        if (file.endswith(".xml")):
            os.remove(os.path.join(dir_name, file))
Пример #4
0
 def _download_corpus(self):
     """
     Downloads a corpus of text
     from internet archive to
     current working directory
     """
     download(self.archive_name, verbose=True, glob_pattern="*.txt")
Пример #5
0
 def download_from_dump(self, home, articles, key):
     if not os.path.isdir(home + '/knolml_dataset/phase_details'):
         download('knolml_dataset',
                  verbose=True,
                  glob_pattern='phase_details.7z',
                  destdir=home)
         Archive('~/knolml_dataset/phase_details.7z').extractall(
             '~/knolml_dataset')
     if not os.path.isdir(home + '/knolml_dataset/bz2t'):
         download('knolml_dataset',
                  verbose=True,
                  glob_pattern='bz2t.7z',
                  destdir=home)
         Archive('~/knolml_dataset/bz2t.7z').extractall(home +
                                                        '/knolml_dataset')
     fileList = glob.glob(home + '/knolml_dataset/phase_details/*.txt')
     for files in fileList:
         if 'phase' in files:
             with open(files, 'r') as myFile:
                 for line in myFile:
                     l = line.split('#$*$#')
                     if l[0] in articles:
                         print("Found hit for article " + l[0])
                         # file, art, index, home, key
                         self.extract_from_bzip(file=l[1],
                                                art=l[0],
                                                index=int(l[2]),
                                                home=home,
                                                key=key)
Пример #6
0
def internet_archive_download(destination_directory, collection='MBLWHOI', pdf_num=None):

    """ Uses the internetarchive Python package to stream pdf pages from a given collection
        into a provided destination_directory.
    """

    print('Beginning internet archive download...')

    for i in internetarchive.search_items('collection:' + collection):

        if pdf_num is not None:
            if i == pdf_num:
                break

        archive_id = i['identifier']
        try:
            if not os.path.exists(os.path.join(destination_directory, archive_id)):
                x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory)
            elif os.listdir(os.path.join(destination_directory, archive_id)) == []:
                x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory)
        except KeyboardInterrupt:
            print('Cancelling download.')
            break
        except:
            print('ERROR downloading', archive_id)
    return
Пример #7
0
 def download_jp2(self, item, glob_pattern):
     success = False
     while not success:
         try:
             download(item, glob_pattern=glob_pattern, destdir=self.top_dir,\
                      ignore_existing = True, retries = 10)
             success = True         
         except Exception as e:
             success = False
             time.sleep(60)
Пример #8
0
def downloadAlphaWithPrefix(prefix):
    for charCode in range(65, 92):
        if charCode == 91:
            # Use for number
            charCode = 48
        char = chr(charCode)
        download(prefix + char,
                 glob_pattern='*.torrent',
                 no_directory=True,
                 destdir='C:/Users/adam/Desktop/IA')
Пример #9
0
def test_download(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock() as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/download/nasa/nasa_meta.xml'.format(PROTOCOL),
                 body='test content')
        rsps.add_metadata_mock('nasa')
        download('nasa', 'nasa_meta.xml')
        p = os.path.join(str(tmpdir), 'nasa')
        assert len(os.listdir(p)) == 1
        assert load_file('nasa/nasa_meta.xml') == 'test content'
Пример #10
0
def test_download(tmpdir):
    tmpdir.chdir()
    with IaRequestsMock() as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/download/nasa/nasa_meta.xml'.format(PROTOCOL),
                 body='test content')
        rsps.add_metadata_mock('nasa')
        download('nasa', 'nasa_meta.xml')
        p = os.path.join(str(tmpdir), 'nasa')
        assert len(os.listdir(p)) == 1
        assert load_file('nasa/nasa_meta.xml') == 'test content'
def download_collection(username,
                        password,
                        collection,
                        destination,
                        glob="*",
                        dry_run=False):

    configure(username, password)
    download(collection,
             destdir=destination,
             glob_pattern=glob,
             dry_run=dry_run)
Пример #12
0
def downloadItem(IAid):
    cleanfolder("input")
    download(IAid,
             glob_pattern="*_djvu.xml",
             destdir="input",
             verbose=True,
             no_directory=True)
    download(IAid,
             glob_pattern="*_jp2.zip",
             destdir="input",
             verbose=True,
             no_directory=True)
Пример #13
0
def download_files(query, folder, **kwargs):
    items, formats, meta = search_files(query, **kwargs)
    download_paths = []
    file_paths = []
    for item in items:
        download(item['ID'], destdir=folder, formats=formats)
        download_paths.append("{}".format(
            os.path.join(os.path.abspath(folder), item['ID'])))

    for path in download_paths:
        for dirname, dirnames, filenames in os.walk(path):
            for filename in filenames:
                file_paths.append(os.path.join(dirname, filename))

    return file_paths
Пример #14
0
def test_download(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/download/nasa/nasa_meta.xml'.format(protocol),
                 body='test content',
                 status=200)
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        download('nasa', 'nasa_meta.xml')
        p = os.path.join(str(tmpdir), 'nasa')
        assert len(os.listdir(p)) == 1
        with open('nasa/nasa_meta.xml') as fh:
            assert fh.read() == 'test content'
Пример #15
0
def assert_title_page(identifier, titlepage, silent=False):
    tp = str(titlepage)
    result = list()
    # first download scandata.xml file from the item
    try:
        item = ia.get_item(identifier)
    except:
        raise ('IA identifier not found.')
    scandata = identifier + '_scandata.xml'
    for f in item.files:
        if f['name'] == scandata:
            ia.download(identifier, files=scandata, silent=silent)
            with open(os.path.join(identifier, scandata), 'r') as fh:
                xml = fh.read()
                nochange = True
                match = False
                final = list()
                for line in xml.splitlines():
                    newline = line
                    if 'leafNum' in line:  # like: <page leafNum="0">
                        leafnum = line.split('"')[1]
                        if leafnum == tp:
                            match = True
                    if 'pageType' in line:  # like: <pageType>Normal</pageType>
                        if match is True:
                            if 'Title' in line:
                                result.append(
                                    'Title page is already declared.')
                            else:
                                newline = line.replace('Normal', 'Title')
                                nochange = False
                            match = False  # don't match in the rest of this document
                        elif 'Title' in line:  # erroneous title page from IA
                            newline = line.replace('Title', 'Normal')
                            nochange = False
                    final.append(newline)
            if nochange is True:
                result.append('No changes detected.')
            else:
                with open(os.path.join(identifier, scandata), 'w') as fh:
                    fh.write('\n'.join(final))
                    result.append(
                        'Generated new scandata.xml file and uploading...')
                ia.upload(identifier,
                          files=[os.path.join(identifier, scandata)])
                result.append('Success!')
            rmtree(identifier)
    return '\n'.join(result)
def download_missing_data(rows, file_name_key):
    for row in tqdm.tqdm(rows):
        ia.download(
            row.identifier,
            [row[file_name_key]],
            destdir="gs://the-peoples-speech-west-europe/archive_org/Mar_7_2021/CC_BY_SA_EXPANDED_LICENSES_FILTERED_ACCESS",
            # Very important to set this. tf.io.gfile uses mtime in
            # nanoseconds, while archive.org uses mtime in seconds
            # (as far as I can tell). I could convert the
            # nanoseconds to seconds, of course, but don't want to
            # make an error.
            ignore_existing=True,
            # tf.io.gfile does not expose any functionality like os.utime
            no_change_timestamp=True,
            ignore_errors=False,
        )
Пример #17
0
 def get_data(identifier):
     ia.download(
         identifier,
         formats=[
             "SubRip", "MP3", "Web Video Text Tracks", "Closed Caption Text"
         ],
         destdir=save_directory,
         # Very import to set this. tf.io.gfile uses mtime in
         # nanoseconds, while archive.org uses mtime in seconds
         # (as far as I can tell). I could convert the
         # nanoseconds to seconds, of course, but don't want to
         # make an error.
         ignore_existing=True,
         # tf.io.gfile does not expose any functionality like os.utime
         no_change_timestamp=True,
         ignore_errors=True)
Пример #18
0
def test_download(tmpdir):
    tmpdir.chdir()
    with responses.RequestsMock() as rsps:
        rsps.add(responses.GET,
                 '{0}//archive.org/download/nasa/nasa_meta.xml'.format(
                     protocol),
                 body='test content',
                 status=200)
        rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol),
                 body=ITEM_METADATA,
                 status=200)
        download('nasa', 'nasa_meta.xml')
        p = os.path.join(str(tmpdir), 'nasa')
        assert len(os.listdir(p)) == 1
        with open('nasa/nasa_meta.xml') as fh:
            assert fh.read() == 'test content'
Пример #19
0
    def compile_txt(self, collection=DEFAULT_COLLECTION):
        """
		This will download the txt files from IA.
		"""
        # make sure exists
        if not os.path.exists(self.path_txt): os.makedirs(self.path_txt)
        os.chdir(self.path_txt)

        # getting ids
        print(
            f'>> [{self.name}] downloading txt files, using custom function...'
        )
        id_list = self.get_collection_ids(collection=collection)

        # download txt
        for idx in enumerate(tqdm(id_list, position=1)):
            ia.download(idx,
                        silent=True,
                        glob_pattern='*.txt',
                        ignore_existing=True)
Пример #20
0
def download_text_data(textID, outDir):

    item = get_item(textID)
    namesFile = []
    for data in item.files:
        name = data['name']
        if os.path.splitext(name)[1] == ".txt":
            namesFile.append(name)

    if len(namesFile) == 0:
        return False, []

    return download(textID, files=namesFile, destdir=outDir), namesFile
Пример #21
0
def main(item: str):
    # determine if application is a script file or frozen exe
    application_path = ""
    if getattr(sys, 'frozen', False):
        application_path = os.path.dirname(sys.executable)
    elif __file__:
        application_path = os.path.dirname(__file__)

    ia_path = os.path.join(application_path, "iaitems")
    xml_filename = os.path.join(application_path, "iaitems", item,
                                "".join([item, "_djvu", ".xml"]))
    xml_filename_scandata = os.path.join(application_path, "iaitems", item,
                                         "".join([item, "_scandata", ".xml"]))
    json_filename = os.path.join(application_path, "iaitems", item,
                                 "".join([item, "_djvu", ".json"]))

    if not os.path.isfile(xml_filename):
        print(f"Downloading {item}_djvu.xml from internet archive website...")
        download(item,
                 verbose=True,
                 destdir=ia_path,
                 glob_pattern='*_djvu.xml')

        print(
            f"Downloading {item}_scandata.xml from internet archive website..."
        )
        download(item,
                 verbose=True,
                 destdir=ia_path,
                 glob_pattern='*_scandata.xml')

    # Do auto printed page generation
    if os.path.isfile(xml_filename):
        print(f"Generating printed pages...")
        book = Book(xml_filename)
        scan_data = ScannedData(xml_filename_scandata)
        book.generate_json(item, json_filename, scan_data=scan_data)
    else:
        print(f"Error: File not found [{xml_filename}]!")
Пример #22
0
def get_internet_archive_document(url) -> str:
    """Downloads a document (book, etc.) from Internet Archive and returns it as a string. The linked document must
       have a text version. PDF text extraction is not supported at this time.
    """
    validate_url(url, expected_netloc='archive.org')
    url_parts = urlsplit(url).path.split("/")
    if len(url_parts) > 2:
        document_id = url_parts[2]
    else:
        raise Exception(f'Not a valid url')
    try:
        response = download(document_id,
                            glob_pattern="*txt",
                            return_responses=True)[0]
        # Remove single newlines, preserve double  newlines (because they demarcate paragraphs
        text = re.sub('(?<![\r\n])(\r?\n|\n?\r)(?![\r\n])', ' ',
                      response.text.strip())
        # This usually creates double spaces between lines because most lines end with single spaces, but to account
        # for cases in which lines end without spaces, we will handle this in two lines
        return re.sub('(?<=[\S])(\s\s)(?=[\S])', ' ', text)

    except Exception:
        raise Exception(f'Archive.org download failed for url: {url}')
Пример #23
0
import internetarchive as ia

from internetarchive import search_items

# example url https://archive.org/details/ucberkeley-webcast-PL3E89002AA9B9879E
# id would be ucberkeley-webcast-PL3E89002AA9B9879E
collection_id = "ucberkeley-webcast-PL3E89002AA9B9879E"

file_id_arr = search_items(f"collection:{collection_id}")

for file_id in file_id_arr:
    # dry run only gets url of items in a collection, set to false to actually download
    ia.download(file_id["identifier"],
                verbose=True,
                dry_run=True,
                glob_pattern="*mp4")
Пример #24
0
configure(
)  # interactive login, for automateed scripting use configure('*****@*****.**', 'password')

s = ArchiveSession()

pattern = None  #change this to download only selected filetypes, e.g.: pattern='*mobi' will download only Kindle formatted e-books

# fill this in -- searches for the ID of a collection in IA
coll = ia.Search(s, 'collection:xxxxxxxx')
# example of collection page: https://archive.org/details/johnjaycollegeofcriminaljustice
# the collection ID for that page is johnjaycollegeofcriminaljustice
# you can tell a page is a collection if it has a 'Spotlight Item' on the left

num = 0

for result in coll:  # for all items in a collection
    num = num + 1  # item count
    itemid = result['identifier']
    print 'Downloading: #' + str(num) + '\t' + itemid
    try:
        download(itemid, ignore_existing=True, glob_pattern=pattern)
        print '\t\t Download success.'
    except Exception, e:
        print "Error Occurred downloading () = {}".format(itemid, e)
    print 'Pausing for 40 minutes'
    # IA restricts the number of things you can download. Be nice to
    time.sleep(2400)
    # their servers -- limit how much you download, too. For me, this
    # time restriction is still not polite enough, and my connection gets
    # cut off all the dang time.
Пример #25
0
@author: descentis
"""
from internetarchive import download
from subprocess import call

name = "3dprinting"
name = name.lower()
stack_exchange_list = []
name_list = []
f = open("site_list.txt", "r")
while True:
    a = f.readline()
    a = a[:len(a) - 1]
    stack_exchange_list.append(a)
    l = []
    for i in a:
        if (i == '.'):
            break
        l.append(i)
    x = ''.join(l)
    name_list.append(x)
    if (not f.readline()):
        break
del name_list[len(name_list) - 1]
site_name = {}
for i in range(len(name_list)):
    site_name[name_list[i]] = stack_exchange_list[i]

download('stackexchange', verbose=True, glob_pattern=site_name[name])
call(["7z", "x", site_name[name]])
Пример #26
0
 def download_item(self, metadata):
     dest_dir = self.get_file_directory()
     d = download(metadata.get('identifier'), files=[metadata.get('file_name')], destdir=dest_dir, silent=True, no_directory=True, retries=3,             ignore_existing=True, ignore_errors=True)
     return d
Пример #27
0
from internetarchive.session import ArchiveSession
from internetarchive import get_item
from internetarchive import download

ident = 'podcasts'
destifolder = 'iapodcasts'
search = ia.search_items('collection:%s' % ident)
current = [f for f in os.listdir(destifolder)]

num = 0

for result in search:  #for all items in a collection
    num = num + 1  #item count
    itemid = result['identifier']
    print('Downloading: #' + str(num) + '\t' + itemid)
    if itemid not in current:
        try:
            download(itemid,
                     destdir=destifolder,
                     retries=5,
                     glob_pattern=['*.ogg', '*.mp3', '*.wav', '*.flv'])
            print('\t\t Download success.')
        except Exception as e:
            print("Error Occurred downloading () = {}".format(itemid, e))
            print('Pausing for 20 minutes')
            #time.sleep(1200)
        #time.sleep(0.5)

    if num == 5000:
        break
def download_collection(username,password,collection,destination,glob="*",dry_run=False):

    configure(username,password)
    download(collection,destdir=destination,glob_pattern=glob,dry_run=dry_run)
Пример #29
0
def main(item: str, **kwargs):
    # determine if application is a script file or frozen exe
    application_path = ""
    xml_file_name = kwargs.get('xml_filename', "")
    xml_file_name_scan_data = kwargs.get('xml_filename_scandata', "")
    json_file_name = kwargs.get('json_filename', "")
    ia_path = kwargs.get('ia_path', "")

    if item is None and ia_path is None:
        print("Error: Unrecognized Arguments")
        return

    if item is not None and \
            (xml_file_name is not None or xml_file_name_scan_data
             is not None or json_file_name is not None or ia_path is not None):
        print("Error: \"-item\" parameter should not be mixed with other parameters")
        return

    if item is not None:
        if getattr(sys, 'frozen', False):
            application_path = os.path.dirname(sys.executable)
        elif __file__:
            application_path = os.path.dirname(__file__)

        ia_path = os.path.join(application_path, "iaitems")

        if not os.path.exists(ia_path):
            os.mkdir(ia_path)

        xml_file_name = os.path.join(application_path, "iaitems", item, "".join([item, "_djvu", ".xml"]))
        xml_file_name_scan_data = os.path.join(application_path, "iaitems", item, "".join([item, "_scandata", ".xml"]))
        json_file_name = os.path.join(application_path, "iaitems", item, "".join([item, "_pages", ".json"]))
    else:
        xml_file_name = kwargs.get('xml_filename', None)
        xml_file_name_scan_data = kwargs.get('xml_filename_scandata', None)
        json_file_name = kwargs.get('json_filename', None)
        val_error = []
        if ia_path is None:
            val_error.append("is not provided")

        if xml_file_name is None:
            val_error.append("xml_filename is not provided")

        if json_file_name is None:
            val_error.append("json_filename is not provided")

        if ','.join(val_error) != "":
            print("Error: " + '\r\n'.join(val_error))
            return

        if not os.path.isdir(ia_path):
            print("Error: ia_path \"" + ia_path + "\" does not exist")
            return

        item = xml_file_name.lower().replace("_djvu.xml", "")
        # xml_file_name = os.path.join(ia_path, item, xml_file_name)
        xml_file_name = os.path.join(ia_path, xml_file_name)
        json_file_name = os.path.join(ia_path, json_file_name)
        # json_file_name = os.path.join(ia_path, item, json_file_name)
        if not os.path.isfile(xml_file_name):
            print("Error: xml_filename \"" + xml_file_name + "\" does not exist")
            return

        if xml_file_name_scan_data is not None and xml_file_name_scan_data != "":
            # item = xml_file_name.lower().replace("_scandata.xml", "")
            xml_file_name_scan_data = os.path.join(ia_path, xml_file_name_scan_data)
            # xml_filename_scan_data = os.path.join(ia_path, item, xml_filename_scan_data)
            if not os.path.isfile(xml_file_name_scan_data):
                print("Error: xml_filename_scandata \"" + xml_file_name_scan_data + "\" does not exist")
                return

    if not os.path.isfile(xml_file_name):
        from internetarchive import download
        print("Downloading " + item + "_djvu.xml from internet archive website...")
        download(item, verbose=True, destdir=ia_path, glob_pattern='*_djvu.xml')

        print("Downloading " + item + "_scandata.xml from internet archive website...")
        try:
            download(item, verbose=True, destdir=ia_path, glob_pattern='*_scandata.xml')
        except NameError:
            pass

    # Do auto printed page generation
    if os.path.isfile(xml_file_name):
        if os.path.exists(json_file_name):
            os.remove(json_file_name)  

        print("Generating printed pages...")

        bk = Book()
        bk.load_xml(xml_file_name)

        if not bk.has_valid_leaf_no:
            print("djvu error: unable to extract leaf number.")
            return

        scan_data = ScanData("")
        #if xml_file_name_scan_data is not None:
        #    if os.path.isfile(xml_file_name_scan_data):
        #        scan_data = ScanData(xml_file_name_scan_data)

        bk.generate_json(item, json_file_name, scan_data=scan_data)
        
    else:
        print("Error: File not found [" + xml_file_name + "]!")
Пример #30
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 26 13:13:57 2019

@author: singh
"""
"""
from selenium import webdriver

url = 'http://wayback.archive.org/web/20131018144323/http://www.infosonics.com:80/'
path = 'D:\\Summer_RA\\Code\\scrape.png'

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1200x600') 

driver = webdriver.Chrome(executable_path = 'D:\\Summer_RA\\Code\\chromedriver.exe', chrome_options = options)
driver.get(url)
el = driver.find_element_by_tag_name('body')
el.screenshot(path)
driver.quit()
"""

from internetarchive import download
download('gatewaycasinosincomefund.com', verbose=True)
Пример #31
0
Файл: sample.py Проект: edsu/spn
            total_size += item['size']

print("There are %s Internet Archive items to download" % len(item_ids))
print("The total size will be %0.2f GB" % (total_size / 1024 / 1024 / 1024.0))
print("And here they are")
for item_id in item_ids:
    print(item_id)

# Now let's download them.

# In[ ]:

count = 0
for item_id in item_ids:
    count += 1
    print('[%s/%s] downloading %s' % (count, len(item_ids), item_id))
    ia.download(item_id,
                glob_pattern=["*arc.gz", "*cdx.gz"],
                destdir="data",
                ignore_existing=True)

# The reality is that it can take weeks (or months) to sample and download, so you probably want to export this notebook as a .py file and run it on a reliable server in a screen or tmux session:
#
# ```
# % jupyter nbconvert --to script Sample.ipynb
# % python Sample.py
# ```
#

# In[ ]:
Пример #32
0
## downloads all items in a given Internet Archive collection
## !! will probably crash after 10 or so items !! feel free to edit the script to make it better for bigger collections
## See http://programminghistorian.org/lessons/data-mining-the-internet-archive for more detailed info
import os
import time
import sys
import internetarchive as ia
from internetarchive.session import ArchiveSession
from internetarchive import get_item
from internetarchive import download

search = ia.search_items('collection:%s' % sys.argv[1])

num = 0

for result in search:  #for all items in a collection
    num = num + 1  #item count
    itemid = result['identifier']
    print 'Downloading: #' + str(num) + '\t' + itemid
    try:
        download(itemid)
        print '\t\t Download success.'
    except Exception, e:
        print "Error Occurred downloading () = {}".format(itemid, e)
    print 'Pausing for 40 minutes'
    time.sleep(
        2400)  # IA restricts the number of things you can download. Be nice to
    # their servers -- limit how much you download, too. For me, this
    # time restriction is still not polite enough, and my connection gets
    # cut off all the dang time.
#-----------------------------------------------------------------------------------------------------------------------

# READ CSV FILE INTO LIST

f = open(r"H:\14000.csv")  # PATH TO CSV FILE
x = f.readlines()
f.close()

with open(r"H:\14000.csv") as f:
    x = f.read().splitlines()

#-----------------------------------------------------------------------------------------------------------------------

# DOWNLOAD ALL 14000 GRATEFUL DEAD SHOWS AT ONCE

for a in x:
    download(a,
             verbose=True,
             glob_pattern='*.mp3',
             destdir=r"C:\Users\username\Desktop\gd"
             )  # LOCAL DIRECTORY TO SAVE FILES

#-----------------------------------------------------------------------------------------------------------------------

# END TIME OF JOB

end = t.time()
print('time to complete: ' + str((end - start) / 60) + ' minutes')

#-----------------------------------------------------------------------------------------------------------------------
## See http://programminghistorian.org/lessons/data-mining-the-internet-archive for more detailed info
import os
import time
import sys
import internetarchive as ia
from internetarchive.session import ArchiveSession
from internetarchive import get_item
from internetarchive import download

search = ia.search_items('collection:%s' % sys.argv[1])

num = 0

for result in search: #for all items in a collection
    num = num + 1 #item count
    itemid = result['identifier']
    print 'Downloading: #' + str(num) + '\t' + itemid
    try:
	download(itemid) 
        print '\t\t Download success.'
    except Exception , e:
        print "Error Occurred downloading () = {}".format(itemid, e) 
    print 'Pausing for 40 minutes'
    time.sleep(2400) # IA restricts the number of things you can download. Be nice to 
                     # their servers -- limit how much you download, too. For me, this
                     # time restriction is still not polite enough, and my connection gets
                     # cut off all the dang time.
    
    
    
Пример #35
0
def parse_internet_archive(collection='', channel=''):
    if collection != '':
        movies = search_items('collection:%s' % collection)
    else:
        print(
            "A collection name is required for importing Internet Archive content.\n"
        )
        print "  use -h for help with more command line arguments"
        sys.exit()

    for item in movies.iter_as_items():
        print("--------------------------\nDownloading: " + item.identifier)
        # note - currently this will download all movie formats that match mpeg4
        download(item.identifier,
                 verbose=True,
                 destdir="downloads",
                 formats=['512Kb MPEG4', 'MPEG4'])
        # metadata
        meta = untangle.parse("downloads/" + item.identifier + "/" +
                              item.identifier + "_meta.xml")

        try:
            title = meta.metadata.title.cdata
        except AttributeError:
            try:
                title = meta.metadata.title[
                    0].cdata  #there are duplicate entries in the xml, take the 1st
            except AttributeError:
                print "Skipping import - Unable to find a title for : " + item.identifier
                return
        try:
            description = meta.metadata.description.cdata
        except AttributeError:
            try:
                description = meta.metadata.description[
                    0].cdata  #there are duplicate entries in the xml, take the 1st
            except AttributeError:
                print "Skipping import - Unable to find a description for : " + item.identifier
                return
        try:
            author = meta.metadata.director.cdata
        except AttributeError:
            try:
                author = meta.metadata.publisher.cdata
            except AttributeError:
                author = ""

        try:
            language = meta.item_metadata.language.cdata
            if language == 'english':
                language = 'en'
        except AttributeError:
            language = 'en'

        try:
            license = meta.metadata.licenseurl.cdata
        except AttributeError:
            license = 'public'

        if _debug:
            print '=' * 70
            print "Metadata found is:"
            print "Title is = " + title
            print "Description = " + description
            print "Author = " + author
            print "Language is = " + language
            print "License url is = " + license
            print '=' * 70

        # now add the file to the blockchain
        path = os.path.dirname(
            os.path.abspath(__file__)) + "/downloads/" + item.identifier + "/"

        # Process the downloaded files to find the movie to upload
        try:
            onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
            parser = csv.reader(onlyfiles)
        except OSError:
            print "No file exists here that was downloaded"

        mp4 = ".mp4"
        ogv = ".ogv"
        for fields in parser:
            for i, f in enumerate(fields):
                if f.find(mp4) > 0:
                    publish(channel, f, path + f, title, description, author,
                            language, license)
                elif f.find(ogv) > 0:
                    publish(channel, f, path + f, title, description, author,
                            language, license)
                else:
                    file_not_supported(f)