def compile_txt(self, collection=DEFAULT_COLLECTION): """ This will download the txt files from IA. """ # make sure exists import shutil if not os.path.exists(self.path_txt): os.makedirs(self.path_txt) os.chdir(self.path_txt) # getting ids print( f'>> [{self.name}] downloading txt files, using custom function...' ) id_list = self.get_collection_ids(collection=collection) # download txt for i, idx in enumerate(tqdm(id_list, position=1)): if os.path.exists(idx + '.txt'): continue ia.download(idx, silent=True, glob_pattern='*.txt', ignore_existing=True) # get files and rename] if not os.path.exists(idx): continue for fn in os.listdir(idx): if not fn.endswith('.txt'): continue fnfn = os.path.join(idx, fn) os.rename(fnfn, idx + '.txt') if os.path.exists(idx): shutil.rmtree(idx)
def __init__(self,ident): """ set up directory info - download from archive if necessary """ self.ident = ident self.dir = os.path.join(Film.root_dir,ident) self.clip_dir = os.path.join(self.dir,"clips") self.failed = False if os.path.isdir(self.dir) == False: item = get_item(ident) ogg = [x['name'] for x in item.files if ".ogv" in x['name']] meta = [x['name'] for x in item.files if "_meta.xml" in x['name']] if ogg and meta: ogg = ogg[:1] meta = meta[:1] os.makedirs(self.dir) os.makedirs(self.clip_dir) download(ident,files=ogg+meta,destdir=Film.root_dir, verbose=True) else: self.failed = True if self.failed == False: self.ogv = [x for x in os.listdir(self.dir) if ".ogv" in x] self.meta = [x for x in os.listdir(self.dir) if "_meta.xml" in x] if self.ogv and self.meta: self.ogv = self.ogv[0] self.meta =self.meta[0] self.load_meta() else: self.failed = True
def stack_downloader(name): name = name.lower() stack_exchange_list = [] name_list = [] f = open("site_list.txt", "r") while True: a = f.readline() a = a[:len(a) - 1] stack_exchange_list.append(a) l = [] for i in a: if (i == '.'): break l.append(i) x = ''.join(l) name_list.append(x) if (not f.readline()): break del name_list[len(name_list) - 1] site_name = {} for i in range(len(name_list)): site_name[name_list[i]] = stack_exchange_list[i] download('stackexchange', verbose=True, glob_pattern=site_name[name]) call(["7z", "x", 'stackexchange/' + site_name[name]]) ''' calling the KML converter for Stack_exchange ''' SE_converter(name) dir_name = os.getcwd() file_list = os.listdir(dir_name) for file in file_list: if (file.endswith(".xml")): os.remove(os.path.join(dir_name, file))
def _download_corpus(self): """ Downloads a corpus of text from internet archive to current working directory """ download(self.archive_name, verbose=True, glob_pattern="*.txt")
def download_from_dump(self, home, articles, key): if not os.path.isdir(home + '/knolml_dataset/phase_details'): download('knolml_dataset', verbose=True, glob_pattern='phase_details.7z', destdir=home) Archive('~/knolml_dataset/phase_details.7z').extractall( '~/knolml_dataset') if not os.path.isdir(home + '/knolml_dataset/bz2t'): download('knolml_dataset', verbose=True, glob_pattern='bz2t.7z', destdir=home) Archive('~/knolml_dataset/bz2t.7z').extractall(home + '/knolml_dataset') fileList = glob.glob(home + '/knolml_dataset/phase_details/*.txt') for files in fileList: if 'phase' in files: with open(files, 'r') as myFile: for line in myFile: l = line.split('#$*$#') if l[0] in articles: print("Found hit for article " + l[0]) # file, art, index, home, key self.extract_from_bzip(file=l[1], art=l[0], index=int(l[2]), home=home, key=key)
def internet_archive_download(destination_directory, collection='MBLWHOI', pdf_num=None): """ Uses the internetarchive Python package to stream pdf pages from a given collection into a provided destination_directory. """ print('Beginning internet archive download...') for i in internetarchive.search_items('collection:' + collection): if pdf_num is not None: if i == pdf_num: break archive_id = i['identifier'] try: if not os.path.exists(os.path.join(destination_directory, archive_id)): x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory) elif os.listdir(os.path.join(destination_directory, archive_id)) == []: x = internetarchive.download(archive_id, verbose=True, glob_pattern='*.pdf', destdir=destination_directory) except KeyboardInterrupt: print('Cancelling download.') break except: print('ERROR downloading', archive_id) return
def download_jp2(self, item, glob_pattern): success = False while not success: try: download(item, glob_pattern=glob_pattern, destdir=self.top_dir,\ ignore_existing = True, retries = 10) success = True except Exception as e: success = False time.sleep(60)
def downloadAlphaWithPrefix(prefix): for charCode in range(65, 92): if charCode == 91: # Use for number charCode = 48 char = chr(charCode) download(prefix + char, glob_pattern='*.torrent', no_directory=True, destdir='C:/Users/adam/Desktop/IA')
def test_download(tmpdir): tmpdir.chdir() with IaRequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/download/nasa/nasa_meta.xml'.format(PROTOCOL), body='test content') rsps.add_metadata_mock('nasa') download('nasa', 'nasa_meta.xml') p = os.path.join(str(tmpdir), 'nasa') assert len(os.listdir(p)) == 1 assert load_file('nasa/nasa_meta.xml') == 'test content'
def download_collection(username, password, collection, destination, glob="*", dry_run=False): configure(username, password) download(collection, destdir=destination, glob_pattern=glob, dry_run=dry_run)
def downloadItem(IAid): cleanfolder("input") download(IAid, glob_pattern="*_djvu.xml", destdir="input", verbose=True, no_directory=True) download(IAid, glob_pattern="*_jp2.zip", destdir="input", verbose=True, no_directory=True)
def download_files(query, folder, **kwargs): items, formats, meta = search_files(query, **kwargs) download_paths = [] file_paths = [] for item in items: download(item['ID'], destdir=folder, formats=formats) download_paths.append("{}".format( os.path.join(os.path.abspath(folder), item['ID']))) for path in download_paths: for dirname, dirnames, filenames in os.walk(path): for filename in filenames: file_paths.append(os.path.join(dirname, filename)) return file_paths
def test_download(tmpdir): tmpdir.chdir() with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/download/nasa/nasa_meta.xml'.format(protocol), body='test content', status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) download('nasa', 'nasa_meta.xml') p = os.path.join(str(tmpdir), 'nasa') assert len(os.listdir(p)) == 1 with open('nasa/nasa_meta.xml') as fh: assert fh.read() == 'test content'
def assert_title_page(identifier, titlepage, silent=False): tp = str(titlepage) result = list() # first download scandata.xml file from the item try: item = ia.get_item(identifier) except: raise ('IA identifier not found.') scandata = identifier + '_scandata.xml' for f in item.files: if f['name'] == scandata: ia.download(identifier, files=scandata, silent=silent) with open(os.path.join(identifier, scandata), 'r') as fh: xml = fh.read() nochange = True match = False final = list() for line in xml.splitlines(): newline = line if 'leafNum' in line: # like: <page leafNum="0"> leafnum = line.split('"')[1] if leafnum == tp: match = True if 'pageType' in line: # like: <pageType>Normal</pageType> if match is True: if 'Title' in line: result.append( 'Title page is already declared.') else: newline = line.replace('Normal', 'Title') nochange = False match = False # don't match in the rest of this document elif 'Title' in line: # erroneous title page from IA newline = line.replace('Title', 'Normal') nochange = False final.append(newline) if nochange is True: result.append('No changes detected.') else: with open(os.path.join(identifier, scandata), 'w') as fh: fh.write('\n'.join(final)) result.append( 'Generated new scandata.xml file and uploading...') ia.upload(identifier, files=[os.path.join(identifier, scandata)]) result.append('Success!') rmtree(identifier) return '\n'.join(result)
def download_missing_data(rows, file_name_key): for row in tqdm.tqdm(rows): ia.download( row.identifier, [row[file_name_key]], destdir="gs://the-peoples-speech-west-europe/archive_org/Mar_7_2021/CC_BY_SA_EXPANDED_LICENSES_FILTERED_ACCESS", # Very important to set this. tf.io.gfile uses mtime in # nanoseconds, while archive.org uses mtime in seconds # (as far as I can tell). I could convert the # nanoseconds to seconds, of course, but don't want to # make an error. ignore_existing=True, # tf.io.gfile does not expose any functionality like os.utime no_change_timestamp=True, ignore_errors=False, )
def get_data(identifier): ia.download( identifier, formats=[ "SubRip", "MP3", "Web Video Text Tracks", "Closed Caption Text" ], destdir=save_directory, # Very import to set this. tf.io.gfile uses mtime in # nanoseconds, while archive.org uses mtime in seconds # (as far as I can tell). I could convert the # nanoseconds to seconds, of course, but don't want to # make an error. ignore_existing=True, # tf.io.gfile does not expose any functionality like os.utime no_change_timestamp=True, ignore_errors=True)
def test_download(tmpdir): tmpdir.chdir() with responses.RequestsMock() as rsps: rsps.add(responses.GET, '{0}//archive.org/download/nasa/nasa_meta.xml'.format( protocol), body='test content', status=200) rsps.add(responses.GET, '{0}//archive.org/metadata/nasa'.format(protocol), body=ITEM_METADATA, status=200) download('nasa', 'nasa_meta.xml') p = os.path.join(str(tmpdir), 'nasa') assert len(os.listdir(p)) == 1 with open('nasa/nasa_meta.xml') as fh: assert fh.read() == 'test content'
def compile_txt(self, collection=DEFAULT_COLLECTION): """ This will download the txt files from IA. """ # make sure exists if not os.path.exists(self.path_txt): os.makedirs(self.path_txt) os.chdir(self.path_txt) # getting ids print( f'>> [{self.name}] downloading txt files, using custom function...' ) id_list = self.get_collection_ids(collection=collection) # download txt for idx in enumerate(tqdm(id_list, position=1)): ia.download(idx, silent=True, glob_pattern='*.txt', ignore_existing=True)
def download_text_data(textID, outDir): item = get_item(textID) namesFile = [] for data in item.files: name = data['name'] if os.path.splitext(name)[1] == ".txt": namesFile.append(name) if len(namesFile) == 0: return False, [] return download(textID, files=namesFile, destdir=outDir), namesFile
def main(item: str): # determine if application is a script file or frozen exe application_path = "" if getattr(sys, 'frozen', False): application_path = os.path.dirname(sys.executable) elif __file__: application_path = os.path.dirname(__file__) ia_path = os.path.join(application_path, "iaitems") xml_filename = os.path.join(application_path, "iaitems", item, "".join([item, "_djvu", ".xml"])) xml_filename_scandata = os.path.join(application_path, "iaitems", item, "".join([item, "_scandata", ".xml"])) json_filename = os.path.join(application_path, "iaitems", item, "".join([item, "_djvu", ".json"])) if not os.path.isfile(xml_filename): print(f"Downloading {item}_djvu.xml from internet archive website...") download(item, verbose=True, destdir=ia_path, glob_pattern='*_djvu.xml') print( f"Downloading {item}_scandata.xml from internet archive website..." ) download(item, verbose=True, destdir=ia_path, glob_pattern='*_scandata.xml') # Do auto printed page generation if os.path.isfile(xml_filename): print(f"Generating printed pages...") book = Book(xml_filename) scan_data = ScannedData(xml_filename_scandata) book.generate_json(item, json_filename, scan_data=scan_data) else: print(f"Error: File not found [{xml_filename}]!")
def get_internet_archive_document(url) -> str: """Downloads a document (book, etc.) from Internet Archive and returns it as a string. The linked document must have a text version. PDF text extraction is not supported at this time. """ validate_url(url, expected_netloc='archive.org') url_parts = urlsplit(url).path.split("/") if len(url_parts) > 2: document_id = url_parts[2] else: raise Exception(f'Not a valid url') try: response = download(document_id, glob_pattern="*txt", return_responses=True)[0] # Remove single newlines, preserve double newlines (because they demarcate paragraphs text = re.sub('(?<![\r\n])(\r?\n|\n?\r)(?![\r\n])', ' ', response.text.strip()) # This usually creates double spaces between lines because most lines end with single spaces, but to account # for cases in which lines end without spaces, we will handle this in two lines return re.sub('(?<=[\S])(\s\s)(?=[\S])', ' ', text) except Exception: raise Exception(f'Archive.org download failed for url: {url}')
import internetarchive as ia from internetarchive import search_items # example url https://archive.org/details/ucberkeley-webcast-PL3E89002AA9B9879E # id would be ucberkeley-webcast-PL3E89002AA9B9879E collection_id = "ucberkeley-webcast-PL3E89002AA9B9879E" file_id_arr = search_items(f"collection:{collection_id}") for file_id in file_id_arr: # dry run only gets url of items in a collection, set to false to actually download ia.download(file_id["identifier"], verbose=True, dry_run=True, glob_pattern="*mp4")
configure( ) # interactive login, for automateed scripting use configure('*****@*****.**', 'password') s = ArchiveSession() pattern = None #change this to download only selected filetypes, e.g.: pattern='*mobi' will download only Kindle formatted e-books # fill this in -- searches for the ID of a collection in IA coll = ia.Search(s, 'collection:xxxxxxxx') # example of collection page: https://archive.org/details/johnjaycollegeofcriminaljustice # the collection ID for that page is johnjaycollegeofcriminaljustice # you can tell a page is a collection if it has a 'Spotlight Item' on the left num = 0 for result in coll: # for all items in a collection num = num + 1 # item count itemid = result['identifier'] print 'Downloading: #' + str(num) + '\t' + itemid try: download(itemid, ignore_existing=True, glob_pattern=pattern) print '\t\t Download success.' except Exception, e: print "Error Occurred downloading () = {}".format(itemid, e) print 'Pausing for 40 minutes' # IA restricts the number of things you can download. Be nice to time.sleep(2400) # their servers -- limit how much you download, too. For me, this # time restriction is still not polite enough, and my connection gets # cut off all the dang time.
@author: descentis """ from internetarchive import download from subprocess import call name = "3dprinting" name = name.lower() stack_exchange_list = [] name_list = [] f = open("site_list.txt", "r") while True: a = f.readline() a = a[:len(a) - 1] stack_exchange_list.append(a) l = [] for i in a: if (i == '.'): break l.append(i) x = ''.join(l) name_list.append(x) if (not f.readline()): break del name_list[len(name_list) - 1] site_name = {} for i in range(len(name_list)): site_name[name_list[i]] = stack_exchange_list[i] download('stackexchange', verbose=True, glob_pattern=site_name[name]) call(["7z", "x", site_name[name]])
def download_item(self, metadata): dest_dir = self.get_file_directory() d = download(metadata.get('identifier'), files=[metadata.get('file_name')], destdir=dest_dir, silent=True, no_directory=True, retries=3, ignore_existing=True, ignore_errors=True) return d
from internetarchive.session import ArchiveSession from internetarchive import get_item from internetarchive import download ident = 'podcasts' destifolder = 'iapodcasts' search = ia.search_items('collection:%s' % ident) current = [f for f in os.listdir(destifolder)] num = 0 for result in search: #for all items in a collection num = num + 1 #item count itemid = result['identifier'] print('Downloading: #' + str(num) + '\t' + itemid) if itemid not in current: try: download(itemid, destdir=destifolder, retries=5, glob_pattern=['*.ogg', '*.mp3', '*.wav', '*.flv']) print('\t\t Download success.') except Exception as e: print("Error Occurred downloading () = {}".format(itemid, e)) print('Pausing for 20 minutes') #time.sleep(1200) #time.sleep(0.5) if num == 5000: break
def download_collection(username,password,collection,destination,glob="*",dry_run=False): configure(username,password) download(collection,destdir=destination,glob_pattern=glob,dry_run=dry_run)
def main(item: str, **kwargs): # determine if application is a script file or frozen exe application_path = "" xml_file_name = kwargs.get('xml_filename', "") xml_file_name_scan_data = kwargs.get('xml_filename_scandata', "") json_file_name = kwargs.get('json_filename', "") ia_path = kwargs.get('ia_path', "") if item is None and ia_path is None: print("Error: Unrecognized Arguments") return if item is not None and \ (xml_file_name is not None or xml_file_name_scan_data is not None or json_file_name is not None or ia_path is not None): print("Error: \"-item\" parameter should not be mixed with other parameters") return if item is not None: if getattr(sys, 'frozen', False): application_path = os.path.dirname(sys.executable) elif __file__: application_path = os.path.dirname(__file__) ia_path = os.path.join(application_path, "iaitems") if not os.path.exists(ia_path): os.mkdir(ia_path) xml_file_name = os.path.join(application_path, "iaitems", item, "".join([item, "_djvu", ".xml"])) xml_file_name_scan_data = os.path.join(application_path, "iaitems", item, "".join([item, "_scandata", ".xml"])) json_file_name = os.path.join(application_path, "iaitems", item, "".join([item, "_pages", ".json"])) else: xml_file_name = kwargs.get('xml_filename', None) xml_file_name_scan_data = kwargs.get('xml_filename_scandata', None) json_file_name = kwargs.get('json_filename', None) val_error = [] if ia_path is None: val_error.append("is not provided") if xml_file_name is None: val_error.append("xml_filename is not provided") if json_file_name is None: val_error.append("json_filename is not provided") if ','.join(val_error) != "": print("Error: " + '\r\n'.join(val_error)) return if not os.path.isdir(ia_path): print("Error: ia_path \"" + ia_path + "\" does not exist") return item = xml_file_name.lower().replace("_djvu.xml", "") # xml_file_name = os.path.join(ia_path, item, xml_file_name) xml_file_name = os.path.join(ia_path, xml_file_name) json_file_name = os.path.join(ia_path, json_file_name) # json_file_name = os.path.join(ia_path, item, json_file_name) if not os.path.isfile(xml_file_name): print("Error: xml_filename \"" + xml_file_name + "\" does not exist") return if xml_file_name_scan_data is not None and xml_file_name_scan_data != "": # item = xml_file_name.lower().replace("_scandata.xml", "") xml_file_name_scan_data = os.path.join(ia_path, xml_file_name_scan_data) # xml_filename_scan_data = os.path.join(ia_path, item, xml_filename_scan_data) if not os.path.isfile(xml_file_name_scan_data): print("Error: xml_filename_scandata \"" + xml_file_name_scan_data + "\" does not exist") return if not os.path.isfile(xml_file_name): from internetarchive import download print("Downloading " + item + "_djvu.xml from internet archive website...") download(item, verbose=True, destdir=ia_path, glob_pattern='*_djvu.xml') print("Downloading " + item + "_scandata.xml from internet archive website...") try: download(item, verbose=True, destdir=ia_path, glob_pattern='*_scandata.xml') except NameError: pass # Do auto printed page generation if os.path.isfile(xml_file_name): if os.path.exists(json_file_name): os.remove(json_file_name) print("Generating printed pages...") bk = Book() bk.load_xml(xml_file_name) if not bk.has_valid_leaf_no: print("djvu error: unable to extract leaf number.") return scan_data = ScanData("") #if xml_file_name_scan_data is not None: # if os.path.isfile(xml_file_name_scan_data): # scan_data = ScanData(xml_file_name_scan_data) bk.generate_json(item, json_file_name, scan_data=scan_data) else: print("Error: File not found [" + xml_file_name + "]!")
# -*- coding: utf-8 -*- """ Created on Wed Jun 26 13:13:57 2019 @author: singh """ """ from selenium import webdriver url = 'http://wayback.archive.org/web/20131018144323/http://www.infosonics.com:80/' path = 'D:\\Summer_RA\\Code\\scrape.png' options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument('window-size=1200x600') driver = webdriver.Chrome(executable_path = 'D:\\Summer_RA\\Code\\chromedriver.exe', chrome_options = options) driver.get(url) el = driver.find_element_by_tag_name('body') el.screenshot(path) driver.quit() """ from internetarchive import download download('gatewaycasinosincomefund.com', verbose=True)
total_size += item['size'] print("There are %s Internet Archive items to download" % len(item_ids)) print("The total size will be %0.2f GB" % (total_size / 1024 / 1024 / 1024.0)) print("And here they are") for item_id in item_ids: print(item_id) # Now let's download them. # In[ ]: count = 0 for item_id in item_ids: count += 1 print('[%s/%s] downloading %s' % (count, len(item_ids), item_id)) ia.download(item_id, glob_pattern=["*arc.gz", "*cdx.gz"], destdir="data", ignore_existing=True) # The reality is that it can take weeks (or months) to sample and download, so you probably want to export this notebook as a .py file and run it on a reliable server in a screen or tmux session: # # ``` # % jupyter nbconvert --to script Sample.ipynb # % python Sample.py # ``` # # In[ ]:
## downloads all items in a given Internet Archive collection ## !! will probably crash after 10 or so items !! feel free to edit the script to make it better for bigger collections ## See http://programminghistorian.org/lessons/data-mining-the-internet-archive for more detailed info import os import time import sys import internetarchive as ia from internetarchive.session import ArchiveSession from internetarchive import get_item from internetarchive import download search = ia.search_items('collection:%s' % sys.argv[1]) num = 0 for result in search: #for all items in a collection num = num + 1 #item count itemid = result['identifier'] print 'Downloading: #' + str(num) + '\t' + itemid try: download(itemid) print '\t\t Download success.' except Exception, e: print "Error Occurred downloading () = {}".format(itemid, e) print 'Pausing for 40 minutes' time.sleep( 2400) # IA restricts the number of things you can download. Be nice to # their servers -- limit how much you download, too. For me, this # time restriction is still not polite enough, and my connection gets # cut off all the dang time.
#----------------------------------------------------------------------------------------------------------------------- # READ CSV FILE INTO LIST f = open(r"H:\14000.csv") # PATH TO CSV FILE x = f.readlines() f.close() with open(r"H:\14000.csv") as f: x = f.read().splitlines() #----------------------------------------------------------------------------------------------------------------------- # DOWNLOAD ALL 14000 GRATEFUL DEAD SHOWS AT ONCE for a in x: download(a, verbose=True, glob_pattern='*.mp3', destdir=r"C:\Users\username\Desktop\gd" ) # LOCAL DIRECTORY TO SAVE FILES #----------------------------------------------------------------------------------------------------------------------- # END TIME OF JOB end = t.time() print('time to complete: ' + str((end - start) / 60) + ' minutes') #-----------------------------------------------------------------------------------------------------------------------
## See http://programminghistorian.org/lessons/data-mining-the-internet-archive for more detailed info import os import time import sys import internetarchive as ia from internetarchive.session import ArchiveSession from internetarchive import get_item from internetarchive import download search = ia.search_items('collection:%s' % sys.argv[1]) num = 0 for result in search: #for all items in a collection num = num + 1 #item count itemid = result['identifier'] print 'Downloading: #' + str(num) + '\t' + itemid try: download(itemid) print '\t\t Download success.' except Exception , e: print "Error Occurred downloading () = {}".format(itemid, e) print 'Pausing for 40 minutes' time.sleep(2400) # IA restricts the number of things you can download. Be nice to # their servers -- limit how much you download, too. For me, this # time restriction is still not polite enough, and my connection gets # cut off all the dang time.
def parse_internet_archive(collection='', channel=''): if collection != '': movies = search_items('collection:%s' % collection) else: print( "A collection name is required for importing Internet Archive content.\n" ) print " use -h for help with more command line arguments" sys.exit() for item in movies.iter_as_items(): print("--------------------------\nDownloading: " + item.identifier) # note - currently this will download all movie formats that match mpeg4 download(item.identifier, verbose=True, destdir="downloads", formats=['512Kb MPEG4', 'MPEG4']) # metadata meta = untangle.parse("downloads/" + item.identifier + "/" + item.identifier + "_meta.xml") try: title = meta.metadata.title.cdata except AttributeError: try: title = meta.metadata.title[ 0].cdata #there are duplicate entries in the xml, take the 1st except AttributeError: print "Skipping import - Unable to find a title for : " + item.identifier return try: description = meta.metadata.description.cdata except AttributeError: try: description = meta.metadata.description[ 0].cdata #there are duplicate entries in the xml, take the 1st except AttributeError: print "Skipping import - Unable to find a description for : " + item.identifier return try: author = meta.metadata.director.cdata except AttributeError: try: author = meta.metadata.publisher.cdata except AttributeError: author = "" try: language = meta.item_metadata.language.cdata if language == 'english': language = 'en' except AttributeError: language = 'en' try: license = meta.metadata.licenseurl.cdata except AttributeError: license = 'public' if _debug: print '=' * 70 print "Metadata found is:" print "Title is = " + title print "Description = " + description print "Author = " + author print "Language is = " + language print "License url is = " + license print '=' * 70 # now add the file to the blockchain path = os.path.dirname( os.path.abspath(__file__)) + "/downloads/" + item.identifier + "/" # Process the downloaded files to find the movie to upload try: onlyfiles = [f for f in listdir(path) if isfile(join(path, f))] parser = csv.reader(onlyfiles) except OSError: print "No file exists here that was downloaded" mp4 = ".mp4" ogv = ".ogv" for fields in parser: for i, f in enumerate(fields): if f.find(mp4) > 0: publish(channel, f, path + f, title, description, author, language, license) elif f.find(ogv) > 0: publish(channel, f, path + f, title, description, author, language, license) else: file_not_supported(f)