def extract_form_opml(request): BASE_DIR = os.path.dirname(os.path.dirname(__file__)) print BASE_DIR bbc = listparser.parse(BASE_DIR + "/static/feeds.opml") sina = listparser.parse(BASE_DIR + "/static/sina_all_opml.xml") for feed in bbc.feeds: cat = RSSCategory.objects.filter(name=feed.title).first() if not cat: cat = RSSCategory(name=feed.title, publisher=bbc.meta.title) cat.save() if not RSSSourceList.objects.filter(url=feed.url): source = RSSSourceList(url=feed.url, category=cat, last_update=timezone.now()) source.save() for feed in sina.feeds: cat = RSSCategory.objects.filter(name=feed.title).first() if not cat: cat = RSSCategory(name=feed.title, publisher=bbc.meta.title) cat.save() if not RSSSourceList.objects.filter(url=feed.url): source = RSSSourceList(url=feed.url, category=cat, last_update=timezone.now()) source.save() return JsonResponse({'status': True})
def import_opml(path, create_feed=True, create_entries=False): res = listparser.parse(path) ret = [] for feed in res.feeds: flat_cats = [] for cat_list in feed.categories: for cat in cat_list: flat_cats.append(cat) # nested-nested categories? ew. # feeds can only have one category currently, use the last one found cat = flat_cats[-1] logger.debug("all known categories for this feed: %s" % flat_cats) logger.info("fetching or creating FeedGroup %r" % cat) feed_group, created = models.FeedGroup.objects.get_or_create( slug=slugify(cat)) if created: logger.info("new feed group created %r" % feed_group) else: logger.info("found feed group %r" % feed_group) # pull the feed down success, data = logic.pull_feed(feed.url, create_feed, create_entries) ret.append((success, feed.url)) if success: logger.info("successfully pulled feed, associating with group") # attach the feed group feed = data['feed'] feed.group = feed_group feed.save() else: logger.warning("failed to pull feed, error was: %s", data) return ret
def run(self): f = StringIO(self.opml_data.encode('utf-8')) opml_obj = listparser.parse(f) for feed in opml_obj.feeds: if not feed.tags: self.add_uncategorized_feed(feed) for category in feed.tags: category_entry = self.get_or_set_category(category) try: feed_entry = self.get_or_set_feed(feed) except: self.failed_feeds += 0 continue if self.is_feed_in_category(feed_entry, category_entry): continue category_entry.feeds.append(feed_entry) Feed.uncategorize_feeds() db.session.commit() ret = self.get_return_status() return ret
def parse(self, response): d = listparser.parse(response.body) feeds = d.feeds for feed in feeds: item = PodsearchbotItem() item['link'] = feed.url yield item
def import_feeds(self, source): """Tries to parse and import an opml file exported from another RSS reader. Will try to keep name and categories. Args: source (string): Path of the opml file. """ result = listparser.parse(source) name = result.meta.title size = len(result.feeds) self.output.write_info( f"Do you want to import {size} feeds from {name}? [y]es/[n]o/[v]iew" ) answer = input() if answer.lower() == "v" or answer.lower() == "view": for i in result.feeds: print(f"{i.title} : {i.url}") elif answer.lower() == "y" or answer.lower() == "yes": try: for i in result.feeds: if self.verbose: print(f"Trying to add {i.title}") if len(i.categories) > 0: if self.verbose: print("Grabbing categories") categories = i.categories[0] else: categories = [] self.add_feed(i.title, i.url, categories) except Exception as e: self.output.write_error( f"Something went wrong when importing {i}!: {e}") finally: self.output.write_ok("Feeds imported successfully.")
def OnImport(self, e): opml_result = listparser.parse(self.opml_file) for f in opml_result.feeds: print(f) print("Importing {} -> {}".format(f.title, f.url)) db.feed.subscribe_feed(f.title, f.url, f.tags) self.Destroy()
def mass_Import_WX_ID_from_opml(self, opemlFile_or_Content_or_URL): ''' listparser.parse(obj[, agent, etag, modified]) Parameters: obj (file or string) – a file-like object or a string containing a URL, an absolute or relative filename, or an XML document agent (string) – User-Agent header to be sent when requesting a URL etag (string) – ETag header to be sent when requesting a URL modified (string or datetime) – Last-Modified header to be sent when requesting a URL ''' opml = listparser.parse(opemlFile_or_Content_or_URL) for feed in opml.feeds: try: wx_id=re.findall("weixin\?id=(\S+)$", feed.url)[0] except IndexError: print "---- WX_ID Paste Error!%s"%feed.url if not self.is_WX_ID_Exists(wx_id): WX_ID = Node("WX_ID") info = { "wx_id": wx_id, "name": feed.title, "group": feed.categories[0][0] } WX_ID.update(info) self.neo4j.create(WX_ID) print "++++ WX_ID Simple stored:\t%s" % wx_id return True
def handle(self, *args, **options): if not len(args) == 2: self.stdout.write('args must be <username> <opml url>') return try: d = lp.parse(args[1]) user = User.objects.get(username=args[0]) except Exception as e: print(e.encode('utf8')) return for feed in d.feeds: # change categories to tags categories = feed['categories'][0] tags = [] for category in categories: if category == 'My Feeds': continue tag, tag_created = Tag.objects.get_or_create(text=category, user=user) tags.append(tag) new_feed, created = Feed.objects.get_or_create( link=feed['url'], user=user ) if created: new_feed.title = feed['title'] new_feed.tags.add(*tags) new_feed.save() else: print('Already exists %d ' % new_feed.id) print(new_feed.title.encode('utf8'))
async def opml_import(event: Union[events.NewMessage.Event, Message], *_, lang: Optional[str] = None, **__): reply_message: Message = await event.get_reply_message() if not (event.is_private or event.is_channel and not event.is_group) and reply_message.sender_id != env.bot_id: return # must reply to the bot in a group to import opml try: opml_file = await event.download_media(file=bytes) except Exception as e: await event.reply('ERROR: ' + i18n[lang]['fetch_file_failed']) logger.warning(f'Failed to get opml file from {event.chat_id}: ', exc_info=e) return reply: Message = await event.reply(i18n[lang]['processing'] + '\n' + i18n[lang]['opml_import_processing']) logger.info(f'Got an opml file from {event.chat_id}') opml_d = listparser.parse(opml_file.decode()) if not opml_d.feeds: await reply.edit('ERROR: ' + i18n[lang]['opml_parse_error']) return import_result = await inner.sub.subs(event.chat_id, tuple(feed.url for feed in opml_d.feeds), lang=lang) logger.info(f'Imported feed(s) for {event.chat_id}') await reply.edit(import_result["msg"], parse_mode='html')
def import_opml(session, opml): feedlist = listparser.parse(opml) for f in feedlist.feeds: # skip entries without URLs if not hasattr(f, 'url'): continue # run a HEAD request against url to find out final URL, in case of any # redirects f.url = find_canonical_url(f.url) feed = session.query(model.Feed).filter_by(url=f.url).first() if feed: # feed url already present in database continue logger.debug( "Importing feed '{title}', URL '{url}, categories: {categories}'". format( title=f.title, url=f.url, categories=f.categories, ) ) feed = model.Feed(title=f.title, url=f.url, has_subscribers=True) session.add(feed) session.commit()
def _process_url(self): url = str(self.url) result = listparser.parse(url) if result["bozo"] == 1: return False self._process_result(result) return True
def parse(self, response): d = listparser.parse(response.body) feeds = d.feeds for feed in feeds: item = PodsearchbotItem() item["link"] = feed.url yield item
def fn(self): doc = listparser._to_bytes("""<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:rss="http://purl.org/rss/1.0/"> <foaf:Agent><foaf:name>á</foaf:name><foaf:weblog> <foaf:Document rdf:about="http://domain/"><rdfs:seeAlso> <rss:channel rdf:about="http://domain/feed" /> </rdfs:seeAlso></foaf:Document></foaf:weblog></foaf:Agent> </rdf:RDF>""") idoc = listparser.Injector(listparser.BytesStrIO(doc)) tmp = [] while 1: i = idoc.read(size) if i: tmp.append(i) else: idoc.close() break xml = _to_unicode(listparser._to_bytes('').join(tmp)) result = listparser.parse(xml) self.assertFalse(result.bozo) self.assertEqual(len(result.feeds), 1) self.assertEqual(ord(result.feeds[0].title), 225) # \u00e1
def _parse_opml(text): result = {} result['items'] = items = [] raw = listparser.parse(io.StringIO(text)) bozo_exception = raw.get('bozo_exception') if bozo_exception: LOG.warning(f'Parse OPML {bozo_exception}') result['title'] = (raw['meta'] or {}).get('title') for feed in (raw['feeds'] or []): url = feed.get('url') title = feed.get('title') # ignore title if it's url. eg: rssant before v1.8 export text(title) field with feed link if title and RE_URL.match(title): title = None # eg: {'url': '...', 'title': '...', 'categories': [['设计']], 'tags': ['设计']} categories = feed.get('categories') group = categories[0] if categories else None if group and isinstance(group, list): group = group[0] group = str(group) if group is not None else None if not url: continue url = _normalize_url(url) items.append(dict( title=title, group=group, url=url, )) total = len(result['items']) if total > IMPORT_ITEMS_LIMIT: LOG.warning(f'import {total} OPML feeds exceed limit {IMPORT_ITEMS_LIMIT}, will discard!') result['items'] = result['items'][:IMPORT_ITEMS_LIMIT] result = validate_opml(result) result['items'] = [x for x in result['items'] if x['url']] return result
def get_feeds(feeds_file, max_age, max_feeds): opml = lp.parse(feeds_file) feeds = opml.feeds feeds = feeds[:max_feeds] md = Markdown() filename = "rssdigest.html" with open(filename, "w") as text_file: text_file.write(md.convert("# Daily RSS Digest \n----")) digeststring = "# Daily RSS Digest \n----\n\n" number_of_feeds = len(feeds) for index, feed in enumerate(feeds): feed = feedparser.parse(feed.url) feedstring = "" addfeed = False print("[" + str(index) + "/" + str(number_of_feeds) + "]") if 'title' in feed.feed: feedstring += "## " + feed.feed.title + "\n" for entry in feed.entries: localtime = time.localtime() try: publishedtime = entry.published_parsed # age in days age = (time.mktime(localtime) - time.mktime(publishedtime)) / 60 / 60 / 24 if age < max_age: feedstring += "## ["+entry.title+"]("+entry.link+")\n\n" if 'description' in entry: if len(entry.description) < 500: feedstring += entry.description + "\n\n" addfeed = True except: pass if not addfeed: print(feedstring + "No new posts\n") feedstring += "----\n" if addfeed: print(feedstring) # Append to string digeststring += feedstring # Append to file with open(filename, "a") as text_file: feedhtml = md.convert(feedstring) text_file.write(feedhtml) digesthtml = md.convert(digeststring) # print("Final: " + digesthtml) return digesthtml
def worker(self, evals, testfile, etag, modified): if 'http' in testfile: testfile = 'http://localhost:8091/tests/' + testfile else: testfile = join('tests', testfile) result = listparser.parse(testfile, etag=etag, modified=modified) for ev in evals: self.assert_(eval(ev))
def parse_opml(file): opml_feeds = [{ "title": feed["title"], "url": feed["url"], "entries": [], "last_updated": None } for feed in listparser.parse(file)["feeds"]] return opml_feeds
def fill_feed_info(self, opml_file): """ Import the file """ parsed = listparser.parse(opml_file) for feed in parsed.feeds: print("Adding %s" % feed.url) Feed.objects.get_or_create(feed_url=feed.url)
def parse(file_name): result = listparser.parse(file_name) rst = {} for feed in result.feeds: for tag in feed['tags']: i = [feed['title'], feed['url']] rst.setdefault(tag, []).append(i) return rst
def testUserAgentGlobalOverride(self): url = 'http://localhost:8091/tests/http/useragent.xml' tmp = listparser.USER_AGENT listparser.USER_AGENT = "NewGlobalAgent" result = listparser.parse(url) listparser.USER_AGENT = tmp self.assertFalse(result.bozo) self.assert_(result.headers.get('x-agent') == "NewGlobalAgent")
def __init__(self, opml_file): print "reading opml file..." self.info = listparser.parse(opml_file) self.feeds = self.info.feeds print self.info.meta.title print self.info.meta.created
def clean_xml_file(self): xml_file = self.cleaned_data['xml_file'] result = listparser.parse(xml_file) if result['bozo'] == 1: raise ValidationError(result['bozo_exception']) self.result = result return xml_file
def import_opml(self, opml_file): """ Import feeds from an opml file Parameters ---------- opml_file : string The relative path to the opml file that we want to import. """ print("importing " + opml_file) new_feeds = listparser.parse(opml_file) for feed in new_feeds.feeds: print(feed.url) self.add_podcast(feed.url)
def read_rss(self, url): utils.log('Read File: %s' % url,xbmc.LOGINFO) if url not in rss_cache: utils.log('File not in cache, requesting...',xbmc.LOGINFO) xml = httpget(url) progs = listparser.parse(xml, self.format) if not progs: return [] d = [] for entry in progs.entries: p = programme_simple(entry.id, entry) d.append(p) utils.log('Found %d entries' % len(d),xbmc.LOGINFO) rss_cache[url] = d else: utils.log('File found in cache',xbmc.LOGINFO) return rss_cache[url]
def handle(self, *args, **options): opml_file = open(args[0]) opml = listparser.parse(opml_file) for feed in opml.feeds: print "%s: %s" % (feed.title, feed.url) feed_object = Feed.objects.create(name=feed.title, feed_url=feed.url) feed_object.save() for tag in feed.tags: # .get_or_create() with a name that begins with a number # (eg. '0-premium') causes .add() to break: "TypeError: int() # argument must be a string or a number, not 'Label'" so # we fetch the label again. Le sigh. label = Label.objects.get_or_create(name=tag) label = Label.objects.get(name=tag) feed_object.labels.add(label)
def upload(request): #TODO sort out multi (implement file drop?) params = [] if request.method == 'GET': print "Importing OPML file" filename = request.GET.get('filename') or './opml2.xml' d = listparser.parse(filename) #request.POST['filename'] for f in d.feeds: print f.title feed = Feed(title=f.title) feed.url = f.url feed.user = request.user feed.save() params = {'Messages': ['Your import might have been a success!',]} return response(request, 'mainapp/index.html', params)
def start_parsing(self, dest_path,csv_checked, csv_first_line_header,plain_checked): try: td = self.unzip(self.path_dict[self.FIND_ZIP_DICT_KEY]) files = self.search_path_files(td) flag_subscriptions_xml_found = False destination_filename = self.dest_filename+'_subscriptions.csv' for f in files: fileName, fileExtension = os.path.splitext(f) if fileName.find('subscriptions')>=0 and fileExtension == '.xml': flag_subscriptions_xml_found = True if(csv_checked.get() == 1): flag_header_written = False with open(os.path.join(dest_path.get(),destination_filename),'wb') as csvfile: csv_writer=csv.writer(csvfile,delimiter=',',quotechar='"',quoting=csv.QUOTE_ALL) parsedxml = listparser.parse(f) keys=parsedxml.feeds[0].keys() if(csv_first_line_header.get() == 1 and not flag_header_written): csv_writer.writerow(keys) flag_header_written = True for e in parsedxml.feeds: row = [] for k in keys: if type(e[k]) is list: row.append(self.list_to_string(e[k])) else: row.append(e[k]) csv_writer.writerow(row) self.gui_logger("File written: "+destination_filename) if flag_subscriptions_xml_found is False: self.gui_logger("Error: subscriptions.xml not found in zip file.") self.gui_logger("Done!") except BadZipfile: self.gui_logger("Error: File is not a zip file.")
def read_rss(self, url): #logging.info('Read RSS: %s', url) if url not in rss_cache: #logging.info('Feed URL not in cache, requesting...') xml = httpget(url) progs = listparser.parse(xml) if not progs: return [] d = [] for entry in progs.entries: pid = parse_entry_id(entry.id) p = programme(pid) d.append(p) #logging.info('Found %d entries', len(d)) rss_cache[url] = d #else: # logging.info('RSS found in cache') return rss_cache[url]
def read_rss(self, url): utils.log('Read RSS: %s' % url,xbmc.LOGINFO) if url not in rss_cache: utils.log('Feed URL not in cache, requesting...',xbmc.LOGINFO) xml = httpget(url) # utils.log("Received xml: %s" % xml,xbmc.LOGDEBUG) progs = listparser.parse(xml) if not progs: return [] d = [] for entry in progs.entries: pid = parse_entry_id(entry.id) p = programme_simple(pid, entry) d.append(p) utils.log('Found %d entries' % len(d),xbmc.LOGINFO) rss_cache[url] = d else: utils.log('RSS found in cache',xbmc.LOGINFO) return rss_cache[url]
def upload_opml_file(request): UploadFile = request.FILES['file'] result = listparser.parse(UploadFile.read()) title = result.meta.title if title != '' : count = 0; for i in result.feeds: c1 = Category(uid=1, name=i.tags) c2 = Category.objects.filter(name=i.tags) if c2.count() == 0: c1.save() else: c1.cid = c2[0].cid r1 = Rss(uid=1, cid=c1.cid, sitename=i.title, xmlurl=i.url, htmlurl='', updatetime=datetime.datetime.now()) r1.save() count += 1 return render_to_response('resultopml.html', locals())
def __read_rss (cls, url): #logging.info ('Read RSS: %s', url) if url not in RSS_CACHE: #logging.info ('Feed URL not in cache, requesting...') xml = httpget (url) progs = listparser.parse (xml) if not progs: return [] cached_programmes = [] for entry in progs.entries: pid = parse_entry_id (entry.identifier) programme = Programme (pid) cached_programmes.append (programme) #logging.info ('Found %d entries', len (d)) RSS_CACHE[url] = cached_programmes #else: # logging.info ('RSS found in cache') return RSS_CACHE[url]
def __read_rss(cls, url): #logging.info ('Read RSS: %s', url) if url not in RSS_CACHE: #logging.info ('Feed URL not in cache, requesting...') xml = httpget(url) progs = listparser.parse(xml) if not progs: return [] cached_programmes = [] for entry in progs.entries: pid = parse_entry_id(entry.identifier) programme = Programme(pid) cached_programmes.append(programme) #logging.info ('Found %d entries', len (d)) RSS_CACHE[url] = cached_programmes #else: # logging.info ('RSS found in cache') return RSS_CACHE[url]
def parser_opml(handler): result = listparser.parse(handler) logger.debug(result) source = dict() for feed in result.feeds: group = feed.categories for g in group: if g[0].lower() == "must read": continue else: group = g[0] # print(group) if group not in source.keys(): source[group] = list() source[group].append(dict(title=feed.title, url=feed.url)) return source
def read_opml(path): try: import listparser except Exception: raise errors.AnsibleFilterError('the "opml" filter requires the \ "listparser" python module, install with `pip install \ listparser`') try: result = listparser.parse(path) except Exception as e: raise errors.AnsibleFilterError('error while parsing opml file: "%s"' % str(e)) feeds = result['feeds'] for index, feed in enumerate(feeds): feeds[index]['folder'] = [item for sublist in feed.pop('categories') for item in sublist] return feeds
def import_opml(self, feed_url): feed = self.parse_feed(feed_url) success = [] errors = [] if 'opml' in feed['feed']: opml = listparser.parse(feed_url) for item in opml['feeds']: try: feed = self.handle(item['url']) success.append(feed) except (exceptions.FeedCriticalError, exceptions.TimeoutError) as exc: errors.append((feed_url, exc)) else: try: feed = self.handle(feed_url) success.append(feed) except (exceptions.FeedCriticalError, exceptions.TimeoutError) as exc: errors.append((feed_url, exc)) return success, errors
def import_opml(subscriptions, opml): """Import a list of subscriptions from an OPML file.""" subscribed_feeds = [] imported_feeds = listparser.parse(opml) # Load the list of currently subscribed feeds with open(subscriptions, 'r') as f: for line in f: feed = line.strip() if feed.startswith("#") or len(feed) == 0: continue subscribed_feeds.append(feed) # Import any feeds we're not already subscribed to with open(subscriptions, 'a') as f: for feed in imported_feeds.feeds: if feed.url not in subscribed_feeds: print("Importing " + feed.title + "...") subscribed_feeds.append(feed.url) f.write(feed.url + "\n") sys.exit()
def upload_opml(): if 'username' in session: try: urls = [] f = request.files['file'] # parse opml file outline = lp.parse(f) for site in outline.feeds: urls.append({"url": site.url, "category" : site.categories[0][0]}) # return a JSON list of feed URLs return { "status" : "ok", "feeds" : urls } except Exception as e: return { "status" : "error", "error" : str(e) } else: abort(401)
def get_dead_feeds(filename, interval): fin = open(filename, 'r') opml = listparser.parse(fin) now = datetime.datetime.now() for f in opml.feeds: d = feedparser.parse(f.url) if 'title' in d.feed: if d.entries: entry = d.entries[0] date = get_date_word(entry) if date: time_updated = datetime.datetime.fromtimestamp(time.mktime(entry[date])) if now - time_updated > datetime.timedelta(days=interval): print('MAYBE: The feed "{}" has not been modified in at least {} days. Url tried is {}'.format(f.title, interval, f.url)) else: print('MAYBE: The feed "{}"\'s most recent item has no information on when it was published. Url tried is {}'.format(f.title, f.url)) else: print('DEAD: The feed "{}" appears to have zero posts. Url tried is {}'.format(f.title, f.url)) else: print('DEAD: The feed "{}" is likely either dead or moved. Url tried is {}'.format(f.title, f.url))
def import_opml_cmd(args): import listparser l = listparser.parse(args.file) forced_rivers = [] if args.river: forced_rivers = args.river.split(',') for item in l.feeds: rivers = forced_rivers if len(rivers) == 0: rivers = ['Main'] if len(item.categories) > 0: rivers = [ ('/'.join(c for c in cats)) for cats in item.categories ] for river_name in rivers: print("Importing feed %s to river %s" % ( item.url, river_name, )) add_river_and_feed(args.user, river_name, item.url)
def main(): global NUM_VIDEOS global DESTINATION_FOLDER global API_KEY global FORMAT global FILE_FORMAT global SCHEDULING_MODE global SCHEDULING_MODE_VALUE number_of_runs_completed = 0 did_i_just_complete_run = False minutes_to_wait = 0 while True: print("Starting on run number %s" % number_of_runs_completed) logging.info("Starting on run number %s" % number_of_runs_completed) if SCHEDULING_MODE == "TIME_OF_DAY": logging.info("Evaluating time of day run for %s schedule mode" % SCHEDULING_MODE_VALUE) if did_i_just_complete_run: minutes_to_wait = 24 * 60 logging.debug(" Just completed run, need to wait %s minutes" % minutes_to_wait) did_i_just_complete_run = False else: minutes_to_wait = (SCHEDULING_MODE_VALUE - datetime.now().hour) * 60 if minutes_to_wait < 0: minutes_to_wait += 24 * 60 minutes_to_wait -= datetime.now().minute print(" First scheduled run set for %s minutes from now" % minutes_to_wait) elif SCHEDULING_MODE == "RUN_ONCE": logging.info("Evaluating run once schedule mode") if did_i_just_complete_run: logging.info(" Just completed run, ending") break else: logging.info(" Starting run once") elif SCHEDULING_MODE == "DELAY": logging.info("Evaluating delay schedule mode") if did_i_just_complete_run: minutes_to_wait = SCHEDULING_MODE_VALUE logging.info(" Next run in %s minutes" % minutes_to_wait) else: logging.info(" First run, doing it now") else: logging.info("Unknown SCHEDULING_MODE found %s" % SCHEDULING_MODE) #todo this should throw an exception break logging.info("Sleeping for %s minutes..." % minutes_to_wait) time.sleep(minutes_to_wait * 60) data = lp.parse("data/youtubeData.xml") # init for usage outside of this for loop xmltitle = [None] * len(data.feeds) xmlurl = [None] * len(data.feeds) channelIDlist = [None] * len(data.feeds) valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) for i in range(0, len(data.feeds)): xmltitle[i] = data.feeds[i].title # channel Title xmlurl[i] = data.feeds[ i].url # formatted like 'https://www.youtube.com/feeds/videos.xml?channel_id=CHANNELID' indexofid = xmlurl[i].find("id=") channelIDlist[i] = xmlurl[i][indexofid + 3:] get_icons(xmltitle, channelIDlist) for i in range(0, len(xmltitle)): # for every channel uploader = xmltitle[i] print(uploader) url_data = urlopen(xmlurl[i],) url_data = url_data.read() xml = bs(url_data.decode('utf-8'), 'html.parser') videoList = xml.find_all('entry') # print(xml.find_all('entry')) video_download_count = 0 for v in videoList: # for every video in channel # make sure we only download how many we want if video_download_count < NUM_VIDEOS: skip_download = False video_download_count += 1 title = str(v.title.string) #title = title.decode("utf-8") #temp = title.encode("ascii", errors="ignore").decode('utf-8', 'ignore') title = title.encode("utf-8", errors="ignore").decode('utf-8', 'ignore') escapes = '|'.join([chr(char) for char in range(1, 32)]) title = re.sub(escapes, "", title) # removes all escape characters title = title.replace("-", " ").replace("\\", "").replace("/", "") upload_time = v.published.string.split('T')[1].split('+')[0].replace(':', '-') upload_date = v.published.string.split('T')[0] upload_date = upload_date + "_" + upload_time url = v.link.get('href') id = v.id.string channelID = str(v.find('yt:channelid').contents[0]) # See if we already downloaded this logFile = open(logFileName, 'r') logFileContents = logFile.read() logFile.close() if id in logFileContents: logging.info("Video Already downloaded for id %s" % id) print("Video Already downloaded: " + id) else: filename_format = parseFormat(FILE_FORMAT, uploader, upload_date, title, channelID, id.replace("yt:video:", "")) logging.debug("filename_formatted parsed to %s" % filename_format) logging.info("Downloading - " + title + " | " + id) logging.info("Channel - " + str(xmltitle[i]) + " | " + channelID) if os.name == 'nt': # if windows use supplied ffmpeg ydl_opts = { 'outtmpl': 'Download/' + uploader + '/' + filename_format + '.%(ext)s', # need to put channelid in here because what youtube-dl gives may be incorrect #'simulate': 'true', 'writethumbnail': 'true', 'forcetitle': 'true', 'ffmpeg_location': './ffmpeg/bin/', 'format': FORMAT } else: # not sure here ydl_opts = { 'outtmpl': 'Download/' + uploader + '/' + filename_format + '.%(ext)s', 'writethumbnail': 'true', 'forcetitle': 'true', 'format': FORMAT } try: with youtube_dl.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=False) video_id = info_dict.get("id", None) video_title = info_dict.get("title", None) video_date = info_dict.get("upload_date", None) uploader = info_dict.get("uploader", None) is_live = info_dict.get("is_live", None) if 'entries' in info_dict: is_live = info_dict['entries'][0]["is_live"] if not is_live: ydl.download([url]) else: print("Warning! This video is streaming live, it will be skipped") logging.info("Warning! This video is streaming live, it will be skipped") skip_download = True except Exception as e: print("Failed to Download") skip_download = True logging.error(str(e)) logging.error(traceback.format_exc()) logVariables() if not skip_download: subscription_source_dir = 'Download/' + uploader + '/' subscription_destination_dir = os.path.join(DESTINATION_FOLDER, uploader) logging.debug("subscription_source_dir is %s" % subscription_source_dir) logging.debug("subscription_destination_dir is %s" % subscription_destination_dir) #destinationDir = parseFormat(DESTINATION_FORMAT, uploader, upload_date, title, channelID, id) #destinationDir = os.path.join(DESTINATION_FOLDER, destinationDir) if not os.path.exists(DESTINATION_FOLDER + uploader): logging.info("Creating uploader destination directory for %s" % subscription_destination_dir) os.makedirs(subscription_destination_dir) try: logging.info("Now moving content from %s to %s" % (subscription_source_dir, subscription_destination_dir)) for filename in os.listdir(subscription_source_dir): logging.info("Checking file %s" % filename) source_to_get = os.path.join(subscription_source_dir, filename) where_to_place = subscription_destination_dir logging.info("Moving file %s to %s" % (source_to_get, where_to_place)) safecopy(source_to_get, where_to_place) #shutil.move(os.path.join(subscription_source_dir, filename), subscription_destination_dir) shutil.rmtree(subscription_source_dir, ignore_errors=True) # shutil.move(videoName, destination + destVideoName) # shutil.move(thumbName, destination + destThumbName) # everything was successful so log that we downloaded and moved the video logFile = open(logFileName, 'a') logFile.write(id + ' \n') logFile.close() except Exception as e: print("An error occured moving file") logging.error(str(e)) logging.error(traceback.format_exc()) logVariables() print() print() number_of_runs_completed += 1 did_i_just_complete_run = True logging.info("Program main.py ended") logging.info("============================================================")
print(Fore.YELLOW + Style.BRIGHT + "MOVED TO {}".format(r.url) + Fore.RESET + Style.RESET_ALL) return None return r.text if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: {} OPML...".format(sys.argv[0])) sys.exit(1) for path in sys.argv[1:]: print(Fore.CYAN + Style.BRIGHT + "Checking OPML {}".format(path) + Fore.RESET + Style.RESET_ALL) result = listparser.parse(path) for feed in result.feeds: feed_text = get("feed", feed.title, feed.url) if feed_text is None: continue feed = feedparser.parse(feed_text) if feed.bozo: print(Fore.RED + Style.BRIGHT + "NG (FEED)" + Fore.RESET + Style.RESET_ALL) continue #pprint(feed) latest = -1 for entry in feed.entries:
def request_ompl(self): self.opml = listparser.parse(self.opml_url)
def read_opml(filename): print "OPML: Reading OPML file: '%s'." % (filename,) result = listparser.parse(filename) urls = [ f.url for f in result.feeds ] #print urls return urls
#!/usr/bin/env python import listparser as lp d = lp.parse("podcasts_opml.xml") f = open('podcasts.org', 'w') f.write("|-|-|\n") f.write("|url|Title|\n") f.write("|-|-|\n") for podcast in d.feeds: f.write("|%s| %s|\n" % (podcast.url, podcast.title)) f.write("|-|-|\n") f.close()
# Generate list of IDs podcasts = driver.find_elements_by_xpath('//ol/li') pod_ids = [] for pod in podcasts: pod_ids.append(pod.get_attribute("id").replace("draggable_", "", 1)) # Send them to /dev/null for pod in pod_ids: # important time delay for server response time.sleep(0.5) driver.get("http://mysqueezebox.com/settings/podcasts/delete/" + pod) # Load local OPML file try: opml_file = open(opml_path) opml_cont = opml_file.read() except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) sys.exit(1) pods = lp.parse(opml_cont) # Create new subscription list, one entry at a time print("Creating new subscription list from OPML file") for feed in pods.feeds: element = driver.find_element_by_xpath('//input[@name="url"]') element.clear() element.send_keys(feed.url) driver.find_element_by_xpath('//*[@id="add_button"]').click() time.sleep(0.7)
def opml(): if ('action' in request.values and request.values['action'] in ['import', 'export']): if request.values['action'] == 'import': import_file = request.files.get('file') if not import_file or import_file.filename == '': flash('Warning: No File Selected') else: feed_list = listparser.parse(import_file) for feed in feed_list['feeds']: url = feed['url'] if not Feed.get(url=url): if 'title' in feed and feed['title']: title = feed['title'] else: parse = feedparser.parse(url) if 'title' in parse.feed: title = parse.feed.title else: title = url new_feed = Feed(title=title, url=url) for category_list in feed['categories']: for title in category_list: if title: category = Category.get(title=title) if not category: category = Category(title=title) new_feed.categories.add(category) flash('Feeds Imported!') elif request.values['action'] == 'export': root = etree.Element('opml', version='2.0') head = etree.SubElement(root, 'head') head_elements = { 'title': 'feedfin OPML export', 'dateCreated': format_datetime(datetime.utcnow()), 'docs': 'http://dev.opml.org/spec2.html' } for element, text in head_elements.items(): new_element = etree.SubElement(head, element) new_element.text = text body = etree.SubElement(root, 'body') for feed in Feed.select(): new_element = etree.SubElement( body, 'outline', type='rss', text=feed.title, xmlUrl=feed.url, category=','.join( [category.title for category in feed.categories])) opml_bytes = etree.tostring(root, encoding='UTF-8', xml_declaration=True) response = make_response(opml_bytes.decode('utf-8')) response.headers['Content-Disposition'] = ( 'attachment; filename=feedfin.opml') return response else: flash('Warning: Invalid Request') return redirect(get_redirect_target())
def upload_file(request): UploadFile = request.FILES['file'] result = listparser.parse(UploadFile.read()) title = result.meta.title return render_to_response('result.html', locals())
log.debug('Link={0} len(text)={1}'.format( entry.link, len(a.text))) except newspaper.article.ArticleException as e: log.warning('{0} {1}'.format(entry.link, e)) if __name__ == "__main__": urls = ['http://planet.scipy.org/rss20.xml', 'http://planetpython.org/rss20.xml', 'http://dsguide.biz/reader/feeds/posts'] df = pd.read_csv('feeds.csv') df = df[df['Flag'] == 'Use'] urls.extend(df['URL'].values) for f in os.listdir('opml'): if f.endswith('opml'): fname = os.path.join('opml', f) parsed_opml = listparser.parse(fname) urls.extend([feed.url for feed in parsed_opml.feeds]) log = dl.log_api.conf_logger(__name__) config = core.init_config() corpus = dl.nlp.WebCorpus('sonar_corpus') for url in set(urls): rss = fp.parse(url) for entry in set(rss.entries): process_url(entry)
def _process_file(self): result = listparser.parse(self.file, "feedshare.net") if result["bozo"] == 1 and not result["feeds"]: return False self._process_result(result) return True
#!pip install --upgrade pip !pip install listparser import listparser as lp url = 'https://raw.githubusercontent.com/rushter/data-science-blogs/master/data-science.opml' d = lp.parse(url) len(d.feeds) d.feeds[24].url d.feeds[24].title !pip install feedparser import feedparser import time feed = feedparser.parse(d.feeds[24].url) feed['feed']['title'] len(feed['entries']) feed = feedparser.parse('http://dsguide.biz/reader/feeds/posts') feed['feed']['title'] feed['entries'][1] feed['entries'][1].title feed['entries'][1].link feed['entries'][1].summary feed['entries'][3].published dt = time.strptime(feed['entries'][3].published, '%a, %d %b %Y %H:%M:%S +0000')
def channel_selection(dataFile, inputFile="data/subscription_manager.xml", titleList=None, idList=None): logging.debug("channel_selection function called") if titleList is not None: inputFile = None else: titleList = [] idList = [] import listparser as lp logging.debug("Channel_selection started") # This function parses OPML data and allows the user to select which channels to be included write("Parsing Youtube data...\n", BLUE) all_channels = False loop = True while loop: write( "Would you like to select which channels you want to include, or do you want to include all of them?\n" "If you include all channels you can remove them manually by editing " + dataFile + " and deleting the" " entire line of the channel you do not want (Choose this option if you have a lot of subscriptions)" ) selection = get_input( "Enter 'all' to keep all subscriptions or 'select' to select which channels (or 'a' or 's'):" ).lower() logging.debug("User selected %s for all or single channel selection" % selection) if selection == 'all' or selection == 'a': all_channels = True loop = False write("Including all channels\n") elif selection == 'select' or selection == 's': all_channels = False loop = False write( "You will now be asked to select which channels you would like to include in your download library. \n" "Any channels you do not include will be ignored.\n") else: write("Invalid Selection!!! Try again.") logging.warning("User selected invalid entry") logging.debug("Opening " + dataFile + " for writing") file = open(dataFile, 'w') # logging.debug("Parsing " + inputFile) file.write('<opml version="1.1">\n<body>\n') if inputFile is not None: d = lp.parse(inputFile) l = d.feeds for count, channel in enumerate(l): #titleList[count] = channel.title #idList[count] = channel.url titleList.append(channel.title) idList.append(channel.url) else: for count, channel in enumerate(idList): idList[ count] = "https://www.youtube.com/feeds/videos.xml?channel_id=" + idList[ count] num_channels = len(titleList) human_count = 1 logging.debug("Processing channels") for count in range(0, num_channels): include_this_subscription = True title = titleList[count].replace('&', 'and') title = title.encode("ascii", errors="ignore").decode('utf-8', 'ignore') url = bytes(idList[count], 'utf-8').decode('utf-8', 'ignore') logging.debug("Processing channel: %s" % title) logging.debug("Channel has url %s" % url) if all_channels: write("(%i/%i) Including subscription: %s\n" % (human_count, num_channels, title)) logging.info("Automatically including channel: %s" % title) if not all_channels: loop = True while loop: selection = get_input( "(%i/%i) Include %s, yes or no (y/n)?" % (human_count, num_channels, title)).lower() if selection == 'y' or selection == 'yes': include_this_subscription = True write(" Including %s\n" % title) logging.info("User opted to include channel: %s" % title) loop = False elif selection == 'n' or selection == 'no': include_this_subscription = False logging.info("User opted to not include channel: %s" % title) loop = False else: write(" Invalid response. Try again.", RED) human_count += 1 if include_this_subscription: file.write('<outline title="' + xml.sax.saxutils.escape(title) + '" xmlUrl="' + xml.sax.saxutils.escape(url) + '"/>\n') else: write(" Not including %s\n" % title) file.write('</body>\n</opml>') file.close() logging.debug("Channels saved to" + dataFile) write("\nComplete.")
def main(): global NUM_VIDEOS global DESTINATION_FOLDER global API_KEY global FORMAT global FILE_FORMAT global SCHEDULING_MODE global SCHEDULING_MODE_VALUE global YOUTUBE_XML_FILE data = lp.parse(YOUTUBE_XML_FILE) my_filters = filters() # init for usage outside of this for loop xmltitle = [None] * len(data.feeds) xmlurl = [None] * len(data.feeds) channelIDlist = [None] * len(data.feeds) valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) for i in range(0, len(data.feeds)): xmltitle[i] = data.feeds[i].title # channel Title xmlurl[i] = data.feeds[ i].url # formatted like 'https://www.youtube.com/feeds/videos.xml?channel_id=CHANNELID' indexofid = xmlurl[i].find("id=") channelIDlist[i] = xmlurl[i][indexofid + 3:] get_icons(xmltitle, channelIDlist) for i in range(0, len(xmltitle)): # for every channel skip_download = False uploader = xmltitle[i] #print(uploader) try: url_data = urlopen(xmlurl[i], ) url_data = url_data.read() xml = bs(url_data.decode('utf-8'), 'html.parser') videoList = xml.find_all('entry') except Exception as e: print("Failed to Download Channel list due to html error, check logs") videoList = "" skip_download = True logging.error(str(e)) logging.error(traceback.format_exc()) logVariables() video_download_count = 0 for v in videoList: # for every video in channel # make sure we only download how many we want if (video_download_count < NUM_VIDEOS) and not skip_download: skip_download = False skip_move = False video_download_count += 1 title = str(v.title.string) #title = title.decode("utf-8") #temp = title.encode("ascii", errors="ignore").decode('utf-8', 'ignore') title = title.encode("utf-8", errors="ignore").decode('utf-8', 'ignore') escapes = '|'.join([chr(char) for char in range(1, 32)]) title = re.sub(escapes, "", title) # removes all escape characters title = title.replace("-", " ").replace("\\", "").replace("/", "").replace("%", "") upload_time = v.published.string.split('T')[1].split('+')[0].replace(':', '')[:-2] upload_date = v.published.string.split('T')[0] upload_date = upload_date + "_" + upload_time url = v.link.get('href') id = v.id.string channelID = str(v.find('yt:channelid').contents[0]) # See if we already downloaded this logFile = open(logFileName, 'r') logFileContents = logFile.read() logFile.close() if id in logFileContents: logging.info("Video Already downloaded for id %s" % id) #print("Video Already downloaded: " + id) else: if not my_filters.download_check(title, channelID): #print("Video Filtered: " + title) logging.info("Video Filtered: Title:" + title + "ChannelID:" + channelID) skip_download = True skip_move = True filename_format = parseFormat(FILE_FORMAT, uploader, upload_date, title, channelID, id.replace("yt:video:", "")) logging.debug("filename_formatted parsed to %s" % filename_format) if not skip_download: logging.info("Downloading - " + title + " | " + id) logging.info("Channel - " + str(xmltitle[i]) + " | " + channelID) if os.name == 'nt': # if windows use supplied ffmpeg ydl_opts = { 'outtmpl': 'Download/' + uploader + '/' + filename_format + '.%(ext)s', # need to put channelid in here because what youtube-dl gives may be incorrect #'simulate': 'true', 'writethumbnail': 'true', 'forcetitle': 'true', 'ffmpeg_location': './ffmpeg/bin/', 'ignoreerrors': 'true', 'format': FORMAT } else: # not sure here ydl_opts = { 'outtmpl': 'Download/' + uploader + '/' + filename_format + '.%(ext)s', 'writethumbnail': 'true', 'forcetitle': 'true', 'format': FORMAT } try: with youtube_dl.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=False) quality = info_dict.get("format", None) print("Video Quality: " + quality) video_id = info_dict.get("id", None) video_title = info_dict.get("title", None) video_date = info_dict.get("upload_date", None) uploader = info_dict.get("uploader", None) is_live = info_dict.get("is_live", None) if 'entries' in info_dict: is_live = info_dict['entries'][0]["is_live"] if not is_live: ydl.download([url]) else: print("Warning! This video is streaming live, it will be skipped") logging.info("Warning! This video is streaming live, it will be skipped") skip_move = True if os.path.exists('Download/' + uploader + '/'): for file in os.listdir('Download/' + uploader + '/'): if fnmatch.fnmatch(file, "*" + video_title + "*.part"): skip_move = True print("Failed to Download. Will Retry on next Run.") logging.error("Found .part file. Failed to Download. Will Retry next Run.") except Exception as e: print("Failed to Download") skip_move = True logging.error(str(e)) logging.error(traceback.format_exc()) logVariables() if not skip_move: subscription_source_dir = 'Download/' + uploader + '/' subscription_destination_dir = os.path.join(DESTINATION_FOLDER, uploader) logging.debug("subscription_source_dir is %s" % subscription_source_dir) logging.debug("subscription_destination_dir is %s" % subscription_destination_dir) #destinationDir = parseFormat(DESTINATION_FORMAT, uploader, upload_date, title, channelID, id) #destinationDir = os.path.join(DESTINATION_FOLDER, destinationDir) if not os.path.exists(DESTINATION_FOLDER + uploader): logging.info("Creating uploader destination directory for %s" % subscription_destination_dir) os.makedirs(subscription_destination_dir) try: logging.info("Now moving content from %s to %s" % (subscription_source_dir, subscription_destination_dir)) for filename in os.listdir(subscription_source_dir): logging.info("Checking file %s" % filename) source_to_get = os.path.join(subscription_source_dir, filename) where_to_place = subscription_destination_dir logging.info("Moving file %s to %s" % (source_to_get, where_to_place)) safecopy(source_to_get, where_to_place) #shutil.move(os.path.join(subscription_source_dir, filename), subscription_destination_dir) shutil.rmtree(subscription_source_dir, ignore_errors=True) # shutil.move(videoName, destination + destVideoName) # shutil.move(thumbName, destination + destThumbName) # everything was successful so log that we downloaded and moved the video logFile = open(logFileName, 'a') logFile.write(id + ' \n') logFile.close() except Exception as e: print("An error occured moving file") logging.error(str(e)) logging.error(traceback.format_exc()) logVariables() skip_download = False skip_move = False logging.info("Program main.py ended") logging.info("============================================================") return ""
import sys import listparser, requests if len(sys.argv) > 1: xml = sys.argv[1] opml = listparser.parse(xml) print('found %s feeds in %s' % (len(opml.feeds), xml)) for feed in opml.feeds: req = requests.post("http://localhost:3000/channels", data={'url' : feed.url}) print('[%s]' % req.status_code, feed.url) else: print('no opml file specified.')
def channel_selection(): import listparser as lp logging.debug("Channel_selection started") # This function parses OPML data and allows the user to select which channels to be included print("Parsing Youtube data\n") all_channels = False loop = True while loop: selection = get_input( "Would you like to select which channels you want to include, or do you want to include all of them?\n" "If you include all channels you can remove them manually by editing data/youtubeData.xml and deleting the" " entire line of the channel you do not want (Choose this option if you have a lot of subscriptions)\n" "Enter 'all' to keep all subscriptions or 'select' to select which channels (or 'a' or 's'):").lower() logging.debug("User selected %s for all or single channel selection" % selection) if selection == 'all' or selection == 'a': all_channels = True loop = False print("Including all channels\n") elif selection == 'select' or selection == 's': all_channels = False loop = False print( "You will now be asked to select which channels you would like to include in your download library. \nAny" " channels you do not include will be ignored. \nWarning: if you add a new subscription you must go through this" " process again (until I add a feature to import a channel)\n") else: print("Invalid Selection!!! Try again.") logging.warning("User selected invalid entry") logging.debug("Opening data/youtubeData.xml for writing") file = open("data/youtubeData.xml", 'w') logging.debug("Parsing data/subscription_manager.xml") d = lp.parse('data/subscription_manager.xml') l = d.feeds file.write('<opml version="1.1">\n<body>\n') num_channels = len(l) human_count = 1 logging.debug("Processing channels") for channel in l: include_this_subscription = True title = channel.title.replace('&', 'and') title = channel.title.encode("ascii", errors="ignore").decode('utf-8', 'ignore') url = bytes(channel.url, 'utf-8').decode('utf-8', 'ignore') logging.debug("Processing channel: %s" % title) logging.debug("Channel has url %s" % url) if all_channels: print("(%i/%i) Including subscription: %s\n" % (human_count, num_channels, title)) logging.info("Automatically including channel: %s" % title) if not all_channels: loop = True while loop: selection = get_input( "(%i/%i) Include %s, yes or no (y/n)?" % (human_count, num_channels, title)).lower() if selection == 'y' or selection == 'yes': include_this_subscription = True print(" Including %s\n" % title) logging.info("User opted to include channel: %s" % title) loop = False elif selection == 'n' or selection == 'no': include_this_subscription = False logging.info("User opted to not include channel: %s" % title) loop = False else: print(" Invalid response. Try again.") human_count += 1 if include_this_subscription: file.write('<outline title="' + xml.sax.saxutils.escape(title) + '" xmlUrl="' + xml.sax.saxutils.escape( url) + '"/>\n') else: print(" Not including %s\n" % title) file.write('</body>\n</opml>') file.close() logging.debug("Channels saved to youtubeData.xml") print("\nComplete.")
def main(my_sch): global NUM_VIDEOS global DESTINATION_FOLDER global API_KEY global FORMAT global FILE_FORMAT global SCHEDULING_MODE global SCHEDULING_MODE_VALUE global YOUTUBE_XML_FILE data = lp.parse(YOUTUBE_XML_FILE) logFileName = "data/log.txt" my_filters = filters() # init for usage outside of this for loop xmltitle = [None] * len(data.feeds) xmlurl = [None] * len(data.feeds) channelIDlist = [None] * len(data.feeds) valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) for i in range(0, len(data.feeds)): xmltitle[i] = slugify(data.feeds[i].title) # channel Title xmlurl[i] = data.feeds[ i].url # formatted like 'https://www.youtube.com/feeds/videos.xml?channel_id=CHANNELID' indexofid = xmlurl[i].find("id=") channelIDlist[i] = xmlurl[i][indexofid + 3:] if my_sch.getNumRuns() == 1: get_icons(xmltitle, channelIDlist) for i in range(0, len(xmltitle)): # for every channel skip_download = False uploader = xmltitle[i] # print(uploader) try: url_data = urlopen(xmlurl[i], ) url_data = url_data.read() xml = bs(url_data.decode('utf-8'), 'html.parser') videoList = xml.find_all('entry') except Exception as e: print(Fore.RED + "Failed to Download Channel list due to html error, check logs" + Style.RESET_ALL) videoList = "" skip_download = True logging.error(str(e)) logging.error(traceback.format_exc()) logVariables() video_download_count = 0 for v in videoList: # for every video in channel # make sure we only download how many we want if (video_download_count < NUM_VIDEOS) and not skip_download: skip_download = False skip_move = False video_download_count += 1 title = str(v.title.string) # title = title.decode("utf-8") # temp = title.encode("ascii", errors="ignore").decode('utf-8', 'ignore') title = title.encode("utf-8", errors="ignore").decode('utf-8', 'ignore') escapes = '|'.join([chr(char) for char in range(1, 32)]) title = re.sub(escapes, "", title) # removes all escape characters title = title.replace("-", " ").replace("\\", "").replace("/", "").replace("%", "") title = slugify(title) upload_time = v.published.string.split('T')[1].split('+')[0].replace(':', '')[:-2] upload_date = v.published.string.split('T')[0] upload_date = upload_date + "_" + upload_time url = v.link.get('href') id = v.id.string channelID = str(v.find('yt:channelid').contents[0]) # See if we already downloaded this logFile = open(logFileName, 'r') logFileContents = logFile.read() logFile.close() if id in logFileContents: logging.info("Video Already downloaded for id %s" % id) # print("Video Already downloaded: " + id) else: if not my_filters.download_check(title, channelID): # print("Video Filtered: " + title) logging.info("Video Filtered: Title:" + title + "ChannelID:" + channelID) skip_download = True skip_move = True filename_format = parseFormat(FILE_FORMAT, uploader, upload_date, title, channelID, id.replace("yt:video:", "")) logging.debug("filename_formatted parsed to %s" % filename_format) if not skip_download: logging.info("Downloading - " + title + " | " + id) logging.info("Channel - " + str(xmltitle[i]) + " | " + channelID) # Get format codes to use usable_extension = 'webm' # usable_format_code_video = 'bestvideo[ext=webm]' # usable_format_code_audio = 'bestaudio' containsWebmContent = False usable_format_code_audio = '(bestaudio[ext=m4a]/bestaudio)' usable_format_code_video = '(bestvideo[vcodec^=av01][height>=2160][fps>30]/' \ 'bestvideo[vcodec=vp9.2][height>=2160][fps>30]/' \ 'bestvideo[vcodec=vp9][height>=2160][fps>30]/' \ 'bestvideo[vcodec^=av01][height>=2160]/' \ 'bestvideo[vcodec=vp9.2][height>=2160]/' \ 'bestvideo[vcodec=vp9][height>=2160]/' \ 'bestvideo[height>=2160]/' \ 'bestvideo[vcodec^=av01][height>=1440][fps>30]/' \ 'bestvideo[vcodec=vp9.2][height>=1440][fps>30]/' \ 'bestvideo[vcodec=vp9][height>=1440][fps>30]/' \ 'bestvideo[vcodec^=av01][height>=1440]/' \ 'bestvideo[vcodec=vp9.2][height>=1440]/' \ 'bestvideo[vcodec=vp9][height>=1440]/' \ 'bestvideo[height>=1440]/' \ 'bestvideo[vcodec^=av01][height>=1080][fps>30]/' \ 'bestvideo[vcodec=vp9.2][height>=1080][fps>30]/' \ 'bestvideo[vcodec=vp9][height>=1080][fps>30]/' \ 'bestvideo[vcodec^=av01][height>=1080]/' \ 'bestvideo[vcodec=vp9.2][height>=1080]/' \ 'bestvideo[vcodec=vp9][height>=1080]/' \ 'bestvideo[height>=1080]/' \ 'bestvideo[vcodec^=av01][height>=720][fps>30]/' \ 'bestvideo[vcodec=vp9.2][height>=720][fps>30]/' \ 'bestvideo[vcodec=vp9][height>=720][fps>30]/' \ 'bestvideo[vcodec^=av01][height>=720]/' \ 'bestvideo[vcodec=vp9.2][height>=720]/' \ 'bestvideo[vcodec=vp9][height>=720]/' \ 'bestvideo[height>=720]/' \ 'bestvideo)' try: if FORMAT.split(" ")[0] == 'best': logging.info("Skipping getting format codes using granulated option") else: with youtube_dl.YoutubeDL() as ydl: info_dict = ydl.extract_info(url, download=False) formats = info_dict.get("formats", None) for f in formats: note = f.get('format_note') fID = f.get('format_id') extension = f.get('ext') if FORMAT.split(" ")[0] == note: usable_format_code_video = fID usable_extension = extension containsWebmContent = True break for f in formats: note = f.get('format_note') fID = f.get('format_id') extension = f.get('ext') if usable_extension == extension and note == 'audio only': usable_format_code_audio = fID if not containsWebmContent: usable_format_code_video = 'bestvideo' usable_format_code_audio = 'bestaudio' except Exception as e: logging.error(str(e)) if str(e) == "ERROR: This video is unavailable.": logging.error("This video is not available for download, " "maybe streaming or just an announcement post.") write("This video is not available for download, " "maybe streaming or just an announcement post.", RED) skip_download = True skip_move = True else: logging.error("An error occurred trying to find user requested format," " reverting to best") usable_format_code_video = 'bestvideo' usable_format_code_audio = 'bestaudio' write("Couldn't find request format for this video, defaulting to best", RED) if not skip_download: if os.name == 'nt': # if windows use supplied ffmpeg ydl_opts = { 'outtmpl': os.path.join('Download', uploader, filename_format + '.%(ext)s'), # need to put channelid in here because what youtube-dl gives may be incorrect # 'simulate': 'true', 'writethumbnail': 'true', 'forcetitle': 'true', 'ffmpeg_location': './ffmpeg/bin/', 'ignoreerrors': 'true', 'format': usable_format_code_video + "+" + usable_format_code_audio + '/best' } else: # Linux/Unix ydl_opts = { 'outtmpl': os.path.join('Download', uploader, filename_format + '.%(ext)s'), 'writethumbnail': 'true', 'forcetitle': 'true', 'format': usable_format_code_video + "+" + usable_format_code_audio + '/best' } try: with youtube_dl.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=False) quality = info_dict.get("format", None) write("Video Quality: " + quality, BLUE) video_id = info_dict.get("id", None) video_title = info_dict.get("title", None) video_date = info_dict.get("upload_date", None) is_live = info_dict.get("is_live", None) if 'entries' in info_dict: is_live = info_dict['entries'][0]["is_live"] if not is_live: ydl.download([url]) else: write("Warning! This video is streaming live, it will be skipped", RED) logging.info("Warning! This video is streaming live, it will be skipped") skip_move = True if os.path.exists('Download/' + uploader + '/'): for file in os.listdir('Download/' + uploader + '/'): if fnmatch.fnmatch(file, "*" + video_title + "*.part"): skip_move = True write("Failed to Download. Will Retry on next Run.", RED) logging.error("Found .part file. Failed to Download. Will Retry next Run.") except Exception as e: skip_move = True logging.error(str(e)) if str(e) == "ERROR: This video is unavailable.": logging.error("This video is not available for download, " "maybe streaming or just an announcement post.") write("This video is not available for download, " "maybe streaming or just an announcement post.", RED) else: logging.error("Failed to download video") write("Failed to Download", RED) logging.error(traceback.format_exc()) logVariables() if not skip_move: destinationDir = parseFormat(DESTINATION_FORMAT, uploader, upload_date, title, channelID, id) destinationDir = os.path.join(DESTINATION_FOLDER, destinationDir) subscription_source_dir = 'Download/' + uploader + '/' logging.debug("subscription_source_dir is %s" % subscription_source_dir) logging.debug("subscription_destination_dir is %s" % destinationDir) if not os.path.exists(destinationDir): logging.info( "Creating uploader destination directory for %s" % destinationDir) os.makedirs(destinationDir) try: logging.info("Now moving content from %s to %s" % ( subscription_source_dir, destinationDir)) for filename in os.listdir(subscription_source_dir): logging.info("Checking file %s" % filename) source_to_get = os.path.join(subscription_source_dir, filename) logging.info("Moving file %s to %s" % (source_to_get, destinationDir)) safecopy(source_to_get, destinationDir) # shutil.move(os.path.join(subscription_source_dir, filename), subscription_destination_dir) shutil.rmtree(subscription_source_dir, ignore_errors=True) # shutil.move(videoName, destination + destVideoName) # shutil.move(thumbName, destination + destThumbName) # everything was successful so log that we downloaded and moved the video logFile = open(logFileName, 'a') logFile.write(id + ' \n') logFile.close() logging.info("Successfully downloaded and moved file") write("Success!", GREEN) except Exception as e: print(str(e)) write("An error occured moving file", RED) logging.error(str(e)) logging.error(traceback.format_exc()) logVariables() skip_download = False skip_move = False logging.info("Program main.py ended") logging.info("============================================================") return ""
def create_from_file(cls, file): opml = listparser.parse(file) print('found %s feeds' % (len(opml.feeds))) for feed in opml.feeds: cls.create(url=feed.url, title=feed.title)