queue = Queue() results_queue = Queue() new_pages_count = 0 for page in childs: if page['link'] not in CACHE: CACHE.append(page['link']) queue.put(page, timeout=10) # woraround for queue.put: time.sleep(0.01) new_pages_count += 1 done = 0 bar.max = len(CACHE) while not queue.empty(): threads_count = 1 + (new_pages_count - done) / 2 if threads_count > max_threads_count: threads_count = max_threads_count workers = [Process(target=collect_childs, args=(queue, results_queue)) for i in xrange(threads_count)] for w in workers: w.daemon = True [w.start() for w in workers] for w in workers: w.join(timeout=0.1)
queue = Queue() results_queue = Queue() new_pages_count = 0 for page in childs: if page['link'] not in CACHE: CACHE.append(page['link']) queue.put(page, timeout=10) # woraround for queue.put: time.sleep(0.01) new_pages_count += 1 done = 0 bar.max = len(CACHE) while not queue.empty(): threads_count = 1 + (new_pages_count - done) / 2 if threads_count > max_threads_count: threads_count = max_threads_count workers = [ Process(target=collect_childs, args=(queue, results_queue)) for i in xrange(threads_count) ] for w in workers: w.daemon = True [w.start() for w in workers]
def create_epg(config): now = pytz.utc.localize(datetime.datetime.utcnow()) timespan_global = int(config.find("timespan_index").text) timespan_full_global = int(config.find("timespan_full").text) caching_global = config.find("caching").text if caching_global in ("on", "yes", "true", "True"): caching_global = True else: caching_global = False try: timespan_force_global = int(config.find("timespan_force").text) except TypeError: timespan_force_global = -1 try: with open("cached_epg.pkl", "rb") as fp: cache = pickle.load(fp) except IOError: cache = {} cache_new = {} # root element of epg tv = ET.Element("tv") tv.set("generator-info-name", "simplEPG v0.1") tv.set("generator-info-url", "https://github.com/eminga/simplEPG") c_pos = 0 successful = set() for channel in config.findall("channel"): try: site = importlib.import_module('sites.' + channel.get("site")) except ModuleNotFoundError: print("Error: could not find module sites." + channel.get("site")) continue channelid = channel.get("xmltv_id") print(channel.get("site") + ":" + channelid) if channelid in successful: print("channel already added, skipping...") continue try: timespan = int(channel.get("timespan_index")) except TypeError: timespan = timespan_global try: timespan_full = int(channel.get("timespan_full")) except TypeError: timespan_full = timespan_full_global try: caching = channel.get("caching") if caching is not None: if caching in ("on", "yes", "true", "True"): caching = True else: caching = False else: caching = caching_global except TypeError: caching = caching_global try: timespan_force = int(channel.get("timespan_force")) except TypeError: timespan_force = timespan_force_global if timespan_force == -1: timespan_force = -10000 try: shows = site.grab(channel.get("site_id"), timespan) except (KeyboardInterrupt, SystemExit): raise except: shows = [] print("An error occured:") print(sys.exc_info()) if len(shows) > 0: successful.add(channelid) c = ET.Element("channel", id = channel.get("xmltv_id")) ET.SubElement(c, "display-name").text = channel.text tv.insert(c_pos, c) c_pos += 1 # create progress bar if module is available try: from progress.bar import Bar except ImportError: class Bar: def __init__(self, label, max): self.max = max self.index = 0 def next(self): self.index += 1 def update(self): pass def finish(self): if self.index > 0: print("%s shows added." % self.index) bar = Bar("Processing", max=len(shows)) for i in range(len(shows)): if isinstance(shows[i], type(ET.Element(None))): starttime = parse_xmltv_date(shows[i].get("start")) stoptime = shows[i].get("stop") if type(stoptime) != None: stoptime = parse_xmltv_date(stoptime) shows[i] = {"xml": shows[i], "start": starttime, "stop": stoptime} else: shows[i] = {"xml": shows[i], "start": starttime} shows.sort(key = lambda r: r["start"]) for i in range(len(shows)): show = shows[i] if "stop" in show: stoptime = show["stop"] elif i < len(shows) - 1: stoptime = shows[i + 1]["start"] else: break # don't store shows that are already finished if stoptime < now: bar.max -= 1 bar.max = max(bar.max, 1) continue starttime = show["start"] # don't store shows that start more than "timespan" hours in the future if (starttime - now).total_seconds() / 3600 > timespan: break if "xml" in show: show = show["xml"] show.set("channel", channelid) tv.append(show) else: url = show.pop("details-url", None) if url is not None and len(url) > 0: if timespan_full > -1 and (starttime - now).total_seconds() / 3600 <= timespan_full: if caching and (starttime - now).total_seconds() / 3600 > timespan_force: force = False try: try: details = cache_new[url] except KeyError: details = cache[url] show.update(details) cache_new[url] = details except KeyError: force = True else: force = True if force: try: details = site.grabdetails(url) show.update(details) if caching: # don't store times in cache details.pop("start", None) details.pop("stop", None) cache_new[url] = details except (AttributeError, TypeError): pass programme = ET.SubElement(tv, "programme", start=starttime.strftime("%Y%m%d%H%M%S %z"), stop=stoptime.strftime("%Y%m%d%H%M%S %z"), channel=channelid) process_show(programme, show) bar.next() if bar.index > 0: bar.max = bar.index else: print("0 shows found.") bar.update() bar.finish() else: print("0 shows found.") if len(cache_new) > 0: with open("cached_epg.pkl", "wb") as fp: cache = pickle.dump(cache_new, fp) return tv
def load_simple_questions_dataset(config, force_reload=False): bar = Bar(suffix='%(index)d/%(max)d - %(elapsed)ds') data_npz = os.path.join(config.data_dir, 'data.npz') word2idx_txt = os.path.join(config.data_dir, 'word2idx.txt') if (os.path.exists(data_npz) and os.path.exists(word2idx_txt) and not force_reload): bar.max = 2 bar.message = 'Loading npz' bar.next() npz = np.load(data_npz) embd_mat = npz['embd_mat'] train_ques = npz['train_ques'].astype(np.int32) train_ans = npz['train_ans'].astype(np.int32) valid_ques = npz['valid_ques'].astype(np.int32) valid_ans = npz['valid_ans'].astype(np.int32) bar.message = 'Loading word2idx' bar.next() with open(word2idx_txt) as f: reader = csv.reader(f, delimiter='\t') word2idx = {row[0]: int(row[1]) for row in reader} bar.finish() train = train_ques, train_ans valid = valid_ques, valid_ans return train, valid, embd_mat, word2idx bar.max = 8 bar.message = 'Loading GloVe vocab' bar.next() glove_vocab = load_glove_vocab(os.path.join(config.data_dir, 'glove'), '42B', 300) bar.message = 'Loading SimpleQuestions' bar.next() train, valid, dataset_vocab = load_simple_questions(config) bar.message = 'Removing unknown answers' bar.next() train, new_vocab = remove_unknown_answers(train, glove_vocab) dataset_vocab.update(new_vocab) valid, new_vocab = remove_unknown_answers(valid, glove_vocab) dataset_vocab.update(new_vocab) train_q, train_a = train[0], train[1] valid_q, valid_a = valid[0], valid[1] bar.message = 'Replacing unknown tokens' bar.next() unknowns = dataset_vocab - glove_vocab train_q = replace_unknowns(train_q, unknowns) train_a = replace_unknowns(train_a, unknowns) valid_q = replace_unknowns(valid_q, unknowns) valid_a = replace_unknowns(valid_a, unknowns) vocab = dataset_vocab - unknowns bar.message = 'Appending pads' bar.next() max_len = max(len(sent) for sent in train_q + valid_q) train_q = append_pads(train_q, max_len) valid_q = append_pads(valid_q, max_len) vocab.update([TOK_UNK, TOK_PAD]) bar.message = 'Loading GloVe embeddings' bar.next() embd_mat, word2idx = load_glove_embeddings( os.path.join(config.data_dir, 'glove'), '42B', 300, vocab) bar.message = 'Converting token to index' bar.next() train_q = convert_to_idx(train_q, word2idx) train_a = convert_to_idx(train_a, word2idx) valid_q = convert_to_idx(valid_q, word2idx) valid_a = convert_to_idx(valid_a, word2idx) bar.message = 'Saving processed data' bar.next() with open(word2idx_txt, 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerows(word2idx.items()) data_dict = dict(embd_mat=embd_mat, train_ques=train_q, train_ans=train_a, valid_ques=valid_q, valid_ans=valid_a) np.savez(data_npz, **data_dict) bar.finish() train = np.array(train_q), np.array(train_a) valid = np.array(valid_q), np.array(valid_a) return train, valid, embd_mat, word2idx