def __init__(self, cache_dir, categories, keywords, amount=None):
     self.categories = categories
     self.keywords = keywords
     cache_dir = os.path.expanduser(cache_dir)
     ensure_directory(cache_dir)
     filename = os.path.join(cache_dir, 'abstracts.txt')
     if not os.path.isfile(filename):
         with open(filename, 'w') as file_:
             for abstract in self._fetch_all(amount):
                 file_.write(abstract + '\n')
     with open(filename) as file_:
         self.data = file_.readlines()
示例#2
0
 def _init_or_load_session(self):
     self.sess = tf.Session()
     self.saver = tf.train.Saver()
     checkpoint = tf.train.get_checkpoint_state(self.params.checkpoint_dir)
     if checkpoint and checkpoint.model_checkpoint_path:
         path = checkpoint.model_checkpoint_path
         print('Load checkpoint', path)
         self.saver.restore(self.sess, path)
         self.epoch = int(re.search(r'-(\d+)$', path).group(1)) + 1
     else:
         ensure_directory(self.params.checkpoint_dir)
         print('Randomly initialize variables')
         self.sess.run(tf.initialize_all_variables())
         self.epoch = 1
示例#3
0
 def _init_or_load_session(self):
     self.sess = tf.Session()
     self.saver = tf.train.Saver()
     checkpoint = tf.train.get_checkpoint_state(self.params.checkpoint_dir)
     if checkpoint and checkpoint.model_checkpoint_path:
         path = checkpoint.model_checkpoint_path
         print('Load checkpoint', path)
         self.saver.restore(self.sess, path)
         self.epoch = int(re.search(r'-(\d+)$', path).group(1)) + 1
     else:
         ensure_directory(self.params.checkpoint_dir)
         print('Randomly initialize variables')
         self.sess.run(tf.initialize_all_variables())
         self.epoch = 1
示例#4
0
def main():
    logging.basicConfig(filename='process.log', level=logging.INFO)
    stream = open('./output/index.yaml')
    index = safe_load(stream)
    total_pages = reduce(lambda acc, ch: acc + len(ch['sections']) + 1, index, 0)

    tasks = multiprocessing.JoinableQueue()
    results = multiprocessing.Queue()
    num_consumers = multiprocessing.cpu_count() * 2
    logging.info("Creating %s consumers" % num_consumers)
    consumers = [ Converter(tasks, results) for i in range(num_consumers) ]
    bar = Bar('Downloading using %i consumers' % num_consumers, max=total_pages)
    ensure_directory('./output/pdf')
    errors_counter = 0

    for w in consumers:
        w.start()

    for chapter_number, chapter in enumerate(index):
        chapter_directory = './output/pdf/%s - %s' % (chapter['number'], chapter['title'])
        ensure_directory(chapter_directory)

        if 'pdf' in chapter:
            bar.next()
            total_pages -= 1
        elif 'error' in chapter:
            bar.next()
            total_pages -= 1
            errors_counter += 1
        else:
            tasks.put({
                'title': chapter['title'],
                'path': '%s/%s.pdf' % (chapter_directory, chapter['title'].replace('/', '-')),
                'link': chapter['link'],
                'chapter': chapter_number
            })

        for section_number, section in enumerate(chapter['sections']):
            if 'pdf' in section:
                bar.next()
                total_pages -= 1
            elif 'error' in section:
                bar.next()
                total_pages -= 1
                errors_counter += 1
            else:
                tasks.put({
                    'title': section['title'],
                    'path': ('%s/%s.pdf' % (chapter_directory, section['title'].replace('/', '-'))),
                    'link': section['link'],
                    'chapter': chapter_number,
                    'section': section_number,
                })

    # Add a poison pill for each consumer
    for i in range(num_consumers):
        tasks.put(None)

    while total_pages > 0:
        result = results.get()
        bar.next()
        logging.info("Task is done. file: %s" % (result['path']))
        is_faulty = 'error' in result
        is_section = 'section' in result
        chapter_number = result['chapter']
        if is_section:
            section_number = result['section']
            if is_faulty:
                index[chapter_number]['sections'][section_number]['error'] = True
            else:
                index[chapter_number]['sections'][section_number]['pdf'] = result['path']
        else:
            if is_faulty:
                index[chapter_number]['error'] = True
            else:
                index[chapter_number]['pdf'] = result['path']
        store_index(index)
        total_pages -= 1

    # Wait for all of the tasks to finish
    tasks.join()
    if errors_counter > 0:
        print('Got %s errors. check logs for more info' % errors_counter)
    return