def scrape(): logger.info("starting session") # Create the missing DataSources. existing_domains = [ds[0] for ds in db.query(DataSource.domain).all()] for domain in sources.SOURCES.keys(): if domain not in existing_domains: logger.info("creating new data source, domain = %s", domain) new_ds = DataSource(domain=domain) db.merge(new_ds) db.commit() active_ds = db.query(DataSource.id).filter(DataSource.active == True).all() data_sources = [ds[0] for ds in active_ds] loop = asyncio.get_event_loop() # Change the default executor so there are enough workers to handle the # request concurrency. executor = concurrent.futures.ThreadPoolExecutor( max_workers=settings.REQUEST_CONCURRENCY + 5 ) loop.set_default_executor(executor) loop.request_semaphore = asyncio.Semaphore(settings.REQUEST_CONCURRENCY) for data_source_id in data_sources: loop.create_task(scrape_entries(data_source_id)) loop.run_forever() loop.close()
def train(self, training_job_id): training_job = db.query(TrainingJob).get(training_job_id) if not training_job: raise Exception("TrainingJob doesn't exist") training_job.task_id = train.request.id embedding = training_job.embedding db.commit() def report(progress): self.update_state(state='PROGRESS', meta={'progress': progress}) start_time = time.time() train_model( embedding.model, embedding.query, embedding.preprocessing, embedding.parameters, embedding.file_name, report ) end_time = time.time() training_job.task_id = None training_job.elapsed_time = int(end_time - start_time) embedding = training_job.embedding embedding.status = 'TRAINED' db.commit()
def test(self, testing_job_id): testing_job = db.query(TestingJob).get(testing_job_id) if not testing_job: raise Exception("TestingJob doesn't exist") # Update testing job's task_id. testing_job.task_id = test.request.id embedding = testing_job.embedding testset = testing_job.testset db.commit() def report(progress): self.update_state(state='PROGRESS', meta={'progress': progress}) # Initial progress report so it happens before loading the model. report(0.0) start_time = time.time() result = evaluate(embedding, testset, report=report) end_time = time.time() result.testing_job = testing_job testing_job.task_id = None testing_job.elapsed_time = int(end_time - start_time) db.commit()
def fill_entries(data_source): """ Creates the missing entries for a Data Source. Assumes the DataSource exists. If the missing ids may not be retrieved, return False. Otherwise, return True. """ module = sources.SOURCES[data_source.domain] # TODO: Offload database queries to an executor. existing_ids = db.query(Entry.source_id)\ .filter(Entry.data_source == data_source)\ .yield_per(10000) existing_ids = list(map(lambda r: r[0], existing_ids)) logger.info("%s existing ids found for '%s'", len(existing_ids), data_source.domain) try: # Use an executor, so we can keep the modules asyncio-agnostic. loop = asyncio.get_event_loop() future = loop.run_in_executor( None, module.get_missing_ids, existing_ids ) missing_ids = yield from future missing_ids = list(missing_ids) except: return False logger.info("%s entries for %s need to be created", len(missing_ids), data_source.domain) # Go around the SQLAlchemy ORM so we avoid loading over 1 million entries # on memory when first adding data sources. Also, add them in batches of # 100k. now = datetime.now() step = 100000 for start in range(0, len(missing_ids), step): logger.info("adding batch #%s for %s", int(start / step + 1), data_source.domain) end = start + step new_entries = [] for missing_id in missing_ids[start:end]: new_entries.append({ 'outcome': 'pending', 'source_id': missing_id, 'added': now, 'number_of_tries': 0, 'data_source_id': data_source.id }) db.execute(Entry.__table__.insert(), new_entries) db.commit() return True
def delete_embedding(embedding_id): embedding = db.query(Embedding).get(embedding_id) if not embedding: abort(404) embedding.clean_up() db.delete(embedding) db.commit() return '', 204
def delete_testset(testset_id): testset = db.query(TestSet).get(testset_id) if not testset: abort(404) testset.clean_up() db.delete(testset) db.commit() return '', 204
def create_embedding(): data = request.get_json(force=True) embedding, error = deserialize_embedding(data) if error: return jsonify(error='Bad Request', message=error), 400 db.add(embedding) db.commit() return jsonify(data=serialize_embedding(embedding, summary=False)), 201
def update_embedding(embedding_id): embedding = db.query(Embedding).get(embedding_id) if not embedding: abort(404) data = request.get_json(force=True) embedding.description = data['description'] embedding = db.merge(embedding) db.commit() return jsonify(data=serialize_embedding(embedding, summary=False))
def delete_result(embedding_id, testset_id): result = db.query(Result).get((embedding_id, testset_id)) if not result: abort(404) # Delete its testing_job first. db.delete(result.testing_job) db.delete(result) db.commit() return '', 204
def update_testset(testset_id): testset = db.query(TestSet).get(testset_id) if not testset: abort(404) data = request.get_json(force=True) testset.name = data['name'] testset.description = data['description'] testset = db.merge(testset) db.commit() return jsonify(data=serialize_testset(testset, summary=False))
def create_testset(): full_data = chain(request.form.items(), request.files.items()) data = {k: v for k, v in full_data} testset, error = deserialize_testset(data) if error: return jsonify(error='Bad Request', message=error), 400 db.add(testset) db.commit() return jsonify(data=serialize_testset(testset, summary=False)), 201
def save_entries(ids, data_source): now = datetime.now() new_entries = [] for source_id in ids: new_entries.append({ 'outcome': 'pending', 'source_id': source_id, 'added': now, 'number_of_tries': 0, 'data_source_id': data_source }) db.execute(Entry.__table__.insert(), new_entries) db.commit()
def main(first_date=FIRST_DATE): clarin = db.query(DataSource).filter_by(domain='clarin.com').first() if not clarin: clarin = DataSource(domain='clarin.com') db.merge(clarin) db.commit() day_count = (date.today() - first_date).days for current_day in range(day_count): day = first_date + timedelta(days=current_day) print("day: {}".format(day)) page_number = 1 day_ids = [] while True: url = BASE_URL.format(str(day).replace('-', ''), page_number) response = requests.get(url) # Remove beginning and ending parentheses. if not response.text: print("error; sleeping...") time.sleep(60) continue page = json.loads(response.text[1:-1]) if not page['news']: break # Get the IDs for each link on the history page. root = html.fromstring(page['news']) links = [ el.get('href') for el in root.xpath('//li[@class="item"]/a[@href]') ] day_ids.extend( [re.sub(r'.*_(\d+)\.html', r'\1', l) for l in links]) if not page.get('moreContents'): break page_number += 1 if day_ids: save_entries(day_ids, clarin.id)
def main(): tusubtitulo = db.query(DataSource)\ .filter_by(domain='tusubtitulo.com')\ .first() if not tusubtitulo: tusubtitulo = DataSource(domain='tusubtitulo.com') db.add(tusubtitulo) db.commit() pool = mp.Pool(15) shows = get_show_list() all_seasons = pool.map(get_show_seasons, shows) season_tuples = [] for show, show_seasons in zip(shows, all_seasons): for show_season in show_seasons: season_tuples.append((show, show_season)) results = pool.map(get_season_subtitles, season_tuples) # Flatten the results. subtitle_ids = [] for result in results: subtitle_ids.extend(result) # Fitler `None`s. subtitle_ids = list(filter(lambda s: s, subtitle_ids)) existing = db.query(Entry.source_id).filter_by(data_source=tusubtitulo) existing = set(map(lambda r: r[0].split('@@')[1], existing)) # We don't want repeated entries for the same episode. new_entries = [] for subtitle_id in subtitle_ids: if subtitle_id[1] in existing: continue new_entries.append(subtitle_id) if new_entries: save_entries(new_entries, tusubtitulo.id)
def delete_testing_job(testing_job_id): testing_job = db.query(TestingJob).get(testing_job_id) if not testing_job: abort(404) # If it has any result associated, delete it. result = db.query(Result).get(( testing_job.embedding_id, testing_job.testset_id )) if result: db.delete(result) if testing_job.task_id: celery_app.control.revoke(testing_job.task_id, terminate=True) db.delete(testing_job) db.commit() return '', 204
def create_training_job(): embedding_id = request.get_json(force=True)['embedding_id'] embedding = db.query(Embedding).get(embedding_id) if not embedding: abort(404) # Check if it has been trained already first. training_job = db.query(TrainingJob)\ .filter_by(embedding_id=embedding_id).first() if training_job: message = "The embedding is already trained or being trained." return jsonify(error='Bad Request', message=message), 400 embedding.status = 'TRAINING' training_job = TrainingJob(embedding_id=embedding_id) db.add(training_job) db.commit() train.delay(training_job.id) return jsonify(data={'training_job_id': training_job.id})
def scrape_entry(entry_id): """ Scrapes the Entry identified by `entry_id` and updates its info, also storing the document in Elasticsearch if successful. """ entry = db.query(Entry).get(entry_id) module = sources.SOURCES[entry.data_source.domain] # Fetch the entry's content. # `source_id` may be composite, separating parts with `@@`. source_id = entry.source_id.split('@@') url = module.DOCUMENT_URL.format(*source_id) headers = settings.REQUEST_HEADERS source_headers = getattr(module, 'HEADERS', None) if source_headers: headers = headers.copy() headers.update(source_headers) try: response = yield from get(url, headers=headers) except Exception as e: # Capture all exceptions, as the `requests` library may raise # arbitrary exceptions; not all of them are wrapped. logger.info("entry_id = %s failed when requesting url; %s", entry_id, repr(e)) entry.outcome = 'failure' entry.last_tried = datetime.now() entry.number_of_tries += 1 db.merge(entry) db.commit() return # TODO: Improve error handling; code may fail silently. try: content = module.get_content(response) except Exception as e: logger.info("entry_id = %s failed when getting content; %s", entry_id, repr(e)) entry.outcome = 'failure' entry.last_tried = datetime.now() entry.number_of_tries += 1 db.merge(entry) db.commit() return if content['outcome'] == 'success': min_words = settings.MIN_WORDS_PER_DOCUMENT word_count = len(content['content'].split()) if not content['content'] or word_count < min_words: # Parsing was marked as successful, but no (or too little) content # returned; mark as unparseable instead. content['outcome'] = 'unparseable' outcome = content['outcome'] entry.outcome = outcome entry.last_tried = datetime.now() entry.number_of_tries += 1 db.merge(entry) if outcome not in ['multiple', 'success', 'more_entries']: # Finished already. logger.info("entry_id = %s finished with outcome = %s", entry_id, outcome) db.commit() return # The `multiple` case returns a dict like this: # {'outcome': 'multiple', 'new_entries': [...], 'documents': [...]} if outcome in ['more_entries', 'multiple']: # Create new entries, only if not needed. new_ids = content['new_entries'] if new_ids: existing = db.query(Entry.source_id)\ .filter(Entry.source_id.in_(new_ids)) existing = set(map(lambda r: r[0], existing)) missing = set(new_ids) - existing if missing: now = datetime.now() new_entries = [] for new_id in missing: new_entries.append({ 'outcome': 'pending', 'source_id': new_id, 'added': now, 'number_of_tries': 0, 'data_source_id': entry.data_source.id, }) db.execute(Entry.__table__.insert(), new_entries) elif outcome == 'more_entries': logger.warning( "entry_id = %s (outcome = %s) returned no additional entries", entry_id, outcome ) # If successful, fetch the metadata of the entry and store in # Elasticsearch. if outcome in ['multiple', 'success']: # `get_metadata` must return the same number of documents as # `get_content`. metadata = module.get_metadata(response) if isinstance(metadata, list): for md in metadata: md['url'] = response.url else: metadata['url'] = response.url if outcome == 'success': results = [content] metadatas = [metadata] else: results = content['documents'] metadatas = metadata new_docs = [] for content, metadata in zip(results, metadatas): min_words = settings.MIN_WORDS_PER_DOCUMENT word_count = len(content['content'].split()) if not content['content'] or word_count < min_words: continue doc_id, doc = prepare_document(content, metadata, entry) new_docs.append((doc_id, doc)) logger.info("entry_id = %s finished with outcome = %s", entry_id, outcome) db.commit() # Finally, store document on Elasticsearch too. for doc_id, doc in new_docs: es.index( index=settings.ES_INDEX, doc_type=settings.ES_DOCTYPE, id=doc_id, body=doc )
def create_testing_job(): data = request.get_json(force=True) if 'embedding_id' not in data or 'testset_id' not in data: abort(400) embedding_id = data['embedding_id'] testset_id = data['testset_id'] if not (isinstance(embedding_id, int) or isinstance(testset_id, int)): return jsonify({ 'message': "At least one ID must be specified", 'error': 'Bad Request' }), 400 # Build a list of embeddings and testsets to test. embeddings = [] testsets = [] if isinstance(embedding_id, int): embedding = db.query(Embedding).get(embedding_id) if not embedding: abort(404) embeddings.append(embedding) if isinstance(testset_id, int): testset = db.query(TestSet).get(testset_id) testsets.append(testset) elif testset_id == 'full': testsets.extend(db.query(TestSet).all()) elif testset_id == 'missing': existing = db.query(TestSet.id).join(Result).join(Embedding)\ .filter(Embedding.id == embedding_id) query = db.query(TestSet).filter(~TestSet.id.in_(existing)) testsets.extend(query.all()) elif isinstance(testset_id, int): testset = db.query(TestSet).get(testset_id) if not testset: abort(404) testsets.append(testset) if isinstance(embedding_id, int): embedding = db.query(Embedding).get(embedding_id) embeddings.append(embedding) elif embedding_id == 'full': embeddings.extend(db.query(Embedding).all()) elif embedding_id == 'missing': existing = db.query(Embedding.id).join(Result).join(TestSet)\ .filter(TestSet.id == testset_id) query = db.query(Embedding).filter(~Embedding.id.in_(existing)) embeddings.extend(query.all()) # Make sure there are no Nones (i.e. all the models exist). if any([emb is None for emb in embeddings]): abort(404) if any([ts is None for ts in testsets]): abort(404) # Make sure the embeddings are trained already. embeddings = filter(lambda e: e.status == 'TRAINED', embeddings) # For each pair <embedding, testset>, create the necessary TestingJob, # deleting it first if it already exists. Also delete associated results. jobs = [] for embedding in embeddings: for testset in testsets: job = db.query(TestingJob)\ .filter_by(embedding=embedding, testset=testset)\ .first() if job and job.status in ['PENDING', 'PROGRESS']: # Only overwrite TestingJobs that have already run. If it's # still pending or running right now, we want to keep it. continue elif job: for result in job.results.all(): db.delete(result) db.delete(job) job = TestingJob(testset=testset, embedding=embedding) jobs.append(job) db.add(job) db.commit() for job in jobs: test.delay(job.id) return jsonify(data={'testing_job_id': [job.id for job in jobs]})