def import_tasks(ctx: CLIContext, task_csv: str): """Import annotation tasks from a CSV""" df = pd.read_csv(task_csv) from cdcrapp.services import TaskService from cdcrapp.model import Task print(f"Found {len(df)} tasks in f{task_csv}...") print(f"Adding tasks to database") engine = create_engine(os.getenv("SQLALCHEMY_DB_URI")) taskmgr = TaskService(engine) # filter existing tasks tasks: List[Task] = taskmgr.list(Task) existing = set([t.hash for t in tasks]) not_ingested = df[~df.hash.isin(existing)] tasks = [] for i, row in tqdm(not_ingested.iterrows(), total=len(not_ingested)): task = Task(hash=row['hash'], news_ent=row['News Candidates'], sci_ent=row['Abstract Candidates'], news_url=row['URL'][:255], sci_url=row['doi'], news_text=row['Summary'], sci_text=row['abstract'], similarity=row['bert_similarity']) tasks.append(task) if i % 1000 == 0: taskmgr.add_tasks(tasks) tasks = [] print("Import complete")
from sqlalchemy import create_engine from sqlalchemy.orm import subqueryload from cdcrapp.services import UserService, TaskService from cdcrapp.model import Task, NewsArticle, SciPaper, UserTask # %% def get_sql_engine(): return create_engine(os.getenv("SQLALCHEMY_DB_URI")) load_dotenv() _engine = get_sql_engine() _usersvc: UserService = UserService(_engine) _tasksvc: TaskService = TaskService(_engine) # %% # collect sets of documents that our task should be limited to # the 'definitive' CD^2CR corpus is from 31/7/20 sci_docs = [] news_docs = [] for seg in ['dev', 'test', 'train']: with open(f"mentions/31_07_20_5pc/{seg}_entities.json") as f: data = json.load(f) for ent in data: _, doc_type, doc_id = ent['doc_id'].split("_") if doc_type == 'science':
def __init__(self): self.engine: Engine = create_engine(os.getenv("SQLALCHEMY_DB_URI")) self.usersvc: UserService = UserService(self.engine) self.tasksvc: TaskService = TaskService(self.engine)