示例#1
0
def import_tasks(ctx: CLIContext, task_csv: str):
    """Import annotation tasks from a CSV"""
    df = pd.read_csv(task_csv)

    from cdcrapp.services import TaskService
    from cdcrapp.model import Task

    print(f"Found {len(df)} tasks in f{task_csv}...")
    print(f"Adding tasks to database")

    engine = create_engine(os.getenv("SQLALCHEMY_DB_URI"))
    taskmgr = TaskService(engine)

    # filter existing tasks
    tasks: List[Task] = taskmgr.list(Task)

    existing = set([t.hash for t in tasks])

    not_ingested = df[~df.hash.isin(existing)]

    tasks = []
    for i, row in tqdm(not_ingested.iterrows(), total=len(not_ingested)):
        task = Task(hash=row['hash'],
                    news_ent=row['News Candidates'],
                    sci_ent=row['Abstract Candidates'],
                    news_url=row['URL'][:255],
                    sci_url=row['doi'],
                    news_text=row['Summary'],
                    sci_text=row['abstract'],
                    similarity=row['bert_similarity'])

        tasks.append(task)

        if i % 1000 == 0:
            taskmgr.add_tasks(tasks)
            tasks = []

    print("Import complete")
from sqlalchemy import create_engine
from sqlalchemy.orm import subqueryload
from cdcrapp.services import UserService, TaskService
from cdcrapp.model import Task, NewsArticle, SciPaper, UserTask

# %%


def get_sql_engine():
    return create_engine(os.getenv("SQLALCHEMY_DB_URI"))


load_dotenv()
_engine = get_sql_engine()
_usersvc: UserService = UserService(_engine)
_tasksvc: TaskService = TaskService(_engine)

# %%
# collect sets of documents that our task should be limited to
# the 'definitive' CD^2CR corpus is from 31/7/20
sci_docs = []
news_docs = []

for seg in ['dev', 'test', 'train']:
    with open(f"mentions/31_07_20_5pc/{seg}_entities.json") as f:
        data = json.load(f)

    for ent in data:
        _, doc_type, doc_id = ent['doc_id'].split("_")

        if doc_type == 'science':
示例#3
0
 def __init__(self):
     self.engine: Engine = create_engine(os.getenv("SQLALCHEMY_DB_URI"))
     self.usersvc: UserService = UserService(self.engine)
     self.tasksvc: TaskService = TaskService(self.engine)