def fill_missing_values(session: Session, only_kegg: bool, batch_size: int, error_log: str) -> None: """ Complete missing mass and/or atom bag information from InChI strings. Parameters ---------- session : sqlalchemy.orm.session.Session An active session in order to communicate with a SQL database. only_kegg : bool Calculate thermodynamic information for compounds contained in KEGG only. batch_size : int The size of batches of compounds considered at a time. error_log : str The base file path for error output. """ query = session.query(Compound.id, Compound.mnx_id, Compound.inchi) if only_kegg: # Filter compounds in KEGG or COCO (additional compounds for # component-contribution) query = query.join(CompoundIdentifier).join(Registry) query = query.filter(Registry.namespace.in_(("kegg", "coco"))) query = query.group_by(Compound.id) query = query.filter( Compound.inchi.isnot(None), or_(Compound.mass.is_(None), Compound.atom_bag.is_(None)), ) logger.debug("calculating mass for compounds with missing values") input_df = pd.read_sql_query(query.statement, query.session.bind) with tqdm(total=len(input_df), desc="Analyzed") as pbar: for index in range(0, len(input_df), batch_size): view = input_df.iloc[index:index + batch_size, :] try: view = get_molecular_masses(view, f"{error_log}_batch_{index}") compounds = [] for row in view.itertuples(index=False): try: atom_bag = get_atom_bag("inchi", row.inchi) except OSError as e: logger.warning(str(e)) atom_bag = {} compounds.append({ "id": row.id, "mass": row.mass, "atom_bag": atom_bag }) session.bulk_update_mappings(Compound, compounds) session.commit() except ValueError as e: logger.warning(str(e)) pbar.update(len(view))
def update_headlines(session: Session, user_dict: Path, logger: Logger) -> None: query_result = session \ .query(Headline) \ .filter(Headline.is_used.is_(None)) \ .all() headlines = list(query_result) if len(headlines) == 0: return tokenizer = Tokenizer(str(user_dict)) mappings = [] logger.info('start updating headlines') for headline in tqdm(headlines): h = simplify_headline(headline.headline) is_about_di = headline.categories is not None and \ DOMESTIC_INDEX in headline.categories # We stopped using `is_template` because the size of the dataset decreased and the result got worse. # if is_template(h) or not is_interesting(h) or not is_about_di: if not is_interesting(h) or not is_about_di: mappings.append({ 'article_id': headline.article_id, 'is_used': False }) continue tokens = kansuuzi2number( [token.surface for token in tokenizer.tokenize(h)]) tag_tokens = replace_prices_with_tags(tokens) mappings.append({ 'article_id': headline.article_id, 'simple_headline': h, 'tokens': tokens, 'tag_tokens': tag_tokens, 'is_used': True, }) session.bulk_update_mappings(Headline, mappings) session.commit() logger.info('end updating headlines')