def get_tr_dicts_and_ids(): # Download metadata file if it is not in data directory download_metadata() # Get the text ref objects from the DB corresponding to the CORD19 # articles text_refs = get_unique_text_refs() md = get_metadata_dict() tr_dicts, multiple_tr_ids = cord19_metadata_for_trs(text_refs, md) return tr_dicts, multiple_tr_ids
def get_cord_info(): global _cord_by_doi global _cord_by_pmid if not (_cord_by_doi and _cord_by_pmid): cord_md = get_metadata_dict() for md_entry in cord_md: if md_entry.get('doi'): _cord_by_doi[md_entry['doi'].upper()] = md_entry if md_entry.get('pubmed_id'): _cord_by_pmid[md_entry['pubmed_id']] = md_entry return (_cord_by_doi, _cord_by_pmid)
import re import csv from covid_19.preprocess import get_metadata_dict, get_zip_texts_for_entry, \ get_metadata_df, get_all_texts from indra_db.util import get_db covid_docs_file = '../covid_docs_ranked_corona.csv' covid_pmids = set() with open(covid_docs_file, 'rt') as f: csv_reader = csv.reader(f, delimiter=',') for row in csv_reader: pmid = row[4] covid_pmids.add(pmid) md = get_metadata_dict() aa_reg = '[ACDEFGHIKLMNPQRSTVWY]' mut_reg = '\s+' + aa_reg + '\d+' + aa_reg + '\s+' print(mut_reg) aa_short = [ 'ala', 'arg', 'asn', 'asp', 'cys', 'gln', 'glu', 'gly', 'his', 'ile', 'leu', 'lys', 'met', 'phe', 'pro', 'ser', 'thr', 'trp', 'tyr', 'val' ] aa_short_reg = '|'.join([aa for aa in aa_short]) aa_seq_reg = '(?:%s)\d{2,5}' % aa_short_reg print(aa_seq_reg) ignore_list = ( 'Y2H', # Yeast two-hybrid 'C3H', # Mouse strain