def generate_abstracts(self, list_of_pmids): """ Generates list of documents using pmids and the restapi interface from tmtools. Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/" :param list_of_pmids: strings :return nalaf.structures.Dataset: dataset """ # if os.path.isfile('cache.json'): # with open('cache.json') as f: # tm_var = json.load() # else: url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/' url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/' # load cache.json if exists if os.path.exists('cache.json'): with open('cache.json', 'r', encoding='utf-8') as f: tm_var = json.load(f) else: tm_var = {} for pmid in list_of_pmids: if pmid not in tm_var: # if pmid was not already downloaded from tmTools req = requests.get(url_tmvar.format(pmid)) try: tm_var[pmid] = req.json() except ValueError: pass # cache the tmVar annotations so we don't pull them every time with open('cache.json', 'w') as file: json.dump(tm_var, file, indent=4) # for key in tm_var: # print(json.dumps(tm_var[key], indent=4)) dataset = Dataset() for doc_id in list_of_pmids: if doc_id in tm_var: doc = Document() text = tm_var[doc_id]['text'] part = Part(text) denotations = tm_var[doc_id]['denotations'] annotations = [] for deno in denotations: ann = Entity( class_id=self.mut_class_id, offset=int(deno['span']['begin']), text=text[deno['span']['begin']:deno['span']['end']]) annotations.append(ann) # note should the annotations from tmvar go to predicted_annotations or annotations? part.annotations = annotations doc.parts['abstract'] = part dataset.documents[doc_id] = doc return dataset
def setUpClass(cls): # create a sample dataset1 (1) to test cls.dataset1 = Dataset() doc_1 = Document() text = '.... aaaa .... bbbb .... cccc .... dddd .... eeee .... ffff .... gggg .... hhhh .... jjjj' part_1 = Part(text) cls.dataset1.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 exact_1 = Entity(STUB_E_ID_1, 5, 'aaaa') exact_1.subclass = 1 exact_2 = Entity(STUB_E_ID_1, 55, 'ffff') exact_2.subclass = 2 exact_3 = Entity(STUB_E_ID_1, 75, 'hhhh') exact_3.subclass = 2 overlap_1_1 = Entity(STUB_E_ID_1, 25, 'cccc') overlap_1_1.subclass = 1 overlap_1_2 = Entity(STUB_E_ID_1, 26, 'cc') overlap_1_2.subclass = 1 overlap_2_1 = Entity(STUB_E_ID_1, 32, '.. ddd') overlap_2_1.subclass = 2 overlap_2_2 = Entity(STUB_E_ID_1, 36, 'ddd ...') overlap_2_2.subclass = 2 overlap_3_1 = Entity(STUB_E_ID_1, 65, 'gggg') overlap_3_1.subclass = 1 overlap_3_2 = Entity(STUB_E_ID_1, 62, '.. gggg ..') overlap_3_2.subclass = 2 missing_1 = Entity('e2', 45, 'eeee') missing_1.subclass = 1 missing_2 = Entity('e2', 84, 'jjjj') missing_2.subclass = 1 spurios = Entity('e2', 15, 'bbbb') spurios.subclass = 1 part_1.annotations = [ exact_1, exact_2, exact_3, overlap_1_1, overlap_2_1, overlap_3_1, missing_1, missing_2 ] part_1.predicted_annotations = [ exact_1, exact_2, exact_3, overlap_1_2, overlap_2_2, overlap_3_2, spurios ]