def biomarkerMediumRelations(filename): # Processing the input data and converting to sentences # If the file has already been parsed, there is no point in reparsing. # Just open the already parsed sentences pckl_f = "cache/" + filename + "/" + 'sentences.pkl' # try: # #Try to see if the sentences have been parsed # with open(pckl_f, 'rb') as f: # #Load em in if they have # sentences = cPickle.load(f) # except: sentences = doc_parser.parseDoc(filename) # Serialize so that you dont have to deal with parsing in the future # if not os.path.exists("cache/" + _filename +"/" ): # os.makedirs("cache/" + _filename + "/") # with open(pckl_f, 'w+') as f: # cPickle.dump(sentences, f) biomarker_ngrams = Ngrams(n_max=1) medium_ngrams = Ngrams(n_max=3) # Create the two matchers who have been defined in separate classes BM = matchers.getBiomarkerMatcher() MM = matchers.getMediumMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) CandidateExtractor_MM = CandidateExtractor(medium_ngrams, MM) # Create the relations using the two matchers possiblePairs = Relations(sentences, CandidateExtractor_BM, CandidateExtractor_MM) return possiblePairs
def biomarkerDrugRelations(filename): # Processing the input data and converting to sentences sentences = doc_parser.parseDoc(filename) # If the file has already been parsed, there is no point in reparsing. # Just open the already parsed sentences # print sentences # Serialize so that you dont have to deal with parsing in the future # if not os.path.exists("cache/" + filename +"/" ): # os.makedirs("cache/" + filename + "/") # with open(pckl_f, 'w+') as f: # cPickle.dump(sentences, f) biomarker_ngrams = Ngrams(n_max=1) drug_association_ngrams = Ngrams(n_max=5) BM = matchers.getBiomarkerMatcher() DAM = matchers.getDrugAssociationMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) CandidateExtractor_DAM = CandidateExtractor(drug_association_ngrams, DAM) possiblePairs = Relations(sentences, CandidateExtractor_BM, CandidateExtractor_DAM) return possiblePairs
def getBiomarkerTestsetRelations(sentences, session): biomarker_ngrams = Ngrams(n_max=1) test_set_ngrams = Ngrams(n_max=10) # Retrieve Matchers: BM = matchers.getBiomarkerMatcher() TS = matchers.getTestSetMatcher() CandidateExtractor_BM = CandidateExtractor(Disease, biomarker_ngrams, BM) CandidateExtractor_TS = CandidateExtractor(Disease, test_set_ngrams, TS) # Create Relations Object possiblePairs = Relations( sentences, CandidateExtractor_BM, CandidateExtractor_TS) return possiblePairs
def getBiomarkerLevelsRelations(sentences, session): biomarker_ngrams = Ngrams(n_max=1) levels_ngrams = Ngrams(n_max=15) # Retrieve Matchers: BM = matchers.getBiomarkerMatcher() L = matchers.getLevelsMatcher() CandidateExtractor_BM = CandidateExtractor(Disease, biomarker_ngrams, BM) CandidateExtractor_L = CandidateExtractor(Disease, levels_ngrams, L) # Create Relations Object possiblePairs = Relations( sentences, CandidateExtractor_BM, CandidateExtractor_L) return possiblePairs
def getBiomarkerUnitsRelations(sentences, session): biomarker_ngrams = Ngrams(n_max=1) unit_ngrams = Ngrams(n_max=10) # Retrieve Matchers: BM = matchers.getBiomarkerMatcher() U = matchers.getUnitsMatcher() CandidateExtractor_BM = CandidateExtractor(Disease, biomarker_ngrams, BM) CandidateExtractor_U = CandidateExtractor(Disease, unit_ngrams, U) # Create Relations Object possiblePairs = Relation( sentences, CandidateExtractor_BM, CandidateExtractor_U) return possiblePairs
def biomarkerMeasurementRelations(sentences): biomarker_ngrams = Ngrams(n_max=1) measurement_type_ngrams = Ngrams(n_max=5) # Retrieve Matchers: BM = matchers.getBiomarkerMatcher() MT = matchers.getMeasurementTypeMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) CandidateExtractor_MT = CandidateExtractor(measurement_type_ngrams, MT) # Create Relations Object possiblePairs = Relations(sentences, CandidateExtractor_BM, CandidateExtractor_MT) return possiblePairs
def getBiomarkerDrugRelations(filename, session): # Processing the input data and converting to sentences sentences = doc_parser.parseDoc(filename, session) biomarker_ngrams = Ngrams(n_max=1) drug_association_ngrams = Ngrams(n_max=5) BM = matchers.getBiomarkerMatcher() DAM = matchers.getDrugAssociationMatcher() ce = CandidateExtractor(Disease, [biomarker_ngrams, drug_association_ngrams], [BM, DAM]) # possible pairs/relations (candidates) c = ce.extract(sentences, 'BDA Development Candidates', session) print "Number of candidates:", len(c) return c, sentences, Disease
def getBiomarkerMediumRelations(filename, session): # Processing the input data and converting to sentences sentences = doc_parser.parseDoc(filename, session) biomarker_ngrams = Ngrams(n_max=1) medium_ngrams = Ngrams(n_max=3) # Create the two matchers who have been defined in separate classes BM = matchers.getBiomarkerMatcher() MM = matchers.getMediumMatcher() CandidateExtractor_BM = CandidateExtractor(Disease, biomarker_ngrams, BM) CandidateExtractor_MM = CandidateExtractor(Disease, medium_ngrams, MM) # Create the relations using the two matchers possiblePairs = Relations( sentences, CandidateExtractor_BM, CandidateExtractor_MM) return possiblePairs
def getBiomarkerTypeRelations(filename, session): """ Processing the input data and converting to sentences """ sentences = doc_parser.parseDoc(filename, session) biomarker_ngrams = Ngrams(n_max=1) biomarker_type_ngrams = Ngrams(n_max=2) BM = matchers.getBiomarkerMatcher() TM = matchers.getBiomarkerTypeMatcher() CandidateExtractor_BM = CandidateExtractor(Disease, biomarker_ngrams, BM) CandidateExtractor_TM = CandidateExtractor( Disease, biomarker_type_ngrams, TM) # Running the CandidateExtractor to get Relations possiblePairs = Relations( sentences, CandidateExtractor_BM, CandidateExtractor_TM) return possiblePairs
def getBiomarkerDiseaseRelations(session): # Processing the input data and converting to sentences # sentences = doc_parser.parseDoc(filename, session) #session = doc_parser.useCDRdata(session) corpus = session.query(Corpus).filter(Corpus.name == 'Training').one() sentences = set() for document in corpus: for sentence in document.sentences: sentences.add(sentence) biomarker_ngrams = Ngrams(n_max=1) disease_ngrams = Ngrams(n_max=5) # Create the two matchers who have been defined in separate classes BM = matchers.getBiomarkerMatcher() DM = matchers.getDiseaseMatcher() # Building the CandidateExtractor: # Combine candidate class, child context spaces, and matchers into the extractor ce = CandidateExtractor(Disease, [biomarker_ngrams, disease_ngrams], [BM, DM]) # Running the CandidateExtractor: Retrieve cadidate set / relations c = ce.extract(sentences, 'BD Training Candidates', session) print "Number of candidates:", len(c) session.add(c) session.commit() #repeat for dev and test! #for corpus_name in ['BD Development', 'BD Test']: # corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one() # sentences = set() # for document in corpus: # for sentence in document.sentences: # sentences.add(sentence) # c = ce.extract(sentences, corpus_name + ' Candidates', session) # session.add(c) #session.commit() return c, sentences, Disease, session
def biomarkerTypeRelations(filename): """ Processing the input data and converting to sentences """ # If the file has already been parsed, there is no point in reparsing. # Just open the already parsed sentences # pckl_f = "cache/" + filename + "/" + 'sentences.pkl' # try: # #Try to see if the sentences have been parsed # with open(pckl_f, 'rb') as f: # #Load em in if they have # sentences = cPickle.load(f) # except: sentences = doc_parser.parseDoc(filename) # Serialize so that you dont have to deal with parsing in the future # if not os.path.exists("cache/" + filename +"/" ): # os.makedirs("cache/" + filename + "/") # with open(pckl_f, 'w+') as f: # cPickle.dump(sentences, f) biomarker_ngrams = Ngrams(n_max=1) biomarker_type_ngrams = Ngrams(n_max=2) BM = matchers.getBiomarkerMatcher() TM = matchers.getBiomarkerTypeMatcher() CandidateExtractor_BM = CandidateExtractor(biomarker_ngrams, BM) CandidateExtractor_TM = CandidateExtractor(biomarker_type_ngrams, TM) possiblePairs = Relations(sentences, CandidateExtractor_BM, CandidateExtractor_TM) return possiblePairs