def parse(): if request.method == 'POST': text = request.form['text'] doc_id = '99999999' print text #if text == "readfile": # with open('test.txt', 'r') as myfile: # text=myfile.read() rules0 = request.form['rules0'] rule0_lines = rules0.split("\n") rules1 = request.form['rules1'] rule1_lines = rules1.split("\n") rules2 = request.form['rules2'] rule2_lines = rules2.split("\n") param_helper = ParamHelper(text,doc_id,rule0_lines,rule1_lines,rule2_lines) raw_doc = document_pb2.Document() edg_rules = edgRules_pb2.EdgRules() param_helper.setDocProtoAttributes(raw_doc) param_helper.setRuleProtoAttributes(edg_rules) ########################## parse_bllip = parse_using_bllip(raw_doc,edg_rules) #print parse_bllip brat_bllip = json.dumps(get_brat_data(parse_bllip)) brat_bllip_added = json.dumps(get_brat_data_added(parse_bllip)) return render_template('index_edg.html', text=text, rules0=rules0,rules1=rules1,rules2=rules2, brat_string_bllip=brat_bllip, brat_string_bllip_added=brat_bllip_added) else: return render_template('index_edg.html')
def parse(): if request.method == 'POST': text = request.form['text'] split = request.form.getlist('split') doc = document_pb2.Document() doc.text = text parse_doc_bllip_brat = '' parse_doc_stanford_brat = '' split_doc_brat = '' cst_parses = '' if len(split) == 0: parse_doc_bllip = parse_using_bllip(doc) parse_doc_stanford = parse_using_stanford(doc) parse_doc_bllip_brat = json.dumps(get_brat_data(parse_doc_bllip)) parse_doc_stanford_brat = json.dumps( get_brat_data(parse_doc_stanford)) parses = {} for sentence in parse_doc_bllip.sentence: parses[sentence.index] = sentence.parse cst_parses = json.dumps(parses) else: split_doc = split_sentence_using_stanford(doc) split_doc_brat = json.dumps(get_brat_data(split_doc)) return render_template('index.html', text=text, parse_bllip=parse_doc_bllip_brat, parse_stanford=parse_doc_stanford_brat, split_stanford=split_doc_brat, bllip_cst_parses=cst_parses) else: return render_template('index.html')
def run(text): raw_doc = document_pb2.Document() raw_doc.doc_id = '26815768' raw_doc.text = text # Parse using Bllip parser. result = parse_using_bllip(raw_doc) #print(result) return result
def run(): text = u'MicroRNAs (miRNAs) are small non-coding RNAs of ∼19-24 ' \ 'nucleotides (nt) in length and considered as potent ' \ 'regulators of gene expression at transcriptional and ' \ 'post-transcriptional levels. Here we report the identification ' \ 'and characterization of 15 conserved miRNAs belonging to 13 ' \ 'families from Rauvolfia serpentina through in silico analysis ' \ 'of available nucleotide dataset. The identified mature R. ' \ 'serpentina miRNAs (rse-miRNAs) ranged between 20 and 22nt in ' \ 'length, and the average minimal folding free energy index (MFEI) ' \ 'value of rse-miRNA precursor sequences was found to be ' \ '-0.815kcal/mol. Using the identified rse-miRNAs as query, their ' \ 'potential targets were predicted in R. serpentina and other plant ' \ 'species. Gene Ontology (GO) annotation showed that predicted ' \ 'targets of rse-miRNAs include transcription factors as well as ' \ 'genes involved in diverse biological processes such as primary ' \ 'and secondary metabolism, stress response, disease resistance, ' \ 'growth, and development. Few rse-miRNAs were predicted to target ' \ 'genes of pharmaceutically important secondary metabolic pathways ' \ 'such as alkaloids and anthocyanin biosynthesis. Phylogenetic ' \ 'analysis showed the evolutionary relationship of rse-miRNAs and ' \ 'their precursor sequences to homologous pre-miRNA sequences from ' \ 'other plant species. The findings under present study besides giving ' \ 'first hand information about R. serpentina miRNAs and their targets, ' \ 'also contributes towards the better understanding of miRNA-mediated ' \ 'gene regulatory processes in plants.' raw_doc = document_pb2.Document() raw_doc.doc_id = '26815768' raw_doc.text = text one_hundred_docs = [raw_doc] * 100 # This is a simple function to make requests out of a list of documents. We # put 5 documents in each request. requests = request_iter_docs(one_hundred_docs, request_size=5, request_type=rpc_pb2.Request.PARSE_BLLIP) # Given a request iterator, send requests in parallel and get responses. responses_queue = grpcapi.get_queue(server='128.4.20.169', port=8900, request_thread_num=10, iterable_request=requests) count = 0 for response in responses_queue: for doc in response.document: count += 1 print(count, doc.doc_id, len(doc.sentence))
def run(): text = u'MicroRNAs (miRNAs) are small non-coding RNAs of ∼19-24 ' \ 'nucleotides (nt) in length and considered as potent ' \ 'regulators of gene expression at transcriptional and ' \ 'post-transcriptional levels. Here we report the identification ' \ 'and characterization of 15 conserved miRNAs belonging to 13 ' \ 'families from Rauvolfia serpentina through in silico analysis ' \ 'of available nucleotide dataset. The identified mature R. ' \ 'serpentina miRNAs (rse-miRNAs) ranged between 20 and 22nt in ' \ 'length, and the average minimal folding free energy index (MFEI) ' \ 'value of rse-miRNA precursor sequences was found to be ' \ '-0.815kcal/mol. Using the identified rse-miRNAs as query, their ' \ 'potential targets were predicted in R. serpentina and other plant ' \ 'species. Gene Ontology (GO) annotation showed that predicted ' \ 'targets of rse-miRNAs include transcription factors as well as ' \ 'genes involved in diverse biological processes such as primary ' \ 'and secondary metabolism, stress response, disease resistance, ' \ 'growth, and development. Few rse-miRNAs were predicted to target ' \ 'genes of pharmaceutically important secondary metabolic pathways ' \ 'such as alkaloids and anthocyanin biosynthesis. Phylogenetic ' \ 'analysis showed the evolutionary relationship of rse-miRNAs and ' \ 'their precursor sequences to homologous pre-miRNA sequences from ' \ 'other plant species. The findings under present study besides giving ' \ 'first hand information about R. serpentina miRNAs and their targets, ' \ 'also contributes towards the better understanding of miRNA-mediated ' \ 'gene regulatory processes in plants.' raw_doc = document_pb2.Document() raw_doc.doc_id = '26815768' raw_doc.text = text # Parse using Bllip parser. result = parse_using_bllip(raw_doc) print(result) # Parse Using Stanford CoreNLP parser. result = parse_using_stanford(raw_doc) print(result) # Only split sentences using Stanford CoreNLP. for i in range(100): result = split_using_stanford(raw_doc) print('Split {} documents'.format(i))
def run(): textFH = open(sys.argv[1], "r") text = textFH.read() textFH.close() #text = u'Surface expression of mir-21 activates tgif beta receptor type II expression. Expression of mir-21 and mir-132 directly mediates cell migration . mir-21 mediates cell migration and proliferation. mir-21 seems to mediate apoptosis. mir-21 is involved in cellular processes, such as cell migration and cell proliferation. mir-21 regulates the ectopic expression of smad2 .' #text = u'transport of annexin 2 not only to dynamic actin-rich ruffles at the cell cortex but also to cytoplasmic and perinuclear vesicles.' doc_id = '99999999' rule_phase0_filename = sys.argv[2] rule_phase1_filename = sys.argv[3] rule_phase2_filename = sys.argv[4] fh0 = open(rule_phase0_filename, "r") rule0_lines = fh0.readlines() fh0.close() fh1 = open(rule_phase1_filename, "r") rule1_lines = fh1.readlines() fh1.close() fh2 = open(rule_phase2_filename, "r") rule2_lines = fh2.readlines() fh2.close() param_helper = ParamHelper(text, doc_id, rule0_lines, rule1_lines, rule2_lines) raw_doc = document_pb2.Document() edg_rules = edgRules_pb2.EdgRules() param_helper.setDocProtoAttributes(raw_doc) param_helper.setRuleProtoAttributes(edg_rules) # Parse using Bllip parser. #print (ruleList) # Parse using Bllip parser. result = parse_using_bllip(raw_doc, edg_rules) helper = DocHelper(result) sentences = result.sentence #print(edg_rules) for sentence in sentences: print(helper.text(sentence)) for depExtra in sentence.dependency_extra: print(helper.printExtraDependency(sentence, depExtra)) print("===============================")
def mask_entity(self, mask_duids=None): if self.has_overlap_entity(): raise ValueError('Overlapped entities: ' + self.doc.doc_id) masked = document_pb2.Document() masked.CopyFrom(self.doc) slices = [] start = 0 mask_start = 0 # Sort by char start. entities = masked.entity.values() entities = sorted(entities, key=lambda a: a.char_start) for entity in entities: slices.append(self.doc.text[start:entity.char_start]) mask_start += len(slices[-1]) if mask_duids is not None and entity.duid not in mask_duids: slices.append(self.text(entity)) else: # Not using entity type as replacement because it may change # the parsing, ENTITY seems to affect the parsing less. if self.text(entity).endswith('s'): slices.append('BIOENTITIES') else: slices.append('BIOENTITY') entity_end = entity.char_end entity.char_start = mask_start entity.char_end = mask_start + len(slices[-1]) - 1 mask_start += len(slices[-1]) start = entity_end + 1 slices.append(self.doc.text[start:]) masked.text = ''.join(slices) return masked
def run(): # text = u'Surface expression of mir-21 activates tgif beta receptor type II expression. Expression of mir-21 and mir-132 directly mediates cell migration . mir-21 mediates cell migration and proliferation. mir-21 seems to mediate apoptosis. mir-21 is involved in cellular processes, such as cell migration and cell proliferation. mir-21 regulates the ectopic expression of smad2 .' # text = u'transport of annexin 2 not only to dynamic actin-rich ruffles at the cell cortex but also to cytoplasmic and perinuclear vesicles.' doc_id = '99999999' rule_phase0_filename = '/home/leebird/Projects/nlputils/visual/uploads/rules_phase0.txt' rule_phase1_filename = '/home/leebird/Projects/nlputils/visual/uploads/rules_phase1.txt' rule_phase2_filename = '/home/leebird/Projects/nlputils/visual/uploads/rules_phase2.txt' fh0 = open(rule_phase0_filename, "r") rule0_lines = fh0.readlines() fh0.close() fh1 = open(rule_phase1_filename, "r") rule1_lines = fh1.readlines() fh1.close() fh2 = open(rule_phase2_filename, "r") rule2_lines = fh2.readlines() fh2.close() with open('/home/leebird/Projects/nlputils/utils/typing/test.json') as f: json_doc = json.load(f) for t in json_doc['entity'].values(): t['entityType'] = t['entityType'].upper() text = json.dumps(json_doc) raw_doc = json_format.Parse(text, document_pb2.Document(), True) param_helper = ParamHelper(text, doc_id, rule0_lines, rule1_lines, rule2_lines) # raw_doc = document_pb2.Document() edg_rules = edgRules_pb2.EdgRules() # param_helper.setDocProtoAttributes(raw_doc) param_helper.setRuleProtoAttributes(edg_rules) # Parse using Bllip parser. doc = parse_using_bllip(raw_doc, edg_rules) helper = DocHelper(doc) invalid_deps = constraint_args(helper, {'arg0': {document_pb2.Entity.GENE}}) print(invalid_deps) propagate(helper, {'arg0': {document_pb2.Entity.GENE}}, invalid_deps)
def upload(): if request.method == 'POST': # Get the name of the uploaded file file0 = request.files['ruleFile0'] file1 = request.files['ruleFile1'] file2 = request.files['ruleFile2'] rules0 = save_read_uploaded_file(file0) rules1 = save_read_uploaded_file(file1) rules2 = save_read_uploaded_file(file2) text = request.form['text'] if rules0 == "": rules0 = request.form['rules0'] if rules1 == "": rules1 = request.form['rules1'] if rules2 == "": rules2 = request.form['rules2'] rule0_lines = rules0.split("\n") rule1_lines = rules1.split("\n") rule2_lines = rules2.split("\n") doc_id = "9999999" param_helper = ParamHelper(text,doc_id,rule0_lines,rule1_lines,rule2_lines) raw_doc = document_pb2.Document() edg_rules = edgRules_pb2.EdgRules() param_helper.setDocProtoAttributes(raw_doc) param_helper.setRuleProtoAttributes(edg_rules) ########################## parse_bllip = parse_using_bllip(raw_doc,edg_rules) #print parse_bllip brat_bllip = json.dumps(get_brat_data(parse_bllip)) brat_bllip_added = json.dumps(get_brat_data_added(parse_bllip)) return render_template('index_edg.html', text=text, rules0=rules0,rules1=rules1,rules2=rules2, brat_string_bllip=brat_bllip, brat_string_bllip_added=brat_bllip_added) else: return render_template('index_edg.html')
def run(): #####Iterate through all files in Input directory and create doc_list input_dir_path = sys.argv[1] glob_path = input_dir_path + "/*" input_files = glob.glob(glob_path) document_list = list() for input_file in input_files: textFH = open(input_file, "r") text = textFH.read() textFH.close() raw_doc = document_pb2.Document() raw_doc = document_pb2.Document() doc_id = os.path.splitext(os.path.basename(input_file))[0] raw_doc.text = text raw_doc.doc_id = doc_id document_list.append(raw_doc) rule_phase0_filename = sys.argv[2] fh0 = open(rule_phase0_filename, "r") rule0_lines = fh0.readlines() fh0.close() ####NEED TO UPDDATE PARAM_HELPER param_helper = ParamHelper("NA", "NA", rule0_lines, [], []) edg_rules = edgRules_pb2.EdgRules() param_helper.setRuleProtoAttributes(edg_rules) #param_helper.setDocProtoAttributes(raw_doc) # This is a simple function to make requests out of a list of documents. We # put 5 documents in each request. requests = edg_request_iter_docs( document_list, edg_rules, request_size=5, request_type=rpc_pb2.EdgRequest.PARSE_BLLIP) # Given a request iterator, send requests in parallel and get responses. responses_queue = grpcapi.get_queue(server='128.4.20.169', port=8902, request_thread_num=10, iterable_request=requests, edg_request_processor=True) count = 0 for response in responses_queue: for doc in response.document: #print(doc) helper = DocHelper(doc) sentences = doc.sentence doc_id = doc.doc_id #print(edg_rules) sentNum = 0 for sentence in sentences: flag = 0 sentText = helper.text(sentence) dependenciesExtra = sentence.dependency_extra edgRelations = EdgRelations(doc_id, sentNum) edgRelations.setRelations(helper, sentence, dependenciesExtra) toPrintRel = ["inv", "reg", "ass", "exp", "cmp", "isa", "fnd"] for edgRelation in edgRelations.relations: numb_args_list = edgRelation.getEdgRelationNumArgs() relation_name = edgRelation.name trigger_head = edgRelation.trigger_head trigger_phrase = edgRelation.trigger_phrase if relation_name in toPrintRel: for numb_args in numb_args_list: print("Sentence: " + doc_id + "\t" + str(sentNum) + "\t" + sentText) print("Relation: " + relation_name + "\t" + trigger_head + "\t" + trigger_phrase) print("Arg0: " + numb_args[0]) print("Arg1: " + numb_args[1]) print("Arg2: " + numb_args[2]) print("\n") sentNum = sentNum + 1 count += 1
def load_from_brat_file(doc_id, text_file, annotation_file): doc = document_pb2.Document() doc.doc_id = doc_id helper = DocHelper(doc) with codecs.open(text_file, 'r', encoding='utf8') as f: # Replace newlines with spaces. text = f.read().replace('\n', ' ') doc.text = text with codecs.open(annotation_file, 'r', encoding='utf8') as f: entities, events, relations = [], [], [] for line in f: line = line.strip('\r\n') assert len(line.strip()) > 0 assert line[0] == 'T' or line[0] == 'E' or \ line[0] == 'R' or line[0] == '*' or \ line[0] == 'M' or line[0] == 'A' if line[0] == 'T': entity_id, entity_text, entity_type, entity_start, entity_end \ = parser.parse_entity(line) entity = helper.add_entity(duid=entity_id) entity.char_start = entity_start entity.char_end = entity_end - 1 entity.entity_type = entity_type elif line[0] == 'E': events.append(parser.parse_event(line)) elif line[0] == 'R' or line[0] == '*': relations.append(parser.parse_relation(line)) for eid, etype, trigger_id, arguments, attrs in events: event = helper.add_relation(relation_type=etype, duid=eid) trigger = event.argument.add() trigger.entity_duid = trigger_id trigger.role = 'Trigger' for role, arg_id in arguments: arg = event.argument.add() arg.role = role arg.entity_duid = arg_id if attrs is not None: for key, values in attrs.items(): for value in values: attr = event.attribute.add() attr.key = key attr.value = value for rid, rtype, arguments, attrs in relations: if rid.startswith('R'): relation = helper.add_relation(relation_type=rtype, duid=rid) else: relation = helper.add_relation(relation_type=rtype) for role, arg_id in arguments: arg = relation.argument.add() arg.role = role arg.entity_duid = arg_id if attrs is not None: for key, values in attrs.items(): for value in values: attr = relation.attribute.add() attr.key = key attr.value = value return doc
def run(): #####Iterate through all files in Input directory and create doc_list input_dir_path = sys.argv[1] glob_path = input_dir_path + "/*"; input_files = glob.glob(glob_path) document_list = list() for input_file in input_files: textFH = open(input_file,"r") text = textFH.read() textFH.close() raw_doc = document_pb2.Document() raw_doc = document_pb2.Document() doc_id = os.path.splitext(os.path.basename(input_file))[0] raw_doc.text = text raw_doc.doc_id = doc_id document_list.append(raw_doc) rule_phase0_filename = sys.argv[2] fh0 = open(rule_phase0_filename, "r") rule0_lines = fh0.readlines() fh0.close() ####NEED TO UPDDATE PARAM_HELPER param_helper = ParamHelper("NA","NA",rule0_lines,[],[]) edg_rules = edgRules_pb2.EdgRules() param_helper.setRuleProtoAttributes(edg_rules) #param_helper.setDocProtoAttributes(raw_doc) # This is a simple function to make requests out of a list of documents. We # put 5 documents in each request. requests = edg_request_iter_docs(document_list, edg_rules, request_size=5, request_type=rpc_pb2.EdgRequest.PARSE_BLLIP) # Given a request iterator, send requests in parallel and get responses. responses_queue = grpcapi.get_queue(server='128.4.20.169', port=8902, request_thread_num=10, iterable_request=requests, edg_request_processor=True) count = 0 for response in responses_queue: for doc in response.document: #print(doc) helper = DocHelper(doc) sentences = doc.sentence doc_id = doc.doc_id #print(edg_rules) sentNum = 0 for sentence in sentences: flag = 0 sentText = helper.text(sentence) for depExtra in sentence.dependency_extra: flag = 1 print(doc_id+"\t"+str(sentNum)+"\t"+helper.printExtraDependencyAnalysis(sentence,depExtra)+"\t"+sentText) if flag == 0: print(doc_id+"\t"+str(sentNum)+"\t"+helper.printEmptyExtraDependencyAnalysis(sentence)+"\t"+sentText) sentNum = sentNum + 1 count += 1