def find_abbreviations(self): abbreviations = self.data_parse.find_abbr() if len(abbreviations) > 0: for item in abbreviations: # full form full_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[2], item[3])]) self.wn += 1 full_mwu.typ = 'Full_form' self.doc.add_mwu(full_mwu) # abbreviation abbr_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[4], item[5])]) self.wn += 1 abbr_mwu.typ = 'Abbreviation' self.doc.add_mwu(abbr_mwu) abbr_rel = relation(nm='Abbr_rel_' + str(self.wn), src=full_mwu, trg=abbr_mwu, annotations={}, typ='Abbreviation') self.wn += 1 self.doc.add_rel(abbr_rel)
def signif_rels(self, preann_parser='spacy'): print('Start search for outcome-significance pairs') out_sig_rels = self.data_parse.find_out_signif_rel_bert() for item in out_sig_rels: # out out_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[2], item[3])]) self.wn += 1 out_mwu.typ = 'rep_out' self.doc.add_mwu(out_mwu) # stat stat_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[4], item[5])]) self.wn += 1 stat_mwu.typ = 'Pval' self.doc.add_mwu(stat_mwu) sig_rel = relation(nm='Sig_rel_' + str(self.wn), src=out_mwu, trg=stat_mwu, annotations={}, typ='out_signif') self.wn += 1 self.doc.add_rel(sig_rel)
def compare_po_text_to_registry(self): print('Start search for matching outcomes') matching_outcomes, outcomes_not_matched, meta = self.data_parse.compare_po_text_registry( ) if len(matching_outcomes) > 0: for item in matching_outcomes: # PO po_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[2], item[3])]) self.wn += 1 po_mwu.typ = 'PO' self.doc.add_mwu(po_mwu) else: self.find_po() if len(outcomes_not_matched) > 0: for item in outcomes_not_matched: # PO po_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[1], item[2])]) self.wn += 1 po_mwu.typ = 'PO' self.doc.add_mwu(po_mwu) self.doc.meta = meta
def find_so(self, preann_parser='spacy'): self.data_parse.find_po(parser=preann_parser) so_list = set(self.data_parse.SO_list) new_tagging_mwus = [] for so in so_list: typ = 'Out' beg = so[2] end = so[3] sent_id = so[4] params = so[5] #print(po) #print(self.data_parse.sentences) sent_beg = self.data_parse.sentences[sent_id][1] # find start position of the first token, returned by ttagger, plus start position of the sentence first_token = self.data_parse.sentences_tagged[( sent_id, preann_parser)][beg] cstart = first_token[3] + sent_beg # find end position of the last token, returned by ttagger, plus start position of the sentence last_token = self.data_parse.sentences_tagged[(sent_id, preann_parser)][end] cend = last_token[4] + sent_beg a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = typ #a_mwu.set_glob_annot( 'Params', params ) new_tagging_mwus.append(a_mwu) self.doc.add_mwu(a_mwu) self.wn += 1 return (new_tagging_mwus)
def token_same_form_rels_text(self, in_data=None): # annotate relations between all pairs of token instances having the same form, a token being defined as any sequence of contiguous visible characters assert ((type(in_data) == str) or (in_data is None)) if (in_data is not None): self.doc = document(ident='', content=in_data, metadata='') self.token_split() i = 0 j = 0 mwn = 0 reln = 0 # the list of created rels (only one direction is stored either source-->target or target-->src in a pair of spans, so when testing a pair of span one must test for the existence of both possible ordering) self.tokspans = {} for t in self.tokens: if t[SFRel.FORM] not in self.tokspans: self.tokspans[t[SFRel.FORM]] = [t[SFRel.SPAN]] else: self.tokspans[t[SFRel.FORM]].append(t[SFRel.SPAN]) self.occ_chains = {} for tok in self.tokspans: self.occ_chains[tok] = [] if (len(self.tokspans[tok]) > 1): for i in range(0, len(self.tokspans[tok]) - 1): mw1 = mwu(nm=(SFRel.FAMERSUF + SFRel.MWUSUF + str(mwn)), txtsps=[self.tokspans[tok][i]]) mwn += 1 self.doc.add_mwu(mw1) mw2 = mwu(nm=(SFRel.FAMERSUF + SFRel.MWUSUF + str(mwn)), txtsps=[self.tokspans[tok][i + 1]]) mwn += 1 self.doc.add_mwu(mw2) new_r = relation(nm=(SFRel.FAMERSUF + SFRel.RELSUF + str(reln)), src=mw1, trg=mw2) new_r.set_glob_annot(SFRel.TYPENM, SFRel.TYPE) self.doc.add_rel(new_r) self.occ_chains[tok].append(new_r) reln += 1 assert (len(self.occ_chains[tok]) == (len(self.tokspans[tok]) - 1)) else: self.hapax[tok] = self.tokspans[tok][0] return (self.doc)
def detect_text_structure(self): # to find title, abstract (results + conclusions), body text when the input is a full text article self.data_parse.find_abstract() title = self.data_parse.title abstract = self.data_parse.abstract res = self.data_parse.results concl = self.data_parse.conclusions cend = 0 # mwu for title if title != None: cstart = title[1] cend = title[2] a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = 'Title' self.doc.add_mwu(a_mwu) self.wn += 1 if abstract != None: cstart = abstract[1] cend = abstract[2] a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = 'Abstract' self.doc.add_mwu(a_mwu) self.wn += 1 if res != None: cstart = res[1] cend = res[2] a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = 'Abstract_Results' self.doc.add_mwu(a_mwu) self.wn += 1 if concl != None: cstart = concl[1] cend = concl[2] a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = 'Abstract_Conclusions' self.doc.add_mwu(a_mwu) self.wn += 1
def compare_po_to_reported(self): #, preann_parser = 'bert'): print('Start search for matching outcomes') matching_outcomes, outcomes_not_matched, meta = self.data_parse.compare_po_rep( ) if len(matching_outcomes) > 0: for item in matching_outcomes: # PO po_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[2], item[3])]) self.wn += 1 po_mwu.typ = 'PO' self.doc.add_mwu(po_mwu) # reported outcome out_rep_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[4], item[5])]) self.wn += 1 out_rep_mwu.typ = 'Reported_outcome' self.doc.add_mwu(out_rep_mwu) out_rel = relation(nm='Out_match_' + str(self.wn), src=po_mwu, trg=out_rep_mwu, annotations={}, typ='Out_match') self.wn += 1 self.doc.add_rel(out_rel) else: self.find_po() #parser = preann_parser) if len(outcomes_not_matched) > 0: for item in outcomes_not_matched: # PO po_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[1], item[2])]) self.wn += 1 po_mwu.typ = 'PO' self.doc.add_mwu(po_mwu) self.doc.meta = meta
def compare_outcomes_abstract_to_body(self): print('Start comparing outcomes') matching_outcomes, outcomes_not_matched, meta = self.data_parse.compare_po_abstr_bt( ) if len(matching_outcomes) > 0: for item in matching_outcomes: # PO abstr po_abstr_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[2], item[3])]) self.wn += 1 po_abstr_mwu.typ = 'PO' self.doc.add_mwu(po_abstr_mwu) # PO body po_bt_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[4], item[5])]) self.wn += 1 po_bt_mwu.typ = 'Reported_outcome' self.doc.add_mwu(po_bt_mwu) out_rel = relation(nm='Out_match_' + str(self.wn), src=po_abstr_mwu, trg=po_bt_mwu, annotations={}, typ='Out_match') self.wn += 1 self.doc.add_rel(out_rel) else: self.find_po() if len(outcomes_not_matched) > 0: for item in outcomes_not_matched: # PO po_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[1], item[2])]) self.wn += 1 po_mwu.typ = 'PO' self.doc.add_mwu(po_mwu) self.doc.meta = meta
def split_sentences(self): sentences = self.data_parse.sent_split() cend = 0 for sent in sentences: cstart = self.doc.ctnt.find(sent, cend) cend = cstart + len(sent) a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = 'Sentence' self.doc.add_mwu(a_mwu) self.wn += 1 # for convenience of use in other functions self.sentences.append((sent, cstart, cend))
def find_registry_data(self): reg_ids = self.data_parse.find_reg_num() for reg_id in reg_ids: cstart = reg_id[2] cend = reg_id[3] a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = 'Registration_number' self.doc.add_mwu(a_mwu) self.wn += 1 self.data_parse.find_web() #print(len(self.data_parse.registry_data)) meta = self.data_parse.parse_all_registries() #print(self.data_parse.registry_parsed) self.doc.meta = meta
def compare_rep_text_to_registry(self): print('Start search for matching outcomes') matching_outcomes, meta = self.data_parse.compare_registry_rep() if len(matching_outcomes) > 0: for item in matching_outcomes: # rep rep_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(item[2], item[3])]) self.wn += 1 rep_mwu.typ = 'rep' self.doc.add_mwu(rep_mwu) else: self.data_parse.find_out_rep_abstr() self.doc.meta = meta
def preannot_to_mwu(self, preannot_function='', preann_parser='spacy', allow_intersection=False): new_tagging_mwus = [] function_str = 'self.data_parse.' + preannot_function + '(parser="' + preann_parser + '")' #print(function_str) preannotation = eval(function_str) #print(preannotation) if allow_intersection == False: preannotation_to_import = advanced_filter_preannot_results( preannotation) elif allow_intersection == True: preannotation_to_import = preannotation cend = 0 for preannot_item in preannotation_to_import: typ = preannot_item[0] beg = preannot_item[1] end = preannot_item[2] sent_id = preannot_item[3] params = preannot_item[4] sent_beg = self.data_parse.sentences[sent_id][1] # find start position of the first token, returned by ttagger, plus start position of the sentence first_token = self.data_parse.sentences_tagged[( sent_id, preann_parser)][beg] cstart = first_token[3] + sent_beg # find end position of the last token, returned by ttagger, plus start position of the sentence last_token = self.data_parse.sentences_tagged[(sent_id, preann_parser)][end] cend = last_token[4] + sent_beg #print(self.data[cstart: cend]) a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) a_mwu.typ = typ #a_mwu.set_glob_annot( 'Params', params ) new_tagging_mwus.append(a_mwu) self.doc.add_mwu(a_mwu) self.wn += 1 return (new_tagging_mwus)
def python_export(self, file): with open(file, mode='w+', encoding='utf-8') as f: f.write('document( ident=' + file.__repr__() + ', content=' + self.doc.content().__repr__() + ', metadata=' + self.doc.meta.__repr__() + ', multi_word_units =') f.write('[') n = 0 mwul = [] for m in self.doc.mwus: if (n > 0): f.write(', ') f.write('mwu( nm=\'' + m.name + '\', txtsps=[ ') n += 1 sp_lst = [] len_all_spans = len(m.txtspans) span_count = 0 for sp in m.txtspans: f.write('(' + str(sp[0]) + ' , ' + str(sp[1]) + ')') span_count += 1 if span_count < len_all_spans: f.write(',') sp_lst += [(sp[0], sp[1])] mwul += [mwu(nm=m.name, txtsps=sp_lst)] # [ mwu( nm='FAMER_mwu_0', txtsps=[(16, 19)]), ... f.write('])') f.write('], ') f.write('relations= [') n = 0 for r in self.doc.rels: if (n > 0): f.write(', ') f.write('{0}'.format(r.__repr__())) n += 1 f.write('], constructions= [') n = 0 for k in self.doc.kstructs: if (n > 0): f.write(', ') f.write('{0}'.format(k.__repr__())) n += 1 f.write('])') f.close()
def detect_abstract_structure(self): # find results and conclusions if the input is an abstract only structure = self.data_parse.find_abstr_sections() # mwus for beginning of abstract, results, conclusions cend = 0 for item in structure: if item != '': cstart = self.doc.ctnt.find(item, cend) cend = cstart + len(item) a_mwu = mwu(nm='mwu_' + str(self.wn), txtsps=[(cstart, cend)]) if structure.index(item) == 0: typ = 'Abstract_beginning' elif structure.index(item) == 1: typ = 'Abstract_Results' elif structure.index(item) == 2: typ = 'Abstract_Conclusions' a_mwu.typ = typ self.doc.add_mwu(a_mwu) self.wn += 1