def test_sentencizer_serialize_bytes(en_vocab): punct_chars = [".", "~", "+"] sentencizer = Sentencizer(punct_chars=punct_chars) assert sentencizer.punct_chars == punct_chars bytes_data = sentencizer.to_bytes() new_sentencizer = Sentencizer().from_bytes(bytes_data) assert new_sentencizer.punct_chars == punct_chars
def test_sentencizer_serialize_bytes(en_vocab): punct_chars = [".", "~", "+"] sentencizer = Sentencizer(punct_chars=punct_chars) assert sentencizer.punct_chars == set(punct_chars) bytes_data = sentencizer.to_bytes() new_sentencizer = Sentencizer().from_bytes(bytes_data) assert new_sentencizer.punct_chars == set(punct_chars)
def process_text(text: str, debug: bool = False) -> Dictionary: """ Process given text through NLP pipes. """ nlp = spacy.load("en_core_web_sm") sentencizer = Sentencizer() nlp.add_pipe(sentencizer, before="parser") doc = nlp(text) dictionary_path = f"{NLP_SERVICE_ROOT}/assets/dictionary.pickle" if os.path.isfile(dictionary_path) and not debug: with open(dictionary_path, "rb") as f: dictionary = pickle.load(f) else: dictionary = Dictionary(debug=False) with open(dictionary_path, "wb") as w: pickle.dump(dictionary, w) for sentence in doc.sents: for token in sentence: relic = Relic(token, sentence) dictionary.add(relic) return dictionary
def make_nlp(): ''' Generates spaCy nlp object and adds pipelines. Returns: an nlp object ''' nlp = spacy.load("en_core_web_sm") sentencizer = Sentencizer(punct_chars=['.']) ruler = make_entity_ruler(nlp) Token.set_extension('is_solitarious', default=None, force=True) Span.set_extension('subject_decline', default=False, force=True) Span.set_extension('contains_adults', default=None, force=True) Span.set_extension('ent_solitarious', default=None, force=True) merge_ents = nlp.create_pipe("merge_entities") combine_ents_ruler = combine_entities_ruler(nlp) nlp.add_pipe(sentencizer, first=True) nlp.add_pipe(ruler, before='ner') nlp.add_pipe(refine_entities) nlp.add_pipe(subject_decline) nlp.add_pipe(merge_ents) nlp.add_pipe(combine_ents_ruler) nlp.add_pipe(is_solitarious) nlp.add_pipe(contains_adults) nlp.add_pipe(ent_solitarious) return nlp
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=punct_chars) doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts assert len(list(doc.sents)) == n_sents
def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer() doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts assert len(list(doc.sents)) == n_sents
def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=None) doc = sentencizer(doc) assert doc.has_annotation("SENT_START") assert [t.is_sent_start for t in doc] == sent_starts assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents
def test_flatten_docs_to_sens(vocab): sentencizer = Sentencizer(".") nlp = spacy.blank("en") nlp.add_pipe(sentencizer) texts = ["Foo is bar. Bar is baz.", "It is a sentence."] docs = nlp.pipe(texts) all_sents = flatten_docs_to_sents(docs) assert len(all_sents) == 3
def test_sentencizer(en_vocab): doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."]) sentencizer = Sentencizer() doc = sentencizer(doc) assert doc.is_sentenced sent_starts = [t.is_sent_start for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert len(list(doc.sents)) == 2
def break_into_sentences(txt): sentencizer = Sentencizer() nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(txt) span_list = list(doc.sents) sentence_list = [t.text for t in span_list] return sentence_list
def __init__(self, nlp): """""" self.nlp = nlp self.sentencizer = Sentencizer() # https://github.com/explosion/spaCy/issues/3569 try: self.nlp.add_pipe(self.sentencizer, first=True) except: # already added pass
def test_sentencizer(en_vocab): doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."]) sentencizer = Sentencizer(punct_chars=None) doc = sentencizer(doc) assert doc.has_annotation("SENT_START") sent_starts = [t.is_sent_start for t in doc] sent_ends = [t.is_sent_end for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert sent_ends == [False, True, False, False, False, False, True] assert len(list(doc.sents)) == 2
def remove_duplicates(txt): #first check via "\n" to remove duplicates new_txt = unique(txt.split("\n")) sentencizer = Sentencizer() nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(new_txt) span_list = list(doc.sents) sentence_list = [t.text for t in span_list] summary = unique(sentence_list) return summary
def __init__(self, archive_path: str, predictor_name: str, text_path: str, cuda_device: int, language: str = "en_core_web_sm"): archive = load_archive(archive_path, cuda_device=0) self.predictor = TextPredictor.from_archive( archive, predictor_name=predictor_name) self._nlp = spacy.load(language) sentencizer = Sentencizer() self._nlp.add_pipe(sentencizer) self.text = self.read_lines(text_path)
def tokenize(full_input_path, corpus): with open(full_input_path, "r") as f: text = f.read() text = re.sub("\n", "", text) text = re.sub("\ufeff", "", text) if corpus == "OCD": nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"]) nlseg = NewLineSegmenter() nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter') elif corpus == "OE": nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"]) sentencizer = Sentencizer(punct_chars=[".", "\n"]) nlp.add_pipe(sentencizer) doc = nlp(text) tokenized_sents = [] for sent in doc.sents: tokens = [] for token in sent: if token.text == "(": token_text = "-LRB-" elif token.text == ")": token_text = "-RRB-" elif token.text in [":)", ":-)", ":(", ":-("]: token_text = "-EMOJI-" else: token_text = token.text tokens.append(token_text) tokenized_sent = " ".join(tokens) tokenized_sents.append(tokenized_sent) if corpus == "OCD": tokenized_text = "".join(tokenized_sents) elif corpus == "OE": non_empty_sents = [] for sent in tokenized_sents: non_empty_sents.append(sent.lstrip()) tokenized_text = "\n".join(non_empty_sents) return tokenized_text
def generate_knowledge_graph(text): doc_title = str(time.time()) sentencizer = Sentencizer() doc = nlp(text) clean_data = [] n = 0 for sents in doc.sents: if len(str(sents).replace("\n", "")) > 0: clean = str(sents).replace("\n", "") if clean.strip() != "" and validateString(clean): clean_data.append(clean) n = n + 1 print(n) entity_pairs = [] for data in tqdm(clean_data): entity_pairs.append(get_entities(data)) print("\nEntity Extraction completed") relations = [get_relation(i) for i in clean_data] source = [] target = [] edge = [] indexes = [] for i in tqdm(range(len(entity_pairs))): if validateAlpha(entity_pairs[i][0]) and validateAlpha( entity_pairs[i][1]) and validateString(relations[i]): ent1 = removeStop(entity_pairs[i][0]) ent2 = removeStop(entity_pairs[i][1]) rel = relations[i] if validateAlpha(ent1.lower()) and validateAlpha(ent2.lower()): source.append(ent1.lower().strip()) target.append(ent2.lower().strip()) edge.append(rel) indexes.append(i) print("\nTotal number of extracted pairs:", len(edge)) print("\nEdges: ", edge) print("\nEntities: ", entity_pairs) if (len(edge) == 0 or len(entity_pairs) == 0): return False else: G = nx.DiGraph(directed=True) for i in tqdm(range(len(edge))): G.add_weighted_edges_from([(source[i], target[i], i)]) print("\nGraph generated") size = 20 if len(edge) / 2 > 20: size = len(edge) / 2 plt.figure(figsize=(size, size)) edge_labels = dict([(( u, v, ), edge[d['weight']]) for u, v, d in G.edges(data=True)]) pos = nx.spring_layout(G, k=0.8) nx.draw(G, with_labels=True, node_color='skyblue', node_size=5000, edge_color='r', edge_cmap=plt.cm.Blues, pos=pos, font_size=20) nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=15) plt.title("KNOWLEDGE GRAPH FOR DOCUMENT: " + doc_title, fontdict={'fontsize': 50}) plt.savefig(os.path.join(IMAGE_DIR, doc_title + ".png")) return os.path.join(IMAGE_DIR, doc_title + ".png")
def test_sentencizer_across_scripts(lang, text): nlp = spacy.blank(lang) sentencizer = Sentencizer() nlp.add_pipe(sentencizer) doc = nlp(text) assert len(list(doc.sents)) > 1
def prepare_model(model="en_core_web_md"): nlp = spacy.load(model) sentencizer = Sentencizer() nlp.add_pipe(sentencizer, before="parser") return nlp
import spacy import string from spacy.pipeline import Sentencizer nlp = spacy.load("de_core_news_md") sentencizer = Sentencizer(punct_chars=[char for char in string.punctuation]) nlp.add_pipe(sentencizer, name="sentence_segmenter", before="parser") def get_oie(corpus): # decision logic for extracting roots and terms - for better analysis sentences are passed as well roots = [] terms = [] sents = [] doc = nlp(corpus.lower()) for sent in doc.sents: t = set() # get sentences sents.append(sent.text) # get important tokens from sentence pd, oc, ng = "", "", "" for token in sent: if token.dep_ == "pd": pd = token.lemma_ if token.dep_ == "oc": oc = token.lemma_ if token.dep_ == "ng" and token.head.dep_ == "ROOT":
def sentencizer(): return Sentencizer()
""" Tests for backend/nlp/src/services/dictionary.py """ import spacy from spacy.pipeline import Sentencizer from services import dictionary from shared.tests.base import TestsBaseClass nlp = spacy.load("en_core_web_sm") sentencizer = Sentencizer() nlp.add_pipe(sentencizer, before="parser") sample_text = "You are not prepared!" doc = nlp(sample_text) for sentence in doc.sents: for token in sentence: relic = dictionary.Relic(token, sentence) break break class DictionaryTests(TestsBaseClass): """ Tests for backend/nlp/src/services/dictionary.py """ def test_01_test_relic_class_init(self): """ Should contain required properties. """
def extract_segments(file_text, file_parse, corpus): """ :param doc_text: absolute path to the text file, where each line is a sentence :param doc_parse: absolute path to the parse file, where each line is a parsed sentence as a phrase-structure three :return segments_dict: dict, where each key is a segment id, and each value is a dictionary with the text, start index and end index of the segment """ with open(file_text, "r", encoding="utf-8") as f: doc_text = f.read() if corpus == "OCD": nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"]) nlseg = NewLineSegmenter() nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter') elif corpus == "OE": nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"]) sentencizer = Sentencizer(punct_chars=["."]) nlp.add_pipe(sentencizer) doc = nlp(doc_text) with open(file_parse, "r") as f: sent_parses = f.readlines() segments_list = [] sent_index = 0 for sent, parse in zip(doc.sents, sent_parses): tokens = {} for index, token in enumerate(sent): tokens[index] = {} start = token.idx end = token.idx + len(token.text) tokens[index]["start"] = start tokens[index]["end"] = end tokens[index]["text"] = doc_text[start:end] # get segments from the parse tree t = Tree.fromstring(parse) for index, treepos in enumerate(t.treepositions("leaves")): t[treepos] = index segments_ids = [] for st in t.subtrees(): # save segment if it is not already saved and if it does not contain only terminals (height=2) if corpus == "OCD": # exclude punctuation leaves st_leaves = [ leaf[0] for leaf in st.pos() if leaf[1] not in [ "#", "$", '"', "``", "(", ")", "-LRB-", "-RRB-", ",", ":", "." ] ] elif corpus == "OE": st_leaves = st.leaves() if st_leaves not in segments_ids and st.height() > 2 and len( st_leaves) > 0: segments_ids.append(st_leaves) for index, segment in enumerate(segments_ids): segment_start = tokens[segment[0]][ "start"] + sent_index # shift the index to fit the bug in the eval script segment_end = tokens[segment[-1]][ "end"] + sent_index # shift the index to fit the bug in the eval script segment_text = doc_text[segment_start:segment_end] segments_list.append({ "start": segment_start, "end": segment_end, "text": segment_text }) sent_index += 1 segments_dict = {} for index, data in enumerate(segments_list): segments_dict[index] = data return segments_dict
import numpy as np import librosa import argparse import torch import soundfile as sf import pyrubberband as pyrb import re import spacy from spacy.pipeline import Sentencizer # Prepare NLP pipeline nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"]) sentencizer = Sentencizer(punct_chars=[".", "?", "!", ":", "..."]) nlp.add_pipe(sentencizer) """#### Prepare the models""" # Print some environment information (for debugging purposes) ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") if not torch.cuda.is_available(): print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device()
def build_sentencizer(self, verbose=False): self.nlp_sentencizer = English() sentencizer = Sentencizer() self.nlp_sentencizer.add_pipe(component=sentencizer) if verbose: print("pipe names: {}".format(self.nlp_sentencizer.pipe_names))
def custom_sentencizer(texts): # disabling Named Entity Recognition for speed nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) boundary_1 = re.compile(r'\b(fig)\b|\b(ex)\b|\b[a-z]\b|\b(prof)\b|' r'\b(eg)\b|\b(etc)\b|\b(sp)\b|\b(an)\b|' r'\b(pp)\b|\b(vol)\b|\b(col)\b') boundary_2_2 = re.compile(r'\b(et)\b') boundary_2_1 = re.compile(r'\b(al)\b') boundary_p = re.compile(r'[.?!:;’“”\"\'0-9]|(wAnM.)') boundary_a = re.compile(r'\b(th)\b|\b(st)\b|\b(nd)\b|\b(rd)\b|[-]') boundary_d = re.compile(r'[0-9]') # Utility functions def custom_seg_1(doc): prev = doc[0].text length = len(doc) for index, token in enumerate(doc): if (token.text == '.' and boundary_1.match(prev.lower()) and index != (length - 1)): doc[index + 1].sent_start = False prev = token.text return doc def custom_seg_2(doc): length = len(doc) # If single token, return if length < 2: return doc # If multiple token, apply rule else: prev_2 = doc[0].text prev_1 = doc[1].text for index, token in enumerate(doc): if index > 0: if (((token.text == '.') | (token.text == '.,') | (token.text == '.(')) and boundary_2_2.match(prev_2.lower()) and boundary_2_1.match(prev_1.lower()) and index != (length - 1)): if ((doc[index + 1].text == ',') or (doc[index + 1].text == '(')): doc[index + 2].sent_start = False else: doc[index + 1].sent_start = False prev_1 = token.text prev_2 = doc[index - 1].text return doc def custom_seg_3(doc): prev = doc[0].text length = len(doc) for index, token in enumerate(doc): if ((token.text == '\n' or token.text == '\n ') and not boundary_p.match(prev.lower()) and index != (length - 1)): doc[index + 1].sent_start = False prev = token.text return doc def custom_seg_4(doc): succ = doc[1].text length = len(doc) for index, token in enumerate(doc): if index < (length - 2): if ((token.text == '\n' or token.text == '\n ') and boundary_a.match(succ.lower())): doc[index + 1].sent_start = False succ = doc[index + 2].text return doc def custom_seg_5(doc): prev = doc[0].text succ = doc[1].text length = len(doc) for index, token in enumerate(doc): if index < (length - 2): if ((token.text == '\n' or token.text == '\n ') and boundary_d.match(prev.lower())): if ~succ.isupper(): doc[index + 1].sent_start = False prev = token.text succ = doc[index + 2].text return doc def brief_cleaning_fun(text): text = BeautifulSoup(html.unescape(text), 'lxml').text # remove HTML tags text = re.sub(r'https?://\S+', '', str(text)) # remove URLs # Remove question marks inside parenthesis (they mess up sentence splitting) text = re.sub(r'\(\?', r'\(', str(text)) # Remove question marks inside parenthesis (they mess up sentence splitting) text = re.sub(r'\?\)', r'\)', str(text)) text = re.sub(r'(\? \()', r' \(', str(text)) # Replace \xa0\n tags with space text = re.sub(r'(\xa0\n)', ' ', str(text)) text = re.sub(r'(\xa0 \n)', ' ', str(text)) # Replace -e.g. with e.g. text = re.sub(r'(-e.g.)', 'e.g.', str(text)) # Replace \xa0 tags with space text = re.sub(r'(\xa0)', ' ', str(text)) # Remove newline characters between brackets. text = re.sub(r'(?s)(?<=[\(]).*?(?=[\)])', lambda x: x.group().replace('\n', ' '), str(text)) # Replace space between linebreak characters text = re.sub(r'(?<=(\n)) *(?=(\n))', '', str(text)) # Replace multiple occurrences of whitespace characters with single one text = re.sub(r'(\s)\1{1,}', r'\1', str(text)) # Replace square brackets text = re.sub(r'[\[\]]', '', str(text)) # Replace occurrences of newline character before a comma text = re.sub(r'(\n,)|(\n ,)', r',\n', str(text)) # keep only certain characters text = re.sub(r"[^a-zA-Z0-9,'‘’“”\":;.?!\(\)\-(\n)]", ' ', str(text)) return text def sentence_cleaning_fun(text): if not text: text = 'nan' return text def sentencization(doc): # Accept only documents with more than two words (one word and one punctuation.) if len(doc) > 2: text = [ sentence_cleaning_fun(token.text).strip().split() for token in doc.sents ] # Remove short sentences text = [' '.join(x) for x in text if len(x) > 1] # Remove bracket characters if they are at the end of a sentence. # Remove space between digit and "st","nd","rd" and "th" for i, x in enumerate(text): if x[-1] in ['(', ')']: text[i] = x[:-1] text[i] = re.sub( r"(?i)\b[0-9][0-9]*\b (\bst\b|\bnd\b|\brd\b|\bth\b)", lambda x: x.group().replace(' ', ''), str(text[i])) if text: return text else: return 'nan' else: return 'nan' sentencizer = Sentencizer(punct_chars=[ ".", "?", "!", "\n", "\n ", "\n\n", "\n \n ", "\n\n\n", "\n \n \n ", "\n\n\n\n", "\n \n \n \n ", "\n\n\n\n\n", "\n \n \n \n \n " ]) nlp.add_pipe(sentencizer) nlp.add_pipe(custom_seg_1, after='sentencizer') nlp.add_pipe(custom_seg_2, after='custom_seg_1') nlp.add_pipe(custom_seg_3, after='custom_seg_2') nlp.add_pipe(custom_seg_4, after='custom_seg_3') nlp.add_pipe(custom_seg_5, after='custom_seg_4') brief_cleaning = (brief_cleaning_fun(row) for row in texts) texts_processed = [ sentencization(doc) for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=25), total=len(texts)) ] return texts_processed
import spacy import string import re from collections import Counter from spacy.pipeline import Sentencizer from spacy.matcher import PhraseMatcher nlp = spacy.load("de_core_news_md") sentencizer = Sentencizer(punct_chars=[".", "?", "!", ",", ";", ":"]) nlp.add_pipe(sentencizer, name="sentence_segmenter", before="parser") def get_abbr(): # load list of german abbreviations for normalizing # taken from https://de.wiktionary.org/wiki/Kategorie:Abk%C3%BCrzung_(Deutsch) with open("resources/abbreviations_ger.txt", "r", encoding="utf-8") as f: x = f.readlines() abbreviations = [item.rstrip('\n') for item in x] return abbreviations def sentences(corpus, no_questions): # potentially add root form support terms = [ "wohin", "wie", "woher", "was", "wieso", "warum", "wer", "welche", "wen", "wem", "wo", "?" ] matcher = PhraseMatcher(nlp.vocab)
def craft_input_to_bolstm(): """Convert the documents in the CRAFT corpus to the input structure of BO-LSTM.""" # Sentence segmentation using Spacy nlp = English() sentencizer = Sentencizer() nlp.add_pipe(sentencizer) # Parse each document in corpus directory - corpus_dir = "chebi_craft_corpus/" docs_list = os.listdir(corpus_dir) for idoc, file in enumerate(docs_list): if file[-3:] == "xmi": file_path = corpus_dir + file file_id = str(file[:-4]) #Retrieve the entire document text tree = ET.parse(file_path) root = tree.getroot() for child in root: if child.tag == "{http:///uima/cas.ecore}Sofa": document_text = child.attrib["sofaString"] # Import annotations from annotations file into annotation_list annotation_list = [] annotation_file = open(file_path[:-3] + "ann", "r") for line in annotation_file.readlines(): entity_text = line.split("\t")[2].strip("\n") ontology_id = line.split("\t")[1].split(" ")[0].replace( "_", ":") offset_begin = int(line.split("\t")[1].split(" ")[1]) offset_end = int( line.split("\t")[1].split(" ")[2].split(";")[0]) annotation_list.append( (entity_text, ontology_id, offset_begin, offset_end)) annotation_file.close() # Create the xml tree for output file new_root = ET.Element("document") new_root.set("id", file_id) # Iterate over each sentence in document docSpacy = nlp(document_text) sentence_count, token_count = 0, 0 for sentence in docSpacy.sents: sentence_count += 1 begin_offset = token_count + 1 token_count += len(sentence.text) + 1 final_offset = token_count sentence_id = str(file_id) + ".s" + str(sentence_count) entity_count = 0 entity_check = [] # Create xml structure for sentence new_sentence = ET.SubElement(new_root, "sentence") new_sentence.set("id", sentence_id) new_sentence.set("text", sentence.text) # Check if there is any annotation present in the current sentence valid_entities_list = [] for annotation in annotation_list: if annotation[2] >= begin_offset and annotation[ 2] <= final_offset: # There is an annotation in this sentence entity_text = annotation[0] if entity_text not in entity_check: # The entity was not added to sentence #Upgrade the entity offset in sentence context entity_begin_offset = sentence.text.find( entity_text) if entity_begin_offset > -1: entity_count += 1 entity_id = sentence_id + ".e" + str( entity_count) entity_final_offset = entity_begin_offset + len( entity_text) - 1 entity_offset = str( entity_begin_offset) + "-" + str( entity_final_offset) entity_check.append(entity_text) valid_entities_list.append(entity_id) # Create xml structure for annotation new_entity = ET.SubElement( new_sentence, "entity") new_entity.set("id", entity_id) new_entity.set("charOffset", entity_offset) new_entity.set("type", "chebi") new_entity.set("text", entity_text) new_entity.set("ontology_id", annotation[1]) # Create Xml structure for pairs of entities in sentence pair_count = 0 pair_check = [] for valid_entity in valid_entities_list: for valid_entity_2 in valid_entities_list: print(valid_entity) if valid_entity != valid_entity_2: # Create a pair between two different entities pair_check_id1 = valid_entity + "_" + valid_entity_2 pair_check_id2 = valid_entity_2 + "_" + valid_entity if pair_check_id1 not in pair_check and pair_check_id2 not in pair_check: # Prevent duplicate pairs pair_count += 1 pair_id = sentence_id + ".p" + str(pair_count) pair_check.append(pair_check_id1) pair_check.append(pair_check_id2) new_pair = ET.SubElement(new_sentence, "pair") new_pair.set("id", pair_id) new_pair.set("e1", valid_entity), new_pair.set( "e2", valid_entity_2) new_pair.set("ddi", "false") #Create an .xml output file ET.ElementTree(new_root).write("./bolstm/converted_chebi_craft/" + file_id + ".xml", xml_declaration=True)
import spacy from spacy.pipeline import Sentencizer import pandas as pd from preprocess import preprocess, construct_spacy_obj import ft import train from feature_extraction import feature_extraction from classifiation import classify nlp = spacy.load('en_core_web_sm') sentencizer = Sentencizer(punct_chars=[".", "!", "?", "\n", "\r", ";"]) nlp.add_pipe(sentencizer) ft_model = ft.get_model() model = train.get_model(nlp, ft_model) def get_features_and_classification(filename): df = pd.read_csv("csv_files/" + filename, header=None, names=['reviewText', 'rating']) df = preprocess(df, nlp) df = construct_spacy_obj(df, nlp) features = feature_extraction(df, ft_model, nlp) result, _, __ = classify(df, features, model) return features, result
def __init__(self, dataset=None, entity_labels=None, no_rel_label=None, no_rel_multiple=False, sentence_align=False, test=False, same_entity_relation=False, write_Entites=False, generalize=False, parallelize=False, no_of_cores=64, predictions_folder=None, de_sample=None): """ Data files are read in and the sentence where the entitiy pair is located is segmented into 5 along with the labels and the track information (file number, entity1 and entity 2) that helps to write predictions back to file. :param dataset: path to dataset :param predictions_folder: path to predictions (output) folder :param entity_labels: labels of the list of entities that create the relations :param no_labels: name the label when entities that do not have relations in a sentence are considered :param no_rel_multiple: flag whether multiple labels are possibles for No-relation :param sentence_align: options to break sentences :param test: flag to run test-segmentation options :param same_entity_relation: flag when relation exists between same type of entities :param de_sample: flag to reduce the no of samples :param generalize: flag when relations are not dependent on the first given relation label :param parallelize: flag to parallelize the segmentation :param no_of_cores: no of cores to run the parallelized segmentation :param write_Entites: write entities and predictions to file :param with_labels: Take the labels of the entites into consideration during segmentation """ self.predictions_folder = predictions_folder self.dataset = dataset self.entity_labels = entity_labels self.test = test self.same_entity_relation = same_entity_relation self.generalize = generalize self.parallelize = parallelize self.write_Entites = write_Entites self.nlp_model = English() self.nlp_model.max_length = 2000000 if no_rel_label: self.no_rel_label = no_rel_label else: self.no_rel_label = False self.no_rel_multiple = no_rel_multiple if de_sample: self.de_sample = de_sample else: self.de_sample = False if sentence_align: sentencizer = Sentencizer(punct_chars=["\n"]) else: sentencizer = Sentencizer(punct_chars=["\n", ".", "?"]) if self.write_Entites and self.predictions_folder is not None: ext = ".ann" file.delete_all_files(predictions_folder, ext) self.nlp_model.add_pipe(sentencizer) # self.nlp_model = spacy.load('en_core_web_sm') # global segmentation object that returns all segments and the label self.segments = { 'seg_preceding': [], 'seg_concept1': [], 'seg_concept2': [], 'seg_concept1_label': [], 'seg_concept2_label': [], 'seg_middle': [], 'seg_succeeding': [], 'sentence': [], 'label': [], 'track': [] } #if parallelize flag is true if self.parallelize: # Pool object which offers a convenient means of parallelizing the execution of a function # across multiple input values, distributing the input data across processes pool = Pool(no_of_cores) all_args = [] for datafile, txt_path, ann_path in self.dataset: all_args.append([datafile, txt_path, ann_path]) segments_file = pool.map(self.process_file_parallel, all_args) pool.close() pool.join() # count = 0 # for i in range(len(segments_file)): # count = count + len(segments_file[i]['label']) # print(count) for segment in segments_file: # Add lists of segments to the segments object for the dataset self.segments['seg_preceding'].extend(segment['preceding']) self.segments['seg_concept1'].extend(segment['concept1']) self.segments['seg_middle'].extend(segment['middle']) self.segments['seg_concept2'].extend(segment['concept2']) self.segments['seg_succeeding'].extend(segment['succeeding']) self.segments['sentence'].extend(segment['sentence']) self.segments['track'].extend(segment['track']) # if not self.test: self.segments['label'].extend(segment['label']) # self.segments['seg_concept1_label'].extend(segment['concept1_label']) # self.segments['seg_concept2_label'].extend(segment['concept2_label']) else: segment = self.process_file_serial(dataset) # Add lists of segments to the segments object for the dataset self.segments['seg_preceding'].extend(segment['preceding']) self.segments['seg_concept1'].extend(segment['concept1']) self.segments['seg_middle'].extend(segment['middle']) self.segments['seg_concept2'].extend(segment['concept2']) self.segments['seg_succeeding'].extend(segment['succeeding']) self.segments['sentence'].extend(segment['sentence']) self.segments['track'].extend(segment['track']) # if not self.test: self.segments['label'].extend(segment['label']) self.segments['seg_concept1_label'].extend( segment['concept1_label']) self.segments['seg_concept2_label'].extend( segment['concept2_label']) if not self.test: # print(set(self.segments['label'])) # print the number of instances of each relation classes print([(i, self.segments['label'].count(i)) for i in set(self.segments['label'])]) # write the segments to a file file.list_to_file('sentence_test', self.segments['sentence']) file.list_to_file('preceding_seg', self.segments['seg_preceding']) file.list_to_file('concept1_seg', self.segments['seg_concept1']) file.list_to_file('middle_seg', self.segments['seg_middle']) file.list_to_file('concept2_seg', self.segments['seg_concept2']) file.list_to_file('succeeding_seg', self.segments['seg_succeeding']) file.list_to_file('track_test', self.segments['track'])