def __init__(self, inputfile, status, language, pair_id): #Read the data from the file and save it in a list called 'alignsegments' self.inputfile = '/tmp/texthammerparsing/{}/parsed/{}/{}'.format( pair_id, language, inputfile) self.code = inputfile self.status = status self.language = language self.haserrors = False try: with open(self.inputfile, 'r') as f: raw = f.read() #Removing all irrelevant comment lines conllinput = "\n".join([ line for line in raw.splitlines() if not re.search("^#", line) or re.search("^# [a-z]+split", line) ]) except UnicodeDecodeError: msg = "Encoding error! Id of the text: {}\n ".format(code) logging.info(msg) #Logger.loggederrors.append(msg) self.haserrors = True if not self.haserrors: #This is only needed for multilingual aligned files self.alignsegments = TrimList( re.split("# " + getConf("segmentsplit"), conllinput)) self.conllinput = conllinput
def MarkParagraphs(self, justfixing): """ Mark the place of paragraphs by using a predefined pattern """ pattern = "\n" + "###C:" + getConf("paragraphsplit") + "\n" if not justfixing: ##NOTE: completely removing empty lines from the input (see the list comprehension inside join) self.output = pattern.join( [thisline for thisline in self.lines if thisline]) else: self.output = "\n".join( [thisline for thisline in self.lines if thisline])
def FilterSentencesAndParagraphs(self, justfixing): """ Run filters in order to strip or sentences that are too long to parse """ pattern = "\n" + "###C:" + getConf("paragraphsplit") + "\n" if not justfixing: #Note: the sentences are filtered in order to detect sentences too long to parse #see longsentencelog.txt and FilterLongSentences.py self.output = FilterLongSentences.FilterByCharCount( self.output, self.filename, True, pattern) else: self.output = FilterLongSentences.FilterByCharCount( self.output, self.filename, True, "\n")
def parseFiles(pair_id, parserpath): """ Sends all the language files in the document identified by pair_id to the parser and captures the output - pair_id the unique id of a source file - parserpath path to the Turku neural parser installation """ if parserpath[-1] != r"/": parserpath += r"/" # TODO: select which model if multiple available (rcfile?) models = getConf("models") # Use the parser's virtual env python_bin = parserpath + "venv-parser-neural/bin/python3" script_file = parserpath + "full_pipeline_stream.py" parsed_dir = "/tmp/texthammerparsing/{}/parsed".format(pair_id) log_dir = "/tmp/texthammerparsing/parserlog/" os.makedirs(parsed_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) prepared_dir = "/tmp/texthammerparsing/" + pair_id + "/prepared/" for lang in os.listdir(prepared_dir): lang = lang.replace(r"/", "") langdir = parsed_dir + "/" + lang os.makedirs(langdir, exist_ok=True) logging.info( "Starting to parse the files in the following language: " + lang) for f in glob.glob(prepared_dir + lang + "/*"): code = os.path.basename(f) logging.info("Starting to parse the following file: " + code) with open(f, 'rb') as payload: headers = {'content-type': 'text/plain; charset = utf-8'} response = requests.post('http://localhost:{}'.format(getPortForLanguage(lang)), data=payload, verify=False, headers=headers) with open(parsed_dir + "/" + lang + "/" + code, "w") as f: f.write(response.text)
def LoopThroughSentences(self, nopara): """ Runs through each sentence and forms an xml representation of that -nopara: if set, will not try to mark paragraphs """ if nopara: sentences = "" raw = "\n".join([ s for s in self.sl_text.conllinput.splitlines() if not re.search("^#", s) ]) sentences = re.split("\n\n", raw) for sentence in sentences: self.current_s = etree.SubElement(self.root, "s") processed = self.ProcessWordsOfSegment(sentence.splitlines(), self.sl_text) if not processed: return False else: paragraphs = TrimList( re.split("# " + getConf("paragraphsplit"), self.sl_text.conllinput)) paragraphs = [p for p in paragraphs if not re.search("^#", p)] for idx, paragraph in enumerate(paragraphs): #start a new paragraph and a new sentencce linesofparagraph = paragraph.splitlines() processed = True if any(linesofparagraph): #If not an empty paragraph self.current_p = etree.SubElement(self.root, "p") #import ipdb;ipdb.set_trace() self.current_s = etree.SubElement(self.current_p, "s") processed = self.ProcessWordsOfSegment( paragraph.splitlines(), self.sl_text) if not processed: return False return True
def getPortForLanguage(lang): ports = getConf('ports') return ports[lang]