def __init__(self, inputfile, status, language, pair_id):
     #Read the data from the file and save it in a list called 'alignsegments'
     self.inputfile = '/tmp/texthammerparsing/{}/parsed/{}/{}'.format(
         pair_id, language, inputfile)
     self.code = inputfile
     self.status = status
     self.language = language
     self.haserrors = False
     try:
         with open(self.inputfile, 'r') as f:
             raw = f.read()
         #Removing all irrelevant comment lines
         conllinput = "\n".join([
             line for line in raw.splitlines() if not re.search("^#", line)
             or re.search("^# [a-z]+split", line)
         ])
     except UnicodeDecodeError:
         msg = "Encoding error! Id of the text: {}\n ".format(code)
         logging.info(msg)
         #Logger.loggederrors.append(msg)
         self.haserrors = True
     if not self.haserrors:
         #This is only needed for multilingual aligned files
         self.alignsegments = TrimList(
             re.split("# " + getConf("segmentsplit"), conllinput))
         self.conllinput = conllinput
示例#2
0
 def MarkParagraphs(self, justfixing):
     """
     Mark the place of paragraphs by using a predefined pattern
     """
     pattern = "\n" + "###C:" + getConf("paragraphsplit") + "\n"
     if not justfixing:
         ##NOTE: completely removing empty lines from the input (see the list comprehension inside join)
         self.output = pattern.join(
             [thisline for thisline in self.lines if thisline])
     else:
         self.output = "\n".join(
             [thisline for thisline in self.lines if thisline])
示例#3
0
 def FilterSentencesAndParagraphs(self, justfixing):
     """
     Run filters in order to strip or sentences that are too long to parse
     """
     pattern = "\n" + "###C:" + getConf("paragraphsplit") + "\n"
     if not justfixing:
         #Note: the sentences are filtered in order to detect sentences too long to parse
         #see longsentencelog.txt and FilterLongSentences.py
         self.output = FilterLongSentences.FilterByCharCount(
             self.output, self.filename, True, pattern)
     else:
         self.output = FilterLongSentences.FilterByCharCount(
             self.output, self.filename, True, "\n")
示例#4
0
def parseFiles(pair_id, parserpath):
    """
    Sends all the language files in the document identified by pair_id to the parser
    and captures the output

    - pair_id the unique id of a source file
    - parserpath path to the Turku neural parser installation

    """

    if parserpath[-1] != r"/":
        parserpath += r"/"

    # TODO: select which model if multiple available (rcfile?)
    models = getConf("models")

    # Use the parser's virtual env
    python_bin = parserpath + "venv-parser-neural/bin/python3"
    script_file = parserpath + "full_pipeline_stream.py"
    parsed_dir = "/tmp/texthammerparsing/{}/parsed".format(pair_id)
    log_dir = "/tmp/texthammerparsing/parserlog/"
    os.makedirs(parsed_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    prepared_dir = "/tmp/texthammerparsing/" + pair_id + "/prepared/"

    for lang in os.listdir(prepared_dir):
        lang = lang.replace(r"/", "")
        langdir = parsed_dir + "/" + lang
        os.makedirs(langdir, exist_ok=True)
        logging.info(
            "Starting to parse the files in the following language: " + lang)
        for f in glob.glob(prepared_dir + lang + "/*"):
            code = os.path.basename(f)
            logging.info("Starting to parse the following file: " + code)

            with open(f, 'rb') as payload:
                headers = {'content-type': 'text/plain; charset = utf-8'}
                response = requests.post('http://localhost:{}'.format(getPortForLanguage(lang)),
                                         data=payload, verify=False, headers=headers)

            with open(parsed_dir + "/" + lang + "/" + code, "w") as f:
                f.write(response.text)
    def LoopThroughSentences(self, nopara):
        """
        Runs through each sentence and forms an xml representation of that

        -nopara: if set, will not try to mark paragraphs

        """
        if nopara:
            sentences = ""
            raw = "\n".join([
                s for s in self.sl_text.conllinput.splitlines()
                if not re.search("^#", s)
            ])
            sentences = re.split("\n\n", raw)
            for sentence in sentences:
                self.current_s = etree.SubElement(self.root, "s")
                processed = self.ProcessWordsOfSegment(sentence.splitlines(),
                                                       self.sl_text)
            if not processed:
                return False
        else:
            paragraphs = TrimList(
                re.split("# " + getConf("paragraphsplit"),
                         self.sl_text.conllinput))
            paragraphs = [p for p in paragraphs if not re.search("^#", p)]
            for idx, paragraph in enumerate(paragraphs):
                #start a new paragraph and a new sentencce
                linesofparagraph = paragraph.splitlines()
                processed = True
                if any(linesofparagraph):
                    #If not an empty paragraph
                    self.current_p = etree.SubElement(self.root, "p")
                    #import ipdb;ipdb.set_trace()
                    self.current_s = etree.SubElement(self.current_p, "s")
                    processed = self.ProcessWordsOfSegment(
                        paragraph.splitlines(), self.sl_text)
                if not processed:
                    return False
        return True
示例#6
0
def getPortForLanguage(lang):
    ports = getConf('ports')
    return ports[lang]