Python PdfParser.PdfParser 예제들, idiomatic.parsers.PdfParser.PdfParser Python 예제들

예제 #1

0

파일 보기

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(
            languageUsageCriteria2012PdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        entry = None
        twoLineEntries = [
            u"ASISTA",
        ]

        for line in pdfParser.lines():

            line = line.strip()

            if parsingStage == 0:
                if line == u"ANEXO V. RELACIÓN DE SIGLAS E ACRÓNIMOS MÁIS " \
                        u"HABITUAIS DA UDC":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line.startswith(u"ÚLTIMAS PUBLICACIÓNS"):
                    break

            if line in [
                    u"CRITERIOS PARA O USO DA LINGUA", u"ANEXO V. "
                    u"RELACIÓN DE SIGLAS E ACRÓNIMOS MÁIS HABITUAIS "
                    u"DA UDC"
            ]:
                continue
            if line.isdigit():
                continue

            if not entry:
                entry = line.strip()
            # Caso especial que hai que xestionar como malamente se poida.
            elif entry in twoLineEntries:
                if entry in entries:
                    entries[entry] += u" " + line.strip()
                    entry = None
                else:
                    entries[entry] = line.strip()
            else:
                entries[entry] = line.strip()
                entry = None

        dictionary = u"# Relación de siglas e acrónimos " \
            u"máis frecuentes na UDC\n"
        dictionary += u"# {}\n".format(languageUsageCriteria2012PdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"):
            dictionary += entry
        return dictionary

예제 #2

0

파일 보기

    def content(self):

        filePath = uvigoContentCache.downloadFileIfNeededAndGetLocalPath(
            doubtsPdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        commentCache = None

        for line in pdfParser.lines():

            if parsingStage == 0:
                if line == u"Relación de siglas e acrónimos máis frecuentes":
                    parsingStage += 1
                else:
                    continue

            elif parsingStage == 1:
                if line == u"49":
                    parsingStage += 1
                    continue

            elif parsingStage == 2:
                if line == u"5":
                    parsingStage += 1
                else:
                    continue

            elif parsingStage == 3:
                if line == u"55":
                    break

            if commentCache:
                entry = line.strip()
                entries[entry] = commentCache
                commentCache = None
            elif u":" in line:
                comment, entry = line.split(u":")
                entry = entry.strip()
                if entry:
                    entries[entry] = comment.strip()
                else:
                    commentCache = comment.strip()

        dictionary = u"# Relación de acrónimos e siglas máis frecuentes\n"
        dictionary += u"# {}\n".format(doubtsPdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"):
            dictionary += entry
        return dictionary

예제 #3

0

파일 보기

파일: usc.py 프로젝트: daliboris/hunspell

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        lineIsContinuation = False
        comment = None

        for line in pdfParser.lines():

            if comment and line[0] != u" " and parsingStage == 1:
                lineIsContinuation = True
            line = line.strip()

            if parsingStage == 0:
                if line == u"abril":
                    parsingStage += 1
                else:
                    continue

            elif parsingStage == 1:
                if line == u"Manuel Bermúdez":
                    break

            if line.startswith(u"Abreviaturas, siglas, símbolos e léxico"):
                continue
            if line.isdigit():
                continue

            if lineIsContinuation:
                lineIsContinuation = False
                comment += u" " + line
                continue

            if comment:
                for subentry in self.parseEntry(line):
                    entries[subentry] = comment
                comment = None
            else:
                comment = line

        dictionary = u"# Relación de abreviaturas máis frecuentes\n"
        dictionary += u"# {}\n".format(pdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(
                entries, u"abreviatura"):
            dictionary += entry
        return dictionary

예제 #4

0

파일 보기

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(
            abbreviationsPdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        entry = None

        for line in pdfParser.lines():

            line = line.strip()

            if parsingStage == 0:
                if line == u"Tabela 24: Abreviaturas e sua expansão " \
                        u"em galego.":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line == u"Abreviatura":
                    continue
                elif line == u"Conversão ortográfica":
                    continue
                elif line == u"54":
                    break

            if not entry:
                entry = line.strip().replace(u"..", u".")
            # Caso especial que hai que xestionar como malamente se poida.
            elif entry == u"s.a.":
                if line == u"especificar":
                    entries[entry] += u" " + line.strip()
                    entry = None
                else:
                    entries[entry] = line.strip()
            else:
                entries[entry] = line.strip()
                entry = None

        dictionary = u"# Relación de abreviaturas máis frecuentes\n"
        dictionary += u"# {}\n".format(abbreviationsPdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(
                entries, u"abreviatura"):
            dictionary += entry
        return dictionary

예제 #5

0

파일 보기

파일: usc.py 프로젝트: daliboris/hunspell

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        lineIsContinuation = False
        comment = None

        for line in pdfParser.lines():

            if comment and line[0] != u" " and parsingStage == 1:
                lineIsContinuation = True
            line = line.strip()

            if parsingStage == 0:
                if line == u"Asociación Española de Normalización e " \
                        u"Certificación":
                    parsingStage += 1
                else:
                    continue

            elif parsingStage == 1:
                if line == u"A sigla caracterízase por:":
                    break

            if line.isdigit():
                continue

            if lineIsContinuation:
                lineIsContinuation = False
                comment += u" " + line
                continue

            if comment:
                entries[line] = comment
                comment = None
            else:
                comment = line

        dictionary = u"# Relación de siglas e acrónimos máis frecuentes\n"
        dictionary += u"# {}\n".format(pdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"):
            dictionary += entry
        return dictionary

예제 #6

0

파일 보기

파일: ceg.py 프로젝트: daliboris/hunspell

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0

        for line in pdfParser.lines():

            line = line.strip()
            if not line:
                continue

            if parsingStage == 0:
                if line == u"SÍMBOLOS:":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line == u"CASOS ESPECIAIS:":
                    break

            if line.isdigit():
                continue

            parts = line.split(u":")
            comment = parts[0].strip()
            entry = u":".join(parts[1:]).strip()
            if comment in [u"FM"]:  # Entradas invertidas.
                temporary = comment
                comment = entry
                entry = temporary
            if u"," in entry:
                for subentry in entry.split(u","):
                    entries[subentry.strip()] = comment
            else:
                entries[entry] = comment

        dictionary = u"# Relación de símbolos máis frecuentes\n"
        dictionary += u"# {}\n".format(pdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(entries,
                                                           u"símbolo"):
            dictionary += entry
        return dictionary

예제 #7

0

파일 보기

파일: microsoft.py 프로젝트: daliboris/hunspell

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(
            styleGuidePdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        previousLine = u""

        for line in pdfParser.lines():

            line = line.strip()

            if parsingStage == 0:
                if line == u"List of common abbreviations:":
                    parsingStage += 1
                else:
                    continue

            elif parsingStage == 1:
                if line == u"Addtional guidelines:":
                    break

            # Yes, I know, ugliest decoding ever… It looks like different parts
            # of the PDF use different encoding, so… bare with me.
            line = line.replace(u"ñ", u"ó").replace(u"ð",
                                                    u"ñ").replace(u"ö", u"ú")

            if line.startswith(u"(+)"):
                comment = previousLine
                entry = line[3:].strip()
                for subentry in self.parseSubEntries(entry):
                    subentry = subentry.strip()
                    entries[subentry] = comment.strip()

            previousLine = line

        dictionary = u"# Relación de abreviaturas máis frecuentes\n"
        dictionary += u"# {}\n".format(styleGuidePdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(
                entries, u"abreviatura"):
            dictionary += entry
        return dictionary

예제 #8

0

파일 보기

파일: ceg.py 프로젝트: daliboris/hunspell

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        entry = None

        comment = u""
        for line in pdfParser.lines():

            line = line.strip()
            if not line:
                continue

            if parsingStage == 0:
                if line == u"SIGLAS e ACRÓNIMOS":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line == u"ABREVIATURAS:":
                    break

            if line.isdigit():
                continue

            parts = line.split(u":")
            if parts[0].upper() != parts[0]:
                comment += u" " + parts[0].strip()
                entries[entry] = comment
            else:
                entry = parts[0].strip()
                comment = u":".join(parts[1:]).strip()
                entries[entry] = comment

        dictionary = u"# Relación de siglas e acrónimos máis frecuentes\n"
        dictionary += u"# {}\n".format(pdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"):
            dictionary += entry
        return dictionary

예제 #9

0

파일 보기

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(
            languageUsageCriteria2012PdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        comment = None

        import string

        for line in pdfParser.lines():

            line = line.strip()

            if parsingStage == 0:
                if line == u"ANEXO I. ABREVIATURAS MÁIS EMPREGADAS NA " \
                        u"LINGUAXE ADMINISTRATIVA":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line.startswith("5"):
                    parsingStage += 1
                    continue

            elif parsingStage == 2:
                if line == u"ANEXO I. ABREVIATURAS MÁIS EMPREGADAS NA " \
                        u"LINGUAXE ADMINISTRATIVA":
                    parsingStage += 1
                else:
                    continue

            elif parsingStage == 3:
                if line == u"ANEXO II. RELACIÓN DOS TOPÓNIMOS MÁIS " \
                        u"HABITUAIS DE FÓRA DO ESTADO ESPAÑOL":
                    break

            if line in string.uppercase:
                continue
            if line in [
                    u"CRITERIOS PARA O USO DA LINGUA",
                    u"ANEXO I. ABREVIATURAS MÁIS EMPREGADAS NA LINGUAXE "
                    u"ADMINISTRATIVA"
            ]:
                continue
            if line.isdigit():
                continue

            if not comment:
                comment = line.strip()
            else:
                entries[line.strip()] = comment
                comment = None

        dictionary = u"# Relación de abreviaturas máis frecuentes na " \
            u"linguaxe administrativa\n"
        dictionary += u"# {}\n".format(languageUsageCriteria2012PdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(
                entries, u"abreviatura"):
            dictionary += entry
        return dictionary

예제 #10

0

파일 보기

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(
            languageUsageCriteria2007PdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        comment = None
        entry = None
        twoLines = 0

        for line in pdfParser.lines():

            line = line.strip()
            if not line:
                continue

            if parsingStage == 0:
                if line == u"ABREVIATURAS DE TRATAMENTO":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line == u"CRITERIOS PARA O USO DA LINGUA":
                    break

            if line.isdigit():
                continue

            if twoLines != 0:
                if twoLines == 1:
                    entry = line
                    twoLines += 1
                    continue
                elif twoLines == 2:
                    comment = comment[:-1] + line
                    twoLines += 1
                    continue
                elif twoLines == 3:
                    entry += u" " + line
                    twoLines = 0

            if not comment:
                comment = line
                if comment.endswith(u"-"):
                    twoLines = 1
                continue
            else:
                if not entry:
                    entry = line
                entries[entry] = comment
                comment = None
                entry = None

        dictionary = u"# Relación de abreviaturas de tratamento\n"
        dictionary += u"# {}\n".format(languageUsageCriteria2007PdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(
                entries, u"abreviatura"):
            dictionary += entry
        return dictionary

예제 #11

0

파일 보기

파일: ceg.py 프로젝트: daliboris/hunspell

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        entry = None

        import re

        plural = re.compile(u"\(pl. ([^)]+)\)")
        parenthesis = re.compile(u" *\([^)]*\)")

        for line in pdfParser.lines():

            line = line.strip()
            if not line:
                continue
            if line.isdigit():
                continue

            if parsingStage == 0:
                if line == u"ABREVIATURAS:":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line == u"SÍMBOLOS:":
                    break

            parts = line.split(u":")
            comment = parts[0].strip()
            entry = u":".join(parts[1:]).strip()

            subentries = set()

            for match in plural.finditer(entry):
                for subentry in self.parseEntry(match.group(1)):
                    subentries.add(subentry)

            # Eliminar contido entre parénteses.
            entry = re.sub(parenthesis, u"", entry)
            entry = entry.strip()

            for subentry in self.parseEntry(entry):
                if subentry.endswith(u"o/a."):
                    subentries.add(subentry[:-4] + u"a.")
                    subentries.add(subentry[:-4] + u"o.")
                else:
                    subentries.add(subentry)

            for subentry in subentries:
                entries[subentry] = comment

        dictionary = u"# Relación de abreviaturas máis frecuentes\n"
        dictionary += u"# {}\n".format(pdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(entries,
                                                           u"abreviatura"):
            dictionary += entry
        return dictionary

예제 #12

0

파일 보기

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(
            styleGuidePdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        continuesInTheNextLine = False
        previousLine = None

        import re
        import string

        plural = re.compile(u"\(plural ([^)]+)\)")
        fem = re.compile(u"\(fem. ([^)]+)\)")
        parenthesis = re.compile(u" *\([^)]*\)")

        for line in pdfParser.lines():

            if line[-1:] == u" " and parsingStage == 1:
                continuesInTheNextLine = True
            line = line.strip()

            if parsingStage == 0:
                if line == u"7.1.2 Listaxe de abreviaturas":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line == u"7.2 A sigla":
                    break

            if line in string.uppercase:
                continue
            if line.startswith(u"Ortografía e estilo"):
                continue
            if line.isdigit():
                continue

            if continuesInTheNextLine:
                continuesInTheNextLine = False
                previousLine = line
                continue

            if previousLine:
                line = previousLine + u" " + line
                previousLine = None

            try:
                comment, entry = line.split(u":")
            except ValueError:
                parts = line.split(u":")
                comment = u":".join(parts[:-1])
                entry = parts[-1]

            subentries = set()

            for match in plural.finditer(entry):
                for subentry in self.parseEntry(match.group(1)):
                    subentries.add(subentry)

            for match in fem.finditer(entry):
                for subentry in self.parseEntry(match.group(1)):
                    subentries.add(subentry)

            # Eliminar contido entre parénteses.
            entry = re.sub(parenthesis, u"", entry)
            entry = entry.strip()

            for subentry in self.parseEntry(entry):
                subentries.add(subentry)

            for subentry in subentries:
                entries[subentry] = comment

        dictionary = u"# Relación de abreviaturas máis frecuentes na " \
            u"linguaxe administrativa\n"
        dictionary += u"# {}\n".format(styleGuidePdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(
                entries, u"abreviatura"):
            dictionary += entry
        return dictionary

예제 #13

0

파일 보기

    def content(self):

        filePath = contentCache.downloadFileIfNeededAndGetLocalPath(
            styleGuidePdfUrl)
        pdfParser = PdfParser(filePath)

        entries = {}
        parsingStage = 0
        continuesInTheNextLine = False
        previousLine = None

        for line in pdfParser.lines():

            if line[-1:] in [u" ", u"-"] and parsingStage == 1:
                continuesInTheNextLine = True

            if parsingStage == 0:
                if line == u"7.3.2 Listaxe de símbolos de uso común":
                    parsingStage += 1
                continue

            elif parsingStage == 1:
                if line == u"7.4 O acrónimo":
                    break

            if line.startswith(u"Ortografía e estilo"):
                continue
            if line.isdigit():
                continue

            if previousLine:
                line = previousLine + line
                previousLine = None

            if continuesInTheNextLine:
                continuesInTheNextLine = False
                previousLine = line
                continue

            if u":" not in line:
                parts = line.split(u" ")
                entry = parts[0]
                comment = u" ".join(parts[1:])
            else:
                try:
                    entry, comment = line.split(u":")
                except ValueError:
                    parts = line.split(u":")
                    entry = parts[0]
                    comment = u":".join(parts[1:])

            entry = entry.strip()
            for subentry in self.parseEntry(entry):
                entries[subentry] = comment

        dictionary = u"# Relación de símbolos máis frecuentes\n"
        dictionary += u"# {}\n".format(styleGuidePdfUrl)
        dictionary += u"\n"
        for entry in formatEntriesAndCommentsForDictionary(
                entries, u"símbolo"):
            dictionary += entry
        return dictionary