Exemplo n.º 1
0
def getFileLanguage(workspace, filegroup, stopwordspath, index=0):

    fgrppath = workspace + '/' + filegroup
    _, _, files = os.walk(fgrppath).__next__()

    inputfiles = []
    for file in files:
        if 'xml' in file:
            inputfiles.append(file)

    fgrp = defaultdict(int)
    for input_file in inputfiles:

        alignurl = fgrppath + '/' + input_file
        pcgts = parse(alignurl, True)
        page = pcgts.get_Page()
        regions = page.get_TextRegion()

        pagetext = ''
        for region in regions:
            pagetext += region.get_TextEquiv()[index].Unicode + ' '

        lang = detect_language(pagetext, stopwordspath)
        fgrp[lang] += 1

    return max(fgrp, key=lambda k: fgrp[k])
Exemplo n.º 2
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        mainIndex = self.parameter['mainIndex']

        for (n, input_file) in enumerate(self.input_files):

            alignurl = input_file.url
            pcgts = parse(alignurl, True)
            page = pcgts.get_Page()
            regions = page.get_TextRegion()

            pagecontent = ''
            for region in regions:
                regioncontent = ''

                lines = region.get_TextLine()
                for line in lines:
                    linecontent = ''

                    words = line.get_Word()
                    for word in words:
                        wordunicode = word.get_TextEquiv()[mainIndex].Unicode
                        word.add_TextEquiv(TextEquivType(Unicode=wordunicode))
                        linecontent += ' ' + wordunicode

                    line.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                    regioncontent += '\n' + linecontent

                region.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                pagecontent += '\n' + regioncontent

            page.add_TextEquiv(TextEquivType(Unicode=pagecontent))

            ID = concat_padded(self.output_file_grp, n)
            self.log.info('creating file id: %s, name: %s, file_grp: %s', ID,
                          input_file.basename, self.output_file_grp)
            # Use the input file's basename for the new file
            # this way the files retain the same basenames.
            out = self.workspace.add_file(
                ID=ID,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                basename=self.output_file_grp + '-' + input_file.basename,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
            self.log.info('created file %s', out)
Exemplo n.º 3
0
def negative2zero(inputfile, outputfile):
    print("Setting negative coords to zero..")
    pcgts = parse(inputfile, silence=True)
    page = pcgts.get_Page()
    for attr in dir(page):
        if "get_" in attr and "Region" in attr:
            for regiondata in getattr(page,attr)():
                if attr == "get_TextRegion":
                    for textline in regiondata.get_TextLine():
                        textcoords = textline.get_Coords()
                        textcoords.set_points(update_points(textcoords.get_points()))
                regcoords = regiondata.get_Coords()
                regcoords.set_points(update_points(regcoords.get_points()))
    content = to_xml(pcgts)
    with open(outputfile,"w") as fout:
        fout.write(content)
Exemplo n.º 4
0
def get_imagefilename(inputfile):
    pcgts = parse(inputfile, silence=True)
    page = pcgts.get_Page()
    print(page.get_imageFilename())
Exemplo n.º 5
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        d = dict()
        # if "gt" not in d:
        #     d["gt"] = 0

        inputfiles = self.input_files
        for input_file in inputfiles:

            # index = input_file.url.rfind('/')
            # alignurl = input_file.url[:index] + '/' + self.input_file_grp + input_file.url[index:]
            alignurl = input_file.url

            pcgts = parse(alignurl, True)

            page = pcgts.get_Page()
            regions = page.get_TextRegion()

            # find index of GT
            gti = self.get_gt_index(regions)
            if gti == -1:
                # sigh. just give up for this file
                continue
            if gti not in d:
                d['gt'] = 0

            for region in regions:
                lines = region.get_TextLine()

                for line in lines:
                    if len(line.get_TextEquiv()) <= gti:
                        continue
                    gtline = line.get_TextEquiv()[gti].Unicode
                    d['gt'] += len(gtline)

                    #Type = line.get_TextEquiv()[0].dataType[9:]

                    # for i in range(2, len(line.get_TextEquiv())):
                    for i in range(0, gti):
                        # print("%d: %s".format(i, line.get_
                        # OCRType = line.get_TextEquiv()[i].dataType
                        # lindex = OCRType.find('OCR-D-')
                        # rindex = OCRType.find(Type)
                        # model = OCRType[lindex:rindex-1]

                        if i not in d:
                            d[i] = 0
                        # print(line.get_TextEquiv()[2].dataType)
                        unicodeline = line.get_TextEquiv()[i].Unicode

                        d[i] += distance(gtline, unicodeline)

                        # words = line.get_Word()
                        # for word in words:
                        #     for ocr in word.get_TextEquiv():
                        #         print(ocr.Unicode)

        print(json.dumps(d))
Exemplo n.º 6
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        linelang = defaultdict(int)
        wordlang = defaultdict(int)

        linefont = defaultdict(int)
        wordfont = defaultdict(int)

        inputfiles = self.input_files
        for input_file in inputfiles:

            alignurl = input_file.url
            pcgts = parse(alignurl, True)
            page = pcgts.get_Page()
            regions = page.get_TextRegion()

            for region in regions:
                lines = region.get_TextLine()

                for line in lines:
                    try:
                        llang = line.primaryLanguage
                        linelang[llang] += 1
                    except TypeError:
                        pass

                    try:
                        lfont = line.fontFamily
                        linefont[lfont] += 1
                    except TypeError:
                        pass

                    words = line.get_Word()
                    for word in words:
                        try:
                            wlang = word.language
                            wordlang[wlang] += 1
                        except TypeError:
                            pass

                        try:
                            wfont = word.get_TextStyle().fontFamily
                            wordfont[wfont] += 1
                        except TypeError:
                            pass

        #predominant language
        try:
            lang = max(linelang, key=lambda k: linelang[k])
        except TypeError:
            try:
                lang = max(wordlang, key=lambda k: wordlang[k])
            except TypeError:
                lang = 'German'

        #predominant font
        try:
            font = max(linefont, key=lambda k: linefont[k])
        except TypeError:
            try:
                font = max(wordfont, key=lambda k: wordfont[k])
            except TypeError:
                font = 'Antiqua'

        print(lang)
        print(font)