示例#1
0
# page-specific option list */
pageoptlist = "granularity=page"

# separator to emit after each chunk of text. This depends on the
# application's needs; for granularity=word a space character may be useful.
#
separator = "\n"

if len(argv) != 3:
    raise Exception("usage: extractor <infilename> <outfilename>\n")

try:
    try:

        tet = TET()

        if (version_info[0] < 3):
            fp = open(argv[2], 'w')
        else:
            fp = open(argv[2], 'w', 2, 'utf-8')

        tet.set_option(globaloptlist)

        doc = tet.open_document(argv[1], docoptlist)

        if (doc == -1):
            raise Exception("Error " + tet.get_errnum() + "in " \
                + tet.get_apiname() + "(): " + tet.get_errmsg())

        # get number of pages in the document */
示例#2
0
def main(argv1):
    input_file_path = str(argv1)
    # extract city name and shopping street name from file
    name_pattern = re.compile("\/(.*)\.", re.IGNORECASE)
    name = re.search(name_pattern, input_file_path).group(1)
    name_pattern = re.compile("\/(.*)", re.IGNORECASE)
    name = re.search(name_pattern, name).group(1)
    name = re.sub("1", "", name)
    name = re.sub("2", "", name)

    city_name = name.split("_", 1)[0].lower()
    street_name = name.split("_", 1)[1].lower()
    street_id = uuid.uuid4()
    city_id = uuid.uuid4()

    IsNewCity = True
    textedfile = ""
    # check if city in city.json already exist
    if os.stat(city_path).st_size != 0:
        with open(city_path, "r+") as city_check:
            city_check.seek(0, os.SEEK_SET)
            cities = json.load(city_check)
        city_check.close()

        for citie in cities:
            if citie["name"].lower() in city_name.lower():
                IsNewCity = False
                city_id = citie["city_id"]

    # texted-pdf output file
    output_file = "data/texted_file/" + city_id.__str__() + "_" + street_id.__str__() + ".txt"

    # global option list */
    globaloptlist = "searchpath={{../data} {../../../resource/cmap}}"

    # document-specific option list */
    docoptlist = ""

    # page-specific option list */
    pageoptlist = "granularity=page"

    # separator used between lines extracted
    separator = "\n"

    try:
        try:

            tet = TET()

            if (version_info[0] < 3):
                fp = open(output_file, 'w')
            else:
                fp = open(output_file, 'w', 2, 'utf-8')

            tet.set_option(globaloptlist)

            doc = tet.open_document(input_file_path, docoptlist)

            if (doc == -1):
                raise Exception("Error " + repr(tet.get_errnum()) + "in "
                                + tet.get_apiname() + "(): " + tet.get_errmsg())

            # get number of pages in the document */
            n_pages = tet.pcos_get_number(doc, "length:pages")

            # loop over pages in the document */
            for pageno in range(1, int(n_pages) + 1):
                imageno = -1

                page = tet.open_page(doc, pageno, pageoptlist)

                if (page == -1):
                    print("Error " + repr(tet.get_errnum()) + "in "
                          + tet.get_apiname() + "(): " + tet.get_errmsg())
                    continue  # try next page */

                # text-filtering from special caracters
                text = tet.get_text(doc)
                text = text.replace("(", "")
                text = text.replace(")", "")
                text = text.replace("*", "")
                text = text.replace("!", "")
                text = text.replace("'", "")
                text = text.replace("/", " ")
                text = text.replace(".", "")
                text = text.replace("&", " ")
                text = text.split("\n")
                text1 = []
                lines_seen = set()
                for line in text:
                    if line not in lines_seen:
                        text1.append(line)
                        lines_seen.add(line)
                text = "\n".join(text1)
                textedfile = text1
                # writing to texted-pdf file
                if text != None:
                    fp.write(text)

                if (tet.get_errnum() != 0):
                    print("\nError " + repr(tet.get_errnum())
                          + "in " + tet.get_apiname() + "() on page " +
                          repr(pageno) + ": " + tet.get_errmsg() + "\n")

                tet.close_page(page)

            tet.close_document(doc)
            fp.close()

        except TETException:
            print("TET exception occurred:\n[%d] %s: %s" %
                  ((tet.get_errnum()), tet.get_apiname(), tet.get_errmsg()))
            print_tb(exc_info()[2])


        except Exception:
            print("Exception occurred: %s" % (exc_info()[0]))
            print_exc()


    finally:
        tet.delete()

    # shoppingstreet and city JSON objects
    shoppingstreet = [{"street_id": street_id.__str__(), "name": street_name.__str__()}]
    city = [{"city_id": city_id.__str__(), "name": city_name.__str__()}]

    # validated_streets object (see the end)
    validated_street = [{"street_id": street_id.__str__(), "Address_Validator": False}]

    ################ WRITING TO JSON FILES ###########################

    # write to ShoppingStreet.json
    if os.stat(shoppingstreet_path).st_size == 0:
        with open(shoppingstreet_path, 'a+') as outfile:

            json.dump(shoppingstreet, outfile, ensure_ascii=False, indent=4)

        outfile.close()
    else:
        with open(shoppingstreet_path, 'a+') as outfile:

            outfile.seek(0, os.SEEK_SET)

            shop_streets = json.load(outfile)
            outfile.truncate(0)
            shop_streets.extend(shoppingstreet)

            json.dump(shop_streets, outfile, ensure_ascii=False, indent=4)

        outfile.close()

    # write to City.json
    if os.stat(city_path).st_size == 0:
        with open(city_path, 'a+') as outfile:

            json.dump(city, outfile, ensure_ascii=False, indent=4)

        outfile.close()
    else:
        with open(city_path, 'a+') as outfile:

            outfile.seek(0, os.SEEK_SET)

            citys = json.load(outfile)
            outfile.truncate(0)
            if IsNewCity == True: citys.extend(city)
            json.dump(citys, outfile, ensure_ascii=False, indent=4)

        outfile.close()
    # write to validated_streets.json
    # tracking which street was already validated to ensure a better use of the Google APIs.
    if os.stat(validated_streets_path).st_size == 0:
        with open(validated_streets_path, 'a+') as outfile:

            json.dump(validated_street, outfile, ensure_ascii=False, indent=4)

        outfile.close()
    else:
        with open(validated_streets_path, 'a+') as outfile:

            outfile.seek(0, os.SEEK_SET)

            val_streets = json.load(outfile)
            outfile.truncate(0)
            val_streets.extend(validated_street)

            json.dump(val_streets, outfile, ensure_ascii=False, indent=4)

        outfile.close()
    print("\n ################################# DATA EXTRACTOR ######################################################")
    print("\n ----------------------------------Shopping Street------------------------------------------------------")
    print(json.dumps(shoppingstreet, ensure_ascii=False, indent=4))
    print("\n  ------------------------------------- City ------------------------------------------------------------")
    print(json.dumps(city, ensure_ascii=False, indent=4))
    print("\n ---------------------------------Text Extracted -------------------------------------------------------")
    print(json.dumps(textedfile, ensure_ascii=False, indent=4))
    print(
        "\n ---------------------------------Validated_street -------------------------------------------------------")
    print(json.dumps(validated_street, ensure_ascii=False, indent=4))
    street_id = str(street_id)
    city_id = str(city_id)
    texted_file = city_id + "_" + street_id + ".txt"
    return texted_file
示例#3
0
文件: tetml.py 项目: jarusified/eve
basedocoptlist = ""

# page-specific option list */
# Remove the tetml= option if you don't need font and geometry information
pageoptlist = "granularity=word tetml={glyphdetails={all}}"

# set this to 1 to generate TETML output in memory */
inmemory = 0

if len(sys.argv) != 3:
    raise Exception("usage: tetml <pdffilename> <xmlfilename>\n")

try:
    try:
        pageno = 0
        tet = TET()

        tet.set_option(globaloptlist)

        if inmemory:
            docoptlist = "tetml={} %s" % basedocoptlist
        else:
            docoptlist = "tetml={filename={%s}} %s" % (sys.argv[2], basedocoptlist)

        doc = tet.open_document(sys.argv[1], docoptlist)

        if doc == -1:
            raise Exception("Error %d in %s(): %s" % (tet.get_errnum(), tet.get_apiname(), tet.get_errmsg()))
        n_pages = int(tet.pcos_get_number(doc, "length:pages"))

        # loop over pages in the document */
示例#4
0
# Search text with at least this size (use 0 to catch all sizes) */
fontsizetrigger = 10

# Catch text where the font name contains this string
# (use empty string to catch all font names)
#/
fontnametrigger = "Bold"

if len(sys.argv) != 2:
    raise Exception("usage: fontfilter <infilename>\n")

try:
    try:
        pageno = 0

        tet = TET()

        tet.set_option(globaloptlist)

        doc = tet.open_document(sys.argv[1], docoptlist)

        if (doc == -1):
            raise Exception("Error %d in %s(): %s\n" % \
                (tet.get_errnum(), tet.get_apiname(), \
                tet.get_errmsg()))

        # get number of pages in the document */
        n_pages = int(tet.pcos_get_number(doc, "length:pages"))

        # loop over pages in the document */
        for pageno in range(1, n_pages+1):
示例#5
0
# page-specific option list */
pageoptlist = "granularity=page"

# here you can insert basic image extract options (more below)
baseimageoptlist = ""


if len(argv) != 2:
    raise Exception("usage: image_extractor <infilename>\n")

outfilebase = argv[1]
try:
    try:

        tet = TET()

        tet.set_option(globaloptlist)


        doc = tet.open_document(argv[1], docoptlist)

        if (doc == -1):
            raise Exception("Error " + tet.get_errnum() + "in " \
                + tet.get_apiname() + "(): " + tet.get_errmsg())

        # get number of pages in the document */
        n_pages = tet.pcos_get_number(doc, "length:pages")

        # loop over pages in the document */
        for pageno in range(1, int(n_pages)+1):
示例#6
0
文件: dumper.py 项目: jarusified/eve
from sys import *
from traceback import print_tb, print_exc
from PDFlib.TET import *

def yesno(arg):
    if (arg != 0):
        return "yes"
    return "no"


if len(argv) != 2:
    raise Exception("usage: dumper <filename>")

try:
    try:
        tet = TET()

        searchpath = "searchpath={{../data}}"
        docoptlist = "requiredmode=minimum"
        globaloptlist = ""

        tet.set_option(searchpath)
        tet.set_option(globaloptlist)

        doc = tet.open_document(argv[1], docoptlist)
        if doc  == -1:
            raise Exception("ERROR: %s\n" % tet.get_errmsg())

        # --------- general information (always available) */
        pcosmode = int(tet.pcos_get_number(doc, "pcosmode"))
示例#7
0
    excName = cla.__name__
    try:
        excArgs = exc.__dict__["args"]
    except KeyError:
        excArgs = "<no args>"
    excTb = traceback.format_tb(trbk, maxTBlevel)
    return (excName, excArgs, excTb)


if len(sys.argv) != 3:
    raise Exception("usage: glyphinfo <infilename> <outfilename>\n")

try:
    try:

        tet = TET()

        if (sys.version_info[0] < 3):
            fp = open(sys.argv[2], 'w')
	    from ctypes import *
	    PyFile_SetEncoding = pythonapi.PyFile_SetEncoding
	    PyFile_SetEncoding.argtypes = (py_object, c_char_p)
	    PyFile_SetEncoding(fp, 'utf-8')
        else:
            fp = open(sys.argv[2], 'w', 2, 'utf-8')

        tet.set_option(globaloptlist)

        doc = tet.open_document(sys.argv[1], docoptlist)

        if (doc == -1):
示例#8
0
# document-specific option list */
docoptlist = ""

# page-specific option list */
pageoptlist = ""

# here you can insert basic image extract options (more below) */
baseimageoptlist = ""

if len(argv) != 2:
    raise Exception("usage: image_resources <filename>\n")

try:
    try:
        tet = TET()

        outfilebase = argv[1]

        tet.set_option(globaloptlist)

        doc = tet.open_document(argv[1], docoptlist)

        if (doc == -1):
            raise Exception("Error " + tet.get_errnum() + "in " \
                + tet.get_apiname() + "(): " + tet.get_errmsg())

        # Images will only be merged upon opening a page.
        # In order to enumerate all merged image resources
        # we open all pages before extracting the images.
        #/