示例#1
0
                  " data is shown below\n%s" % (i, exc, elem), file=errors)
        p.set_file(results)
        if first:
            p.print_csv_titles()
            first = False
        if p.is_valid():
            p.print_csv()
            counter += 1
            if p.has_warnings():
                print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings)
        else:
            print("%s ERROR %s " % (i, p.errors))
            print("[%s] %s %s\n" % (i, elem, p.errors), file=errors)

    if first:
        Patent.print_empty_titles(results)
        Patent.print_zip_info(results)

    print("Total printed: %s\nTotal patents: %s" % (counter, len(patents)-1))
    return (counter == (len(patents) - 1))

if __name__ == "__main__":

    if os.path.exists(PATH):
        shutil.rmtree(PATH)
    os.makedirs(PATH)
    os.makedirs(ERROR_PATH)
    os.makedirs(WARN_PATH)
    # os.makedirs(RES_PATH)
    pfm = PatentsFileManager().get_patent_generator()
示例#2
0
def process_zip(data):
    from parser import Patent
    print("Processing zip...")

    usp = "<patent-assignments"
    usp_c = "</patent-assignments>"
    q = len(data) / 100
    index_1 = data.find(usp, 0, q)
    index_2 = data.find(usp_c)
    index_2 += len(usp_c)

    head = data[0:index_1]
    tail = data[index_2:len(data)-1]

    t1 = time()

    s = BeautifulSoup(head+tail, "lxml")
    t2 = time()
    print("Soup is ready...Time: %s" % (t2-t1))

    patents = data[index_1:index_2+len(usp_c)]
    patents = patents.split("<patent-assignment>")

    patents = ["<patent-assignment>" + p for p in patents]

    dtd = s("us-patent-assignments")[0]["dtd-version"]
    date_produced = s("us-patent-assignments")[0]["date-produced"]
    ak = s("action-key-code")[0].string
    d = s("transaction-date")[0].string

    if not d:
        d = s("transaction-date")[0]("date")[0].string

    results = open(os.path.join(RES_PATH, 'res_%s.csv' % d), "w+")
    errors = open(os.path.join(ERROR_PATH, 'errors_%s.txt' % d), "w+")
    warnings = open(os.path.join(WARN_PATH, 'warnings_%s.txt' % d), "w+")

    Patent.set_zip_info(dtd, date_produced, ak, d)
    first = True
    counter = 0
    exc = ""
    for i in xrange(1, len(patents)):
        elem = patents[i]
        init = False
        retries = 0
        while not init and retries < 10:
            try:
                p = Patent(elem)
                init = True
            except Exception, e:
                exc = e
                retries += 1
                print("Exception during init... retrying (%s)" % retries)
                pass

        if not init:
            print("%s ERROR: could not init Patent object, [[%s]]\nString"
                  " data is shown below\n%s" % (i, exc, elem), file=errors)
        p.set_file(results)
        if first:
            p.print_csv_titles()
            first = False
        if p.is_valid():
            p.print_csv()
            counter += 1
            if p.has_warnings():
                print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings)
        else:
            print("%s ERROR %s " % (i, p.errors))
            print("[%s] %s %s\n" % (i, elem, p.errors), file=errors)
示例#3
0
def process_zip(file):
    t0 = time()

    data = unzip_patent(file)
    name = os.path.basename(file).replace("zip", "csv")

    from parser import Patent
    print("Processing zip...")

    usp = "<patent-assignments"
    usp_c = "</patent-assignments>"
    # q = len(data) / 100
    index_1 = data.find(usp)
    index_2 = data.find(usp_c)
    index_2 += len(usp_c)

    head = data[0:index_1]
    tail = data[index_2:len(data) - 1]

    t1 = time()

    s = BeautifulSoup(head + tail, "lxml")
    t2 = time()
    print("Soup Time: %s" % (t2 - t1))

    patents = data[index_1:index_2 + len(usp_c)]
    patents = patents.split("<patent-assignment>")

    patents = ["<patent-assignment>" + p for p in patents]

    dtd = s("us-patent-assignments")[0]["dtd-version"]
    date_produced = s("us-patent-assignments")[0]["date-produced"]
    ak = s("action-key-code")[0].string
    d = s("transaction-date")[0].string

    if not d:
        d = s("transaction-date")[0]("date")[0].string

    results = open(os.path.join(RES_PATH, name), "w+")
    errors = open(
        os.path.join(ERROR_PATH, 'errors_%s.txt' % name.replace(".csv", "")),
        "w+")
    warnings = open(
        os.path.join(WARN_PATH, 'warnings_%s.txt' % name.replace(".csv", "")),
        "w+")

    Patent.set_zip_info(dtd, date_produced, ak, d)
    first = True
    counter = 1
    exc = ""

    p_time = 0

    pr_time = 0
    p = False
    print("Patents to parse %s" % len(patents))
    for i in xrange(1, len(patents)):
        elem = patents[i]
        init = False
        retries = 0
        while not init and retries < 10:
            try:
                tx1 = time()
                p = Patent(elem)
                p_time += (time() - tx1)
                init = True
            except Exception, e:
                exc = e
                retries += 1
                tb = traceback.format_exc()
                print("Exception during init... retrying (%s)\n - %s" %
                      (retries, tb))
                pass

        if not init:
            print("%s ERROR: could not init Patent object, [[%s]]\nString"
                  " data is shown below\n%s" % (i, exc, elem),
                  file=errors)
        txp = time()
        p.set_file(results)
        if first:
            p.print_csv_titles()
            first = False
        if p.is_valid():
            counter += 1
            if p.has_warnings():
                print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings)
        else:
            print("%s ERROR %s " % (i, p.errors))
            print("[%s] %s %s\n" % (i, elem, p.errors), file=errors)
        p.print_csv()
        pr_time = (time() - txp)
示例#4
0
        p.set_file(results)
        if first:
            p.print_csv_titles()
            first = False
        if p.is_valid():
            counter += 1
            if p.has_warnings():
                print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings)
        else:
            print("%s ERROR %s " % (i, p.errors))
            print("[%s] %s %s\n" % (i, elem, p.errors), file=errors)
        p.print_csv()
        pr_time = (time() - txp)

    if first:
        Patent.print_empty_titles(results)
        Patent.print_zip_info(results)

    print("TIMES:\n\tPatent %s\n\tPrint %s" % (p_time, pr_time))

    print("Total printed: %s\nTotal patents: %s" % (counter, len(patents)))
    if counter == len(patents):
        # os.remove(file)
        pass


def unzip_patent(patent_zip):
    patent_xml = patent_zip.replace("zip", "xml")
    zf = zipfile.ZipFile(patent_zip)
    data = zf.read(os.path.basename(patent_xml))
    return data
def process_zip(data):
    data = data.replace('\n', '')
    print("Processing zip...")
    total = 0.0
    process = 0.0
    printing = 0.0

    usp = "<patent-assignments"
    usp_c = "</patent-assignments>"
    q = len(data) / 100
    index_1 = data.find(usp, 0, q)
    index_2 = data.find(usp_c)
    index_2 += len(usp_c)

    head = data[0:index_1]
    tail = data[index_2:len(data)-1]

    t1 = time()

    s = BeautifulSoup(head+tail, "lxml")
    t2 = time()
    print("Soup is ready...Time: %s" % (t2-t1))

    patents = data[index_1:index_2+len(usp_c)]
    patents = patents.split("<patent-assignment>")

    patents = ["<patent-assignment>" + p for p in patents]

    dtd = s("us-patent-assignments")[0]["dtd-version"]
    date_produced = s("us-patent-assignments")[0]["date-produced"]
    ak = s("action-key-code")[0].string

    s("transaction-date").contents = [p for p in s("transaction-date")[0].contents if p != " "]
    d = s("transaction-date")[0]("date")[0].string

    # print("Transaction %s" % s("transaction-date"))

    results = open(os.path.join("test_results", 'ad%s.csv' % d), "w+")
    errors = open(os.path.join("test_results", 'errors%s.txt' % d), "w+")
    warnings = open(os.path.join("test_results", 'warnings%s.txt' % d), "w+")

    t3 = time()

    print("DTD %s, DP %s, AK %s, D %s" % (dtd, date_produced, ak, d))
    print("Time gathering zip info: %s" % (t3 - t2))

    # Patent.set_zip_info(dtd, date_produced, ak, d)
    # p = Patent(example)
    # p.set_file(results)
    # p.print_csv_titles()
    # p.print_csv()

    Patent.set_zip_info(dtd, date_produced, ak, d)
    first = True
    t4 = time()
    counter = 1
    for i in xrange(1, len(patents)-1):
        if counter < 5:
            elem = patents[i]
            t_x_1 = time()
            p = Patent(elem)
            t_x_2 = time()
            # print("[%s]\nInit %s" % (i, t_x_2-t_x_1))
            process += (t_x_2 - t_x_1)
            p.set_file(results)
            if first:
                p.print_csv_titles()
                first = False
            if p.is_valid():
                tv1 = time()
                p.print_csv()
                counter += 1
                tv2 = time()
                if counter % 10000:
                    print("Printing %s" % (tv2-tv1), end="")
                    print("%s OK" % i)
                printing += (tv2-tv1)
                if p.has_warnings():
                    print(p.get_warnings(), file=warnings)
            else:
                print("\n%s Not valid %s " % (i, p.errors))
                print("[%s] %s %s\n" % (i, elem, p.errors), file=errors)
            t5 = time()
            # print("Total time: %s\n\n" % (t5-t4))
            total += (t5-t4)
            t4 = t5

    if first:
        Patent.print_empty_titles(results)
        Patent.print_zip_info(results)

    print("Processing %s\nPrinting %s\nTotal %s, count: %s" % (process / counter , printing / counter, total / counter, counter))
示例#6
0
def process_zip(data):
    from parser import Patent
    print("Processing zip...")

    usp = "<patent-assignments"
    usp_c = "</patent-assignments>"
    q = len(data) / 100
    index_1 = data.find(usp, 0, q)
    index_2 = data.find(usp_c)
    index_2 += len(usp_c)

    head = data[0:index_1]
    tail = data[index_2:len(data) - 1]

    t1 = time()

    s = BeautifulSoup(head + tail, "lxml")
    t2 = time()
    print("Soup is ready...Time: %s" % (t2 - t1))

    patents = data[index_1:index_2 + len(usp_c)]
    patents = patents.split("<patent-assignment>")

    patents = ["<patent-assignment>" + p for p in patents]

    dtd = s("us-patent-assignments")[0]["dtd-version"]
    date_produced = s("us-patent-assignments")[0]["date-produced"]
    ak = s("action-key-code")[0].string
    d = s("transaction-date")[0].string

    if not d:
        d = s("transaction-date")[0]("date")[0].string

    results = open(os.path.join(RES_PATH, 'res_%s.csv' % d), "w+")
    errors = open(os.path.join(ERROR_PATH, 'errors_%s.txt' % d), "w+")
    warnings = open(os.path.join(WARN_PATH, 'warnings_%s.txt' % d), "w+")

    Patent.set_zip_info(dtd, date_produced, ak, d)
    first = True
    counter = 0
    exc = ""
    for i in xrange(1, len(patents)):
        elem = patents[i]
        init = False
        retries = 0
        while not init and retries < 10:
            try:
                p = Patent(elem)
                init = True
            except Exception, e:
                exc = e
                retries += 1
                print("Exception during init... retrying (%s)" % retries)
                pass

        if not init:
            print("%s ERROR: could not init Patent object, [[%s]]\nString"
                  " data is shown below\n%s" % (i, exc, elem),
                  file=errors)
        p.set_file(results)
        if first:
            p.print_csv_titles()
            first = False
        if p.is_valid():
            p.print_csv()
            counter += 1
            if p.has_warnings():
                print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings)
        else:
            print("%s ERROR %s " % (i, p.errors))
            print("[%s] %s %s\n" % (i, elem, p.errors), file=errors)
示例#7
0
def process_zip(file):
    t0 = time()

    data = unzip_patent(file)
    name = os.path.basename(file).replace("zip", "csv")

    from parser import Patent
    print("Processing zip...")

    usp = "<patent-assignments"
    usp_c = "</patent-assignments>"
    # q = len(data) / 100
    index_1 = data.find(usp)
    index_2 = data.find(usp_c)
    index_2 += len(usp_c)

    head = data[0:index_1]
    tail = data[index_2:len(data)-1]

    t1 = time()

    s = BeautifulSoup(head+tail, "lxml")
    t2 = time()
    print("Soup Time: %s" % (t2-t1))

    patents = data[index_1:index_2+len(usp_c)]
    patents = patents.split("<patent-assignment>")

    patents = ["<patent-assignment>" + p for p in patents]

    dtd = s("us-patent-assignments")[0]["dtd-version"]
    date_produced = s("us-patent-assignments")[0]["date-produced"]
    ak = s("action-key-code")[0].string
    d = s("transaction-date")[0].string

    if not d:
        d = s("transaction-date")[0]("date")[0].string

    results = open(os.path.join(RES_PATH, name), "w+")
    errors = open(os.path.join(ERROR_PATH, 'errors_%s.txt' % name.replace(".csv", "")), "w+")
    warnings = open(os.path.join(WARN_PATH, 'warnings_%s.txt' % name.replace(".csv", "")), "w+")

    Patent.set_zip_info(dtd, date_produced, ak, d)
    first = True
    counter = 1
    exc = ""

    p_time = 0

    pr_time  = 	0
    p = False
    print("Patents to parse %s" % len(patents))
    for i in xrange(1, len(patents)):
        elem = patents[i]
        init = False
        retries = 0
        while not init and retries < 10:
            try:
                tx1 = time()
                p = Patent(elem)
                p_time += (time() - tx1)
                init = True
            except Exception, e:
                exc = e
                retries += 1
                tb = traceback.format_exc()
                print("Exception during init... retrying (%s)\n - %s" % (retries, tb))
                pass

        if not init:
            print("%s ERROR: could not init Patent object, [[%s]]\nString"
                  " data is shown below\n%s" % (i, exc, elem), file=errors)
        txp = time()
        p.set_file(results)
        if first:
            p.print_csv_titles()
            first = False
        if p.is_valid():
            counter += 1
            if p.has_warnings():
                print("%s WARNINGS %s" % (i, p.get_warnings()), file=warnings)
        else:
            print("%s ERROR %s " % (i, p.errors))
            print("[%s] %s %s\n" % (i, elem, p.errors), file=errors)
        p.print_csv()
        pr_time = (time() - txp)