Python main 예제들, pdf2txt.main Python 예제들

예제 #1

0

파일 보기

파일: pdf2json.py 프로젝트: fcabaud/pyReq

def get_req_from_pdf(fileNameIn, regExp, fileNameOut):
  """ transform pdf file fileNameIn in requirements 
       in json file fileNameOut via reg exp regExp
  
     :arg fileNameIn: pdf file containing requirements
     :type fileNameIn: string

     :arg regExp: regular expression
     :type regExp: string

     :arg fileNameOut: Json file
     :type fileNameOut: string
  """
  print("Extract : %s"%fileNameIn)
  pdf2txt.main(["-A", "-o", "../work/output.txt", fileNameIn])

  # 1) Read input file
  fp=open("../work/output.txt","r")
  data = fp.readlines()
  concatenatedData = "".join(data)
  #
  # 2) provide the concatened string to regexp  
  extract = re.findall(regExp, concatenatedData)
  #
  # 3) write json file
  requirements = pyReq(fileNameOut)
  for item in extract:
    requirements.add(item[0], fileNameIn, item[1])
  #
  # 4) Free resources
  del(requirements)
  fp.close()

예제 #2

0

파일 보기

파일: item.py 프로젝트: bendmorris/zot

 def get_full_text(self, storage_dir):
     if hasattr(self, 'attachments'):
         for attachment in self.attachments:
             # TODO: read text from pdf
             pdf2txt.main(['pdf2txt', self.format_filename(attachment, storage_dir)])
     else: 
         return "No PDF attachments."

예제 #3

0

파일 보기

def pdftotextcovert():
    print "***File Path :" + name
    arr = ['arguments', '-o', 'pdftotextconvertedfile.txt', name]
    pdf2txt.main(arr)
    f = open("pdftotextconvertedfile.txt", "r")
    data = f.read()
    data = unicode(data, errors='ignore')
    loadfiledata()

예제 #4

0

파일 보기

파일: tExpress.py 프로젝트: Why-Not-Sky/TExpress

def exec_pdf2txt(pdf_file):
    # pdf2txt.py -t text -o 2900510720726.txt 2900510720726.pdf
    #subprocess.call([cmd, '-t', 'text', '-o', txt_file, pdf_file])
    import pdf2txt

    txt_file = pdf_file.replace('.pdf', '.txt')
    args = ['pdf2txt.py', '-t', 'text', '-o', txt_file, pdf_file]
    pdf2txt.main(args)
    return(txt_file)

예제 #5

0

파일 보기

파일: tExpress.py 프로젝트: Why-Not-Sky/TExpress

def exec_pdf2txt(pdf_file):
    # pdf2txt.py -t text -o 2900510720726.txt 2900510720726.pdf
    #subprocess.call([cmd, '-t', 'text', '-o', txt_file, pdf_file])
    import pdf2txt

    txt_file = pdf_file.replace('.pdf', '.txt')
    args = ['pdf2txt.py', '-t', 'text', '-o', txt_file, pdf_file]
    pdf2txt.main(args)
    return (txt_file)

예제 #6

0

파일 보기

def transforme_pdf_en_txt(fichier_PDF, REPERTOIRE_TXT):

    # Titre de la section où se retrouvent les contrats.
    TITRE_SECTION_20 = " Affaires contractuelles"

    # Titre de la section suivant celle où se retrouvent les contrats.
    TITRE_SECTION_30 = " Administration et finances"

    prefixe_txt = os.path.splitext(os.path.basename(fichier_PDF))[0]
    fichier_TXT_temp = os.path.join(REPERTOIRE_TXT, prefixe_txt + '_temp.txt')
    fichier_TXT = os.path.join(REPERTOIRE_TXT, prefixe_txt + '.txt')
    odj_traites = open(fichier_TXT, "w")

    est_dans_section_affaires_contractuelles = False
    est_dans_section_suivante = False
    compteur_page = 0

    while not est_dans_section_suivante:
        compteur_page += 1

        print("Traitement de la page %s" % compteur_page)

        args = [
            'pdf2txt',
            '-p',
            str(compteur_page),
            '-o',
            fichier_TXT_temp,
            fichier_PDF,
        ]

        pdf2txt.main(args)

        with open(fichier_TXT_temp, 'r') as f:

            for ligne in f:

                if not est_dans_section_affaires_contractuelles:
                    if TITRE_SECTION_20 in ligne:
                        est_dans_section_affaires_contractuelles = True

                if TITRE_SECTION_30 in ligne:
                    est_dans_section_suivante = True
                    break

                elif est_dans_section_affaires_contractuelles:

                    if ligne.startswith("['Page "):
                        # Ne pas écrire le numéro de page du pied-de-page
                        break
                    else:
                        # Ajouter la ligne dans le fichier fichier_TXT
                        odj_traites.writelines(ligne)

    os.remove(fichier_TXT_temp)

    odj_traites.close()

예제 #7

0

파일 보기

파일: odj2txt.py 프로젝트: PascalRobichaudDO101/odj2txt

def transforme_pdf_en_txt(fichier_PDF, REPERTOIRE_TXT):
    
    # Titre de la section où se retrouvent les contrats.
    TITRE_SECTION_20 = " Affaires contractuelles"

    # Titre de la section suivant celle où se retrouvent les contrats.
    TITRE_SECTION_30 = " Administration et finances"

    prefixe_txt = os.path.splitext(os.path.basename(fichier_PDF))[0]
    fichier_TXT_temp = os.path.join(REPERTOIRE_TXT, prefixe_txt + '_temp.txt')
    fichier_TXT = os.path.join(REPERTOIRE_TXT, prefixe_txt + '.txt')
    odj_traites = open(fichier_TXT, "w")

    est_dans_section_affaires_contractuelles = False
    est_dans_section_suivante = False
    compteur_page = 0

    while not est_dans_section_suivante:
        compteur_page += 1

        print("Traitement de la page %s" % compteur_page)
        
        args = [
            'pdf2txt',
            '-p', str(compteur_page),
            '-o', fichier_TXT_temp,
            fichier_PDF,
        ]
        
        pdf2txt.main(args)

        with open(fichier_TXT_temp, 'r') as f:
            
            for ligne in f:

                if not est_dans_section_affaires_contractuelles:
                    if TITRE_SECTION_20 in ligne:
                        est_dans_section_affaires_contractuelles = True

                if TITRE_SECTION_30 in ligne:
                    est_dans_section_suivante = True
                    break
                    
                elif est_dans_section_affaires_contractuelles:

                    if ligne.startswith("['Page "):
                        # Ne pas écrire le numéro de page du pied-de-page
                        break
                    else:
                        # Ajouter la ligne dans le fichier fichier_TXT
                        odj_traites.writelines(ligne)
    
    os.remove(fichier_TXT_temp)
                        
    odj_traites.close()

예제 #8

0

파일 보기

파일: DataManagement_new.py 프로젝트: Weiming-Hu/text-based-six-degree

def convertPDF2TXT_thread(fullpath, lock):
    # 0 conversion succeed and created new txt file
    # 1 failed
    # 2 already done and nothing has been changed

    _argv = [
        "D:/EclipseWorkspace/TextbasedSixDegree/pdf2txt.py", "-o",
        "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" +
        os.path.basename(fullpath)[:-3] + "txt", fullpath
    ]

    if os.path.isfile("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" +
                      os.path.basename(fullpath)[:-3] + "txt"):
        with lock:
            print("process exits with id: %d " % os.getpid())
        return 2
    else:
        try:
            pdf2txt.main(_argv)
        except:
            print("PDF 2 TXT conversion failed. Info:")
            print(sys.exc_info()[1])
            with lock:
                print("process exits with id: %d " % os.getpid())
            return 1

    # format txt
    txt = open(
        "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" +
        os.path.basename(fullpath)[:-3] + "txt", 'r')
    data = txt.read()
    data = data.replace("\f", '')
    data = data.replace('\n', ' ')
    data, number = re.subn(re.compile(" [ ]+"), " ", data)
    data, number = re.subn(re.compile("[^a-zA-Z. ]+"), "", data)
    txt_fmt = open(
        "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" +
        os.path.basename(fullpath)[:-4] + "_fmt.txt", 'w')
    txt_fmt.write(data)

    txt.close()
    txt_fmt.close()
    os.remove("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" +
              os.path.basename(fullpath)[:-3] + "txt")
    os.rename(
        "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" +
        os.path.basename(fullpath)[:-4] + "_fmt.txt",
        "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/pdf" +
        os.path.basename(fullpath)[:-3] + "txt")

    with lock:
        print("process exits with id: %d " % os.getpid())
    return 0

예제 #9

0

파일 보기

파일: gui.py 프로젝트: nogajaakanksha/AQPG

def pdftotextcovert():
    print "***File Path :"+name
    arr = ['arguments', '-o', 'pdftotextconvertedfile.txt',name]
    pdf2txt.main(arr)
    f = open("pdftotextconvertedfile.txt","r") 
    data=f.read()
    data=unicode(data, errors='ignore')
    testdata=""
    wordList = re.sub("[^\w]", " ",  data).split()
    for i in range(len(wordList)):
        testdata=testdata+" "+lemmatizer.lemmatize(wordList[i])
    lbl1.configure(text="File Data :"+testdata)
    nounphrases(testdata)

예제 #10

0

파일 보기

파일: pdf_parser.py 프로젝트: dwheelerau/modules

 def dump_file(self,tag="text",outfile=None):
     '''tag = xml, tag, text,html'''
     if outfile:
         outfil=self.location+"/"+outfile
     else:
         outfile=self.location+"/tmp.txt"
     flag = tag
     #path = self.location+"/"+self.filename
     cmd =  ["spacer","-t",flag,"-o",outfile, self.filename]
     try:
         pdf2txt.main(cmd)
     except IOError:
         print "Missing file!"

예제 #11

0

파일 보기

파일: filehandler.py 프로젝트: jamesra/copycatcher

def pdf_to_text(file_path):
    import pdf2txt
    
    (outpath,ext) = os.path.splitext(file_path)
    outfile = outpath + '.txt'
    
    print(os.path.abspath(outfile))
    if os.path.exists(outfile):
        return text_from_txt_file(outfile)
    
    outfile =  os.path.abspath(outfile)
    file_path =  os.path.abspath(file_path) 
    pdf2txt.main(argv=['pdf2txt', '-o', outfile, file_path ])
    return text_from_txt_file(outfile)

예제 #12

0

파일 보기

파일: TopicModeling.py 프로젝트: andreslechuga/arte_mexicano_antiguo

def convertirLibros(parametros):
	parametros_pdf2txt = list()
	parametros_pdf2txt.append("")
	parametros_pdf2txt.append(parametros.ruta_general)
	parametros_pdf2txt.append(parametros.ruta_base_txts)
	bandera, librosNoConvertidos = pdf2txt.main(parametros_pdf2txt)
	return bandera, librosNoConvertidos

예제 #13

0

파일 보기

파일: parseFile.py 프로젝트: michaelgilhooly/parse-pdf-files

def read_pdfs_directory():
    print "*****PROCESS STARTED*****\n"
    print "Cleaning directory"
    clean_up()
    list_of_pdf_files = glob.glob('pdfs/*.pdf')
    print "List of files that are going to be parsed:"
    print list_of_pdf_files
    with open("datafile.txt", "a") as myfile:
        for individual in list_of_pdf_files:
            print "Reading: {}".format(individual)
            pdf2txt.main(['', '-o', 'individualfile.txt', '-t', 'text', individual])
            individual_file = open('individualfile.txt', 'r')
            individual_content = individual_file.read()
            myfile.write(individual_content)
            print "Finished reading: {}".format(individual)
    print "Completed reading PDF files"
    print "Created datafile.txt from PDFs"

예제 #14

0

파일 보기

def convertPDF(fullpath):
    # read and convert the file to pure texts
    # 0 conversion succeed and created new txt file
    # 1 failed
    # 2 already done and nothing has been changed
    
    _argv = ["D:/EclipseWorkspace/TextbasedSixDegree/pdf2txt.py",
             "-o", "D:/EclipseWorkspace/TextbasedSixDegree/txt_ori/" + os.path.basename(fullpath)[:-3] + "txt",
             fullpath]
    if os.path.isfile("D:/EclipseWorkspace/TextbasedSixDegree/txt_ori/" + os.path.basename(fullpath)[:-3] + "txt"):
        return 2
    else:
        try:
            pdf2txt.main(_argv)
        except:
            print("PDF 2 TXT conversion failed. Info:")
            print(sys.exc_info()[1])
            return 1
    return 0

예제 #15

0

파일 보기

파일: DataManagement_new.py 프로젝트: Weiming-Hu/text-based-six-degree

def convertPDF2TXT_thread(fullpath, lock):
    # 0 conversion succeed and created new txt file
    # 1 failed
    # 2 already done and nothing has been changed
      
    _argv = ["D:/EclipseWorkspace/TextbasedSixDegree/pdf2txt.py",
             "-o", "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt",
             fullpath]
    
    if os.path.isfile("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt"):
        with lock:
            print("process exits with id: %d " % os.getpid())
        return 2
    else:
        try:
            pdf2txt.main(_argv)
        except:
            print("PDF 2 TXT conversion failed. Info:")
            print(sys.exc_info()[1])
            with lock:
                print("process exits with id: %d " % os.getpid())
            return 1
    
    # format txt
    txt = open("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt", 'r')
    data = txt.read()
    data = data.replace("\f", '')
    data = data.replace('\n', ' ')
    data, number = re.subn(re.compile(" [ ]+"), " ", data)
    data, number = re.subn(re.compile("[^a-zA-Z. ]+"), "", data)    
    txt_fmt = open("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-4] + "_fmt.txt", 'w')
    txt_fmt.write(data)
    
    txt.close()
    txt_fmt.close()
    os.remove("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt")
    os.rename("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-4] + "_fmt.txt",
              "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/pdf" + os.path.basename(fullpath)[:-3] + "txt")
    
    with lock:
        print("process exits with id: %d " % os.getpid())
    return 0

예제 #16

0

파일 보기

파일: indi2data.py 프로젝트: varunpillai/bpachackathon

import pdf2txt
from urllib import urlretrieve
import collections

headers = [
    "Starting Serial Num", "Ending Serial Num", "Male", "Female", "Others",
    "Total"
]
finaldata = collections.OrderedDict()

for item in range(1580001, 1580213):
    url = "http://ceokarnataka.kar.nic.in/FinalRoll2013_Final/English/WOIMG/AC158/AC" + str(
        item) + ".pdf"
    filename = str(item) + ".pdf"
    urlretrieve(url, filename)
    pdf2txt.main(["", "-M 40", "-p 1", "-oa.txt", filename])

    infile = open("a.txt").readlines()

    output = collections.OrderedDict()

    for line in infile:
        if line.strip().startswith("Part"):
            for el in line.strip().split():
                if el.isdigit():
                    part = el

        if line.startswith("Male Female"):
            for i in xrange(len(infile[infile.index(line) +
                                       1].strip().split())):
                output[headers[i]] = infile[infile.index(line) +

예제 #17

0

파일 보기

파일: indi2data.py 프로젝트: Vidip/bpachackathon

import json
import pdf2txt
from urllib import urlretrieve
import collections

headers = ["Starting Serial Num", "Ending Serial Num", "Male", "Female", "Others", "Total"]
finaldata = collections.OrderedDict()

for item in range(1580001, 1580213):
    url = "http://ceokarnataka.kar.nic.in/FinalRoll2013_Final/English/WOIMG/AC158/AC" + str(item) + ".pdf"
    filename = str(item) + ".pdf"
    urlretrieve(url, filename)
    pdf2txt.main(["", "-M 40", "-p 1", "-oa.txt", filename])

    infile = open("a.txt").readlines()

    output = collections.OrderedDict()

    for line in infile:
        if line.strip().startswith("Part"):
            for el in line.strip().split():
                if el.isdigit():
                    part = el

        if line.startswith("Male Female"):
            for i in xrange(len(infile[infile.index(line) + 1].strip().split())):
                output[headers[i]] = infile[infile.index(line) + 1].strip().split()[i]

    finaldata["158"+str(item)[-3:]] = output
    print str(item), "complete"

예제 #18

0

파일 보기

import requests
import sys
sys.path.append('/mnt/brick1/justin/nejm/pdfminer-20140328/tools')
import pdf2txt
import logging

# downloads pdfs into temp.pdf, then converts temp.pdf to a text file
logging.basicConfig(filename='save_pdfs.log', level=logging.INFO)
base = 'http://www.nejm.org'

flinks = open(sys.argv[1], 'r')  #opens file with list of links as first column

for line in flinks:
    logging.info(line)
    tokens = line.rstrip().split()
    linkname = tokens[0]

    response = requests.get(base + linkname)

    with open('temp.pdf', 'wb') as f:
        f.write(response.content)

    linkpathsplit = linkname.split('/')
    fname = linkpathsplit[-1] + '_' + tokens[1] + '.txt'
    pdf2txt.main(['wer', '-o', fname, 'temp.pdf'])

예제 #19

0

파일 보기

파일: main.py 프로젝트: lixiao89/literature_network

alchemyapi = AlchemyAPI()

# ---- Convert pdf to txt (doesn't have to be run if no new pdfs are added) ---

pdf_name = []
# get pdf names
for(dirpath, dirnames,filenames) in walk(pdf_path):
    pdf_name.extend(filenames)
    break

# convert pdf to txt
for f in pdf_name:
    if f[-3:] == "pdf":
        full_pdf_path = pdf_path + f
        full_text_path = out_txt_path + f[0:-3] + "txt"
        pdf2txt.main(full_pdf_path, k , full_text_path)

# ---------------------------------------------------------------------------

txt_name = []
for(dirpath, dirnames,filenames) in walk(out_txt_path):
    txt_name.extend(filenames)
    break

json_data = {}
entity_list = []
keywords_list = []
concept_list = []
for f in txt_name:
    if f[-3:] == "txt":
        full_text_path = out_txt_path + f

예제 #20

0

파일 보기

파일: pdfcount.py 프로젝트: potato3d/pdfcount

def pdf_to_text(pdf, text):
	args = ["", "-o", text, pdf]
	pdf2txt.main(args)

예제 #21

0

파일 보기

#
#

#lunghezza = 5 #len(file_list)
lunghezza = len(file_list)

excel = [[0 for x in range(7)] for y in range(lunghezza)]

for f in range(0, lunghezza):
    #for f in range(0,len(file_list)):
    print f
    reportpdf = file_list[f]

    reporttxt = my_dir + '\\temp.txt'

    pdf2txt.main(['', '-o', reporttxt, reportpdf])

    data_raw = []

    testo = open(reporttxt, 'r')

    with testo as myfile:
        for line in myfile:
            data_raw.append(line)

    #for i in range(0,len(data_raw)):
    #    print i, data_raw[i]

    testo.close()

    ############## Data ##############

예제 #22

0

파일 보기

파일: sow_extract.py 프로젝트: glazor14/CaptechPDFProcessing

def scrape_and_parse(pdf_file_name, text_file_name):
    pdf2txt.main([pdf_file_name, "-o", text_file_name])
    sow_parsing_ff(text_file_name)

예제 #23

0

파일 보기

파일: odj2txt.py 프로젝트: deuxpi/odj2txt

def main():        
    print
    print("Debut du traitement")
    print

    REPERTOIRE_PDF = "C:\\ContratsOuvertsMtl\\Ordres_du_jour\\PDF"              #Répertoire où les fichiers PDF sont enregistrés#
    fichier_PDF = ""                                                            #Nom du fichier PDF traité
    REPERTOIRE_TXT = "C:\\ContratsOuvertsMtl\\Ordres_du_jour\\TXT"              #Répertoire où le fichier texte résultant sera sauvegardé
    fichier_TXT = ""                                                            #Nom du fichier texte qui sera généré

    TITRE_SECTION_20 = " Affaires contractuelles"                               #Titre de la section où se retrouvent les contrats
    TITRE_SECTION_30 = " Administration et finances"                            #Titre de la section suivant celle où se retrouvent les contrats

    est_dans_section_affaires_contractuelle = False                             #Variable pour savoir si on est rendu à la section des contrats, pour ne pas sauvegarder
                                                                                #les premières pages inutilement
    continuer = True                                                            #Variable pour arrêter le traitement une fois que la section des contrats est terminée                                                                                
                                                                                
    compteur_page = 0                                                           #Compteur pour le traitement des pages       

    for filename in os.listdir(REPERTOIRE_PDF):                                 #Passer au travers des fichiers PDF

        fichier_PDF = REPERTOIRE_PDF + "\\" + filename
        fichier_TXT = REPERTOIRE_TXT + "\\" + filename.replace("pdf","txt")

        #Ouverture du fichier fichier_TXT pour sauvegarder le traitement
        odj_traites = open(fichier_TXT, "w")      
        fodj_traites = csv.writer(odj_traites, delimiter = ';') 
      
        while continuer:                                                        #Passer au travers des pages du fichier PDF
        
            compteur_page = compteur_page + 1                                   #Compteur pour le traitement des pages
            
            print("Traitement de la page %s" % compteur_page)                   #Afficher le numéro de la page comme indicateur que le traitement fonctionne
            
            arg = ["", '-p', '' + str(compteur_page) + '', '-o', 'C:\\ContratsOuvertsMtl\\Ordres_du_jour\\TXT\\page_' + str(compteur_page) + '.txt', fichier_PDF]
            
            pdf2txt.main(arg)                                                   #Convertir la page du PDF en texte
                
            with open('C:\\ContratsOuvertsMtl\\Ordres_du_jour\\TXT\\page_' + str(compteur_page) + '.txt', "r",) as f:
                reader = csv.reader(f, delimiter = "|")                         #Accéder au fichier texte généré

                for ligne in reader:                                            #Passer au travers du fichier texte généré
                
                    if est_dans_section_affaires_contractuelle == False:        #Indicateur si on est dans la section des contrats
                        if TITRE_SECTION_20 in str(ligne).encode("utf-8"):
                            est_dans_section_affaires_contractuelle = True
                
                    if TITRE_SECTION_30 in str(ligne).encode("utf-8"):          #Indicateur si on a fini de traiter la section des contrats
                        continuer = False
                        break
                    else:
                        if est_dans_section_affaires_contractuelle:             #Écrire la page dans le fichier fichier_TXT
                            if left(str(ligne),7) == "['Page ":                 #Ne pas écrire le numéro de page du pied-de-page
                                break
                            else:
                                #Ajouter la ligne dans le fichier fichier_TXT
                                fodj_traites.writerow(ligne)
                    
            f.close()       

    odj_traites.close()
            
    print      
    print("Fin du traitement")

예제 #24

0

파일 보기

파일: PDFer.py 프로젝트: lucadalbosco/PDFer

#
#

#lunghezza = 5 #len(file_list)
lunghezza = len(file_list)
    
excel = [[0 for x in range(7)] for y in range(lunghezza)] 

for f in range(0,lunghezza):
#for f in range(0,len(file_list)):
    print f
    reportpdf = file_list[f]
    
    reporttxt = my_dir + '\\temp.txt'
    
    pdf2txt.main(['', '-o', reporttxt, reportpdf]) 
    
    data_raw = []
    
    
    testo = open(reporttxt, 'r')
    
    with testo as myfile:
        for line in myfile:
            data_raw.append(line)
            
    #for i in range(0,len(data_raw)):
    #    print i, data_raw[i]
        
    testo.close()

예제 #25

0

파일 보기

파일: indi2data.py 프로젝트: royarpan/bpachackathon

import json
import pdf2txt
from urllib import urlretrieve
import collections

headers = ["Starting Serial Num", "Ending Serial Num", "Male", "Female", "Others", "Total"]
finaldata = collections.OrderedDict()

for item in range(1580001, 1580213):
    url = "http://ceokarnataka.kar.nic.in/FinalRoll2013_Final/English/WOIMG/AC158/AC" + str(item) + ".pdf"
    filename = str(item) + ".pdf"
    urlretrieve(url, filename)
    pdf2txt.main(["", "-M 40", "-p 1", "-oa.txt", filename])

    infile = open("a.txt").readlines()

    output = collections.OrderedDict()

    for line in infile:
        if line.strip().startswith("Part"):
            for el in line.strip().split():
                if el.isdigit():
                    part = el

        if line.startswith("Male Female"):
            for i in xrange(len(infile[infile.index(line) + 1].strip().split())):
                output[headers[i]] = infile[infile.index(line) + 1].strip().split()[i]

    finaldata["158" + str(item)[-3:]] = output
    print str(item), "complete"

예제 #26

0

파일 보기

import json
import pdf2txt
import collections

datalist = []
pdf2txt.main(["", "-M 40", "-ooutput.txt", "voterdata.pdf"])
finaldata = {}
finaldata["assemblywise"] = []
finaldata["total"] = []

infile = open('output.txt').readlines()
sanitized = []

for line in infile:
    if line.strip() and line.strip().split()[0].isdigit() and len(line.strip().split()) > 4:
        sanitized.append(line.strip())

for i in xrange(len(sanitized)):
    if sanitized[i].strip() and sanitized[i].strip().split()[0].isdigit():
        fullstring = ','.join(w for w in sanitized[i].strip().split() if w)
        pointer = 0
        t = collections.OrderedDict()

        pc_no = fullstring.split(',')[pointer]
        pointer += 1
        t["pc_no"] = int(pc_no)

        pc_name = ""
        if not fullstring.split(',')[pointer].isdigit():
            pc_name += fullstring.split(',')[pointer]
            pointer += 1