def main(argv):
    f = open(argv[1], "rb")
    result = get_tables(f)
    with open(argv[2], "wb") as f:
        for sublist in result:
            for subsublist in sublist:
            	flag1=0
            	flag2=0
            	count=0
            	subsublist2=[]
            	tmp_sublist=[]
                writer = csv.writer(f)
                if(subsublist[0]!='' and subsublist[1]=='' and subsublist[2]=='' and subsublist[3]==''):
               		flag1=1
               	for item in subsublist:
               		if item == '-':
               			count+=1
                if(count>=2):
                	flag2=1
                if(not flag1):
                	for item in subsublist:
                		if(item != ''):
                			subsublist2.append(item)
                	subsublist=subsublist2
                if(flag1 or flag2):
                	for elem in subsublist:
                		elem=elem.replace(",","/")
                		tmp_sublist.append(elem)
                	writer.writerow(tmp_sublist)
Exemplo n.º 2
0
def extract_bp_records(filepath):
    pdffile = open(filepath, 'rb')
    result = get_tables(pdffile)
    columns = [
        'ID', 'gender', 'date of birth', 'time', 'T', 'SBP', 'DBP', 'pulse'
    ]
    bp_records = {col: [] for col in columns}
    patient_id, gender, date_str = None, 'M', None
    for page in result:
        for line in page:
            full_str = ''.join(line).lower()
            if ('id' in full_str and 'date' in full_str
                    and patient_id is None):
                ind1 = full_str.index('id')
                while (full_str[ind1].isalpha() == False):
                    ind1 += 1
                ind2 = full_str.index('date')
                print(ind2 - ind1)
                if (ind2 - ind1 == 11):
                    patient_id = full_str[ind1 + 2:ind2].upper()
            if ('female' in full_str):
                gender = 'F'
            if ('birth' in full_str and '/' in full_str and date_str is None):
                ind1 = full_str.index('/')
                ind2 = full_str[ind1 + 1:].index('/')
                ind3 = full_str[ind2 + 1:].index('/')
                date_str = full_str[ind1 - 4:ind3 - 4]
    print(patient_id, gender, date_str)
    for page in result:
        for line in page:
            f_line = []
            for txt in line:
                if (len(txt) > 0):
                    f_line.append(txt)
            if (len(f_line) < 2):
                continue
            if(':' in f_line[0] and len(f_line[0]) == 5 and\
                f_line[1] in ['A', 'E']):
                bp_records['time'].append(f_line[0])
                bp_records['T'].append(f_line[1])
                if (f_line[1] == 'A'):
                    try:
                        bp_records['SBP'].append(int(f_line[2]))
                        bp_records['DBP'].append(int(f_line[3]))
                        bp_records['pulse'].append(int(f_line[4]))
                    except:
                        bp_records['SBP'].append(np.nan)
                        bp_records['DBP'].append(np.nan)
                        bp_records['pulse'].append(np.nan)
                else:
                    bp_records['SBP'].append(np.nan)
                    bp_records['DBP'].append(np.nan)
                    bp_records['pulse'].append(np.nan)
                bp_records['ID'].append(patient_id)
                bp_records['gender'].append(gender)
                bp_records['date of birth'].append(date_str)
    df = pd.DataFrame.from_dict(bp_records)[columns]
    df = df.drop_duplicates().set_index(['ID', 'time'])
    return df
def echo(wb,matrix,title,n,postmonth,postday):
	wb.create_sheet(title = title)
	sheet = wb.get_sheet_by_name(title)
	res = get_tables(matrix)
	for subres in res:
		for subsubres in subres:
			for elem in subsubres:
				sheet[get_column_letter(subsubres.index(elem)+1)+str(subres.index(subsubres)+1)]=elem
def fill(wb,matrix,title,n,postmonth,postday):
	wb.create_sheet(title = title)
	sheet = wb.get_sheet_by_name(title)
	flag1=flag2=flag3=0
	res = get_tables(matrix)
	sheet['A1']="Label Index"
	sheet['B1']="Date"
	sheet['C1']="Low Price"
	sheet['D1']="High Price"
	sheet['E1']="Average"
	sheet['F1']="Unit"
	sheet['G1']="Frequency"
	i=1
	if n == 14:
		n=-1
	elif n == 7:
		n=-2
	else:
		n=-3
	for subres in res:
		for subsubres in subres:
			if 'Fob prices' in subsubres:
				flag1=1
				aux = "FOB "
			if 'Delivered prices' in subsubres:
				flag2=1
				aux = "CFR "
			if 'Prices' in subsubres[0]:
				flag3=1
			if flag1 == 1 and flag3 !=1:
				if subsubres[0] not in ["","Of which:","Fob prices","Delivered prices"] and ("-" not in subsubres[0] or "Tampa" in subsubres[0] or "Gulf" in subsubres[0]):
					if "Yuzhnyy" in subsubres[0]:
						country="Yuzhnyy"
					elif "Southeast Asia" in subsubres[0]:
						country="SE Asia"
					elif "(duty paid" in subsubres[0]:
						country=subsubres[0].split(" ")[0]+" "+subsubres[0].split(" ")[1]+" (duty paid)"
					elif "(duty unpaid" in subsubres[0]:
						country=subsubres[0].split(" ")[0]+" "+subsubres[0].split(" ")[1]+" (duty unpaid)"
					elif "Gulf" in subsubres[0] and "Tampa" not in subsubres[0]:
						country="US Gulf (MS, LA, TX)"
					elif "Tampa" in subsubres[0] and "Gulf" not in subsubres[0]:
						country="Tampa"
					else:
						country=subsubres[0]
					sheet['A'+str(i)]="Ammonia "+aux+country
					sheet['B'+str(i)]=(str(postday)+"/"+str(postmonth)+"/2016").strip("'")
					if "-" in subsubres[n]:
						sheet['C'+str(i)]=float(subsubres[n].strip('*').split('-')[0].strip("'"))
						sheet['D'+str(i)]=float(subsubres[n].strip('*').split('-')[1].strip("'"))
					else:
						sheet['C'+str(i)]=sheet['D'+str(i)]=float(subsubres[n].strip('*').strip("'"))
					sheet['E'+str(i)]="=AVERAGE(C"+str(i)+":D"+str(i)+")"
					sheet['F'+str(i)]="$/T"
					sheet['G'+str(i)]="Weekly"
					#print subsubres
					i+=1
Exemplo n.º 5
0
 def __init__(self, fileobj=None, filename=None):
     if get_tables is None:
         raise ImportError("pdftables is not installed")
     if filename is not None:
         self.fh = open(filename, 'r')
     elif fileobj is not None:
         self.fh = fileobj
     else:
         raise TypeError('You must provide one of filename or fileobj')
     self.raw_tables = get_tables(self.fh)
Exemplo n.º 6
0
 def __init__(self, fileobj=None, filename=None):
     if get_tables is None:
         raise ImportError("pdftables is not installed")
     if filename is not None:
         self.fh = open(filename, 'r')
     elif fileobj is not None:
         self.fh = fileobj
     else:
         raise TypeError('You must provide one of filename or fileobj')
     self.raw_tables = get_tables(self.fh)
Exemplo n.º 7
0
def test_it_includes_page_numbers():
    """
    page_number is 1-indexed, as defined in the PDF format
    table_number is 1-indexed
    """
    fh = open('fixtures/sample_data/AnimalExampleTables.pdf', 'rb')
    result = get_tables(fh)
    assert_equals(result[0].total_pages, 4)
    assert_equals(result[0].page_number, 2)
    assert_equals(result[1].total_pages, 4)
    assert_equals(result[1].page_number, 3)
    assert_equals(result[2].total_pages, 4)
    assert_equals(result[2].page_number, 4)
Exemplo n.º 8
0
def _test_sample_pdf(short_filename):
    with open(join(SAMPLE_DIR, short_filename), 'rb') as f:
        tables = get_tables(f)

    assert_equal(get_expected_number_of_tables(short_filename), len(tables))
    for table_num, table in enumerate(tables):
        table_filename = "{}_{}.txt".format(short_filename, table_num)
        expected_filename = join(EXPECTED_DIR, table_filename)
        actual_filename = join(ACTUAL_DIR, table_filename)

        with open(actual_filename, 'w') as f:
            f.write(to_string(table).encode('utf-8'))

        diff_table_files(expected_filename, actual_filename)
Exemplo n.º 9
0
def _test_sample_pdf(short_filename):
    with open(join(SAMPLE_DIR, short_filename), "rb") as f:
        tables = get_tables(f)

    assert_equal(get_expected_number_of_tables(short_filename), len(tables))
    for table_num, table in enumerate(tables):
        table_filename = "{}_{}.txt".format(short_filename, table_num)
        expected_filename = join(EXPECTED_DIR, table_filename)
        actual_filename = join(ACTUAL_DIR, table_filename)

        with open(actual_filename, "w") as f:
            f.write(to_string(table).encode("utf-8"))

        diff_table_files(expected_filename, actual_filename)
Exemplo n.º 10
0
    def parse_pdf_files(self, cleaner=None):
        """
        The cleaner parameter should be an instance of RowCleaner
        """
        data = []

        if not cleaner:
           cleaner = RowCleaner
        row_cleaner = cleaner()

        for file in self.pdfs:
            with open(file, 'rb') as f:
                tables = get_tables(f)
                rows = [row_cleaner.get_row(row) for table in tables for row in table]
                data.extend([row for row in rows if row])
        self.parsed_data = data
Exemplo n.º 11
0
def parse_laptimes(filepath):
    """
    Parses a PDF of qualifying or practice report lap times from the FIA into data that we can further analyze. See
    an example at `Japan Qualifying Report <http://www.fia.com/sites/default/files/championship/event_report/documents/2014_15_JPN_F1_Q0_Timing_QualifyingSessionLapTimes_V01.pdf>`_.

    :param filepath: a string pathname to the pdf on your local computer
    :return: - a pandas dataframe with column for number, name, and times
    """

    # open the file
    with open(filepath, 'rb') as fileobj:

        names = []
        nums = []
        times = []

        tables = get_tables(fileobj)

        # loop over each page/table that were parsed out, and append drivers into one list
        for table in tables:
            this_drivers = get_drivers(table)
            for driver in this_drivers:
                # drivers.append(driver)

                h = len(driver['times'])
                this_names = [driver['name']] * h
                this_num = [int(driver['num'])] * h

                names.extend(this_names)
                nums.extend(this_num)
                times.extend(driver['times'])

        # create pandas dataframe
        nums = np.asarray(nums)
        names = np.asarray(names)
        times = np.asarray(times)
        drivers = pd.DataFrame({
            'driver_no': nums,
            'name': names,
            'time': times
        })

        return drivers
def fill(wb,matrix,title,n,postmonth,postday):
	wb.create_sheet(title = title)
	sheet = wb.get_sheet_by_name(title)
	flag1=flag2=0
	res = get_tables(matrix)
	sheet['A1']="Label Index"
	sheet['B1']="Date"
	sheet['C1']="Low Price"
	sheet['D1']="High Price"
	sheet['E1']="Average"
	sheet['F1']="Unit"
	sheet['G1']="Frequency"
	if n == 14:
		n=-2
	elif n == 7:
		n=-3
	else:
		n=-4
	k=0
	for subres in res:
		i=2
		for subsubres in subres:
			if 'Med cfr' in subsubres[0]:
				flag1=1
			if ('*' in subsubres[0] or (subsubres[0]=="" and subsubres[2]!="")) and flag1 == 1:
				flag2=1
			if flag1 == 1 and subsubres[1]!="" and flag2 == 0:
				prod_list=["Sulphur CFR Med (incl N. Africa)","Sulphur CFR Med (small lots N africa)","Sulphur CFR Med (small lots Others Markets)","Sulphur CFR North Africa (contract)","Sulphur FOB Med (small lots other markets)","Sulphur CFR China (Contract)","Sulphur CFR China (Spot)","Sulphur CFR India (Spot)","Sulphur (Liquid) CFR Brazil","Sulphur FOB Vancouver (Contract)","Sulphur FOB Vancouver (Spot)","Sulphur FOB California (Spot)","Sulphur FOB Middle East","Sulphur FOB Middle East (Contract)","Sulphur FOB Middle East (Spot)","Sulphur FOB Qatar (Tasweeq QSP)","Sulphur FOB Saudi Arabia (Armaco)","Sulphur FOB Middle East (Adnoc)","Sulphur CPT NW Europe","Sulphur DEL Benelux","Sulphur CFR Tampa/Central Florida Deliv.","Sulphur CFR Houston (Spot)","Sulphur Ex-tank Galveston"]
				if i == len(prod_list) + 2:
					print "Done"
				else:
					sheet['A'+str(i)]=prod_list[i-2]
					sheet['B'+str(i)]=(str(postday)+"/"+str(postmonth)+"/2016").strip("'")
					if "-" in subsubres[n]:
						sheet['C'+str(i)]=float(subsubres[n].split('-')[0].strip("'").strip('*'))
						sheet['D'+str(i)]=float(subsubres[n].split('-')[1].strip("'").strip('*'))
					else:
						sheet['C'+str(i)]=sheet['D'+str(i)]=float(subsubres[n].strip('*').strip("'"))
					sheet['E'+str(i)]="=AVERAGE(C"+str(i)+":D"+str(i)+")"
					sheet['F'+str(i)]="$/T"
					sheet['G'+str(i)]="Weekly"
					i+=1
Exemplo n.º 13
0
def parse_laptimes(filepath):
    """
    Parses a PDF of qualifying or practice report lap times from the FIA into data that we can further analyze. See
    an example at `Japan Qualifying Report <http://www.fia.com/sites/default/files/championship/event_report/documents/2014_15_JPN_F1_Q0_Timing_QualifyingSessionLapTimes_V01.pdf>`_.

    :param filepath: a string pathname to the pdf on your local computer
    :return: - a pandas dataframe with column for number, name, and times
    """

    # open the file
    with open(filepath, 'rb') as fileobj:

        names = []
        nums = []
        times = []

        tables = get_tables(fileobj)

        # loop over each page/table that were parsed out, and append drivers into one list
        for table in tables:
            this_drivers = get_drivers(table)
            for driver in this_drivers:
                # drivers.append(driver)

                h = len(driver['times'])
                this_names = [driver['name']] * h
                this_num = [int(driver['num'])] * h

                names.extend(this_names)
                nums.extend(this_num)
                times.extend(driver['times'])

        # create pandas dataframe
        nums = np.asarray(nums)
        names = np.asarray(names)
        times = np.asarray(times)
        drivers = pd.DataFrame({'driver_no':nums, 'name':names, 'time':times})

        return drivers
Exemplo n.º 14
0
def test_it_includes_table_numbers():
    fh = open('fixtures/sample_data/AnimalExampleTables.pdf', 'rb')
    result = get_tables(fh)
    assert_equals(result[0].table_number_on_page, 1)
    assert_equals(result[0].total_tables_on_page, 1)
def fill(wb,matrix,title,n,postmonth,postday):
	#echo(wb,matrix,title,n,postmonth,postday)
	#return
	wb.create_sheet(title = title)
	sheet = wb.get_sheet_by_name(title)
	flag1=flag2=0
	res = get_tables(matrix)
	sheet['A1']="Label Index"
	sheet['B1']="Date"
	sheet['C1']="Low Price"
	sheet['D1']="High Price"
	sheet['E1']="Average"
	sheet['F1']="Unit"
	sheet['G1']="Frequency"
	if title.split(" ")[1] not in ["Mai","Juin","Juillet","Aout"] or title == "2 Juin":
		if n == 14:
			n=-2
		elif n == 7:
			n=-3
		else:
			n=-4
	else:
		if n == 14:
			n=-1
		elif n == 7:
			n=-2
		else:
			n=-3
	k=0
	print n
	l=[]
	for subres in res:
		m=[]
		for subsubres in subres:
			v=[]
			for item in subsubres:
				if item=="" and subsubres.index(item)!=0:
					v.append(subsubres[subsubres.index(item)-1])
				else:
					v.append(item)
			m.append(v)
		l.append(m)

	for subres in l:
		i=2
		for subsubres in subres:
			fla=0
			if 'US Gulf fob bulk' in subsubres[0]:
				flag1=1
			if 'Casablanca' in subsubres[0]:
				flag2=1
			if flag1 == 1 and subsubres[0] not in ["PHOSPHATE ROCK","PHOSPHORIC ACID","GTSP","MAP","Black Sea fob bulk",""] and flag2 == 0:
				prod_list=["DAP FOB US Gulf","DAP FOB Morocco","DAP FOB Tunisia","DAP FOB Jordan","DAP FOB Saudi Arabia","DAP FOB Baltic","DAP FOB Australia","DAP FOB China","DAP FCA Benelux","DAP bulk CFR India","DAP FOB Nola","DAP FOB C.Florida","TSP FOB Bulgaria","TSP FOB Morocco","TSP FOB Tunisia","TSP FOB Lebanon","TSP FOB Mexico","TSP Bagged FOB China","MAP FOB Black Sea","MAP FOB Baltic","MAP CFR Brazil","Phosphoric Acid FOB US Golf","Phosphoric Acid CFR India","Phosphate Rock FOB Casablanca"]
				if i == len(prod_list) + 2:
					print "Done"
				else:
					sheet['A'+str(i)]=prod_list[i-2]
					sheet['B'+str(i)]=(str(postday)+"/"+str(postmonth)+"/2016").strip("'")
					if "-" in subsubres[n]:
						sheet['C'+str(i)]=float(subsubres[n].split('-')[0].strip("'").strip('*'))
						sheet['D'+str(i)]=float(subsubres[n].split('-')[1].strip("'").strip('*'))
					else:
						if "n.m" not in subsubres[n]:
							sheet['C'+str(i)]=sheet['D'+str(i)]=subsubres[n].strip('*').strip("'")
						else:
							sheet['C'+str(i)]=sheet['D'+str(i)]="n.m."
							fla=1
					if fla == 0:
						sheet['E'+str(i)]="=AVERAGE(C"+str(i)+":D"+str(i)+")"
					else:
						sheet['E'+str(i)]="n.m."
					sheet['F'+str(i)]="$/T"
					sheet['G'+str(i)]="Weekly"
					i+=1
Exemplo n.º 16
0
    'Country', 'Child Labor 2005-2012 (%) total',
    'Child Labor 2005-2012 (%) male', 'Child Labor 2005-2012 (%) female',
    'Child Marriage 2005-2012 (%) married by 15',
    'Child Marriage 2005-2012 (%) married by 18',
    'Birth registration 2005-2012 (%)',
    'Female Genital mutilation 2002-2012 (prevalence), women',
    'Female Genital mutilation 2002-2012 (prevalence), girls',
    'Female Genital mutilation 2002-2012 (support)',
    'Justification of wife beating 2005-2012 (%) male',
    'Justification of wife beating 2005-2012 (%) female',
    'Violent discipline 2005-2012 (%) total',
    'Violent discipline 2005-2012 (%) male',
    'Violent discipline 2005-2012 (%) female'
]

all_tables = get_tables(open('data/EN-FINAL Table 9.pdf', 'rb'))

first_name = False
final_data = []

for table in all_tables:
    for row in table[5:]:
        if row[0] == '' or row[0][0].isdigit():
            continue
        elif row[2] == '':
            first_name = row[0]
            continue
        if first_name:
            row[0] = u'{} {}'.format(first_name, row[0])
            first_name = False
Exemplo n.º 17
0
from pdftables import get_tables
import pprint

headers = [
    'Site', 'Parameter', 'Date (LST)', 'Year', 'Month', 'Day', 'Hour', 'Value',
    'Unit', 'Duration', 'QC Name'
]

all_tables = get_tables(open('AQI_2016_11.pdf', 'rb'))

first_name = False
data = []

#Get headers and row for PDF file
for table in all_tables:
    for row in table[5:]:
        data.append(dict(zip(headers, row)))

#Pare dict and build a new dict
final_data = []
for ele in data:
    #Jump off unvalid data
    if ele.get('QC Name') == 'Missing':
        continue
    site = ele.get('Site')
    del ele['Site']

    final_data.append({site: ele})

pprint.pprint(final_data)
Exemplo n.º 18
0
headers = ['Country', 'Child Labor 2005-2012 (%) total',
           'Child Labor 2005-2012 (%) male',
           'Child Labor 2005-2012 (%) female',
           'Child Marriage 2005-2012 (%) married by 15',
           'Child Marriage 2005-2012 (%) married by 18',
           'Birth registration 2005-2012 (%)',
           'Female Genital mutilation 2002-2012 (prevalence), women',
           'Female Genital mutilation 2002-2012 (prevalence), girls',
           'Female Genital mutilation 2002-2012 (support)',
           'Justification of wife beating 2005-2012 (%) male',
           'Justification of wife beating 2005-2012 (%) female',
           'Violent discipline 2005-2012 (%) total',
           'Violent discipline 2005-2012 (%) male',
           'Violent discipline 2005-2012 (%) female']

all_tables = get_tables(open('EN-FINAL Table 9.pdf', 'rb'))

first_name = False
final_data = []

for table in all_tables:
    for row in table[5:]:
        if row[0] == '' or row[0][0].isdigit():
            continue
        elif row[2] == '':
            first_name = row[0]
            continue
        if first_name:
            row[0] = u'{} {}'.format(first_name, row[0])
            first_name = False
from pdftables import get_tables
import pprint

all_tables = get_tables(open('E:/xhy_python/data-wrangling-master/data/chp5/EN-FINAL Table 9.pdf', 'rb'))
# print all_tables[0][:6]

# for table in all_tables:
# 	for row in table[5:]:
# 		if row[2]=='':
# 			print row

headers = ['Country', 'Child Labor 2005-2012 (%) total',
		   'Child Labor 2005-2012 (%) male',
		   'Child Labor 2005-2012 (%) female',
		   'Child Marriage 2005-2012 (%) married by 15',
		   'Child Marriage 2005-2012 (%) married by 18',
		   'Birth registration 2005-2012 (%)',
		   'Female Genital mutilation 2002-2012 (prevalence), women',
		   'Female Genital mutilation 2002-2012 (prevalence), girls',
		   'Female Genital mutilation 2002-2012 (support)',
		   'Justification of wife beating 2005-2012 (%) male',
		   'Justification of wife beating 2005-2012 (%) female',
		   'Violent discipline 2005-2012 (%) total',
		   'Violent discipline 2005-2012 (%) male',
		   'Violent discipline 2005-2012 (%) female']

# first_name=''
first_name = False
final_data = []

for table in all_tables:
Exemplo n.º 20
0
headers = ['Country', 'Child Labor 2005-2012 (%) total',
           'Child Labor 2005-2012 (%) male',
           'Child Labor 2005-2012 (%) female',
           'Child Marriage 2005-2012 (%) married by 15',
           'Child Marriage 2005-2012 (%) married by 18',
           'Birth registration 2005-2012 (%)',
           'Female Genital mutilation 2002-2012 (prevalence), women',
           'Female Genital mutilation 2002-2012 (prevalence), girls',
           'Female Genital mutilation 2002-2012 (support)',
           'Justification of wife beating 2005-2012 (%) male',
           'Justification of wife beating 2005-2012 (%) female',
           'Violent discipline 2005-2012 (%) total',
           'Violent discipline 2005-2012 (%) male',
           'Violent discipline 2005-2012 (%) female']

all_tables = get_tables(open('../../data/chp5/EN-FINAL Table 9.pdf', 'rb'))

first_name = False
final_data = []

for table in all_tables:
    for row in table[5:]:
        if row[0] == '' or row[0][0].isdigit():
            continue
        elif row[2] == '':
            first_name = row[0]
            continue
        if first_name:
            row[0] = u'{} {}'.format(first_name, row[0])
            first_name = False