def pdf2refs(opts, args): """.""" global url_re xmltag = True highlight = False for o, a in opts: if (o == '--noxml'): xmltag = False elif (o == '--highlight'): highlight = True tree = pdf2etree(args) pubs = [] urls = [] try: xps = tree.xpath('//BLOCK') except AttributeError: return tree hit_ref = 0 refs = [] for el in xps: origtxts = [] for el2 in el.iter(): try: origtxts.append(el2.text.strip()) except AttributeError: pass if el2 != el and el2.tail is not None: origtxts.append(el2.tail.strip()) origtxt = ' '.join(origtxts) if not len(origtxt): continue elif origtxt.strip().startswith(('Reference', 'REFERENCE')) or origtxt.find('Reference') > 0 or origtxt[:20].find('REFERENCE') > 0: hit_ref = 1 continue elif hit_ref: refs.append(origtxt) for ref in split_refs('\n'.join(refs)): for url in url_re.findall(ref): urls.append(url[0]) pubbits = [] for pubnode in el.xpath(".//TOKEN[@italic='yes']"): pubtxt = etree.tostring(pubnode, method='text', encoding="UTF-8") pubbits.append(pubtxt) if len(pubbits): pubs.append(' '.join(pubbits)) if xmltag: ref = tag_ref(ref, highlight) sys.stdout.write(ref + '\n') sys.stdout.flush() if len(pubs): sys.stdout.write('-'*10 + "\nCited Publications\n" + '-'*10 + '\n') for pub in pubs: sys.stdout.write(pub + '\n') sys.stdout.flush() if len(urls): sys.stdout.write('-'*10 + "\nCited URLs\n" + '-'*10 + '\n') for url in urls: sys.stdout.write(url + '\n') sys.stdout.flush()
def opencalaistags(opts, args): global api_key tree = pdf2etree(args) # could do something more sophisticated, but for now use full text full_text = ' '.join([etree.tostring(el, method="text", encoding="UTF-8") for el in tree.xpath('//TOKEN')]) oc = OpenCalaisService("http://api1.opencalais.com/enlighten/rest/", api_key, "PDF SSA4MET Open Calais Tagger") ft_graph = oc.rdfFromText(full_text) # for t in oc.entitiesFromRdf(ft_graph): # print str(t) # print '-'*10,"\nEntities from Open Calais\n",'-'*10 # for t in oc.entitiesFromRdf(ft_graph): # print str(t) # print '-'*10,"\nSocial Tags from Open Calais\n",'-'*10 for tn, uri in oc.tagsFromRdf(ft_graph): sys.stdout.writelines([str(tn).ljust(35), str(uri), '\n']) sys.stdout.flush() return 0
def opencalaistags(opts, args): global api_key tree = pdf2etree(args) # could do something more sophisticated, but for now use full text full_text = ' '.join([ etree.tostring(el, method="text", encoding="UTF-8") for el in tree.xpath('//TOKEN') ]) oc = OpenCalaisService("http://api1.opencalais.com/enlighten/rest/", api_key, "PDF SSA4MET Open Calais Tagger") ft_graph = oc.rdfFromText(full_text) # for t in oc.entitiesFromRdf(ft_graph): # print str(t) # print '-'*10,"\nEntities from Open Calais\n",'-'*10 # for t in oc.entitiesFromRdf(ft_graph): # print str(t) # print '-'*10,"\nSocial Tags from Open Calais\n",'-'*10 for tn, uri in oc.tagsFromRdf(ft_graph): sys.stdout.writelines([str(tn).ljust(35), str(uri), '\n']) sys.stdout.flush() return 0
def pdf2heads(opts, args): xmltag = True highlight = False titleonly = False authonly = False for o, a in opts: if (o == '--noxml'): xmltag = False elif (o == '--highlight'): highlight = True if (o == '--title'): titleonly = True elif (o == '--author'): authonly = True tree = pdf2etree(args) # find title page = 1 block = 1 title_node = None while True: try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] except IndexError: page += 1 else: break if page > 2: # probably not going to find it now break # find author page = 1 block = 2 auth_node = None while True: try: auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] except InbdexError: block += 1 else: break if block > 4: # probably not going to find it now break font_sizes = tree.xpath('//TOKEN/@font-size') mean_font_size = mean(font_sizes) median_font_size = median(font_sizes) #print "Median Font Size (i.e. body text):", median_font_size font_colors = tree.xpath('//TOKEN/@font-color') font_color_hash = {} for fc in font_colors: try: font_color_hash[fc] += 1 except KeyError: font_color_hash[fc] = 1 sortlist = [(v, k) for k, v in font_color_hash.iteritems()] sortlist.sort(reverse=True) main_font_color = sortlist[0][1] head_txts = [] stop = False for page_node in tree.xpath('//PAGE'): for block_node in page_node.xpath('.//BLOCK'): if xmltag: if block_node == title_node: st = "<title>" et = "</title>" elif block_node == auth_node: st = "<author>" et = "</author>" else: st = "<heading>" et = "</heading>" if highlight: st = "\033[0;32m{0}\033[0m".format(st) et = "\033[0;32m{0}\033[0m".format(et) else: st = et = "" if block_node == title_node and authonly: continue headers = block_node.xpath( ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']" .format(mean_font_size * 1.05, main_font_color)) head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in headers ]) if len(head_txt): head_txts.append("{0}{1}{2}".format(st, head_txt, et)) if block_node == title_node and titleonly: stop = True break elif block_node == auth_node and authonly: stop = True break if stop: break for txt in head_txts: sys.stdout.writelines([txt, '\n'])
def pdf2heads(opts, args): xmltag = True highlight = False titleonly = False authonly = False for o, a in opts: if (o == '--noxml'): xmltag = False elif (o == '--highlight'): highlight = True if (o == '--title'): titleonly = True elif (o == '--author'): authonly = True tree = pdf2etree(args) # find title page = 1 block = 1 title_node = None while True: try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0] except IndexError: page+=1 else: break if page > 2: # probably not going to find it now break # find author page = 1 block = 2 auth_node = None while True: try: auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0] except InbdexError: block+=1 else: break if block > 4: # probably not going to find it now break font_sizes = tree.xpath('//TOKEN/@font-size') mean_font_size = mean(font_sizes) median_font_size = median(font_sizes) #print "Median Font Size (i.e. body text):", median_font_size font_colors = tree.xpath('//TOKEN/@font-color') font_color_hash = {} for fc in font_colors: try: font_color_hash[fc]+=1 except KeyError: font_color_hash[fc] = 1 sortlist = [(v,k) for k,v in font_color_hash.iteritems()] sortlist.sort(reverse=True) main_font_color = sortlist[0][1] head_txts = [] stop = False for page_node in tree.xpath('//PAGE'): for block_node in page_node.xpath('.//BLOCK'): if xmltag: if block_node == title_node: st = "<title>" et = "</title>" elif block_node == auth_node: st = "<author>" et = "</author>" else: st = "<heading>" et = "</heading>" if highlight: st = "\033[0;32m{0}\033[0m".format(st) et = "\033[0;32m{0}\033[0m".format(et) else: st = et = "" if block_node == title_node and authonly: continue headers = block_node.xpath(".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']".format(mean_font_size*1.05, main_font_color)) head_txt = ' '.join([etree.tostring(el, method='text', encoding="UTF-8") for el in headers]) if len(head_txt): head_txts.append("{0}{1}{2}".format(st, head_txt, et)) if block_node == title_node and titleonly: stop = True break elif block_node == auth_node and authonly: stop = True break if stop: break for txt in head_txts: sys.stdout.writelines([txt, '\n'])
def pdf2heads(opts, args): xmltag = True highlight = False titleonly = False authonly = False for o, a in opts: if (o == '--noxml'): xmltag = False elif (o == '--highlight'): highlight = True if (o == '--title'): titleonly = True elif (o == '--author'): authonly = True tree = pdf2etree(args) # find title page = 1 block = 1 title_node = None while True: try: title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0] except IndexError: page+=1 else: break if page > 2: # probably not going to find it now break # find author page = 1 block = 2 auth_node = None while True: try: auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format(page, block))[0] except InbdexError: block+=1 else: break if block > 4: # probably not going to find it now break font_sizes = tree.xpath('//TOKEN/@font-size') mean_font_size = mean(font_sizes) median_font_size = median(font_sizes) #print "Median Font Size (i.e. body text):", median_font_size font_colors = tree.xpath('//TOKEN/@font-color') font_color_hash = {} for fc in font_colors: try: font_color_hash[fc]+=1 except KeyError: font_color_hash[fc] = 1 sortlist = [(v,k) for k,v in font_color_hash.iteritems()] sortlist.sort(reverse=True) main_font_color = sortlist[0][1] head_txts = [] stop = False for page_node in tree.xpath('//PAGE'): for block_node in page_node.xpath('.//BLOCK'): if xmltag: if block_node == title_node: st = "<title>" et = "</title>" elif block_node == auth_node: st = "<author>" et = "</author>" else: st = "<heading>" et = "</heading>" if highlight: st = "\033[0;32m{0}\033[0m".format(st) et = "\033[0;32m{0}\033[0m".format(et) else: st = et = "" if block_node == title_node and authonly: continue headers = block_node.xpath(".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']".format(mean_font_size*1.05, main_font_color)) head_txt = ' '.join([etree.tostring(el, method='text', encoding="UTF-8") for el in headers]) if len(head_txt): head_txts.append("{0}{1}{2}".format(st, head_txt, et)) if block_node == title_node and titleonly: stop = True break elif block_node == auth_node and authonly: stop = True break if stop: break for txt in head_txts: sys.stdout.writelines([txt, '\n']) def main(argv=None): if argv is None: argv = sys.argv[1:] try: try: opts, args = getopt.getopt(argv, "ht", ["help", "test", "noxml", "highlight", "title", "author"]) except getopt.error as msg: raise UsageError(msg) for o, a in opts: if (o in ['-h', '--help']): # print help and exit sys.stdout.write(__doc__) sys.stdout.flush() return 0 pdf2heads(opts, args) except UsageError as err: print >>sys.stderr, err.msg print >>sys.stderr, "for help use --help" return 2 except ConfigError, err: sys.stderr.writelines([str(err.msg),'\n']) sys.stderr.flush() return 1
def pdf2heads(opts, args): global Verbose_flag xmltag = True highlight = False titleonly = False authonly = False Verbose_flag = False look_for_all_caps_headings = False global automatic_rerunning global Found_abstract global Found_Sammanfattning start_to_exclude = False for o, a in opts: if (o == '--noxml'): xmltag = False elif (o == '--highlight'): highlight = True if (o == '--title'): titleonly = True elif (o == '--author'): authonly = True elif (o == '--verbose'): Verbose_flag = True print "Verbose_flag is on" elif (o == '--caps'): print "looking for ABSTRACT and other headers in all caps" look_for_all_caps_headings = True if automatic_rerunning: print "looking for ABSTRACT and other headers in all caps" look_for_all_caps_headings = True tree = pdf2etree(args) # find title - look on the first page of the document at the first block of text on the page page = 1 block = 1 title_node = None while True: try: trial_title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] if Verbose_flag: print "trial_title_node:" print trial_title_node # title_headers = trial_title_node.xpath(".//TOKEN[@font-size > {0}]".format(23)) # note that the Title is assumed to be 20 points or larger in size title_headers = trial_title_node.xpath( ".//TOKEN[@font-size > {0}]".format(20)) if Verbose_flag: print "title_headers:" print title_headers title_head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in title_headers ]) if len(title_head_txt): print "<Title>" + title_head_txt + "</Title>" title_node = trial_title_node next_block = block + 1 break except IndexError: page += 1 else: break if page > 2: # probably not going to find it now break # find subtitle - note that a subtitle is option - start on the 2nd page and second block on the page page = 2 block = 2 next_block = 2 subtitle_node = None while True: try: trial_subtitle_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] if Verbose_flag: print "trial_subtitle_node:" print trial_subtitle_node # the Subtitle is assumed to be larger than 19 points subtitle_headers = trial_subtitle_node.xpath( ".//TOKEN[@font-size > {0}]".format(19)) if Verbose_flag: print "subtitle_headers:" print subtitle_headers if len(subtitle_headers) == 0: next_block = 2 break subtitle_head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in subtitle_headers ]) if len(subtitle_head_txt): subtitle_node = trial_subtitle_node print "<Subtitle>" + title_head_txt + "</Subtitle>" next_block = 3 break except IndexError: block += 1 else: break if block > 4: # probably not going to find it now break # find author - on inside cover page = 2 block = next_block auth_node = None while True: try: trial_auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] if Verbose_flag: print "trial_auth_node:" print trial_auth_node # the author's name(s) is(are) assumed to be 15 points or larger in size auth_headers = trial_auth_node.xpath( ".//TOKEN[@font-size > {0}]".format(15)) if Verbose_flag: print "auth_headers:" print auth_headers auth_head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in auth_headers ]) if len(title_head_txt): auth_node = trial_auth_node break except IndexError: block += 1 else: break if block > 4: # probably not going to find it now break font_sizes = tree.xpath('//TOKEN/@font-size') mean_font_size = mean(font_sizes) median_font_size = median(font_sizes) # print "Median Font Size (i.e. body text):", median_font_size font_colors = tree.xpath('//TOKEN/@font-color') font_color_hash = {} for fc in font_colors: try: font_color_hash[fc] += 1 except KeyError: font_color_hash[fc] = 1 sortlist = [(v, k) for k, v in font_color_hash.iteritems()] sortlist.sort(reverse=True) main_font_color = sortlist[0][1] head_txts = [] stop = False page = 0 Found_abstract = False Found_Sammanfattning = False for page_node in tree.xpath('//PAGE'): page = page + 1 block_number = 0 for block_node in page_node.xpath('.//BLOCK'): block_number = block_number + 1 if xmltag: if block_node == title_node: st = "<title>" et = "</title>" if block_node == subtitle_node: st = "<subtitle>" et = "</subtitle>" elif block_node == auth_node: st = "<author>" et = "</author>" else: st = "<heading>" et = "</heading>" if highlight: st = "\033[0;32m{0}\033[0m".format(st) et = "\033[0;32m{0}\033[0m".format(et) else: st = et = "" if block_node == title_node and authonly: continue # note that the assumption that the Abstract headings is set in a larger font then the median font sized used on a page, will not find # abstracts of Aalto university - as they set the word ABSTRACT in a slightly larger size font as used for the rest of the text, but they do set it in all CAPs if look_for_all_caps_headings: headers = block_node.xpath( ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']" .format(mean_font_size, main_font_color)) else: headers = block_node.xpath( ".//TOKEN[@font-size > {0} or @bold = 'yes' or @font-color != '{1}']" .format(mean_font_size * 1.05, main_font_color)) head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in headers ]) if head_txt in text_start_to_exclude: start_to_exclude = True head_txt = filter_headings(head_txt) if len(head_txt) and (not start_to_exclude): head_txts.append("{0}{1}{2}".format(st, head_txt, et)) if head_txt.find("Abstract") >= 0 or head_txt.find( "ABSTRACT") >= 0: if not Found_abstract: print "Abstract (en):" output_blocks_on_page(page_node, block_number, page) Found_abstract = True break if head_txt.find("Sammanfattning") >= 0 or head_txt.find( "SAMMANFATTNING") >= 0: if not Found_Sammanfattning: print "Sammanfattning (sv):" output_blocks_on_page(page_node, block_number, page) Found_Sammanfattning = True break if head_txt.find("Abstrakt") >= 0 or head_txt.find( "ABSTRAKT") >= 0: if not Found_Sammanfattning: print "Abstrakt (sv):" output_blocks_on_page(page_node, block_number, page) Found_Sammanfattning = True break if head_txt.find("Referat") >= 0 or head_txt.find("REFERAT") >= 0: if not Found_Sammanfattning: print "Referat (sv):" output_blocks_on_page(page_node, block_number, page) Found_Sammanfattning = True break # # if head_txt.find("Abstracto(sp)") >= 0: # print "Abstracto (sp):" # output_blocks_on_page(page_node, block_number, page) # break # # if head_txt.find("Abstrait (fr)") >= 0: # print "Abstrait (fr):" # output_blocks_on_page(page_node, block_number, page) # break if block_node == title_node and titleonly: stop = True break elif block_node == auth_node and authonly: stop = True break if stop: break for txt in head_txts: sys.stdout.writelines([txt, '\n'])
def pdf2heads(opts, args, document): global Verbose_flag global test_flag xmltag = True highlight = False titleonly = False authonly = False Verbose_flag = False test_flag = False global look_for_all_caps_headings look_for_all_caps_headings = False global automatic_rerunning global Found_Heading global Found_abstract global Found_org global Found_key global Found_Author global Found_Level global Found_Sammanfattning global Found_Method global Found_Introduction global Found_TOC global abstractOut_path global OrgandSup_path global referat_path global methodOut_path global introductionOut_path global toc_path global heading_path global title_path global author_path global subtitle_path global end_tag global tree global mean_font_size global main_font_color global document_type global mean_font_size global author author = "" document_type = document start_to_exclude = False for o, a in opts: if (o == '--noxml'): xmltag = False elif (o == '--highlight'): highlight = True if (o == '--title'): titleonly = True elif (o == '--author'): authonly = True elif (o == '--unittest'): test_flag = True elif (o == '--verbose'): Verbose_flag = True print "Verbose_flag is on" elif (o == '--caps'): print "looking for ABSTRACT and other headers in all caps" look_for_all_caps_headings = True if automatic_rerunning: print "looking for ABSTRACT and other headers in all caps" look_for_all_caps_headings = True tree = pdf2etree(args) global title_head_txt # find title - look on the first page of the document at the first block of text on the page page = 1 block = 1 title_node = None while (page < 2): try: trial_title_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] if Verbose_flag: #verse flag print "trial_title_node:" print trial_title_node # title_headers = trial_title_node.xpath(".//TOKEN[@font-size > {0}]".format(23)) # note that the Title is assumed to be 20 points or larger in size title_headers = trial_title_node.xpath( ".//TOKEN[(@font-size > {0} and @bold = 'yes') or (@font-size > {1} and @bold = 'yes')]" .format(20, 15)) if Verbose_flag: #verse flag print "title_headers:" print title_headers title_head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in title_headers ]) if len(title_head_txt): #sucess title found print "Title: found" title_path = '../../../../output/parse_result/' + directiory + '/title.txt' txt = title_head_txt st = 'title' json_append(st, txt) # with open(title_path, 'w') as f: # print txt+ "\n" # print tag information to certain file # print >> f, txt, "\n" # print tag information to certain file title_node = trial_title_node next_block = block + 1 break block = block + 1 except IndexError: page += 1 # find subtitle - note that a subtitle is option - start on the 2nd page and second block on the page # WRONG SECOND PAGE IS TABLE OF CONTENt. page = 1 block = next_block print_log("next block is: " + str(block)) subtitle_node = None while (page < 2): try: trial_subtitle_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] if Verbose_flag: print "trial_subtitle_node:" print trial_subtitle_node # the Subtitle is assumed to be larger than 19 points subtitle_headers = trial_subtitle_node.xpath( ".//TOKEN[(@font-size < {0} and @bold = 'no' and @italic= 'no') or (@font-size > {1} and @bold = 'no' and @italic= 'yes')]" .format(20, 13)) if Verbose_flag: print "subtitle_headers:" print subtitle_headers subtitle_path = '../../../../output/parse_result/' + directiory + '/subtitle.txt' title_path = '../../../../output/parse_result/' + directiory + '/title.txt' subtitle_head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in subtitle_headers ]) if len(subtitle_head_txt) and not subtitle_head_txt.isdigit(): if title_head_txt == "Project proposal": subtitle_path = title_path print "Subtitle: not found" print "Title: found since title is project proporsal, replace subtitle as title" txt = subtitle_head_txt st = 'subtitle' json_append(st, txt) # with open(subtitle_path, 'w') as f: # print txt+ "\n" # print tag information to certain file # # print >> f, txt, "\n" # print tag information to certain file subtitle_node = trial_subtitle_node next_block = block + 1 print "Subtitle: found" break block = block + 1 except IndexError: page += 1 # find author - on cover page Found_Author = False Found_Level = False author_path = '../../../../output/parse_result/' + directiory + '/author_detail.txt' frontname_path = '../../../../output/parse_result/' + directiory + '/front_name.txt' aftername_path = '../../../../output/parse_result/' + directiory + '/after_name.txt' page = 1 block = next_block auth_node = None auth_count = 0 while (page < 2): try: trial_auth_node = tree.xpath("//PAGE[{0}]//BLOCK[{1}]".format( page, block))[0] if Verbose_flag: print "trial_auth_node:" print trial_auth_node # the author's name(s) is(are) assumed to be smaller than title bigger than degree project... auth_headers = trial_auth_node.xpath( ".//TOKEN[@font-size < {0} and @font-size > {1}]".format( 20, 11)) if Verbose_flag: print "auth_headers:" print auth_headers print_log(document_type) auth_head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in auth_headers ]) auth_list = auth_head_txt.split(";") while (len(auth_head_txt) > 0) and auth_count < 2 and len( auth_list) > auth_count: #found print "Author: found" auth_head_txt = auth_list[auth_count - 1] auth_count += 1 name_split = auth_head_txt.split() txt = auth_head_txt author = author + "_" + auth_head_txt author_path = '../../../../output/parse_result/' + directiory + '/author_' + str( auth_count) + '.txt' st = 'author_' + str(auth_count) json_append(st, txt) # with open(author_path, 'w') as f: # print txt + "in" + author_path # print >> f, txt, "\n" # print tag information to certain file txt = name_split[0] frontname_path = '../../../../output/parse_result/' + directiory + '/author_' + str( auth_count) + '_frontname' + '.txt' st = 'author_' + str(auth_count) + '_frontname' json_append(st, txt) # with open(frontname_path, 'w') as f: # print txt + "in" + frontname_path # print >> f, txt, "\n" # print tag information to certain file txt = name_split[1] aftername_path = '../../../../output/parse_result/' + directiory + '/author_' + str( auth_count) + '_aftername' + '.txt' st = 'author_' + str(auth_count) + '_aftername' json_append(st, txt) # with open(aftername_path, 'w') as f: # print txt + "in" + aftername_path # print >> f, txt, "\n" # print tag information to certain file auth_node = trial_auth_node block = block + 1 except IndexError: page += 1 font_sizes = tree.xpath('//TOKEN/@font-size') mean_font_size = mean(font_sizes) median_font_size = median(font_sizes) # print "Median Font Size (i.e. body text):", median_font_size font_colors = tree.xpath('//TOKEN/@font-color') font_color_hash = {} for fc in font_colors: try: font_color_hash[fc] += 1 except KeyError: font_color_hash[fc] = 1 sortlist = [(v, k) for k, v in font_color_hash.iteritems()] sortlist.sort(reverse=True) main_font_color = sortlist[0][1] head_txts = [] stop = False page = 0 Found_abstract = False Found_org = False Found_key = False Found_Sammanfattning = False Found_Method = False Found_Introduction = False Found_TOC = False OrgandSup_path = '../../../../output/parse_result/' + directiory + '/Orignization_supervisor(en).txt' key_path = '../../../../output/parse_result/' + directiory + '/Keyword(en).txt' abstractOut_path = '../../../../output/parse_result/' + directiory + '/abstract(en).txt' abstractsvOut_path = '../../../../output/parse_result/' + directiory + '/abstract(sv).txt' referat_path = '../../../../output/parse_result/' + directiory + '/referat(sv).txt' methodOut_path = '../../../../output/parse_result/' + directiory + '/method(en).txt' toc_path = '../../../../output/parse_result/' + directiory + '/toc(en).txt' introductionOut_path = '../../../../output/parse_result/' + directiory + '/introduction(en).txt' heading_path = '../../../../output/parse_result/' + directiory + '/heading.txt' title_path = '../../../../output/parse_result/' + directiory + '/title.txt' #page node for page_node in tree.xpath('//PAGE'): page = page + 1 block_number = 0 for block_node in page_node.xpath('.//BLOCK'): block_number = block_number + 1 if xmltag: #specify data mining model #all gone to heading....not working!! if block_node == title_node: #found title st = "title" et = "title" if block_node == subtitle_node: #found subtitle st = "subtitle" et = "subtitle" elif block_node == auth_node: #found author #not working st = "author" et = "author" else: st = "heading" et = "heading" #found other headings if highlight: st = "\033[0;32m{0}\033[0m".format(st) et = "\033[0;32m{0}\033[0m".format(et) else: st = et = "" if block_node == title_node and authonly: continue # note that the assumption that the Abstract headings is set in a larger font then the median font sized used on a page, will not find # abstracts of Aalto university - as they set the word ABSTRACT in a slightly larger size font as used for the rest of the text, but they do set it in all CAPs if look_for_all_caps_headings: headers = block_node.xpath( ".//TOKEN[(@font-size > {0} and @bold = 'yes') or @font-color != '{1}']" .format(mean_font_size, main_font_color)) else: headers = block_node.xpath( ".//TOKEN[(@font-size > {0} and @bold = 'yes') or @font-color != '{1}']" .format(mean_font_size * 1.05, main_font_color)) level_headers = block_node.xpath( ".//TOKEN[@font-size > {0}]".format(0)) head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in headers ]) level_head_txt = ' '.join([ etree.tostring(el, method='text', encoding="UTF-8") for el in level_headers ]) # print head_txt if head_txt in text_start_to_exclude: start_to_exclude = True head_txt = filter_headings(head_txt) if len(head_txt) and (not start_to_exclude): head_txts.append("{0}{1}{2}".format( st, head_txt, et)) #append st tag_content andet # model for proposal if (int(document_type) == 1): print_log("first content check: " + head_txt) if head_txt.find("Authors") >= 0 or head_txt.find( "Author") >= 0: if not Found_Author: # if the abstract has not been found yet print "Authors(en): OVERIDE " print "Authors and detail information (en): found " author = "" output_text_on_block_on_page(page_node, block_number, page, author_path) author = auth Found_Author = True if level_head_txt.find("Bachelor") >= 0 or level_head_txt.find( "Master") >= 0 or level_head_txt.find( "Degree Project") >= 0: if not Found_Level: # if the abstract has not been found yet print_log("Level: found") level_path = '../../../../output/parse_result/' + directiory + '/level.txt' st = 'level' json_append(st, level_head_txt) # with open(level_path, 'w') as f: # print level_head_txt + "\n" # print tag information to certain file # print >> f, level_head_txt, "\n" # print tag information to certain file Found_Level = True if head_txt.find("Organization and Supervisor") >= 0 or ( head_txt.find("Organization") >= 0 and head_txt.find("Supervisor") >= 0): if not Found_org: # if the abstract has not been found yet print "Organization and Supervisor (en): found" output_blocks_on_page(page_node, block_number, page, OrgandSup_path, 0) Found_org = True if head_txt.find("Keywords") >= 0 or head_txt.find( "Keyword") >= 0: print_log("I should be herer!!!!!") if not Found_key: # if the abstract has not been found yet print "Keywords(en): found" output_blocks_on_page(page_node, block_number, page, key_path, 0) Found_key = True # model for thesis if head_txt.find("Abstract") >= 0 or head_txt.find( "ABSTRACT") >= 0: if not Found_abstract: #if the abstract has not been found yet print "Abstract (en): found" output_blocks_on_page(page_node, block_number, page, abstractOut_path, 0) Found_abstract = True break if head_txt.find("Sammanfattning") >= 0 or head_txt.find( "SAMMANFATTNING") >= 0: if not Found_Sammanfattning: print "Sammanfattning (sv): found" output_blocks_on_page(page_node, block_number, page, abstractsvOut_path, 0) Found_Sammanfattning = True break if head_txt.find("Abstrakt") >= 0 or head_txt.find( "ABSTRAKT") >= 0: if not Found_Sammanfattning: print "Abstrakt (sv): found" output_blocks_on_page(page_node, block_number, page, abstractOut_path, 0) Found_Sammanfattning = True break if head_txt.find("Referat") >= 0 or head_txt.find("REFERAT") >= 0: if not Found_Sammanfattning: print "Referat (sv): found" output_blocks_on_page(page_node, block_number, page, referat_path, 0) Found_Sammanfattning = True break #table of content if head_txt.find("Table of Contents") >= 0 or head_txt.find( "Contents") >= 0: if not Found_TOC: # if the abstract has not been found yet print "TOC (en): found" output_blocks_on_page(page_node, block_number, page, toc_path, 0) Found_TOC = True break if head_txt.find("Introduction") >= 0 or head_txt.find( "INTRODUCTION") >= 0: if not Found_Introduction: # if the abstract has not been found yet print "Introduction (en): found" output_blocks_on_page(page_node, block_number, page, introductionOut_path, 1) Found_Introduction = True #Found_Introduction = True break if head_txt.find("Methods") >= 0 or head_txt.find( "METHODS") >= 0 or head_txt.find( "Methodology") >= 0 or head_txt.find( "METHODOLOGY") >= 0: if not Found_Method: #if the abstract has not been found yet print "Methods (en): found" output_blocks_on_page(page_node, block_number, page, methodOut_path, 0) Found_Method = True break # # if head_txt.find("Abstracto(sp)") >= 0: # print "Abstracto (sp):" # output_blocks_on_page(page_node, block_number, page) # break # # if head_txt.find("Abstrait (fr)") >= 0: # print "Abstrait (fr):" # output_blocks_on_page(page_node, block_number, page) # break if block_node == title_node and titleonly: stop = True break elif block_node == auth_node and authonly: stop = True break if stop: break