def extract_metadata(): with open("config.json") as f: global_config = json.load(f) print global_config filepath = os.path.join(global_config["opendata"],"temp/*.pdf") filenames = glob.glob(filepath) print len(filenames) list_field = [] filename_output_text = os.path.join(global_config["home"],"output/paper-pdf.txt") filename_output_csv = os.path.join(global_config["home"],"output/paper-pdf.csv") list_paper = [] with codecs.open(filename_output_text, "wb","utf-8") as f: with open(filename_output_csv, "wb") as fcsv: writer = UnicodeWriter(fcsv) for filename in filenames: #if '87970177' not in filename: # continue with open(filename,'r') as fpdf: f.write(u"=================================\n\r") f.write(filename) f.write(u'\n\r') f.write(u'\n\r') ret = lib_pdf.pdf2text(fpdf, maxpages=1) for p in ["title","number_of_pages", "text"]: f.write("\n") f.write("\n") f.write(p) f.write("\n") print if p == "number_of_pages": content = str(ret[p]) else: content = ret[p] f.write(content.decode("utf-8",errors="ignore")) ret = lib_pdf.pdf2metadata_iswc(fpdf) ret["paper_id"]= int(filename.split("/")[-1][:-4]) assert ret["author"] list_paper.append(ret) print json.dumps(ret,indent=4) row = UtilString.json2list(ret, ["title","paper_id","author", "keyword","abstract"]) writer.writerow(row) #break filename_output_json = os.path.join(global_config["home"],"output/paper-pdf.json") content = lib_data.json2text(list_paper) with codecs.open(filename_output_json, "w","utf-8") as f: f.write(content)
def main(): filename = "config.json" filename = os.path.join(os.path.dirname(__file__), filename) with open(filename) as f: global_config = json.load(f) print global_config list_input = [ {"filename": "8796CorrespondingAuthors.csv", #TODO #"link_publisher":"tba", "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-1", }, {"filename": "8797CorrespondingAuthors.csv", #"link_publisher":"tba", "proceedings_uri": "http://data.semanticweb.org/conference/iswc/2014/proceedings-2", }, ] list_field=[ "author", "title", "pages", "year", "link_open_access", "link_publisher", "proceedings_uri", "paper_uri", "source_uri", "keywords", "abstract", "uri_me", "category", "source", "start_page", "paper_id", "EOL", ] map_key = { "Title":"title", "Authors":"author", "Start Page":"start_page", "Folder Index":"paper_id", "Paper no.":"paper_no", } list_key = { "link_publisher", "proceedings_uri", } list_item = [] counter = collections.Counter() for input in list_input: filename = os.path.join( global_config["home"],"data", input["filename"]) print filename with open(filename,'r') as f: csvreader = UnicodeReader(f) headers = csvreader.next() prev_item = None for row in csvreader: entry = dict(zip(headers, row)) print entry item = { "year":2014, "uri_me":"http://data.semanticweb.org/conference/iswc/2014", #"EOL":"EOL", } for k,v in map_key.items(): item[v] = entry[k].strip() for k in list_key: if k in input: item[k] = input[k] temp = entry["Paper no."] if temp.startswith("DC"): counter["DC"] += 1 category = "Doctoral Consortium Paper" else: counter[temp[0]] += 1 map_category = { "R": "Research Track Paper", "D": "Replication, Benchmark, Data and Software Track Paper", "I": "Semantic Web In Use Track Paper", } category = map_category[temp[0]] item["category"]= category list_item.append(item) if prev_item: prev_item["pages"]= "{}-{}".format(prev_item["start_page"], int(item["start_page"]) - 1) prev_item = item prev_item["pages"]= "{}-".format(prev_item["start_page"]) #update: paper uri for item in list_item: #paper_name = re.sub("\W+", "-", item[u"title"]).lower() paper_name = slugify.slugify(item[u"title"]) print item[u"title"] print paper_name item["link_open_access"] = "https://github.com/lidingpku/iswc2014/raw/master/paper/{}-{}.pdf".format(item['paper_id'],paper_name) print item["link_open_access"] print counter.most_common() print len(list_item) #create file filename = "paper-excel.csv" filename = os.path.join(global_config["home"],"output", filename) print filename with open(filename, "w") as f: csvwriter = UnicodeWriter(f) csvwriter.writerow(list_field) for item in list_item: row = UtilString.json2list(item, list_field) csvwriter.writerow(row) filename = "paper-excel.json" filename = os.path.join(global_config["home"],"output", filename) print filename with codecs.open(filename, "w","utf-8") as f: f.write(lib_data.json2text(list_item))
def main(): ################################################################### # load config file with open("config.json") as f: global_config = json.load( f) """ load three csv files, aggregate them to form a join """ json_person = {} filename = os.path.join(global_config["home"], "data/work/iswc2013/raw/payments.csv") json_payment = UtilCsv.csv2json(filename) for entry in json_payment: key = entry["name"].lower() if key in json_person: data = json_person[key] else: data = {"name":entry["name"], "paid":False, "attend":False, "paper":[]} json_person[key]=data data["email_payment"]= entry["email"] data["id_payment"]= entry["id"] data["paid"]= True filename = os.path.join(global_config["home"], "data/work/iswc2013/raw/attendees.csv") json_attendees = UtilCsv.csv2json(filename) for entry in json_attendees: key = entry["name"].lower() if key in json_person: data = json_person[key] else: data = {"name":entry["name"], "paid":False, "attend":False, "paper":[]} json_person[key]=data data["email_attendees"]= entry["email"] data["id_attendees"]= entry["id"] data["attend"]= True json_output=[] filename = os.path.join(global_config["home"], "data/source/iswc-2013-paper.csv") json_paper = UtilCsv.csv2json(filename) #split authors for entry in json_paper: title = entry["title"] entry["author_list"]= [x.strip() for x in entry["author"].split(',')] #print len(json_paper), entry data_paper = { "paid":[], "attend":[]} for key in ["title","category","author"]: data_paper[key] =entry[key] json_output.append(data_paper) for name in entry["author_list"]: key =name.lower() if key in json_person: json_person[key]["paper"].append(title) if json_person[key]["paid"]: data_paper["paid"].append(name) if json_person[key]["attend"]: data_paper["attend"].append(name) filename_output = os.path.join(global_config["home"], "data/work/iswc2013/raw/stat_paper.csv") with open(filename_output,"w") as f: csvwriter = UnicodeWriter(f) headers = ["category","author","title","paid","attend"] csvwriter.writerow(headers) for entry in json_output: #print entry row = UtilString.json2list(entry, headers) csvwriter.writerow(row) filename_output = os.path.join(global_config["home"], "data/work/iswc2013/raw/stat_person.csv") with open(filename_output,"w") as f: csvwriter = UnicodeWriter(f) headers = ["name","paid","attend","paper"] csvwriter.writerow(headers) for entry in sorted(json_person.values(), key=lambda x:x["name"]): #print entry row = UtilString.json2list(entry, headers) csvwriter.writerow(row)
def easychair_paper_author( filename_input_paper, filename_input_author, filename_output_paper, filename_output_author): #process author with open(filename_input_author) as f: html_doc = f.read() soup = BeautifulSoup(html_doc) map_author_all ={} attr_pattern = { "class" : "ct_table"} list_author = HtmlUtil.extract_table(soup, attr_pattern) for author in list_author["rows"]: #print author name = author[u"Author"] map_author_all[name] = author author["Homepage"] = "" print "{} authors found".format(len(map_author_all)) #process paper with open(filename_input_paper) as f: html_doc = f.read() soup = BeautifulSoup(html_doc) list_paper =[] list_author = [] list_div_paper = soup.find_all('div', attrs= { "class" : "paper"}) for div_paper in list_div_paper: paper ={} list_paper.append(paper) for cls in ["authors", "title"]: div_cls = div_paper.find('span', attrs= { "class" : cls}) #print div_cls.text if cls =="authors": text = div_cls.text.encode('utf8') text = text.replace(" and ", ", ") text = re.sub("\.\s*$","", text) text = text.strip() paper[cls] = text for el in HtmlUtil.extract_links(div_cls): name = el["text"] list_author.append( name) if name not in map_author_all: print "ERROR: name [{}] not in author, with homepage".format(name) map_author_all[name]={"Author": name} el["link"] = el["link"].replace("http://http:/","http://") map_author_all[name]["Homepage"]=el["link"] for x in text.split(","): name = x.strip() list_author.append( name ) if name not in map_author_all: print "ERROR: name [{}] not in author, without homepage".format(name) map_author_all[name]={"Author": name} else: paper[cls] = div_cls.text.encode('utf8') list_author = sorted(set(list_author)) list_div_abstract = soup.find_all('div', attrs= { "class" : "abstract"}) for index, div_abstract in enumerate(list_div_abstract): abstract = div_abstract.text.encode('utf8').replace("Abstract:", "") abstract = abstract.strip() list_paper[index]["abstract"] =abstract print "{} papers write".format(len(list_paper)) with open(filename_output_paper, "w") as f: csvwriter = UnicodeWriter(f) headers = ["authors", "title","abstract"] csvwriter.writerow(headers) for paper in list_paper: row = UtilString.json2list(paper, headers) csvwriter.writerow(row) print "{} authors write".format(len(list_author)) with open(filename_output_author, "w") as f: csvwriter = UnicodeWriter(f) headers = ["Author", "Affiliation","Country","Email","Homepage"] csvwriter.writerow(headers) for name in list_author: author = map_author_all[name] row = UtilString.json2list(author, headers) csvwriter.writerow(row)