def start(self): self._load_journal_meta() with open(self.origin_meta_file) as f: for line in f: try: line = line.strip() json_data = json.loads(line) except Exception as e: continue if json_data["from_url"] not in self.journal_meta: raise Exception("conference_url %s not excepted" % json_data["from_url"]) journal_meta = self.journal_meta[json_data["from_url"]] new_data = {} new_data["id"] = journal_meta["id"] new_data["conference"] = journal_meta["conference"] new_data["issn"] = json_data.get("issn") new_data["title"] = json_data.get("title") new_data["abstract"] = json_data.get("abstract") new_data["author"] = json_data.get("author") new_data["keywords"] = json_data.get("keywords") new_data["release_year"] = json_data.get("release_year") page = json_data.get("page", "").replace("Pages", "").replace("Page", "").strip() page_infos = page.split("-") if len(page_infos) != 1: start_page = Utils.str_to_num(page_infos[0].strip()) end_page = Utils.str_to_num(page_infos[1].strip()) total_page = end_page - start_page + 1 else: start_page = Utils.str_to_num(page_infos[0].strip()) end_page = start_page total_page = 1 new_data["start_page"] = start_page new_data["end_page"] = end_page new_data["total_page"] = total_page new_data["pdf_path"] = os.path.join( journal_meta["id"], Utils.get_pdf_filename(json_data)) new_data["doi"] = json_data.get("doi") new_data["conference_url"] = json_data["from_url"] new_data["access_url"] = json_data["access_url"] new_data["pdf_url"] = json_data["pdf_url"] print json.dumps(new_data)
def _mark_success_record(self, json_data): """ Mark an success record. 1. 对Key做一些归一化的工作,同时也可以加一些必备的字段 2. 如果指定了pdf save dir,则会和pdf文件做拼接 3. 去掉一些无用的key(比如portia爬取,会加上_template这样的字段) @param json_data """ publish_data = self._get_value(json_data, "release_date") if "release_year" not in json_data or json_data["release_year"] == "": #if publish data is also empty, there is no way to get publish_year if "release_date" in json_data: json_data["release_year"] = publish_data.split()[-1] if "keywords" in json_data: if type(json_data["keywords"]) is list \ and len(json_data["keywords"]) == 1: #portia有的期刊爬取keywords,都写到一个元素里面了,应该拆开 keywords = json_data["keywords"][0].replace("Keywords", "").strip() json_data["keywords"] = keywords.split(";") if len(json_data["keywords"]) == 1: json_data["keywords"] = keywords.split(",") elif self.for_oa and type(json_data["keywords"]) is not list: keywords = json_data["keywords"].replace( "Index terms:", "").replace("Keywords", "").split(";") json_data["keywords"] = keywords #如果是oa,且keywords不是list,要转化为list convert_data = {} for key, value in json_data.iteritems(): format_key = key.strip(":").strip().lower() if format_key in self.key_map: format_key = self.key_map[format_key] #这类key,不会把list转换为string if format_key in self.reserved_non_converted_keys: convert = False else: convert = not self.for_oa value = self._get_value( json_data, format_key, convert=convert) #这里使用_get_value,会对value做一些归一化 convert_data[format_key] = value # 归一化作者和作者机构 is_scielo = False if is_scielo: #2018.04.18 崩溃了,scielo的作者单独处理 if convert_data["author"][0].find("<") != -1: author_raw_text = " ".join(convert_data["author"]) authors = self._format_scielo_authors(author_raw_text) convert_data['author'] = authors convert_data.pop("author_affiliation", None) elif 'author' in convert_data and len( convert_data["author"] ) == 1: #这种author可能是一坨html,包含了多个作者并且每个作者的sup也标记在<sup>里面 #这种情况属于作者机构不太好爬,直接把html文档都爬取到author字段了 authors, author_sups = self._format_authors(convert_data) if 'author_sup' not in convert_data: convert_data['author_sup'] = author_sups else: convert_data['author_sup'] = [ self._format_author_sup(sup) for sup in convert_data['author_sup'] ] if len(authors) == 1: #如果此时author还是只有一个元素,那么author可能是,分隔的 authors = authors[0].split(",") convert_data['author'] = authors if 'author_sup' in convert_data: convert_data['author_sup'] = [ self._format_author_sup(sup) for sup in convert_data['author_sup'] ] if "author_affiliation" in convert_data and len( convert_data['author_affiliation']) == 1: #这种author_affiliation可能是一坨html author_affiliation = convert_data['author_affiliation'][0] try: authors = convert_data['author'] if author_affiliation.startswith(authors[0]): #这种作者机构是以作者分隔的,比如:https://koedoe.co.za/index.php/koedoe/article/view/188 author_affiliation = self._format_author_affiliations_by_author( author_affiliation, authors) else: author_affiliation = self._format_author_affiliations( convert_data) convert_data['author_affiliation'] = author_affiliation except Exception as e: #没爬到作者,却爬到了作者机构,先忽略这种情况吧 authors = [] convert_data['author_affiliation'] = [] convert_data['author'] = [] convert_data['author_sup'] = [] #有的author_sup里面,会有空字符串,比如scielo if "author_sup" in convert_data and type( convert_data["author_sup"]) is list: convert_data["author_sup"] = [ i for i in convert_data["author_sup"] if i != '' ] if self.args.pdf_dir is not None: filename = Utils.get_pdf_filename(json_data) pdf_path = os.path.join(self.args.pdf_dir, filename + ".pdf") txt_path = os.path.join(self.args.pdf_dir, filename + ".txt") if os.path.exists(pdf_path): convert_data["pdf_path"] = filename + ".pdf" self.pdf_exist = self.pdf_exist + 1 elif os.path.exists(txt_path): convert_data["pdf_path"] = filename + ".txt" self.pdf_exist = self.pdf_exist + 1 else: #print "pdf path(%s) or txt path(%s) not exist" % (pdf_path, txt_path) convert_data["pdf_path"] = "wrong" self.pdf_non_exist = self.pdf_non_exist + 1 pdf_link = self._get_value(json_data, "pdf_url") if pdf_link == "": raise Exception("cannot get pdf_url from json %s" % json_data) self.miss_pdf_writer.write(pdf_link) self.miss_pdf_writer.write("\n") #归一化author,author_affiliation if not self.for_oa: pass #convert_data = Utils.format_authors_from_json(convert_data) #去掉一些key if not self.for_oa: convert_data.pop("author_sup", None) else: #oa的需要特别处理下 convert_data["doi"] = self._get_value(convert_data, "doi").replace( "https://doi.org/", "").replace("http://doi.org/", "") convert_data.pop('_template', None) convert_data_str = json.dumps(convert_data) self.pass_meta_writer.write(convert_data_str) self.pass_meta_writer.write("\n")