def _get_value(self, json_data, key, default="", convert=True): """ 这里的key应该是规则的key """ value = default if key in json_data: value = json_data[key] elif key in self.reverse_key_map: for alias_key in self.reverse_key_map[key]: if alias_key in json_data: value = json_data[alias_key] match_key = alias_key break ret = Utils.format_value(value, convert) #余下,可以针对特定key的value,做一些归一化处理 if key == "release_date": if type(ret) is not list: ret = ret.replace("00:00:00", "") ret = Utils.format_datetime(ret) else: if len(ret) == 0: return "" ret[0] = ret[0].replace("00:00:00", "") ret[0] = Utils.format_datetime(ret[0]) return ret
def start(self): #a = "<font size=\"5\"><a name=\"top1\"></a>Efeito de fungicidas na germinação <i>in vitro</i> de conídios de <i>Claviceps africana</i><sup>(<a href=\"#back1\">1</a>)</sup></font>" #self._format_scielo_authors(a) #sys.exit(0) with open(self.meta_file) as f: for line in f: try: self.total = self.total + 1 line = line.strip() json_data = json.loads(line) except Exception as e: self.json_fail = self.json_fail + 1 continue #检查1:access url是最基本的字段了,如果这个都没爬取下来,那连问题都没法定位了 access_url = self._get_value(json_data, "access_url") if access_url == "": self.no_access_url = self.no_access_url + 1 continue #针对不同的平台,对元数据做一些特殊处理 json_data = self._transform(json_data) #检查2:检查是否采集必备字段 miss_required_filed = False for key in self.required_keys: value = self._get_value(json_data, key) if value == "": #value为空,表示元数据未采集到此字段 bad_record = {} bad_record['reason'] = "%s empty" % key bad_record['access_url'] = access_url self._mark_bad_record(bad_record) self.incomplete = self.incomplete + 1 miss_required_filed = True break if miss_required_filed: continue #检查3:检查元数据里面是否有非空字段 fail = False for key, value in json_data.iteritems(): key = key.strip(":").strip() value = Utils.format_value(value) if value == "" and key in self.required_keys: if key == "release_year": publish_data = self._get_value( json_data, "release_date") if publish_data != "": #if publish data is also empty, there is no way to get publish_year json_data["release_year"] = publish_data.split( "-")[0] print "publish year is %s" % json_data[ "release_year"] continue bad_record = {} bad_record['reason'] = "%s empty" % key bad_record['access_url'] = access_url self._mark_bad_record(bad_record) self.incomplete = self.incomplete + 1 fail = True break if fail: continue #检查4:补充一些必备字段 json_data['acquisition_time'] = Utils.current_time() publish_year = self._get_value(json_data, "release_year") if publish_year == "": publish_data = self._get_value(json_data, "release_date") if publish_data != "": json_data["release_year"] = publish_data.split("-")[0] #处理一下author、author_sub、author_affiliation等字段 if access_url in self.pass_meta_map: title = self._get_value(json_data, "title") if title != self.pass_meta_map[access_url]: pass #raise Exception("same url with different title, not gonna happen :%s" % access_url) self.dup = self.dup + 1 continue self.pass_count = self.pass_count + 1 self._mark_success_record(json_data) self.pass_meta_map[access_url] = json_data["title"] if self.args.pdf_dir is not None: print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pdf_non_exist: %d, pdf_exist_count: %d, pass meta save to: %s, fail meta save to :%s, miss pdf url save to :%s" \ % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pdf_non_exist, self.pdf_exist, self.pass_meta_file, self.bad_meta_file, self.miss_pdf_file) else: print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pass meta save to: %s, fail meta save to :%s" \ % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pass_meta_file, self.bad_meta_file)
from spiders.utils import Utils filename = sys.argv[1] columnname = sys.argv[2] split = '|' with open(filename) as fp: for line in fp: try: json_date = json.loads(line) except Exception as e: continue columns = columnname.split(",") line = "" for column in columns: try: data = Utils.format_value(json_date[column], join_char='|') except Exception as e: data = "" if column == 'url': data = re.sub("\?journalCode=.*", "", data) if isinstance(data, int): line += str(data) + split else: line += data.replace('\n', '').replace('\t', '').strip() + split #print line.strip().replace(u'ê', 'e').replace(u'é', 'e').replace(u'ã', 'a').replace(u'ó', 'o').replace(u'ú', 'u').strip(split) print line.strip().strip(split)