Exemplo n.º 1
0
    def _get_value(self, json_data, key, default="", convert=True):
        """
        这里的key应该是规则的key
        """
        value = default
        if key in json_data:
            value = json_data[key]
        elif key in self.reverse_key_map:
            for alias_key in self.reverse_key_map[key]:
                if alias_key in json_data:
                    value = json_data[alias_key]
                    match_key = alias_key
                    break

        ret = Utils.format_value(value, convert)

        #余下,可以针对特定key的value,做一些归一化处理
        if key == "release_date":
            if type(ret) is not list:
                ret = ret.replace("00:00:00", "")
                ret = Utils.format_datetime(ret)
            else:
                if len(ret) == 0:
                    return ""
                ret[0] = ret[0].replace("00:00:00", "")
                ret[0] = Utils.format_datetime(ret[0])
        return ret
Exemplo n.º 2
0
    def start(self):
        #a = "<font size=\"5\"><a name=\"top1\"></a>Efeito de fungicidas na    germinação <i>in vitro</i> de conídios de <i>Claviceps    africana</i><sup>(<a href=\"#back1\">1</a>)</sup></font>"
        #self._format_scielo_authors(a)
        #sys.exit(0)
        with open(self.meta_file) as f:
            for line in f:
                try:
                    self.total = self.total + 1
                    line = line.strip()
                    json_data = json.loads(line)
                except Exception as e:
                    self.json_fail = self.json_fail + 1
                    continue

                #检查1:access url是最基本的字段了,如果这个都没爬取下来,那连问题都没法定位了
                access_url = self._get_value(json_data, "access_url")
                if access_url == "":
                    self.no_access_url = self.no_access_url + 1
                    continue

                #针对不同的平台,对元数据做一些特殊处理
                json_data = self._transform(json_data)

                #检查2:检查是否采集必备字段
                miss_required_filed = False
                for key in self.required_keys:
                    value = self._get_value(json_data, key)
                    if value == "":
                        #value为空,表示元数据未采集到此字段
                        bad_record = {}
                        bad_record['reason'] = "%s empty" % key
                        bad_record['access_url'] = access_url
                        self._mark_bad_record(bad_record)
                        self.incomplete = self.incomplete + 1
                        miss_required_filed = True
                        break

                if miss_required_filed:
                    continue

                #检查3:检查元数据里面是否有非空字段
                fail = False
                for key, value in json_data.iteritems():
                    key = key.strip(":").strip()
                    value = Utils.format_value(value)
                    if value == "" and key in self.required_keys:
                        if key == "release_year":
                            publish_data = self._get_value(
                                json_data, "release_date")
                            if publish_data != "":
                                #if publish data is also empty, there is no way to get publish_year
                                json_data["release_year"] = publish_data.split(
                                    "-")[0]
                                print "publish year is %s" % json_data[
                                    "release_year"]
                                continue

                        bad_record = {}
                        bad_record['reason'] = "%s empty" % key
                        bad_record['access_url'] = access_url
                        self._mark_bad_record(bad_record)
                        self.incomplete = self.incomplete + 1
                        fail = True
                        break

                if fail:
                    continue

                #检查4:补充一些必备字段
                json_data['acquisition_time'] = Utils.current_time()
                publish_year = self._get_value(json_data, "release_year")
                if publish_year == "":
                    publish_data = self._get_value(json_data, "release_date")
                    if publish_data != "":
                        json_data["release_year"] = publish_data.split("-")[0]

                #处理一下author、author_sub、author_affiliation等字段

                if access_url in self.pass_meta_map:
                    title = self._get_value(json_data, "title")
                    if title != self.pass_meta_map[access_url]:
                        pass
                        #raise Exception("same url with different title, not gonna happen :%s" % access_url)
                    self.dup = self.dup + 1
                    continue

                self.pass_count = self.pass_count + 1
                self._mark_success_record(json_data)
                self.pass_meta_map[access_url] = json_data["title"]

        if self.args.pdf_dir is not None:
            print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pdf_non_exist: %d, pdf_exist_count: %d, pass meta save to: %s, fail meta save to :%s, miss pdf url save to :%s" \
            % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pdf_non_exist, self.pdf_exist, self.pass_meta_file, self.bad_meta_file, self.miss_pdf_file)
        else:
            print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pass meta save to: %s, fail meta save to :%s" \
            % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pass_meta_file, self.bad_meta_file)
Exemplo n.º 3
0
from spiders.utils import Utils

filename = sys.argv[1]
columnname = sys.argv[2]
split = '|'

with open(filename) as fp:
    for line in fp:
        try:
            json_date = json.loads(line)
        except Exception as e:
            continue
        columns = columnname.split(",")
        line = ""
        for column in columns:
            try:
                data = Utils.format_value(json_date[column], join_char='|')
            except Exception as e:
                data = ""

            if column == 'url':
                data = re.sub("\?journalCode=.*", "", data)

            if isinstance(data, int):
                line += str(data) + split
            else:
                line += data.replace('\n', '').replace('\t',
                                                       '').strip() + split
        #print line.strip().replace(u'ê', 'e').replace(u'é', 'e').replace(u'ã', 'a').replace(u'ó', 'o').replace(u'ú', 'u').strip(split)
        print line.strip().strip(split)