Python Utils.format_value 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: spiders.utils

클래스/타입: Utils

메소드/함수: format_value

hotexamples.com에서의 예제들: 3

Python Utils.format_value - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 spiders.utils.Utils.format_value에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Utils(6)

req(5)

checkpath(3)

format_value(3)

regex_extract(3)

format_datetime(2)

get_pdf_filename(2)

current_time(1)

extract_chars(1)

format_authors(1)

generate_workdir(1)

load_journal_meta(1)

str_to_num(1)

strptime(1)

예제 #1

파일 보기

파일: meta_check.py 프로젝트: zouheng/private_crawl

    def _get_value(self, json_data, key, default="", convert=True):
        """
        这里的key应该是规则的key
        """
        value = default
        if key in json_data:
            value = json_data[key]
        elif key in self.reverse_key_map:
            for alias_key in self.reverse_key_map[key]:
                if alias_key in json_data:
                    value = json_data[alias_key]
                    match_key = alias_key
                    break

        ret = Utils.format_value(value, convert)

        #余下，可以针对特定key的value，做一些归一化处理
        if key == "release_date":
            if type(ret) is not list:
                ret = ret.replace("00:00:00", "")
                ret = Utils.format_datetime(ret)
            else:
                if len(ret) == 0:
                    return ""
                ret[0] = ret[0].replace("00:00:00", "")
                ret[0] = Utils.format_datetime(ret[0])
        return ret

예제 #2

파일 보기

파일: meta_check.py 프로젝트: zouheng/private_crawl

    def start(self):
        #a = "<font size=\"5\"><a name=\"top1\"></a>Efeito de fungicidas na    germinação <i>in vitro</i> de conídios de <i>Claviceps    africana</i><sup>(<a href=\"#back1\">1</a>)</sup></font>"
        #self._format_scielo_authors(a)
        #sys.exit(0)
        with open(self.meta_file) as f:
            for line in f:
                try:
                    self.total = self.total + 1
                    line = line.strip()
                    json_data = json.loads(line)
                except Exception as e:
                    self.json_fail = self.json_fail + 1
                    continue

                #检查1:access url是最基本的字段了，如果这个都没爬取下来，那连问题都没法定位了
                access_url = self._get_value(json_data, "access_url")
                if access_url == "":
                    self.no_access_url = self.no_access_url + 1
                    continue

                #针对不同的平台，对元数据做一些特殊处理
                json_data = self._transform(json_data)

                #检查2:检查是否采集必备字段
                miss_required_filed = False
                for key in self.required_keys:
                    value = self._get_value(json_data, key)
                    if value == "":
                        #value为空，表示元数据未采集到此字段
                        bad_record = {}
                        bad_record['reason'] = "%s empty" % key
                        bad_record['access_url'] = access_url
                        self._mark_bad_record(bad_record)
                        self.incomplete = self.incomplete + 1
                        miss_required_filed = True
                        break

                if miss_required_filed:
                    continue

                #检查3:检查元数据里面是否有非空字段
                fail = False
                for key, value in json_data.iteritems():
                    key = key.strip(":").strip()
                    value = Utils.format_value(value)
                    if value == "" and key in self.required_keys:
                        if key == "release_year":
                            publish_data = self._get_value(
                                json_data, "release_date")
                            if publish_data != "":
                                #if publish data is also empty, there is no way to get publish_year
                                json_data["release_year"] = publish_data.split(
                                    "-")[0]
                                print "publish year is %s" % json_data[
                                    "release_year"]
                                continue

                        bad_record = {}
                        bad_record['reason'] = "%s empty" % key
                        bad_record['access_url'] = access_url
                        self._mark_bad_record(bad_record)
                        self.incomplete = self.incomplete + 1
                        fail = True
                        break

                if fail:
                    continue

                #检查4:补充一些必备字段
                json_data['acquisition_time'] = Utils.current_time()
                publish_year = self._get_value(json_data, "release_year")
                if publish_year == "":
                    publish_data = self._get_value(json_data, "release_date")
                    if publish_data != "":
                        json_data["release_year"] = publish_data.split("-")[0]

                #处理一下author、author_sub、author_affiliation等字段

                if access_url in self.pass_meta_map:
                    title = self._get_value(json_data, "title")
                    if title != self.pass_meta_map[access_url]:
                        pass
                        #raise Exception("same url with different title, not gonna happen :%s" % access_url)
                    self.dup = self.dup + 1
                    continue

                self.pass_count = self.pass_count + 1
                self._mark_success_record(json_data)
                self.pass_meta_map[access_url] = json_data["title"]

        if self.args.pdf_dir is not None:
            print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pdf_non_exist: %d, pdf_exist_count: %d, pass meta save to: %s, fail meta save to :%s, miss pdf url save to :%s" \
            % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pdf_non_exist, self.pdf_exist, self.pass_meta_file, self.bad_meta_file, self.miss_pdf_file)
        else:
            print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pass meta save to: %s, fail meta save to :%s" \
            % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pass_meta_file, self.bad_meta_file)

예제 #3

파일 보기

from spiders.utils import Utils

filename = sys.argv[1]
columnname = sys.argv[2]
split = '|'

with open(filename) as fp:
    for line in fp:
        try:
            json_date = json.loads(line)
        except Exception as e:
            continue
        columns = columnname.split(",")
        line = ""
        for column in columns:
            try:
                data = Utils.format_value(json_date[column], join_char='|')
            except Exception as e:
                data = ""

            if column == 'url':
                data = re.sub("\?journalCode=.*", "", data)

            if isinstance(data, int):
                line += str(data) + split
            else:
                line += data.replace('\n', '').replace('\t',
                                                       '').strip() + split
        #print line.strip().replace(u'ê', 'e').replace(u'é', 'e').replace(u'ã', 'a').replace(u'ó', 'o').replace(u'ú', 'u').strip(split)
        print line.strip().strip(split)