Пример #1
0
    def start(self):
        self._load_journal_meta()
        with open(self.origin_meta_file) as f:
            for line in f:
                try:
                    line = line.strip()
                    json_data = json.loads(line)
                except Exception as e:
                    continue

                if json_data["from_url"] not in self.journal_meta:
                    raise Exception("conference_url %s not excepted" % json_data["from_url"])

                journal_meta = self.journal_meta[json_data["from_url"]]

                new_data = {}
                new_data["id"] = journal_meta["id"]
                new_data["conference"] = journal_meta["conference"]
                new_data["issn"] = json_data.get("issn")
                new_data["title"] = json_data.get("title")
                new_data["abstract"] = json_data.get("abstract")
                new_data["author"] = json_data.get("author")
                new_data["keywords"] = json_data.get("keywords")
                new_data["release_year"] = json_data.get("release_year")

                page = json_data.get("page", "").replace("Pages", "").replace("Page", "").strip()
                page_infos = page.split("-")
                if len(page_infos) != 1:
                    start_page = Utils.str_to_num(page_infos[0].strip())
                    end_page = Utils.str_to_num(page_infos[1].strip())
                    total_page = end_page - start_page + 1
                else:
                    start_page = Utils.str_to_num(page_infos[0].strip())
                    end_page = start_page
                    total_page = 1

                new_data["start_page"] = start_page
                new_data["end_page"] = end_page
                new_data["total_page"] = total_page
                new_data["pdf_path"] = os.path.join(
                    journal_meta["id"], Utils.get_pdf_filename(json_data))
                new_data["doi"] = json_data.get("doi")
                new_data["conference_url"] = json_data["from_url"]
                new_data["access_url"] = json_data["access_url"]
                new_data["pdf_url"] = json_data["pdf_url"]
                print json.dumps(new_data)
Пример #2
0
    def _mark_success_record(self, json_data):
        """
        Mark an success record.

        1. 对Key做一些归一化的工作,同时也可以加一些必备的字段
        2. 如果指定了pdf save dir,则会和pdf文件做拼接
        3. 去掉一些无用的key(比如portia爬取,会加上_template这样的字段)
        @param json_data
        """
        publish_data = self._get_value(json_data, "release_date")
        if "release_year" not in json_data or json_data["release_year"] == "":
            #if publish data is also empty, there is no way to get publish_year
            if "release_date" in json_data:
                json_data["release_year"] = publish_data.split()[-1]

        if "keywords" in json_data:
            if type(json_data["keywords"]) is list \
            and len(json_data["keywords"]) == 1:
                #portia有的期刊爬取keywords,都写到一个元素里面了,应该拆开
                keywords = json_data["keywords"][0].replace("Keywords",
                                                            "").strip()
                json_data["keywords"] = keywords.split(";")
                if len(json_data["keywords"]) == 1:
                    json_data["keywords"] = keywords.split(",")
            elif self.for_oa and type(json_data["keywords"]) is not list:
                keywords = json_data["keywords"].replace(
                    "Index terms:", "").replace("Keywords", "").split(";")
                json_data["keywords"] = keywords

                #如果是oa,且keywords不是list,要转化为list

        convert_data = {}
        for key, value in json_data.iteritems():
            format_key = key.strip(":").strip().lower()

            if format_key in self.key_map:
                format_key = self.key_map[format_key]

            #这类key,不会把list转换为string
            if format_key in self.reserved_non_converted_keys:
                convert = False
            else:
                convert = not self.for_oa

            value = self._get_value(
                json_data, format_key,
                convert=convert)  #这里使用_get_value,会对value做一些归一化
            convert_data[format_key] = value

        # 归一化作者和作者机构
        is_scielo = False
        if is_scielo:
            #2018.04.18 崩溃了,scielo的作者单独处理
            if convert_data["author"][0].find("<") != -1:
                author_raw_text = " ".join(convert_data["author"])
                authors = self._format_scielo_authors(author_raw_text)
                convert_data['author'] = authors

            convert_data.pop("author_affiliation", None)
        elif 'author' in convert_data and len(
                convert_data["author"]
        ) == 1:  #这种author可能是一坨html,包含了多个作者并且每个作者的sup也标记在<sup>里面
            #这种情况属于作者机构不太好爬,直接把html文档都爬取到author字段了
            authors, author_sups = self._format_authors(convert_data)
            if 'author_sup' not in convert_data:
                convert_data['author_sup'] = author_sups
            else:
                convert_data['author_sup'] = [
                    self._format_author_sup(sup)
                    for sup in convert_data['author_sup']
                ]
            if len(authors) == 1:
                #如果此时author还是只有一个元素,那么author可能是,分隔的
                authors = authors[0].split(",")

            convert_data['author'] = authors

        if 'author_sup' in convert_data:
            convert_data['author_sup'] = [
                self._format_author_sup(sup)
                for sup in convert_data['author_sup']
            ]

        if "author_affiliation" in convert_data and len(
                convert_data['author_affiliation']) == 1:
            #这种author_affiliation可能是一坨html
            author_affiliation = convert_data['author_affiliation'][0]
            try:
                authors = convert_data['author']
                if author_affiliation.startswith(authors[0]):
                    #这种作者机构是以作者分隔的,比如:https://koedoe.co.za/index.php/koedoe/article/view/188
                    author_affiliation = self._format_author_affiliations_by_author(
                        author_affiliation, authors)
                else:
                    author_affiliation = self._format_author_affiliations(
                        convert_data)
                convert_data['author_affiliation'] = author_affiliation
            except Exception as e:
                #没爬到作者,却爬到了作者机构,先忽略这种情况吧
                authors = []
                convert_data['author_affiliation'] = []
                convert_data['author'] = []
                convert_data['author_sup'] = []

        #有的author_sup里面,会有空字符串,比如scielo
        if "author_sup" in convert_data and type(
                convert_data["author_sup"]) is list:
            convert_data["author_sup"] = [
                i for i in convert_data["author_sup"] if i != ''
            ]

        if self.args.pdf_dir is not None:
            filename = Utils.get_pdf_filename(json_data)
            pdf_path = os.path.join(self.args.pdf_dir, filename + ".pdf")
            txt_path = os.path.join(self.args.pdf_dir, filename + ".txt")
            if os.path.exists(pdf_path):
                convert_data["pdf_path"] = filename + ".pdf"
                self.pdf_exist = self.pdf_exist + 1
            elif os.path.exists(txt_path):
                convert_data["pdf_path"] = filename + ".txt"
                self.pdf_exist = self.pdf_exist + 1
            else:
                #print "pdf path(%s) or txt path(%s) not exist" % (pdf_path, txt_path)
                convert_data["pdf_path"] = "wrong"
                self.pdf_non_exist = self.pdf_non_exist + 1
                pdf_link = self._get_value(json_data, "pdf_url")
                if pdf_link == "":
                    raise Exception("cannot get pdf_url from json %s" %
                                    json_data)
                self.miss_pdf_writer.write(pdf_link)
                self.miss_pdf_writer.write("\n")

        #归一化author,author_affiliation
        if not self.for_oa:
            pass
            #convert_data = Utils.format_authors_from_json(convert_data)

        #去掉一些key
        if not self.for_oa:
            convert_data.pop("author_sup", None)
        else:
            #oa的需要特别处理下
            convert_data["doi"] = self._get_value(convert_data, "doi").replace(
                "https://doi.org/", "").replace("http://doi.org/", "")
        convert_data.pop('_template', None)

        convert_data_str = json.dumps(convert_data)
        self.pass_meta_writer.write(convert_data_str)
        self.pass_meta_writer.write("\n")