Пример #1
0
 def get_prid(self, sid):
     root_url = self.base_url + '/' + self.general_url
     self.form_prid_hearders = {
         'User-Agent': random.choice(browser_useragents_utils.USER_AGENTS)
     }
     # 所有年份
     self.form_prid_data = {
         "action": "search",
         "product": self.product,
         "search_mode": "GeneralSearch",
         "SID": sid,
         # "sa_params": self.product + "||" + sid + "|http://apps.webofknowledge.com|'",
         "value(input1)": self.inputvalue,
         "value(select1)": "OG",
         "startYear": config.startYear,
         "endYear": config.endYear
     }
     s = requests.Session()
     response = s.post(root_url,
                       data=self.form_prid_data,
                       headers=self.form_prid_hearders)
     prId = file_utils.url_parse(response.url, "prID")
     # 获取总页数
     total_num = int(
         BeautifulSoup(response.text,
                       "lxml").find(id="trueFinalResultCount").text)
     response.close()
     return prId, total_num
Пример #2
0
 def get_sid(self, ):
     root = 'http://www.webofknowledge.com/'
     response = requests.get(root,
                             headers={
                                 'User-Agent':
                                 random.choice(
                                     browser_useragents_utils.USER_AGENTS)
                             })
     sid = file_utils.url_parse(response.url, "SID")
     response.close()
     return sid
    def craw_detail(self, url, headers, cookies, it):
        time.sleep(random.uniform(1, 3000) / 1000)
        searchid = file_utils.url_parse(url, "SEARCHID")
        #print(url)
        session = requests.session()
        response_list = session.get(url, headers=headers, cookies=cookies)
        cookies = requests.utils.dict_from_cookiejar(session.cookies)
        headers = response_list.headers
        #print(response_list.text)
        response_list.close()
        if response_list.status_code == 200 and 'System error happened' not in response_list.text:
            title = ""
            accession_number = ""
            source_title = ""
            language = ""
            document_type = ""
            abstract = ""
            number_of_references = ""
            main_heading = ""
            controlled_terms = ""
            uncontrolled_terms = ""
            classification_code = ""
            doi = ""
            database = ""
            conference_name = ""
            conference_date = ""
            conference_location = ""
            conference_code = ""
            mumerical_data_indexing = ""
            affiliation_no = ""
            author_affiliation = ""
            affiliation_organization = ""
            country = ""
            authors = ""
            affiliation_no = ""
            e_mail = ""
            funding_number = ""
            funding_acronym = ""
            funding_sponsor = ""
            source_title = ""
            abbreviated_source_title = ""
            issn = ""
            e_issn = ""
            coden = ""
            isbn_13 = ""
            article_number = ""
            issue = ""
            volume = ""
            part_number = ""
            issue_title = ""
            issue_date = ""
            publication_year = ""
            page_begin = ""
            page_end = ""
            publisher = ""
            referance_no = ""
            referance_title = ""
            referance_authors = ""
            referance_source = ""

            list_json = response_list.json()
            results = list_json["results"]
            docindex = results[0].get("doc").get("hitindex")
            docid = results[0].get("doc").get("docid")

            # abstracthref = results[0]["abstracthref"].replace("\n","").replace(" ","")
            time.sleep(random.uniform(1, 3000) / 1000)
            abstracthref = "https://www.engineeringvillage.com/search/doc/abstract.url?content=true&&pageType=quickSearch&usageZone=resultslist&usageOrigin=searchresults&searchtype=Quick&SEARCHID=" + searchid + "&DOCINDEX=" + str(
                docindex
            ) + "&ignore_docid=" + docid + "&database=1&format=quickSearchAbstractFormat&tagscope=&displayPagination=yes"
            #session = requests.session()
            # response = session.get(self.basd_url+abstracthref,headers=headers,cookies=cookies)
            headers["Content-Type"] = "application/json"
            # headers["Connection"] = "keep-alive"
            # headers["Referer"] = "https://www.engineeringvillage.com/search/doc/abstract.url?content=true&&pageType=quickSearch&usageZone=resultslist&usageOrigin=searchresults&searchtype=Quick&SEARCHID="+searchid+"&DOCINDEX="+str(docindex)+"&ignore_docid="+docid+"&database=1&format=quickSearchAbstractFormat&tagscope=&displayPagination=yes"

            # abstract_response = session.get(abstracthref, headers=headers, cookies=cookies)
            # print(abstract_response.text)
            # abstract_json = abstract_response.json()
            # title = BeautifulSoup(abstract_json.get("abstractDetail_highlight_terms_map").get("title"),"lxml").text

            # ------------------------------------------------------detailed----------------------------------------------------------
            time.sleep(random.uniform(1, 3000) / 1000)
            detailedhref = "https://www.engineeringvillage.com/search/doc/detailed.url?content=true&SEARCHID=" + searchid + "&DOCINDEX=" + str(
                docindex
            ) + "&database=1&pageType=expertSearch&searchtype=Expert&dedupResultCount=null&format=expertSearchDetailedFormat&usageOrigin=recordpage&usageZone=abstracttab"
            session = requests.session()
            detailed_response = session.get(detailedhref,
                                            headers=headers,
                                            cookies=cookies)
            #print(detailed_response.text)
            detailed_response.close()
            if detailed_response.status_code == 200:
                detailed_json = detailed_response.json()
                #print(detailed_json)
                detailed_result = detailed_json.get("result")
                title = BeautifulSoup(
                    detailed_json.get("result").get("title"),
                    "lxml").text.replace("'", "\\'").replace('"', '\\"')
                accession_number = detailed_result.get("accnum")
                author_affiliations = detailed_result.get("affils")
                source_title = detailed_result.get("ril")
                language = detailed_result.get("la")
                document_type = detailed_result.get("doctype")
                abstract = BeautifulSoup(
                    detailed_json.get("abstractDetail_highlight_terms_map").
                    get("abstractRecord"), "lxml").text if detailed_json.get(
                        "abstractDetail_highlight_terms_map").get(
                            "abstractRecord") is not None else ''
                number_of_references = detailed_result.get(
                    "abstractrecord").get("refcount")
                main_heading = ''
                if detailed_result.get("abstractrecord") is not None:
                    if detailed_result.get("abstractrecord").get(
                            "termmap") is not None:
                        if detailed_result.get("abstractrecord").get(
                                "termmap").get("MH") is not None:
                            main_heading = detailed_result.get(
                                "abstractrecord").get("termmap").get(
                                    "MH")[0].get("value")
                controlled_terms = BeautifulSoup(
                    detailed_json.get("abstractDetail_highlight_terms_map").
                    get("CVS"), "lxml").text if detailed_json.get(
                        "abstractDetail_highlight_terms_map").get(
                            "CVS") is not None else ''
                uncontrolled_terms = BeautifulSoup(
                    detailed_json.get("abstractDetail_highlight_terms_map").
                    get("FLS"), "lxml").text if detailed_json.get(
                        "abstractDetail_highlight_terms_map").get(
                            "FLS") is not None else ''
                # 具体解析
                classification_code_tmp = detailed_result.get(
                    "abstractrecord").get("classificationcodes").get(
                        "Classification code")
                if classification_code_tmp is not None and len(
                        classification_code_tmp) > 0:
                    for cc in classification_code_tmp:
                        classification_code = classification_code + cc.get(
                            "id") + cc.get("title") + " - "
                    classification_code = classification_code.rstrip(' - ')
                doi = detailed_result.get("doi")
                data_base = detailed_result.get("doc").get("dbname")
                conference_name = BeautifulSoup(
                    detailed_result.get("cf"), "lxml"
                ).text if detailed_result.get("cf") is not None else ''
                conference_date = detailed_result.get(
                    "md") if detailed_result.get("md") is not None else ''
                conference_location = detailed_result.get(
                    "ml") if detailed_result.get("ml") is not None else ''
                conference_code = BeautifulSoup(
                    detailed_result.get("cc"),
                    "lxml").text.replace("\n", "").replace(
                        "\t",
                        "") if detailed_result.get("cc") is not None else ""
                mumerical_data_indexing = detailed_result.get(
                    "ndi") if detailed_result.get("ndi") is not None else ''

                sqls = []
                # ei_thesis_thesis
                tt_cauthors = detailed_result.get("cauthors")
                corresponding_author = ""
                corresponding_author_email = ""
                if tt_cauthors is not None and len(tt_cauthors) > 0:
                    for cauthor in tt_cauthors:
                        corresponding_author = corresponding_author + cauthor.get(
                            "name") + ";"
                        corresponding_author_email = corresponding_author_email + (
                            (cauthor.get("email") + ";")
                            if cauthor.get("email") != '' is not None else '')
                id = str(
                    uuid.uuid3(uuid.NAMESPACE_DNS, title + accession_number))
                sql = "REPLACE INTO ei_thesis_thesis(id,title,accession_number,source_title,language,document_type,abstract,number_of_references,main_heading,controlled_terms,uncontrolled_terms,classification_code,doi,data_base,conference_name,conference_date,conference_location,conference_code,mumerical_data_indexing,corresponding_author,corresponding_author_email) " \
                      "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + source_title.replace("'", "\\'").replace('"', '\\"') + "','" + language + "','" + document_type + "','" + abstract.replace("'", "\\'").replace('"', '\\"') + "','" + str(number_of_references) + "','" + main_heading + "','" + controlled_terms + "','" + uncontrolled_terms.replace("'", "\\'").replace('"', '\\"') + "','" + classification_code + "','" + doi + "','" + data_base + "','" + conference_name.replace("'", "\\'").replace('"', '\\"') + "','" + conference_date + "','" + conference_location.replace("'", "\\'").replace('"', '\\"') + "','" + conference_code + "','" + mumerical_data_indexing + "','" + corresponding_author.replace("'", "\\'").replace('"', '\\"') + "','" + corresponding_author_email + "')"
                sqls.append(sql)

                # ei_thesis_affiliation
                if author_affiliations is not None and len(
                        author_affiliations) > 0:
                    for af in author_affiliations:
                        author_affiliation = BeautifulSoup(
                            af.get("name"),
                            "lxml").text if af.get("name") is not None else ''
                        aocs = author_affiliation.split(",")
                        affiliation_organization = ''
                        country = ''
                        if len(aocs) == 5:
                            affiliation_organization = aocs[-3]
                            country = aocs[-1]
                        elif len(aocs) == 4:
                            affiliation_organization = aocs[-3]
                            country = aocs[-1]
                        elif len(aocs) == 3:
                            affiliation_organization = aocs[-2]
                            country = aocs[-1]
                        id = str(
                            uuid.uuid3(
                                uuid.NAMESPACE_DNS,
                                title + accession_number + str(af.get("id"))))
                        sql = 'REPLACE INTO ei_thesis_affiliation(id,title,accession_number,affiliation_no,author_affiliation,affiliation_organization,country)  ' \
                              'VALUES ("' + id + '","' + title + '","' + accession_number + '","' + str(af.get("id")) + '","' + author_affiliation + '","' + affiliation_organization + '","' + country + '")'
                        sqls.append(sql)

                    # ei_thesis_author
                    authors = detailed_result.get("authors")
                    cauthors = detailed_result.get("cauthors")
                    if authors is not None and len(authors) > 0:
                        for au in authors:
                            affiliation_no = au.get("id")
                            author = au.get("name")
                            e_mail = au.get("email")
                            corresponding_author = '0'
                            if cauthors is not None and len(cauthors) > 0:
                                for cauthor in cauthors:
                                    if author == cauthor.get("name"):
                                        corresponding_author = "1"
                            id = str(
                                uuid.uuid3(uuid.NAMESPACE_DNS,
                                           title + accession_number + author))
                            sql = "REPLACE INTO ei_thesis_author(id,title,accession_number,author,affiliation_no,e_mail) " \
                                  "VALUES ('"+id+"','"+title+"','"+accession_number+"','"+author.replace("'", "\\'").replace('"', '\\"')+"','"+str(affiliation_no)+"','"+e_mail+"')"
                            sqls.append(sql)

                    # ei_thesis_funding
                    funding_details = detailed_result.get(
                        "abstractrecord").get("fundingDetails")
                    if funding_details is not None and len(
                            funding_details) > 0:
                        for fd in funding_details:
                            id = str(
                                uuid.uuid3(
                                    uuid.NAMESPACE_DNS,
                                    title + accession_number +
                                    str(fd.get("fundingId"))))
                            sql = "REPLACE INTO ei_thesis_funding(id,title,accession_number,funding_number,funding_acronym,funding_sponsor) " \
                                  "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(fd.get("fundingId")) + "','" + fd.get("fundingAcronym") + "','" + fd.get("fundingAgency").replace("'", "\\'").replace('"', '\\"') + "')"
                            sqls.append(sql)

                    # ei_thesis_publication
                    abbreviated_source_title = detailed_result.get(
                        "sourceabbrev")
                    issn = detailed_result.get("citedby").get(
                        "issn") if detailed_result.get("citedby").get(
                            "issn") is not None else ''
                    e_issn = detailed_result.get("abstractrecord").get(
                        "eissn") if detailed_result.get("abstractrecord").get(
                            "eissn") is not None else ''
                    if e_issn is not None and e_issn != '':
                        e_issn = e_issn[0:4] + "-" + e_issn[4:len(e_issn)]
                    coden = detailed_result.get("abstractrecord").get(
                        "coden") if detailed_result.get("abstractrecord").get(
                            "coden") is not None else ''
                    isbn_13 = detailed_result.get(
                        "isbn13") if detailed_result.get(
                            "isbn13") is not None else ''
                    article_number = detailed_result.get(
                        "articlenumber") if detailed_result.get(
                            "articlenumber") is not None else ''
                    issue = detailed_result.get("citedby").get("firstissue")
                    volume = detailed_result.get("vo")
                    part_number = detailed_result.get(
                        "cfpnum") if detailed_result.get(
                            "cfpnum") is not None else ''
                    issue_title = detailed_result.get("mt").replace(
                        "::H:", ":H::")
                    issue_date = detailed_result.get("sd")
                    publication_year = detailed_result.get("yr")
                    pages = detailed_result.get("pages")
                    page_begin = ""
                    page_end = ""
                    pages_split = pages.split("-")
                    if len(pages_split) == 2:
                        page_begin = pages_split[0]
                        page_end = pages_split[1]
                    publisher = detailed_result.get("pn").replace(
                        "::H:", ":H::")
                    id = str(
                        uuid.uuid3(uuid.NAMESPACE_DNS,
                                   title + accession_number))
                    sql = "REPLACE INTO ei_thesis_publication(id,title,accession_number,source_title,abbreviated_source_title,issn,e_issn,coden,isbn_13,article_number,issue,volume,part_number,issue_title,issue_date,publication_year,page_begin,page_end,publisher) " \
                          "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + source_title.replace("'", "\\'").replace('"', '\\"') + "','" + abbreviated_source_title.replace("'", "\\'").replace('"', '\\"') + "','" + str(issn) + "','" + str(e_issn) + "','" + str(coden) + "','" + str(isbn_13) + "','" + str(article_number) + "','" + str(issue) + "','" + volume + "','" + str(part_number) + "','" + issue_title + "','" + issue_date + "','" + publication_year + "','" + page_begin + "','" + page_end + "','" + publisher.replace("'", "\\'").replace('"', '\\"') + "')"
                    sqls.append(sql)

                # ------------------------------------------------------Compendex Refs------------------------------------------------------
                # refs1,如果没有参考文献信息,detailed_result.get("abstractrecord").get("refcount")的值会为-1,否则就显示实际论文数
                if number_of_references != -1:
                    time.sleep(random.uniform(1, 3000) / 1000)
                    refshref = "https://www.engineeringvillage.com/search/doc/refs.url?content=true&refType=compendex&searchtype=Expert&usageOrigin=recordpage&usageZone=detailedtab&pageType=expertSearch&SEARCHID=" + searchid + "&DOCINDEX=" + str(
                        docindex
                    ) + "&database=1&docid=" + docid + "&totalResultsCount=67010&displayPagination=yes&dbid=cpx"
                    session = requests.session()
                    refs_response = session.get(refshref,
                                                headers=headers,
                                                cookies=cookies)
                    #print(refs_response.text)
                    refs_response.close()
                    if refs_response.status_code == 200:
                        refs_json = refs_response.json()
                        #print(refs_json)
                        referenceBean = refs_json.get("referenceBean")
                        title_authors = referenceBean.get("results")
                        sources = referenceBean.get(
                            "resultformat_abssourcelines")
                        if title_authors is not None and len(
                                title_authors) > 0:
                            for index in range(0, len(title_authors)):
                                referance_no = index + 1
                                referance_authors = ""
                                t_authors = title_authors[index].get("authors")
                                if t_authors is not None and len(
                                        t_authors) > 0:
                                    for tau in t_authors:
                                        referance_authors = referance_authors + tau.get(
                                            "name") + ";"
                                referance_title = title_authors[index].get(
                                    "title").replace("'", "\\'").replace(
                                        '"', '\\"')
                                referance_authors = referance_authors.replace(
                                    "'", "\\'").replace('"', '\\"')
                                referance_source = BeautifulSoup(
                                    sources[index],
                                    "lxml").text.replace("'", "\\'").replace(
                                        '"', '\\"').replace('Source:  ', '')
                                id = str(
                                    uuid.uuid3(
                                        uuid.NAMESPACE_DNS, title +
                                        accession_number + referance_title))
                                sql = "REPLACE INTO ei_thesis_reference(id,title,accession_number,referance_no,referance_title,referance_authors,referance_source) " \
                                      "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(referance_no) + "','" + referance_title + "','" + referance_authors + "','" + referance_source + "')"
                                sqls.append(sql)

                    # resfs2 当refs条数大于25的时候才执行这一步,不然只有一页,没有下一页
                    if number_of_references > 25:
                        time.sleep(random.uniform(1, 3000) / 1000)
                        refshref = "https://www.engineeringvillage.com/search/doc/refs.url?content=true&compendexajax=t&docid=" + docid + "&SEARCHID=" + searchid + "&database=1&DOCINDEX=&currPageNumber=2&searchtype=Expert&pageSize=25"
                        session = requests.session()
                        refs_response = session.get(refshref,
                                                    headers=headers,
                                                    cookies=cookies)
                        #print(refs_response.text)
                        refs_response.close()
                        refs_json = refs_response.json()
                        #print(refs_json)
                        referenceBean = refs_json.get("referenceBean")
                        title_authors = referenceBean.get("results")
                        sources = referenceBean.get(
                            "resultformat_abssourcelines")
                        if title_authors is not None and len(
                                title_authors) > 0:
                            for index in range(0, len(title_authors)):
                                referance_no = index + 1
                                referance_authors = ""
                                t_authors = title_authors[index].get("authors")
                                if t_authors is not None and len(
                                        t_authors) > 0:
                                    for tau in t_authors:
                                        referance_authors = referance_authors + tau.get(
                                            "name") + ";"
                                referance_title = title_authors[index].get(
                                    "title").replace("'", "\\'").replace(
                                        '"', '\\"')
                                referance_authors = referance_authors.replace(
                                    "'", "\\'").replace('"', '\\"')
                                referance_source = BeautifulSoup(
                                    sources[index],
                                    "lxml").text.replace("'", "\\'").replace(
                                        '"', '\\"').replace('Source:  ', '')
                                id = str(
                                    uuid.uuid3(
                                        uuid.NAMESPACE_DNS, title +
                                        accession_number + referance_title))
                                sql = "REPLACE INTO ei_thesis_reference(id,title,accession_number,referance_no,referance_title,referance_authors,referance_source) " \
                                      "VALUES ('" + id + "','" + title + "','" + accession_number + "','" + str(referance_no) + "','" + referance_title + "','" + referance_authors + "','" + referance_source + "')"
                                sqls.append(sql)
                print(sqls)
                self.mysqlclient.insert_thesis_afoprt(sqls)
            else:
                self.redis_client.lpush(self.consumer_list_success_fail,
                                        json.dumps(it))
        else:
            self.redis_client.lpush(self.consumer_list_success_fail,
                                    json.dumps(it))