예제 #1
0
def search(author, years, results, affiliation=None):
    try:
        client = WosClient(user_id, password)
        client.connect()
    except suds.WebFault as e:
        print_('Username and/or password not valid, or requests limit exceeded')
        print_(e)
        exit(1)

    # Build the tree
    tree = query(client, author, years, results, affiliation)
    # Extract information from the tree
    results = tree_extractor(tree)
    # Draw the table
    _draw_table(results)
    return results
예제 #2
0
def main():
    global sResultFlag
    timestring = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    connIn = sqlite3.connect('paper2.db')
    dfSourceTitles = pandas.read_sql('select * from paper2', connIn)
    dfScopus = pandas.read_pickle('pickles/scopusData.pkl')

    connOut = sqlite3.connect('publications' + timestring + '.db')
    cursorOut = connOut.cursor()
    createTables(cursorOut)

    loopIdx = 0
    publicationCount = len(dfSourceTitles.index)

    while loopIdx < publicationCount:
        sid = input('Please put in SID of Web of Science session: ')
        with WosClient(SID=sid, lite=True, close_on_exit=False) as client:
            for i in range(0, 2400):
                if loopIdx >= publicationCount:
                    break
                sResultFlag = SearchResult.NOTFOUND
                publication = Publication(
                    title=dfSourceTitles.loc[loopIdx]['title'],
                    journal=dfSourceTitles.loc[loopIdx]['journal'],
                    abstract=dfSourceTitles.loc[loopIdx]['abstract'])

                publication = searchPublicationInfoWos(publication, client)
                if sResultFlag == SearchResult.NOTFOUND:
                    publication = searchPublicationInfoDblp(publication)
                if sResultFlag == SearchResult.NOTFOUND:
                    publication = searchPublicationInfoScopus(
                        publication, dfScopus,
                        dfSourceTitles.iloc[loopIdx]['index'])

                if sResultFlag == SearchResult.MULTI:
                    with open('logMulti' + timestring + '.txt',
                              'a',
                              encoding='utf-8') as f:
                        f.write('mulitple records found: "' +
                                publication.title + '"\n')
                    sResultFlag == None

                if sResultFlag == SearchResult.NOTFOUND:
                    with open('logNotFound' + timestring + '.txt',
                              'a',
                              encoding='utf-8') as f:
                        f.write('Not found: "' + publication.title + '"\n')
                    sResultFlag == None

                cursorOut.execute(
                    "INSERT INTO publication VALUES(?,?,?,?,?,?)",
                    (int(dfSourceTitles.loc[loopIdx]['id']), publication.title,
                     publication.journal, publication.abstract,
                     str(publication.authors), publication.year))
                connOut.commit()

                loopIdx += 1
                print('iteration: ' + str(loopIdx))
예제 #3
0
 def fetchJson(self, user):
     with WosClient(user='******',
                    password='******') as client:
         filename = (wos.utils.query(client, 'AU=%s'), user)
     # file = "../../../recordForAbrahms"
     infile = open(filename)
     contents = infile.read()
     Json = self.convertXMLToJson(contents)
     return json.loads(Json)
예제 #4
0
def queryWos(WOS, start_time):
    root_list = []
    with WosClient(c.getUserName(), c.getPassword()) as client:
        print("Starting to look for title and publisher info")
        for i, id in enumerate(WOS):
            try:
                root = wos.utils.query_by_id(client, id)
                root_list.append(root)
            except:
                print("Some error while adding roots. Waiting 30 sec")
                time.sleep(15)
            if (i+1) % 50 == 0:
                print("Queries so far is {}. Time used is {:0.1f}".format((i+1),((time.time()-start_time)/60.0)))
            time.sleep(0.5)
    return root_list
예제 #5
0
def wos_main(querylist, d_path):
    ''' main function for calling web of science returns '''
    # @TODO incorporate: article types, subject areas, countries.
    with WosClient() as client:
        for item in query_list:
            # this reformats the string from query_list to make it amenable
            new_item = " AND ".join(['TS=' + x.replace('{', '"').
                                     replace('}', '"')
                                     for x in item.split(' AND ')])
            now = datetime.now()
            dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
            print(dt_string + ': searching WOS: ' + new_item)
            new_wos_query(new_item, client, d_path)
            time.sleep(60)
    df = merge_dfs(d_path)
    df.to_csv(os.path.join(d_path, 'wos', 'wos_search.csv'), index=False)
    return df
예제 #6
0
def queryRelatedWos(WOS, start_time):
    root_list = []
    with WosClient(c.getUserName(), c.getPassword()) as client:
        print("Starting to look for related records")
        for i, id in enumerate(WOS):
            try:
                records = wos.utils.get_related_records_v2(client, id, count=2)
                roots = records.findall("REC")
                for root in roots:
                    root_list.append(root)
            except:
                print("Some error while adding roots. Waiting 30 sec")
                time.sleep(5)
            if (i + 1) % 50 == 0:
                print("Queries so far is {}. Time used is {:0.1f}".format(
                    (i + 1), ((time.time() - start_time) / 60.0)))
            time.sleep(0.5)
    return root_list
예제 #7
0
def queryWoS(titles, years):
    # Create an empty list which should contain info later
    info = []
    # Connect to Web of Science
    with WosClient(c.getUserName(), c.getPassword()) as client:
        # Looping through the titles (search parameter)
        for i,title in enumerate(titles):
            # Replace '|' with ','
            title = title.replace("|","")
            # Get year published
            year = years[i]
            # Create year query with +/- 1 year
            query_string_year = 'PY=(' + str(year-1) + ' OR ' + str(year) + ' OR ' + str(year+1) + ')'
            # Check if the title contains any operators (like AND, OR, NOT)
            if isOperator(title):
                title = removeOperator(title)
            # Create title query
            query_string_title = 'TI=' + title
            # Create query AND operator string
            query_AND_operator = ' AND '
            # Create the query string
            query_string = query_string_title + query_AND_operator + query_string_year
            print query_string
            # Perform the query on wos engine
            xmlString = ""
            try:
                xmlString = wos.utils.query(client, query_string, count=1)
            except:
                print("Some error occurred while querying the WoS database")
            print xmlString
            # Convert to XML object
            if xmlString is not "":
                root = ElementTree.XML(xmlString)
                # Convert XML object to a dictionary
                xmlDict = XmlDictConfig(root)
                # Add the dictionary to the info list
                info.append(xmlDict)
            # Just for being 'nice' to WoS and not bomb attack the server
            time.sleep(1)
    return info
"""
Garbage and convoluted code attempting to parse the formless REF data using web of science.
"""

import glob
import difflib
import json
from wos import WosClient
import wos.utils
from xmljson import badgerfish as bf

refs = {}
errors = 0
total_cit = 0

with WosClient() as client:
    for infile in glob.glob("stage1/*.json"):
        total_cit += 1
        with open(infile, "r") as infile_data:
            data = json.load(infile_data)

        if data["valid"] is False:
            continue

        all_Z = set()
        all_found = True
        for cit in data["citations"]:
            if cit["valid"] is False:
                all_found = False
                break
예제 #9
0
        'pubyear':
        summary.find('pub_info')['pubyear']
    }


# In[ ]:

# total_batch, nth_batch = 8, 0
total_batch, nth_batch = int(sys.argv[1]), int(sys.argv[2])
print(f'Now batch querying {nth_batch} of total {total_batch} batches')
batch_size = len(all_uids) // total_batch
batch_uids = all_uids[batch_size * nth_batch:batch_size * (nth_batch + 1)]

# In[ ]:

summary_client = WosClient()
refs_client = WosClient()
inv_refs_client = WosClient()
summary_client.connect()
refs_client.connect()
inv_refs_client.connect()

start_time = time.time()
for idx, wos_code in enumerate(batch_uids, 1):

    if idx % 150 == 0:
        print(
            f'Has downloaded info of {idx} articles, Time elapsed: {time.time() - start_time: .4f}'
        )
        start_time = time.time()
예제 #10
0
def queryWos(tcp_data, start, start_sample, end_sample):
    # Return a small list of titles, year and ids for testing

    #with open("data/wos_not_meta_data.json", "r") as f:
    #    small_list_ids = json.load(f)
    # Create an empty list which should contain info later
    info = []
    not_found = []
    # Connect to Web of Science
    with WosClient(c.getUserName(), c.getPassword()) as client:
        print("Starting the queries")
        # Looping through the titles (search parameter)
        for i, id in enumerate(tcp_data.index.values.tolist()[start_sample:end_sample]):
            # Replace '|' with ','
            title = tcp_data.loc[id].Title.replace("|",",").replace("?","").replace('"', '').replace("/"," ").replace("-", " ").replace(":", " ")
            title = re.sub(r'\([^)]*\)', '', title)
            title = title.replace("(","").replace(")","")
            # Get year published
            year = tcp_data.loc[id].Year
            # Create year query with +/- 1 year
            query_string_year = 'PY=(' + str(year-1) + ' OR ' + str(year) + ' OR ' + str(year+1) + ')'
            # Check if the title contains any operators (like AND, OR, NOT)
            if isOperator(title):
                title = removeOperator(title)
            # Create title query
            query_string_title = 'TI=' + title
            # Create query AND operator string
            query_AND_operator = ' AND '
            # Create the query string
            query_string = query_string_title + query_AND_operator + query_string_year
            #print query_string
            # Perform the query on wos engine
            root = None
            wait = 2
            try:
                root = wos.utils.query_v2(client, query_string, count=1)
            except suds.WebFault:
                print "Suds.WebFault: Waiting {} sec".format(wait)
                print suds.WebFault.args
                time.sleep(wait)
            except:
                print "Some other error occured, sleep {} sec".format(wait)
                time.sleep(wait)
            if root is None:
                # Adding tuple with id and title
                not_found.append(id)
                print("Did not find record with title {}".format(title))
                print("Not found length is {}".format(len(not_found)))
            else:
                # Adding dictionary
                tcp_data_title = tcp_data.loc[id].Title.replace("|", ",")
                tcp_data_title = re.sub(r'\([^)]*\)', '', tcp_data_title)
                tcp_data_title = removePunct(tcp_data_title)
                wos_title = getTitle(root)
                wos_title = re.sub(r'\([^)]*\)', '', wos_title)
                wos_title = removePunct(wos_title)
                print("tcp title: {}".format(tcp_data_title))
                print("wos title: {}".format(wos_title))
                print
                if difflib.SequenceMatcher(None, tcp_data_title, wos_title).ratio() > 0.95:
                    if getAbstract(root) is not None:
                        info.append((root, id))
                        print("Successfully retrieved is now {}".format(len(info)))
                    else:
                        print
                        print("Abstract was none")
                        print
                else:
                    print("titles not alike...")
                    not_found.append(id)
            print("Number of queries so far is {}. Time used is {:0.1f} minutes".format((i+1),((time.time()-start)/60.0)))
            time.sleep(0.5)
    return info, not_found
from wos import WosClient
import wos.utils

with WosClient("", '') as client:
    print((wos.utils.query(client, 'AU=Goncalves, Renata AND PY=2017')))
    # print((wos.utils.query(client, 'TI=(GRONS: a comprehensive genetic resource of nicotine and smoking)')))
    # print((wos.utils.query(client, 'DOI=BUSH KAREN AND PY=(2015 AND 2016)')))
예제 #12
0
def wos_fetch(output_file, username, password):
    with WosClient(username, password) as client:
        print(wos.utils.query(client, 'AU=Beckner Wesley'))
예제 #13
0
# In[11]:

import re

crawled_authors = glob.glob('data/author_to_articles/*.json')
crawled_authors = set(
    [os.path.basename(filename).split('.')[0] for filename in crawled_authors])
print(
    f'There Are {len(crawled_authors)} Collected Authors. Remain {len(authors) - len(crawled_authors)} Authors.'
)
remained_authors = [
    author for author in authors if author[1] not in crawled_authors
][::-1]

wc = WosClient()
wc.connect()
for author_order, (author_name, author_id) in enumerate(remained_authors, 1):

    print(
        f'Finding Articles of Author {author_order}: {author_name} ({author_id})'
    )
    if author_order % 100 == 0:
        wc.close()
        wc = WosClient()
        wc.connect()

    query_url = "https://apps.webofknowledge.com/" + f"InboundService.do?product=WOS&daisIds={author_id}" + "&Func=Frame&DestFail=http%3A%2F%2Fwww.webofknowledge.com" + f"&SrcApp=RRC&locale=zh_TW&SrcAuth=RRCPubList&SID={wc._SID}" + "&customersID=RRCPubList&mode=SourceByDais&IsProductCode=Yes&Init=Yes&viewType=summary&action=search" + "&action=changePageSize&pageSize=50"
    wos_codes = []

    while True: