Пример #1
0
def queryRelatedWos(WOS, start_time):
    root_list = []
    with WosClient(c.getUserName(), c.getPassword()) as client:
        print("Starting to look for related records")
        for i, id in enumerate(WOS):
            try:
                records = wos.utils.get_related_records(client, id, count=8)
                roots = records.findall("REC")
                for root in roots:
                    root_list.append(root)
            except:
                wait = 5
                print("Some error while adding roots. Waiting {} sec".format(wait))
                time.sleep(wait)
            if (i+1) % 50 == 0:
                print("Queries so far is {}. Time used is {:0.1f}".format((i+1), ((time.time()-start_time)/60.0)))
            time.sleep(0.5)
    return root_list
Пример #2
0
def queryRelatedWos(WOS, start_time):
    root_list = []
    with WosClient(c.getUserName(), c.getPassword()) as client:
        print("Starting to look for related records")
        for i, id in enumerate(WOS):
            try:
                records = wos.utils.get_related_records(client, id, count=8)
                roots = records.findall("REC")
                for root in roots:
                    root_list.append(root)
            except:
                wait = 5
                print("Some error while adding roots. Waiting {} sec".format(
                    wait))
                time.sleep(wait)
            if (i + 1) % 50 == 0:
                print("Queries so far is {}. Time used is {:0.1f}".format(
                    (i + 1), ((time.time() - start_time) / 60.0)))
            time.sleep(0.5)
    return root_list
Пример #3
0
def queryWos(tcp_data, start, start_sample, end_sample):
    # Return a small list of titles, year and ids for testing

    # with open("data/wos_not_meta_data.json", "r") as f:
    #    small_list_ids = json.load(f)
    # Create an empty list which should contain info later
    info = []
    not_found = []
    # Connect to Web of Science
    with WosClient(c.getUserName(), c.getPassword()) as client:
        print ("Starting the queries")
        # Looping through the titles (search parameter)
        for i, id in enumerate(tcp_data.index.values.tolist()[start_sample:end_sample]):
            # Replace '|' with ','
            title = (
                tcp_data.loc[id]
                .Title.replace("|", ",")
                .replace("?", "")
                .replace('"', "")
                .replace("/", " ")
                .replace("-", " ")
                .replace(":", " ")
            )
            title = re.sub(r"\([^)]*\)", "", title)
            title = title.replace("(", "").replace(")", "")
            # Get year published
            year = tcp_data.loc[id].Year
            # Create year query with +/- 1 year
            query_string_year = "PY=(" + str(year - 1) + " OR " + str(year) + " OR " + str(year + 1) + ")"
            # Check if the title contains any operators (like AND, OR, NOT)
            if isOperator(title):
                title = removeOperator(title)
            # Create title query
            query_string_title = "TI=" + title
            # Create query AND operator string
            query_AND_operator = " AND "
            # Create the query string
            query_string = query_string_title + query_AND_operator + query_string_year
            # print query_string
            # Perform the query on wos engine
            root = None
            wait = 2
            try:
                root = wos.utils.query_v2(client, query_string, count=1)
            except suds.WebFault:
                print "Suds.WebFault: Waiting {} sec".format(wait)
                print suds.WebFault.args
                time.sleep(wait)
            except:
                print "Some other error occured, sleep {} sec".format(wait)
                time.sleep(wait)
            if root is None:
                # Adding tuple with id and title
                not_found.append(id)
                print ("Did not find record with title {}".format(title))
                print ("Not found length is {}".format(len(not_found)))
            else:
                # Adding dictionary
                tcp_data_title = tcp_data.loc[id].Title.replace("|", ",")
                tcp_data_title = re.sub(r"\([^)]*\)", "", tcp_data_title)
                tcp_data_title = removePunct(tcp_data_title)
                wos_title = getTitle(root)
                wos_title = re.sub(r"\([^)]*\)", "", wos_title)
                wos_title = removePunct(wos_title)
                print ("tcp title: {}".format(tcp_data_title))
                print ("wos title: {}".format(wos_title))
                print
                if difflib.SequenceMatcher(None, tcp_data_title, wos_title).ratio() > 0.95:
                    if getAbstract(root) is not None:
                        info.append((root, id))
                        print ("Successfully retrieved is now {}".format(len(info)))
                    else:
                        print
                        print ("Abstract was none")
                        print
                else:
                    print ("titles not alike...")
                    not_found.append(id)
            print (
                "Number of queries so far is {}. Time used is {:0.1f} minutes".format(
                    (i + 1), ((time.time() - start) / 60.0)
                )
            )
            time.sleep(0.5)
    return info, not_found
Пример #4
0
def queryWos(tcp_data, start, start_sample, end_sample):
    # Return a small list of titles, year and ids for testing

    #with open("data/wos_not_meta_data.json", "r") as f:
    #    small_list_ids = json.load(f)
    # Create an empty list which should contain info later
    info = []
    not_found = []
    # Connect to Web of Science
    with WosClient(c.getUserName(), c.getPassword()) as client:
        print("Starting the queries")
        # Looping through the titles (search parameter)
        for i, id in enumerate(tcp_data.index.values.tolist()[start_sample:end_sample]):
            # Replace '|' with ','
            title = tcp_data.loc[id].Title.replace("|",",").replace("?","").replace('"', '').replace("/"," ").replace("-", " ").replace(":", " ")
            title = re.sub(r'\([^)]*\)', '', title)
            title = title.replace("(","").replace(")","")
            # Get year published
            year = tcp_data.loc[id].Year
            # Create year query with +/- 1 year
            query_string_year = 'PY=(' + str(year-1) + ' OR ' + str(year) + ' OR ' + str(year+1) + ')'
            # Check if the title contains any operators (like AND, OR, NOT)
            if isOperator(title):
                title = removeOperator(title)
            # Create title query
            query_string_title = 'TI=' + title
            # Create query AND operator string
            query_AND_operator = ' AND '
            # Create the query string
            query_string = query_string_title + query_AND_operator + query_string_year
            #print query_string
            # Perform the query on wos engine
            root = None
            wait = 2
            try:
                root = wos.utils.query_v2(client, query_string, count=1)
            except suds.WebFault:
                print "Suds.WebFault: Waiting {} sec".format(wait)
                print suds.WebFault.args
                time.sleep(wait)
            except:
                print "Some other error occured, sleep {} sec".format(wait)
                time.sleep(wait)
            if root is None:
                # Adding tuple with id and title
                not_found.append(id)
                print("Did not find record with title {}".format(title))
                print("Not found length is {}".format(len(not_found)))
            else:
                # Adding dictionary
                tcp_data_title = tcp_data.loc[id].Title.replace("|", ",")
                tcp_data_title = re.sub(r'\([^)]*\)', '', tcp_data_title)
                tcp_data_title = removePunct(tcp_data_title)
                wos_title = getTitle(root)
                wos_title = re.sub(r'\([^)]*\)', '', wos_title)
                wos_title = removePunct(wos_title)
                print("tcp title: {}".format(tcp_data_title))
                print("wos title: {}".format(wos_title))
                print
                if difflib.SequenceMatcher(None, tcp_data_title, wos_title).ratio() > 0.95:
                    if getAbstract(root) is not None:
                        info.append((root, id))
                        print("Successfully retrieved is now {}".format(len(info)))
                    else:
                        print
                        print("Abstract was none")
                        print
                else:
                    print("titles not alike...")
                    not_found.append(id)
            print("Number of queries so far is {}. Time used is {:0.1f} minutes".format((i+1),((time.time()-start)/60.0)))
            time.sleep(0.5)
    return info, not_found