Exemplo n.º 1
0
 def __init__(self, args, source_name, dict_file):
     self.args = args
     self.perm_dict_abstract = self.create_dict()
     self.file_handle = FileClass()
     self.dict_file_path = dict_file
     self.source_name = source_name
     self.synonyms = self.file_handle.get_synonyms(self.source_name,
                                                   dict_file)
Exemplo n.º 2
0
 def __init__(self, search_args, dict_file):
     self.search_args = search_args
     self.source_name = "springer"
     self.perm = Permutation(search_args, self.source_name, dict_file)
     self.file_handle = FileClass()
     self.dict_file_path = dict_file
     self.api_key = self.get_api_key()
     self.req_res = []
     self.cleaned_res = []
Exemplo n.º 3
0
 def __init__(self, args, operands, passed_name, failed_name):
     self.args = args
     self.operands = operands
     self.passed_name = passed_name
     self.failed_name = failed_name
     self.dict_file_path = "dict.json"
     self.file_handle = FileClass()
     self.data_responses = []
     self.response = Response()
     self.results = []
Exemplo n.º 4
0
 def __init__(self, args):
     # removes first argument which is not needed
     args.pop(0)
     self.args = args
     self.argument_file = "args.json"
     self.concept = None
     self.field_sets = None
     self.operands = None
     self.passed_name = "passend_entries.bib"
     self.failed_name = "failed_entries.json"
     self.fileOps = FileClass()
Exemplo n.º 5
0
 def __init__(self, results, concepts, fields, operands, passed_name,
              failed_name):
     self.fileOps = FileClass()
     self.results = results
     self.concepts = concepts
     self.field_sets = fields
     self.operands = operands
     self.passed_entries = []
     self.failed_entries = []
     self.passed_name = passed_name
     self.failed_name = failed_name
Exemplo n.º 6
0
class RequestScieneDirect:
    def __init__(self, search_args, dict_file):
        self.search_args = search_args
        self.source_name = "science_direct"
        self.perm = Permutation(search_args, self.source_name, dict_file)
        self.file_handle = FileClass()
        self.dict_file_path = dict_file
        self.api_key = self.get_api_key()
        self.req_res = []
        self.cleaned_res = []

    def get_api_key(self):
        return self.file_handle.open_json(
            self.dict_file_path)["api_keys"][self.source_name]

    def get_permutations(self):
        self.permutations = self.perm.create_query_object_templates()

    def iterate_permutation(self):
        for perm in self.permutations:
            url = self.build_request_url(perm)
            self.make_request(url)
        pprint(self.req_res)

    def build_request_url(self, query_fields):
        req_url = "https://api.elsevier.com/content/search/sciencedirect?httpAccept=application/json&count=100&query="
        counter = 0
        for key, value in query_fields.items():
            if counter < len(query_fields) - 1:
                req_url += key + "(" + value + ")+and+"
            else:
                req_url += key + "(" + value + ")&apiKey=" + self.api_key
            counter += 1
        return req_url

    def make_request(self, url, offset=""):
        time.sleep(1)
        print(url + offset)
        r = requests.get(url + offset).json()
        self.req_res += r['search-results']['entry']
        if int(r['search-results']['opensearch:startIndex']) + int(
                r['search-results']['opensearch:itemsPerPage']) < int(
                    r['search-results']['opensearch:totalResults']):
            offs = str(
                int(r['search-results']['opensearch:startIndex']) +
                int(r['search-results']['opensearch:itemsPerPage']))
            self.make_request(url, "&start=" + offs)
Exemplo n.º 7
0
class RequestSpringer:
    def __init__(self, search_args, dict_file):
        self.search_args = search_args
        self.source_name = "springer"
        self.perm = Permutation(search_args, self.source_name, dict_file)
        self.file_handle = FileClass()
        self.dict_file_path = dict_file
        self.api_key = self.get_api_key()
        self.req_res = []
        self.cleaned_res = []

    def get_api_key(self):
        return self.file_handle.open_json(
            self.dict_file_path)["api_keys"][self.source_name]

    def get_permutations(self):
        self.permutations = self.perm.create_query_object_templates()
        print("________________________")
        print("##    Springer:       ##")
        print("________________________\n")
        print("# Permutations #")

        # prints list sorted by abstract if abstract is included!
        if "abstract" in self.permutations[0]:
            pprint(sorted(self.permutations, key=lambda i: i['abstract']))
        else:
            pprint(self.permutations)

        print("\n  Found " + str(len(self.permutations)) + " permutations!")

    # iterates all permutations, builds and makes requests
    def iterate_permutation(self):
        print("\n# Requesting Data #")
        for perm in self.permutations:
            url = self.build_request_url(perm)
            self.make_request(url)

    # make request and handles a query that results in more then 50 entries (api limit) recursivly
    def make_request(self, url, offset=""):
        time.sleep(1)
        print("  Calling: " + url + offset)
        r = requests.get(url + offset).json()
        self.req_res += r['records']
        if int(r['result'][0]['start']) + int(
                r['result'][0]['recordsDisplayed']) < int(
                    r['result'][0]['total']):
            offs = str(
                int(r['result'][0]['start']) +
                int(r['result'][0]['recordsDisplayed']))
            self.make_request(url, "&s=" + offs)

    # build request url with passed field arguments in accepted format
    def build_request_url(self, query_fields):
        req_url = "http://api.springernature.com/metadata/v1/json?p=50&q=("
        counter = 0
        if "abstract" in query_fields:
            query_fields.pop("abstract")
        for key, value in query_fields.items():
            if counter < len(query_fields) - 1:
                req_url += key + ":'" + value + "' AND "
            else:
                req_url += key + ":'" + value + "')&api_key=" + self.api_key
            counter += 1
        return req_url

    # cleans and converts data for writing to bibtex file
    def clean_entries(self):
        print("\n# Getting keywords and formating data #")
        for i in range(len(self.req_res)):
            sys.stdout.write("\r  {0}".format(
                str(i) + "/" + str(len(self.req_res)) + " complete"))
            obj = {}
            # creates id based on name, year and 8 char hash value generated from the title
            obj['ID'] = self.req_res[i]['creators'][0]["creator"].replace(
                ', ', '_') + "_" + self.req_res[i]["publicationDate"].split(
                    "-")[0] + str(abs(hash(self.req_res[i]['title'])))[:8]
            obj['ENTRYTYPE'] = self.req_res[i]["contentType"]
            obj["year"] = self.req_res[i]["publicationDate"].split("-")[0]
            obj['url'] = self.req_res[i]["url"][0]["value"]
            obj['author'] = self.build_creators(self.req_res[i]["creators"])
            del self.req_res[i]['url']
            del self.req_res[i]['creators']
            for key, value in self.req_res[i].items():
                if value:
                    obj[key] = self.req_res[i][key]

            if "keyword" not in self.req_res[i]:
                obj["keywords"] = self.get_missing_field(
                    obj['url'], "keywords")
            #removes "abstract from the beginning of every abstract"
            obj['abstract'] = obj['abstract'][8:]
            self.cleaned_res.append(obj)

    # brings creators in format needed for bibtex file
    def build_creators(self, creators):
        creators_length = len(creators)
        res = ""
        for idx, val in enumerate(creators):
            if idx < creators_length - 1:
                res += val["creator"] + " and "
            else:
                res += val["creator"]
        return res

    # build url for given data and calls request and parsing function
    def get_missing_field(self, url, field):
        url = "https://link.springer.com/" + url.split("http://dx.doi.org/")[1]
        time.sleep(0.5)
        res = self.make_field_request(url)
        return self.parse_response(res, field)

    # make http request to given link and returns data as xpath compatible object
    def make_field_request(self, url):
        try:
            page = requests.get(url)
            return html.fromstring(page.content)
        except requests.exceptions.RequestException as e:  # This is the correct syntax
            print(e)
            sys.exit(1)

    # parse keywords from passed response object
    def parse_response(self, response, field):
        # simple test if abstract, keywords and doi exist on the crawled HTML side ?
        keywordsExists = response.xpath(
            '//div[@class="KeywordGroup"]/span[@class="Keyword"]/text()')[0]
        # .extract_first(default='Keywords not-found')

        # ==================================================
        # XPATH for abstract #1
        # ==================================================
        # because of other HTML structure, some entries need other xpaths
        if field == "keywords":
            finalkeywords = ""
            if ("Keywords not-found" in keywordsExists):
                keywordgroupexists = response.xpath(
                    '//div[@id="Keywords"]/ul/li[@class="c-keywords__item"]/text()'
                )[0]
                if ("Keywords not-found" not in keywordgroupexists):
                    keywordgroup = response.xpath(
                        '//div[@id="Keywords"]/ul/li[@class="c-keywords__item"]/text()'
                    )
                    for keyword in keywordgroup:
                        finalkeywords = finalkeywords.rstrip(
                        ) + ", " + keyword.rstrip()
                else:
                    finalkeywords = keywordgroupexists
            else:
                keywordgroup = response.xpath(
                    '//div[@class="KeywordGroup"]/span[@class="Keyword"]/text()'
                )
                for keyword in keywordgroup:
                    finalkeywords = finalkeywords.rstrip(
                    ) + ", " + keyword.rstrip()
            return finalkeywords
Exemplo n.º 8
0
class Main:
    def __init__(self, args):
        # removes first argument which is not needed
        args.pop(0)
        self.args = args
        self.argument_file = "args.json"
        self.concept = None
        self.field_sets = None
        self.operands = None
        self.passed_name = "passend_entries.bib"
        self.failed_name = "failed_entries.json"
        self.fileOps = FileClass()

    # parses cl args, and calls needed methods for reading data
    # def parse_cl_args(self):
    #     if self.args[0] == "-h" or self.args[0] == "--help" or self.args[0] == "help":
    #         print("Example usage:")
    #         print("python3 script.py")
    #         sys.exit()
    #     else:
    #         self.parse_argument_file()

    # parse argument json file
    def parse_argument_file(self):
        self.argument_file = self.fileOps.open_json(self.argument_file)

    def perform_argument_checks(self):
        self.check_file_arguments()
        self.set_settings()
        self.check_operands()

    # checks if arguments are valid
    def check_file_arguments(self):
        if len(self.argument_file["concepts"]) == 0:
            print("No concepts specified! Please check arguments file!")
        elif len(self.argument_file["fields"]) == 0:
            print("No fields specified! Please check arguments file!")
        else:
            self.concepts = self.argument_file["concepts"]
            self.field_sets = self.argument_file["fields"]
            self.operands = self.check_for_dict_entry("operands",
                                                      self.argument_file)
            self.settings = self.check_for_dict_entry("settings",
                                                      self.argument_file)

    # checks if passed operands are valid
    def check_operands(self):
        for op in self.operands:
            if isinstance(op["value"], int):
                # check if operator is supported, for string operations it doesnt matter as one operation is only supported
                if op["operator"] != ">" and op["operator"] != "<" and op[
                        "operator"] != "!=" and op["operator"] != "<=" and op[
                            "operator"] != ">=":
                    print(
                        "Operator is not supported! Check documentation for more information"
                    )
                    print("Operator passed: " + op["operator"])

    def set_settings(self):
        if "failed_name" in self.settings and self.failed_name == "failed_entries.json":
            self.failed_name = self.settings["failed_name"]
        if "passed_name" in self.settings and self.passed_name == "passed_entries.bib":
            self.passed_name = self.settings["passed_name"]

    # safe access method for dict, returns emtpy list if key not found
    def check_for_dict_entry(self, key, obj):
        if key in obj:
            return obj[key]
        return []

    def perform_query(self):
        self.request = Request(self.argument_file, self.operands,
                               self.passed_name, self.failed_name)
        self.request.initialize_request_classes()
        self.request.peform_checks_and_execute()
        self.request.remove_duplicates()
        self.request.perform_filter()
Exemplo n.º 9
0
class Filter:
    def __init__(self, results, concepts, fields, operands, passed_name,
                 failed_name):
        self.fileOps = FileClass()
        self.results = results
        self.concepts = concepts
        self.field_sets = fields
        self.operands = operands
        self.passed_entries = []
        self.failed_entries = []
        self.passed_name = passed_name
        self.failed_name = failed_name

    def iterate_entries(self):
        print("\n # Performing filters and argument checks #")
        for entry in self.results:
            res_concepts = self.iterate_arguments(entry)
            res_operands = self.execute_operands(entry)
            if (self.eval_res(res_concepts)
                    and self.eval_res(res_operands)) is not True:
                self.explain_rejection(res_concepts, res_operands, entry)
            else:
                self.passed_entries.append(entry)
        print("  -Total entries: " +
              str(len(self.passed_entries) + len(self.failed_entries)))
        print("  -Final passed entries: " + str(len(self.passed_entries)))
        print("  -Failed entries: " + str(len(self.failed_entries)))
        self.fileOps.write_res(self.passed_entries, self.passed_name,
                               self.failed_entries, self.failed_name)

    # loop thats waaaayyyyyyy to deep, though it should stay somewhat around O(n)
    def iterate_arguments(self, bib_entry):
        concepts_checked = [False] * (len(self.concepts))
        for i in range(len(self.concepts)):
            for field_set in self.field_sets:
                if concepts_checked[i]:
                    break
                for field in field_set:
                    if concepts_checked[i]:
                        break
                    for concept_entry in self.concepts[i]:
                        if self.check_argument(field, concept_entry,
                                               bib_entry):
                            concepts_checked[i] = True
                            break
                        else:
                            continue
        return concepts_checked

    # logic for concept checks
    def check_argument(self, field, concept_entry, bib_entry):
        if field in bib_entry and concept_entry.lower(
        ) in bib_entry[field].lower():
            return True
        return False

    # function for executing and checking passed operands on each bib_entry
    def execute_operands(self, bib_entry):
        # dictionary for string operators which include the functionality as a lambda function
        ops = {
            ">": (lambda x, y: x > y),
            "<": (lambda x, y: x < y),
            ">=": (lambda x, y: x >= y),
            "<=": (lambda x, y: x <= y),
            "!=": (lambda x, y: x != y)
        }
        res = [False] * (len(self.operands))
        for i in range(len(self.operands)):
            # since there is only one string operation no operation dictionary needed
            if self.operands[i]['operator'] == "==":
                if self.operands[i]['field'] in bib_entry:
                    res[i] = self.operands[i]['value'] == bib_entry[
                        self.operands[i]['field']]
            else:
                if self.operands[i]['field'] in bib_entry:
                    res[i] = ops[self.operands[i]['operator']](
                        int(bib_entry[self.operands[i]['field']]),
                        self.operands[i]['value'])
        return res

    # checks if all checks have passed aka wif we have a match
    def eval_res(self, res):
        if False not in res:
            return True
        return False

    # adds array of concept or operation to dict if they failed during checks
    def explain_rejection(self, res_concepts, res_operands, bib_entry):
        obj = {
            'name': bib_entry["doi"],
            'failed_concepts': [],
            'failed_operands': []
        }
        for i in range(len(res_concepts)):
            if res_concepts[i] is not True:
                obj['failed_concepts'].append(self.concepts[i])
        for i in range(len(res_operands)):
            if res_operands[i] is not True:
                obj['failed_operands'].append(self.operands[i])
        self.failed_entries.append(obj)

    def write_res(self):
        self.fileOps.write_res(self.passed_entries, self.passed_name,
                               self.failed_entries, self.failed_name)
Exemplo n.º 10
0
class RequestIeee:
    def __init__(self, search_args, dict_file):
        self.search_args = search_args
        self.source_name = "ieee"
        self.perm = Permutation(search_args, self.source_name, dict_file)
        self.file_handle = FileClass()
        self.dict_file_path = dict_file
        self.api_key = self.get_api_key()
        self.req_res = []
        self.cleaned_res = []

    def get_api_key(self):
        return self.file_handle.open_json(
            self.dict_file_path)["api_keys"][self.source_name]

    def get_permutations(self):
        self.permutations = self.perm.create_query_object_templates()
        print("\n________________________")
        print("##       IEEE:       ##")
        print("________________________\n")
        print("# Permutations #")
        pprint(self.permutations)
        print("\n  Found " + str(len(self.permutations)) + " permutations!")

    def iterate_permutation(self):
        print("\n# Requesting Data #")
        for perm in self.permutations:
            url = self.build_request_url(perm)
            self.make_request(url)

    # build request url with passed field arguments in accepted format
    def build_request_url(self, query_fields):
        req_url = "http://ieeexploreapi.ieee.org/api/v1/search/articles?max_records=200&querytext=("
        counter = 0
        for key, value in query_fields.items():
            if counter < len(query_fields) - 1:
                req_url += "(\"" + key + "\":" + value + ") AND "
            else:
                req_url += "(\"" + key + "\":" + value + "))&apikey=" + self.api_key
            counter += 1
        return req_url

    # make request and handles a query that results in more then 50 entries (api limit) recursivly
    def make_request(self, url, offset=0):
        time.sleep(0.5)
        print("  Calling: " + url + "&start_record=" + str(offset))
        r = requests.get(url + "&start_record=" + str(offset)).json()
        self.req_res += r['articles']
        if r['total_records'] > (len(r['articles']) + offset):
            self.make_request(url, offset + 200)

    def clean_entries(self):
        print("# Formating Data #")
        for i in range(len(self.req_res)):
            obj = {}
            # creates id based on name, year and 8 char hash value generated from the title
            obj['ID'] = self.req_res[i]['authors']['authors'][0][
                "full_name"].replace(' ', '_') + "_" + str(
                    self.req_res[i]["publication_year"]) + "_" + str(
                        abs(hash(self.req_res[i]['title'])))[:8]
            obj['ENTRYTYPE'] = self.req_res[i]["content_type"].lower().replace(
                " & ", "").replace(" ", "")
            obj["year"] = str(self.req_res[i]["publication_year"])
            obj['url'] = self.req_res[i]["pdf_url"]
            obj['author'] = self.build_creators(
                self.req_res[i]["authors"]["authors"])
            obj['keywords'] = ", ".join(
                self.req_res[i]["index_terms"]["author_terms"]["terms"])
            del self.req_res[i]['pdf_url']
            del self.req_res[i]['authors']
            del self.req_res[i]["index_terms"]
            del self.req_res[i]["publication_year"]
            for key, value in self.req_res[i].items():
                if value:
                    obj[key] = str(self.req_res[i][key])
            self.cleaned_res.append(obj)

    #etwas doof wenn mehr als 2 namen stehen, bspw. François de Chezelles
    def build_creators(self, creators):
        creators_length = len(creators)
        res = ""
        for i, val in enumerate(creators):
            sub_res = ""
            if i < creators_length - 1:
                for idx, sub_name in enumerate(
                        list(reversed(val["full_name"].split()))):
                    if idx < len(val["full_name"].split()) - 1:
                        sub_res += sub_name + ", "
                    else:
                        sub_res += sub_name + " and "
            else:
                for idx, sub_name in enumerate(val["full_name"].split()):
                    if idx < len(val["full_name"].split()) - 1:
                        sub_res += sub_name + ", "
                    else:
                        sub_res += sub_name
            res += sub_res
        return res
Exemplo n.º 11
0
class Permutation:
    def __init__(self, args, source_name, dict_file):
        self.args = args
        self.perm_dict_abstract = self.create_dict()
        self.file_handle = FileClass()
        self.dict_file_path = dict_file
        self.source_name = source_name
        self.synonyms = self.file_handle.get_synonyms(self.source_name,
                                                      dict_file)

    def create_dict(self):
        abstract_dict = {}
        for keys in self.args["fields"]:
            for key in keys:
                abstract_dict[key] = None
        return abstract_dict

    def create_query_object_templates(self):
        query_lists = self.build_fields_lists()
        concept_perms = self.create_concept_groupings()

        return self.build_queries_from_template(query_lists, concept_perms)

    # creates concept groupings, e.g. returns [['smart contract', 'energy', 'trading'], ['blockchain', 'energy', 'trading']] if blockchain and smart conctract are concept synonyms
    def create_concept_groupings(self):
        r = [[]]
        for x in self.args["concepts"]:
            t = []
            for y in x:
                for i in r:
                    t.append(i + [y])
            r = t
        return r

    # builds array of possible fields combinations, e.g. returns: [[abstract, title, keywords], [abstract, booktitle, keywords]] if title and booktitle are synonyms
    def build_fields_lists(self):
        fields = self.collect_fields_from_dict()
        r = [[]]
        for x in fields:
            t = []
            for y in x:
                for i in r:
                    t.append(i + [y])
            r = t
        print(r)
        return self.build_fields_permutations(r)

    # builds permutations of all submitted fields and return list of tupels
    def build_fields_permutations(self, query_objects):
        result = []
        for element in query_objects:
            result.append(list(permutations(element)))
        # flatten nested array of tupels
        return [item for sublist in result for item in sublist]

    def collect_fields_from_dict(self):
        res = []
        for field in self.args["fields"]:
            if field[0] in self.synonyms:
                res.append(self.synonyms[field[0]])
            else:
                res.append([field[0]])
        return res

    def build_queries_from_template(self, query_lists, concept_perms):
        final_res = []
        for concept in concept_perms:
            for perm in query_lists:
                obj = {}
                for idx, val in enumerate(perm):
                    obj[val] = concept[idx]
                final_res.append(obj)
        return final_res
Exemplo n.º 12
0
class Request:
    def __init__(self, args, operands, passed_name, failed_name):
        self.args = args
        self.operands = operands
        self.passed_name = passed_name
        self.failed_name = failed_name
        self.dict_file_path = "dict.json"
        self.file_handle = FileClass()
        self.data_responses = []
        self.response = Response()
        self.results = []
        # pprint(self.args)

    # Initialize instances of reques classes
    def initialize_request_classes(self):
        self.springer = RequestSpringer(self.args, self.dict_file_path)
        self.science_direct = RequestScieneDirect(self.args,
                                                  self.dict_file_path)
        self.ieee = RequestIeee(self.args, self.dict_file_path)

    # calls checks methods and run_request if checks pass
    def peform_checks_and_execute(self):
        if self.args["sources"]["springer"] and self.check_fields('springer'):
            self.run_requests("springer")
        if self.args["sources"]["science_direct"] and self.check_fields(
                'science_direct'):
            self.run_requests("science_direct")
        if self.args["sources"]["ieee"] and self.check_fields('ieee'):
            self.run_requests("ieee")

    # remove duplicates controller methid
    def remove_duplicates(self):
        self.response.set_responses(self.data_responses)
        self.response.remove_duplicates()
        self.results = self.response.get_reponses()

    # filter contoller function
    def perform_filter(self):
        self.filter = Filter(self.results, self.args["concepts"],
                             self.args["fields"], self.operands,
                             self.passed_name, self.failed_name)
        self.filter.iterate_entries()
        self.filter.write_res()

    # controller function for field checks
    def check_fields(self, source_name):
        fields = self.merge_synonyms(source_name)
        field_check = self.check_field_validity(fields, source_name)
        if field_check[0]:
            return True
        else:
            print(field_check[1])
            print("unaccepted fields found in concepts!")
            sys.exit()
            return False

    # controller function for different sources
    def run_requests(self, source_name):
        if source_name == "springer":
            self.springer.get_permutations()
            self.springer.iterate_permutation()
            self.springer.clean_entries()
            self.data_responses += self.springer.cleaned_res
        if source_name == "science_direct":
            self.science_direct.get_permutations()
            self.science_direct.iterate_permutation()
            # self.science_direct.clean_entries()
            self.data_responses += self.science_direct.req_res
            # print(len(self.data_responses))
        if source_name == "ieee":
            self.ieee.get_permutations()
            self.ieee.iterate_permutation()
            self.ieee.clean_entries()
            self.data_responses += self.ieee.cleaned_res

    # merges fields with synonyms of data source, e.g. title => booktitle, title
    def merge_synonyms(self, source_name):
        synonyms = self.file_handle.get_synonyms(source_name,
                                                 self.dict_file_path)
        included_keys = []
        for key in self.args["fields"]:
            if key[0] in synonyms:
                included_keys += synonyms[key[0]]
            else:
                included_keys.append(key[0])
        return included_keys

    # check if field is accepted for given source, e.g. haha123 is not a field that can be queried
    def check_field_validity(self, fields, source_name):
        accepted_fields = self.file_handle.open_json(
            self.dict_file_path)["accepted_fields"][source_name]
        accepted = True
        rejected = []
        for field in fields:
            if field in accepted_fields:
                continue
            else:
                accepted = False
                rejected.append(field)
        return (accepted, rejected)