def __init__(self, args, source_name, dict_file): self.args = args self.perm_dict_abstract = self.create_dict() self.file_handle = FileClass() self.dict_file_path = dict_file self.source_name = source_name self.synonyms = self.file_handle.get_synonyms(self.source_name, dict_file)
def __init__(self, search_args, dict_file): self.search_args = search_args self.source_name = "springer" self.perm = Permutation(search_args, self.source_name, dict_file) self.file_handle = FileClass() self.dict_file_path = dict_file self.api_key = self.get_api_key() self.req_res = [] self.cleaned_res = []
def __init__(self, args, operands, passed_name, failed_name): self.args = args self.operands = operands self.passed_name = passed_name self.failed_name = failed_name self.dict_file_path = "dict.json" self.file_handle = FileClass() self.data_responses = [] self.response = Response() self.results = []
def __init__(self, args): # removes first argument which is not needed args.pop(0) self.args = args self.argument_file = "args.json" self.concept = None self.field_sets = None self.operands = None self.passed_name = "passend_entries.bib" self.failed_name = "failed_entries.json" self.fileOps = FileClass()
def __init__(self, results, concepts, fields, operands, passed_name, failed_name): self.fileOps = FileClass() self.results = results self.concepts = concepts self.field_sets = fields self.operands = operands self.passed_entries = [] self.failed_entries = [] self.passed_name = passed_name self.failed_name = failed_name
class RequestScieneDirect: def __init__(self, search_args, dict_file): self.search_args = search_args self.source_name = "science_direct" self.perm = Permutation(search_args, self.source_name, dict_file) self.file_handle = FileClass() self.dict_file_path = dict_file self.api_key = self.get_api_key() self.req_res = [] self.cleaned_res = [] def get_api_key(self): return self.file_handle.open_json( self.dict_file_path)["api_keys"][self.source_name] def get_permutations(self): self.permutations = self.perm.create_query_object_templates() def iterate_permutation(self): for perm in self.permutations: url = self.build_request_url(perm) self.make_request(url) pprint(self.req_res) def build_request_url(self, query_fields): req_url = "https://api.elsevier.com/content/search/sciencedirect?httpAccept=application/json&count=100&query=" counter = 0 for key, value in query_fields.items(): if counter < len(query_fields) - 1: req_url += key + "(" + value + ")+and+" else: req_url += key + "(" + value + ")&apiKey=" + self.api_key counter += 1 return req_url def make_request(self, url, offset=""): time.sleep(1) print(url + offset) r = requests.get(url + offset).json() self.req_res += r['search-results']['entry'] if int(r['search-results']['opensearch:startIndex']) + int( r['search-results']['opensearch:itemsPerPage']) < int( r['search-results']['opensearch:totalResults']): offs = str( int(r['search-results']['opensearch:startIndex']) + int(r['search-results']['opensearch:itemsPerPage'])) self.make_request(url, "&start=" + offs)
class RequestSpringer: def __init__(self, search_args, dict_file): self.search_args = search_args self.source_name = "springer" self.perm = Permutation(search_args, self.source_name, dict_file) self.file_handle = FileClass() self.dict_file_path = dict_file self.api_key = self.get_api_key() self.req_res = [] self.cleaned_res = [] def get_api_key(self): return self.file_handle.open_json( self.dict_file_path)["api_keys"][self.source_name] def get_permutations(self): self.permutations = self.perm.create_query_object_templates() print("________________________") print("## Springer: ##") print("________________________\n") print("# Permutations #") # prints list sorted by abstract if abstract is included! if "abstract" in self.permutations[0]: pprint(sorted(self.permutations, key=lambda i: i['abstract'])) else: pprint(self.permutations) print("\n Found " + str(len(self.permutations)) + " permutations!") # iterates all permutations, builds and makes requests def iterate_permutation(self): print("\n# Requesting Data #") for perm in self.permutations: url = self.build_request_url(perm) self.make_request(url) # make request and handles a query that results in more then 50 entries (api limit) recursivly def make_request(self, url, offset=""): time.sleep(1) print(" Calling: " + url + offset) r = requests.get(url + offset).json() self.req_res += r['records'] if int(r['result'][0]['start']) + int( r['result'][0]['recordsDisplayed']) < int( r['result'][0]['total']): offs = str( int(r['result'][0]['start']) + int(r['result'][0]['recordsDisplayed'])) self.make_request(url, "&s=" + offs) # build request url with passed field arguments in accepted format def build_request_url(self, query_fields): req_url = "http://api.springernature.com/metadata/v1/json?p=50&q=(" counter = 0 if "abstract" in query_fields: query_fields.pop("abstract") for key, value in query_fields.items(): if counter < len(query_fields) - 1: req_url += key + ":'" + value + "' AND " else: req_url += key + ":'" + value + "')&api_key=" + self.api_key counter += 1 return req_url # cleans and converts data for writing to bibtex file def clean_entries(self): print("\n# Getting keywords and formating data #") for i in range(len(self.req_res)): sys.stdout.write("\r {0}".format( str(i) + "/" + str(len(self.req_res)) + " complete")) obj = {} # creates id based on name, year and 8 char hash value generated from the title obj['ID'] = self.req_res[i]['creators'][0]["creator"].replace( ', ', '_') + "_" + self.req_res[i]["publicationDate"].split( "-")[0] + str(abs(hash(self.req_res[i]['title'])))[:8] obj['ENTRYTYPE'] = self.req_res[i]["contentType"] obj["year"] = self.req_res[i]["publicationDate"].split("-")[0] obj['url'] = self.req_res[i]["url"][0]["value"] obj['author'] = self.build_creators(self.req_res[i]["creators"]) del self.req_res[i]['url'] del self.req_res[i]['creators'] for key, value in self.req_res[i].items(): if value: obj[key] = self.req_res[i][key] if "keyword" not in self.req_res[i]: obj["keywords"] = self.get_missing_field( obj['url'], "keywords") #removes "abstract from the beginning of every abstract" obj['abstract'] = obj['abstract'][8:] self.cleaned_res.append(obj) # brings creators in format needed for bibtex file def build_creators(self, creators): creators_length = len(creators) res = "" for idx, val in enumerate(creators): if idx < creators_length - 1: res += val["creator"] + " and " else: res += val["creator"] return res # build url for given data and calls request and parsing function def get_missing_field(self, url, field): url = "https://link.springer.com/" + url.split("http://dx.doi.org/")[1] time.sleep(0.5) res = self.make_field_request(url) return self.parse_response(res, field) # make http request to given link and returns data as xpath compatible object def make_field_request(self, url): try: page = requests.get(url) return html.fromstring(page.content) except requests.exceptions.RequestException as e: # This is the correct syntax print(e) sys.exit(1) # parse keywords from passed response object def parse_response(self, response, field): # simple test if abstract, keywords and doi exist on the crawled HTML side ? keywordsExists = response.xpath( '//div[@class="KeywordGroup"]/span[@class="Keyword"]/text()')[0] # .extract_first(default='Keywords not-found') # ================================================== # XPATH for abstract #1 # ================================================== # because of other HTML structure, some entries need other xpaths if field == "keywords": finalkeywords = "" if ("Keywords not-found" in keywordsExists): keywordgroupexists = response.xpath( '//div[@id="Keywords"]/ul/li[@class="c-keywords__item"]/text()' )[0] if ("Keywords not-found" not in keywordgroupexists): keywordgroup = response.xpath( '//div[@id="Keywords"]/ul/li[@class="c-keywords__item"]/text()' ) for keyword in keywordgroup: finalkeywords = finalkeywords.rstrip( ) + ", " + keyword.rstrip() else: finalkeywords = keywordgroupexists else: keywordgroup = response.xpath( '//div[@class="KeywordGroup"]/span[@class="Keyword"]/text()' ) for keyword in keywordgroup: finalkeywords = finalkeywords.rstrip( ) + ", " + keyword.rstrip() return finalkeywords
class Main: def __init__(self, args): # removes first argument which is not needed args.pop(0) self.args = args self.argument_file = "args.json" self.concept = None self.field_sets = None self.operands = None self.passed_name = "passend_entries.bib" self.failed_name = "failed_entries.json" self.fileOps = FileClass() # parses cl args, and calls needed methods for reading data # def parse_cl_args(self): # if self.args[0] == "-h" or self.args[0] == "--help" or self.args[0] == "help": # print("Example usage:") # print("python3 script.py") # sys.exit() # else: # self.parse_argument_file() # parse argument json file def parse_argument_file(self): self.argument_file = self.fileOps.open_json(self.argument_file) def perform_argument_checks(self): self.check_file_arguments() self.set_settings() self.check_operands() # checks if arguments are valid def check_file_arguments(self): if len(self.argument_file["concepts"]) == 0: print("No concepts specified! Please check arguments file!") elif len(self.argument_file["fields"]) == 0: print("No fields specified! Please check arguments file!") else: self.concepts = self.argument_file["concepts"] self.field_sets = self.argument_file["fields"] self.operands = self.check_for_dict_entry("operands", self.argument_file) self.settings = self.check_for_dict_entry("settings", self.argument_file) # checks if passed operands are valid def check_operands(self): for op in self.operands: if isinstance(op["value"], int): # check if operator is supported, for string operations it doesnt matter as one operation is only supported if op["operator"] != ">" and op["operator"] != "<" and op[ "operator"] != "!=" and op["operator"] != "<=" and op[ "operator"] != ">=": print( "Operator is not supported! Check documentation for more information" ) print("Operator passed: " + op["operator"]) def set_settings(self): if "failed_name" in self.settings and self.failed_name == "failed_entries.json": self.failed_name = self.settings["failed_name"] if "passed_name" in self.settings and self.passed_name == "passed_entries.bib": self.passed_name = self.settings["passed_name"] # safe access method for dict, returns emtpy list if key not found def check_for_dict_entry(self, key, obj): if key in obj: return obj[key] return [] def perform_query(self): self.request = Request(self.argument_file, self.operands, self.passed_name, self.failed_name) self.request.initialize_request_classes() self.request.peform_checks_and_execute() self.request.remove_duplicates() self.request.perform_filter()
class Filter: def __init__(self, results, concepts, fields, operands, passed_name, failed_name): self.fileOps = FileClass() self.results = results self.concepts = concepts self.field_sets = fields self.operands = operands self.passed_entries = [] self.failed_entries = [] self.passed_name = passed_name self.failed_name = failed_name def iterate_entries(self): print("\n # Performing filters and argument checks #") for entry in self.results: res_concepts = self.iterate_arguments(entry) res_operands = self.execute_operands(entry) if (self.eval_res(res_concepts) and self.eval_res(res_operands)) is not True: self.explain_rejection(res_concepts, res_operands, entry) else: self.passed_entries.append(entry) print(" -Total entries: " + str(len(self.passed_entries) + len(self.failed_entries))) print(" -Final passed entries: " + str(len(self.passed_entries))) print(" -Failed entries: " + str(len(self.failed_entries))) self.fileOps.write_res(self.passed_entries, self.passed_name, self.failed_entries, self.failed_name) # loop thats waaaayyyyyyy to deep, though it should stay somewhat around O(n) def iterate_arguments(self, bib_entry): concepts_checked = [False] * (len(self.concepts)) for i in range(len(self.concepts)): for field_set in self.field_sets: if concepts_checked[i]: break for field in field_set: if concepts_checked[i]: break for concept_entry in self.concepts[i]: if self.check_argument(field, concept_entry, bib_entry): concepts_checked[i] = True break else: continue return concepts_checked # logic for concept checks def check_argument(self, field, concept_entry, bib_entry): if field in bib_entry and concept_entry.lower( ) in bib_entry[field].lower(): return True return False # function for executing and checking passed operands on each bib_entry def execute_operands(self, bib_entry): # dictionary for string operators which include the functionality as a lambda function ops = { ">": (lambda x, y: x > y), "<": (lambda x, y: x < y), ">=": (lambda x, y: x >= y), "<=": (lambda x, y: x <= y), "!=": (lambda x, y: x != y) } res = [False] * (len(self.operands)) for i in range(len(self.operands)): # since there is only one string operation no operation dictionary needed if self.operands[i]['operator'] == "==": if self.operands[i]['field'] in bib_entry: res[i] = self.operands[i]['value'] == bib_entry[ self.operands[i]['field']] else: if self.operands[i]['field'] in bib_entry: res[i] = ops[self.operands[i]['operator']]( int(bib_entry[self.operands[i]['field']]), self.operands[i]['value']) return res # checks if all checks have passed aka wif we have a match def eval_res(self, res): if False not in res: return True return False # adds array of concept or operation to dict if they failed during checks def explain_rejection(self, res_concepts, res_operands, bib_entry): obj = { 'name': bib_entry["doi"], 'failed_concepts': [], 'failed_operands': [] } for i in range(len(res_concepts)): if res_concepts[i] is not True: obj['failed_concepts'].append(self.concepts[i]) for i in range(len(res_operands)): if res_operands[i] is not True: obj['failed_operands'].append(self.operands[i]) self.failed_entries.append(obj) def write_res(self): self.fileOps.write_res(self.passed_entries, self.passed_name, self.failed_entries, self.failed_name)
class RequestIeee: def __init__(self, search_args, dict_file): self.search_args = search_args self.source_name = "ieee" self.perm = Permutation(search_args, self.source_name, dict_file) self.file_handle = FileClass() self.dict_file_path = dict_file self.api_key = self.get_api_key() self.req_res = [] self.cleaned_res = [] def get_api_key(self): return self.file_handle.open_json( self.dict_file_path)["api_keys"][self.source_name] def get_permutations(self): self.permutations = self.perm.create_query_object_templates() print("\n________________________") print("## IEEE: ##") print("________________________\n") print("# Permutations #") pprint(self.permutations) print("\n Found " + str(len(self.permutations)) + " permutations!") def iterate_permutation(self): print("\n# Requesting Data #") for perm in self.permutations: url = self.build_request_url(perm) self.make_request(url) # build request url with passed field arguments in accepted format def build_request_url(self, query_fields): req_url = "http://ieeexploreapi.ieee.org/api/v1/search/articles?max_records=200&querytext=(" counter = 0 for key, value in query_fields.items(): if counter < len(query_fields) - 1: req_url += "(\"" + key + "\":" + value + ") AND " else: req_url += "(\"" + key + "\":" + value + "))&apikey=" + self.api_key counter += 1 return req_url # make request and handles a query that results in more then 50 entries (api limit) recursivly def make_request(self, url, offset=0): time.sleep(0.5) print(" Calling: " + url + "&start_record=" + str(offset)) r = requests.get(url + "&start_record=" + str(offset)).json() self.req_res += r['articles'] if r['total_records'] > (len(r['articles']) + offset): self.make_request(url, offset + 200) def clean_entries(self): print("# Formating Data #") for i in range(len(self.req_res)): obj = {} # creates id based on name, year and 8 char hash value generated from the title obj['ID'] = self.req_res[i]['authors']['authors'][0][ "full_name"].replace(' ', '_') + "_" + str( self.req_res[i]["publication_year"]) + "_" + str( abs(hash(self.req_res[i]['title'])))[:8] obj['ENTRYTYPE'] = self.req_res[i]["content_type"].lower().replace( " & ", "").replace(" ", "") obj["year"] = str(self.req_res[i]["publication_year"]) obj['url'] = self.req_res[i]["pdf_url"] obj['author'] = self.build_creators( self.req_res[i]["authors"]["authors"]) obj['keywords'] = ", ".join( self.req_res[i]["index_terms"]["author_terms"]["terms"]) del self.req_res[i]['pdf_url'] del self.req_res[i]['authors'] del self.req_res[i]["index_terms"] del self.req_res[i]["publication_year"] for key, value in self.req_res[i].items(): if value: obj[key] = str(self.req_res[i][key]) self.cleaned_res.append(obj) #etwas doof wenn mehr als 2 namen stehen, bspw. François de Chezelles def build_creators(self, creators): creators_length = len(creators) res = "" for i, val in enumerate(creators): sub_res = "" if i < creators_length - 1: for idx, sub_name in enumerate( list(reversed(val["full_name"].split()))): if idx < len(val["full_name"].split()) - 1: sub_res += sub_name + ", " else: sub_res += sub_name + " and " else: for idx, sub_name in enumerate(val["full_name"].split()): if idx < len(val["full_name"].split()) - 1: sub_res += sub_name + ", " else: sub_res += sub_name res += sub_res return res
class Permutation: def __init__(self, args, source_name, dict_file): self.args = args self.perm_dict_abstract = self.create_dict() self.file_handle = FileClass() self.dict_file_path = dict_file self.source_name = source_name self.synonyms = self.file_handle.get_synonyms(self.source_name, dict_file) def create_dict(self): abstract_dict = {} for keys in self.args["fields"]: for key in keys: abstract_dict[key] = None return abstract_dict def create_query_object_templates(self): query_lists = self.build_fields_lists() concept_perms = self.create_concept_groupings() return self.build_queries_from_template(query_lists, concept_perms) # creates concept groupings, e.g. returns [['smart contract', 'energy', 'trading'], ['blockchain', 'energy', 'trading']] if blockchain and smart conctract are concept synonyms def create_concept_groupings(self): r = [[]] for x in self.args["concepts"]: t = [] for y in x: for i in r: t.append(i + [y]) r = t return r # builds array of possible fields combinations, e.g. returns: [[abstract, title, keywords], [abstract, booktitle, keywords]] if title and booktitle are synonyms def build_fields_lists(self): fields = self.collect_fields_from_dict() r = [[]] for x in fields: t = [] for y in x: for i in r: t.append(i + [y]) r = t print(r) return self.build_fields_permutations(r) # builds permutations of all submitted fields and return list of tupels def build_fields_permutations(self, query_objects): result = [] for element in query_objects: result.append(list(permutations(element))) # flatten nested array of tupels return [item for sublist in result for item in sublist] def collect_fields_from_dict(self): res = [] for field in self.args["fields"]: if field[0] in self.synonyms: res.append(self.synonyms[field[0]]) else: res.append([field[0]]) return res def build_queries_from_template(self, query_lists, concept_perms): final_res = [] for concept in concept_perms: for perm in query_lists: obj = {} for idx, val in enumerate(perm): obj[val] = concept[idx] final_res.append(obj) return final_res
class Request: def __init__(self, args, operands, passed_name, failed_name): self.args = args self.operands = operands self.passed_name = passed_name self.failed_name = failed_name self.dict_file_path = "dict.json" self.file_handle = FileClass() self.data_responses = [] self.response = Response() self.results = [] # pprint(self.args) # Initialize instances of reques classes def initialize_request_classes(self): self.springer = RequestSpringer(self.args, self.dict_file_path) self.science_direct = RequestScieneDirect(self.args, self.dict_file_path) self.ieee = RequestIeee(self.args, self.dict_file_path) # calls checks methods and run_request if checks pass def peform_checks_and_execute(self): if self.args["sources"]["springer"] and self.check_fields('springer'): self.run_requests("springer") if self.args["sources"]["science_direct"] and self.check_fields( 'science_direct'): self.run_requests("science_direct") if self.args["sources"]["ieee"] and self.check_fields('ieee'): self.run_requests("ieee") # remove duplicates controller methid def remove_duplicates(self): self.response.set_responses(self.data_responses) self.response.remove_duplicates() self.results = self.response.get_reponses() # filter contoller function def perform_filter(self): self.filter = Filter(self.results, self.args["concepts"], self.args["fields"], self.operands, self.passed_name, self.failed_name) self.filter.iterate_entries() self.filter.write_res() # controller function for field checks def check_fields(self, source_name): fields = self.merge_synonyms(source_name) field_check = self.check_field_validity(fields, source_name) if field_check[0]: return True else: print(field_check[1]) print("unaccepted fields found in concepts!") sys.exit() return False # controller function for different sources def run_requests(self, source_name): if source_name == "springer": self.springer.get_permutations() self.springer.iterate_permutation() self.springer.clean_entries() self.data_responses += self.springer.cleaned_res if source_name == "science_direct": self.science_direct.get_permutations() self.science_direct.iterate_permutation() # self.science_direct.clean_entries() self.data_responses += self.science_direct.req_res # print(len(self.data_responses)) if source_name == "ieee": self.ieee.get_permutations() self.ieee.iterate_permutation() self.ieee.clean_entries() self.data_responses += self.ieee.cleaned_res # merges fields with synonyms of data source, e.g. title => booktitle, title def merge_synonyms(self, source_name): synonyms = self.file_handle.get_synonyms(source_name, self.dict_file_path) included_keys = [] for key in self.args["fields"]: if key[0] in synonyms: included_keys += synonyms[key[0]] else: included_keys.append(key[0]) return included_keys # check if field is accepted for given source, e.g. haha123 is not a field that can be queried def check_field_validity(self, fields, source_name): accepted_fields = self.file_handle.open_json( self.dict_file_path)["accepted_fields"][source_name] accepted = True rejected = [] for field in fields: if field in accepted_fields: continue else: accepted = False rejected.append(field) return (accepted, rejected)