def add_oa_links_in_references(text): wikicode = mwparserfromhell.parse(text) changed_templates = [] stats = { 'nb_templates':0, # total number of templates processed 'oa_found':0, # hits from the API 'changed':0, # actual changes on the templates 'already_present':0, # no change because already present } for template in wikicode.filter_templates(): orig_template = deepcopy(template) reference = parse_citation_template(template) if reference: stats['nb_templates'] += 1 link = get_oa_link(reference) if not link: changed_templates.append((orig_template,None)) continue # We found an OA link! stats['oa_found'] += 1 change = {} argument_found = False for argmap in template_arg_mappings: # Did the link we have got match that argument place? match = argmap.extract(link) if not match: continue argument_found = True # If this parameter is already present in the template, # don't change anything non_empty = argmap.present(template) if non_empty: change['new_'+argmap.name] = (match,link) stats['already_present'] += 1 break # If the parameter is not present yet, add it stats['changed'] += 1 if not argmap.is_id: template.add(argmap.name, match) change[argmap.name] = (match,link) else: val = '{{%s|%s}}' % (argmap.name,match) template.add('id', val) change['id'] = (val,link) break changed_templates.append((orig_template, change)) return unicode(wikicode), changed_templates, stats
def get_parsed_citations(content): parsed_cites = [] # Go through each of the templates wikicode = mwparserfromhell.parse(content) templates = wikicode.filter_templates() for tpl in templates: citation = parse_citation_template(tpl) if citation: type_of_citation = tpl.split('|')[0].lower()[2:] parsed_cites.append((citation, type_of_citation)) return parsed_cites
def get_generic_template(citation): """ Get generic template of a citation using the wikiciteparser library. :param: citation according to a particular format as described in const.py """ not_parseable = {'Title': 'Citation generic template not possible'} if not check_if_balanced(citation): citation = citation + '}}' # Convert the str into mwparser object wikicode = mwparserfromhell.parse(citation) try: template = wikicode.filter_templates()[0] except IndexError: return not_parseable parsed_result = parse_citation_template(template) # In case the mwparser is not able to parse the citation template return parsed_result if parsed_result is not None else not_parseable
def propose_change(self): """ Fetches open urls for that template and proposes a change """ reference = parse_citation_template(self.template) tpl_name = unicode(self.template.name).lower().strip() if not reference or tpl_name in excluded_templates: self.classification = 'ignored' return sys.stdout.write('.') sys.stdout.flush() # First check if there is already a link to a full text # in the citation. already_oa_param = None already_oa_value = None for argmap in template_arg_mappings: if argmap.present_and_free(self.template): already_oa_param = argmap.name already_oa_value = argmap.get(self.template) change = {} # If so, we just skip it - no need for more free links if already_oa_param: self.classification = 'already_open' self.conflicting_value = already_oa_value return # --- Disabled for now ---- # If the template is marked with |registration= or # |subscription= , let's assume that the editor tried to find # a better version themselves so it's not worth trying. if ((get_value(self.template, 'subscription') or get_value(self.template, 'registration')) in ['yes','y','true']): self.classification = 'registration_subscription' # return dissemin_paper_object = get_dissemin_paper(reference) # Otherwise, try to get a free link link = get_oa_link(dissemin_paper_object) if not link: self.classification = 'not_found' return # We found an OA link! self.proposed_link = link self.proposed_link_policy = get_paper_values(dissemin_paper_object, 'policy') self.issn = get_paper_values(dissemin_paper_object, 'issn') # Try to match it with an argument argument_found = False for argmap in template_arg_mappings: # Did the link we have got match that argument place? match = argmap.extract(link) if not match: continue argument_found = True # If this parameter is already present in the template: current_value = argmap.get(self.template) if current_value: change['new_'+argmap.name] = (match,link) #if argmap.custom_access: # stats['changed'] += 1 # template.add(argmap.custom_access, 'free') #else: self.classification = 'already_present' # don't change anything break # If the parameter is not present yet, add it self.classification = 'link_added' if argmap.is_id: self.proposed_change = 'id={{%s|%s}}' % (argmap.name,match) else: self.proposed_change = '%s=%s' % (argmap.name,match) break
def propose_change(self): """ Fetches open urls for that template and proposes a change """ reference = parse_citation_template(self.template) tpl_name = unicode(self.template.name).lower().strip() if not reference or tpl_name in excluded_templates: self.classification = 'ignored' return sys.stdout.write('.') sys.stdout.flush() # First check if there is already a link to a full text # in the citation. already_oa_param = None already_oa_value = None for argmap in template_arg_mappings: if argmap.present_and_free(self.template): already_oa_param = argmap.name already_oa_value = argmap.get(self.template) change = {} # If so, we just skip it - no need for more free links if already_oa_param: self.classification = 'already_open' self.conflicting_value = already_oa_value return # --- Disabled for now ---- # If the template is marked with |registration= or # |subscription= , let's assume that the editor tried to find # a better version themselves so it's not worth trying. if ((get_value(self.template, 'subscription') or get_value(self.template, 'registration')) in ['yes', 'y', 'true']): self.classification = 'registration_subscription' # return dissemin_paper_object = get_dissemin_paper(reference) # Otherwise, try to get a free link link = get_oa_link(dissemin_paper_object) if not link: self.classification = 'not_found' return # We found an OA link! self.proposed_link = link self.proposed_link_policy = get_paper_values(dissemin_paper_object, 'policy') self.issn = get_paper_values(dissemin_paper_object, 'issn') # Try to match it with an argument argument_found = False for argmap in template_arg_mappings: # Did the link we have got match that argument place? match = argmap.extract(link) if not match: continue argument_found = True # If this parameter is already present in the template: current_value = argmap.get(self.template) if current_value: change['new_' + argmap.name] = (match, link) #if argmap.custom_access: # stats['changed'] += 1 # template.add(argmap.custom_access, 'free') #else: self.classification = 'already_present' # don't change anything break # If the parameter is not present yet, add it self.classification = 'link_added' if argmap.is_id: self.proposed_change = 'id={{%s|%s}}' % (argmap.name, match) else: self.proposed_change = '%s=%s' % (argmap.name, match) break
def propose_change(self, only_doi=False): """ Fetches open urls for that template and proposes a change """ reference = parse_citation_template(self.template) tpl_name = unicode(self.template.name).lower().strip() if not reference or tpl_name in excluded_templates: self.classification = 'ignored' return sys.stdout.write('.') sys.stdout.flush() # First check if there is already a link to a full text # in the citation. already_oa_param = None already_oa_value = None for argmap in template_arg_mappings: if argmap.present_and_free(self.template): already_oa_param = argmap.name already_oa_value = argmap.get(self.template) change = {} # If so, we just skip it - no need for more free links if already_oa_param: self.classification = 'already_open' self.conflicting_value = already_oa_value return # --- Disabled for now ---- # If the template is marked with |registration= or # |subscription= , let's assume that the editor tried to find # a better version themselves so it's not worth trying. if ((get_value(self.template, 'subscription') or get_value(self.template, 'registration')) in ['yes', 'y', 'true']): self.classification = 'registration_subscription' # return if only_doi: dissemin_paper_object = {} else: dissemin_paper_object = get_dissemin_paper(reference) # Otherwise, try to get a free link doi = reference.get('ID_list', {}).get('DOI') link = get_oa_link(paper=dissemin_paper_object, doi=doi, only_unpaywall=only_doi) if link is False: self.classification = 'already_open' if doi: self.proposed_change = "doi-access=free" self.proposed_link = "https://doi.org/{}".format(doi) return # TODO add the DOI suggested by Dissemin if missing. Needs some checks. # elif dissemin_paper_object.get('pdf_url') and 'doi.org' in dissemin_paper_object.get('pdf_url'): # self.proposed_change = dissemin_paper_object.get('pdf_url') # return else: return if not link: self.classification = 'not_found' return # We found an OA link! self.proposed_link = link # If the parameter is not present yet, add it self.classification = 'link_added' if dissemin_paper_object: self.proposed_link_policy = get_paper_values( dissemin_paper_object, 'policy') # TODO: fetch from Unpaywall? self.issn = get_paper_values(dissemin_paper_object, 'issn') # Try to match it with an argument argument_found = False for argmap in template_arg_mappings: # Did the link we have got match that argument place? match = argmap.extract(link) if not match: continue argument_found = True # If this parameter is already present in the template: current_value = argmap.get(self.template) if current_value: change['new_' + argmap.name] = (match, link) #if argmap.custom_access: # stats['changed'] += 1 # template.add(argmap.custom_access, 'free') #else: self.classification = 'already_present' if argmap.name == 'hdl': self.proposed_change = "hdl-access=free" # don't change anything else return if argmap.is_id: self.proposed_change = 'id={{%s|%s}}' % (argmap.name, match) else: self.proposed_change = '%s=%s' % (argmap.name, match) if argmap.name == 'hdl': self.proposed_change += "|hdl-access=free" break # If we are going to add an URL, check it's not probably redundant if self.proposed_change.startswith('url='): hdl = get_value(self.template, 'hdl') if hdl and hdl in self.proposed_change: # Don't actually add the URL but mark the hdl as seemingly OA # and hope that the templates will later linkify it self.proposed_change = "hdl-access=free"