def perform_candidate_record_search(requestType, data): """Handle search requests. """ max_results = 999 too_many = False result = {'resultCode': 0, 'resultText': ''} if requestType == "searchCandidates": recids = perform_request_search(p=data['query']) if len(recids) > max_results: too_many = True else: captions = [search_result_info(x) for x in recids] alternative_titles = [ remove_html_markup(print_record(x, "hs")) for x in recids ] search_results = [recids, captions, alternative_titles] elif requestType == "searchRevisions": revisions = get_record_revision_ids(data['recID1']) captions = [split_revid(x, 'datetext')[1] for x in revisions] search_results = [revisions, captions] if too_many == True: result['resultCode'] = 1 result['resultText'] = 'Too many results' else: result['results'] = search_results result['resultText'] = '%s results' % len(search_results[0]) return result
def perform_candidate_record_search(requestType, data): """Handle search requests. """ max_results = 999 too_many = False result = { 'resultCode': 0, 'resultText': '' } if requestType == "searchCandidates": recids = perform_request_search( p=data['query'] ) if len(recids) > max_results: too_many = True else: captions = [ search_result_info(x) for x in recids ] alternative_titles = [ remove_html_markup(print_record(x, "hs")) for x in recids ] search_results = [recids, captions, alternative_titles] elif requestType == "searchRevisions": revisions = get_record_revision_ids( data['recID1'] ) captions = [ split_revid(x, 'datetext')[1] for x in revisions ] search_results = [revisions, captions] if too_many == True: result['resultCode'] = 1 result['resultText'] = 'Too many results' else: result['results'] = search_results result['resultText'] = '%s results' % len(search_results[0]) return result
def get_as_text(record_id=0, xml_record=None, ln=CFG_SITE_LANG): """Return the record in a textual format""" _ = gettext_set_language(ln) out = "" if record_id != 0: rec_in_hb = format_record(record_id, of="hb") elif xml_record: rec_in_hb = format_record(0, of="hb", xml_record=xml_record) rec_in_hb = rec_in_hb.replace("\n", " ") htparser = RecordHTMLParser() try: htparser.feed(rec_in_hb) htparser.close() out = htparser.result except: out = remove_html_markup(rec_in_hb) # Remove trailing whitespace and linefeeds out = out.strip("\n").strip() # Merge consecutive whitespaces. Must be done here, once all HTML # tags have been removed out = whitespaces_pattern.sub(" ", out) # Now consider non-breakable spaces out = out.replace(" ", " ") out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Detailed record"), "", out) out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Similar records"), "", out) out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Cited by"), "", out) return out.strip()
def perform_candidate_record_search(requestType, data): """Handle search requests. """ max_results = 999 too_many = False result = {"resultCode": 0, "resultText": ""} if requestType == "searchCandidates": recids = perform_request_search(p=data["query"]) if len(recids) > max_results: too_many = True else: captions = [search_result_info(x) for x in recids] alternative_titles = [remove_html_markup(print_record(x, "hs")) for x in recids] search_results = [recids, captions, alternative_titles] elif requestType == "searchRevisions": revisions = get_record_revision_ids(data["recID1"]) captions = [split_revid(x, "datetext")[1] for x in revisions] search_results = [revisions, captions] if too_many == True: result["resultCode"] = 1 result["resultText"] = "Too many results" else: result["results"] = search_results result["resultText"] = "%s results" % len(search_results[0]) return result
def get_as_text(record_id=0, xml_record=None, ln=CFG_SITE_LANG): """Return the record in a textual format""" _ = gettext_set_language(ln) out = "" if record_id != 0: rec_in_hb = format_record(record_id, of="hb") elif xml_record: rec_in_hb = format_record(0, of="hb", xml_record=xml_record) rec_in_hb = rec_in_hb.replace('\n', ' ') htparser = RecordHTMLParser() try: htparser.feed(rec_in_hb) htparser.close() out = htparser.result except: out = remove_html_markup(rec_in_hb) # Remove trailing whitespace and linefeeds out = out.strip('\n').strip() # Merge consecutive whitespaces. Must be done here, once all HTML # tags have been removed out = whitespaces_pattern.sub(' ', out) # Now consider non-breakable spaces out = out.replace(' ', ' ') out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Detailed record"), "", out) out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Similar records"), "", out) out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Cited by"), "", out) return out.strip()
def tokenize_for_words(self, phrase): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ words = {} formulas = [] if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: formulas = latex_formula_re.findall(phrase) phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(' ', phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: stemmed_block = remove_stopwords(block, self.remove_stopwords) stemmed_block = length_check(stemmed_block) stemmed_block = apply_stemming(stemmed_block, self.stemming_language) if stemmed_block: words[stemmed_block] = 1 if re_arxiv.match(block): # special case for blocks like `arXiv:1007.5048' where # we would like to index the part after the colon # regardless of dot or other punctuation characters: words[block.split(':', 1)[1]] = 1 # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): stemmed_subblock = remove_stopwords( subblock, self.remove_stopwords) stemmed_subblock = length_check(stemmed_subblock) stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language) if stemmed_subblock: words[stemmed_subblock] = 1 # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): stemmed_alphanumeric_group = remove_stopwords( alphanumeric_group, self.remove_stopwords) stemmed_alphanumeric_group = length_check( stemmed_alphanumeric_group) stemmed_alphanumeric_group = apply_stemming( stemmed_alphanumeric_group, self.stemming_language) if stemmed_alphanumeric_group: words[stemmed_alphanumeric_group] = 1 for block in formulas: words[block] = 1 return words.keys()
def tokenize_for_words(self, phrase, recid): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ if not self.isAuthority(recid): return [] words = {} formulas = [] if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: formulas = latex_formula_re.findall(phrase) phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(" ", phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: stemmed_block = remove_stopwords(block, self.remove_stopwords) stemmed_block = length_check(stemmed_block) stemmed_block = apply_stemming(stemmed_block, self.stemming_language) if stemmed_block: words[stemmed_block] = 1 if re_arxiv.match(block): # special case for blocks like `arXiv:1007.5048' where # we would like to index the part after the colon # regardless of dot or other punctuation characters: words[block.split(":", 1)[1]] = 1 # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords) stemmed_subblock = length_check(stemmed_subblock) stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language) if stemmed_subblock: words[stemmed_subblock] = 1 # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords) stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group) stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language) if stemmed_alphanumeric_group: words[stemmed_alphanumeric_group] = 1 for block in formulas: words[block] = 1 return words.keys()
def main(): for journal in CFG_JOURNALS: name = get_coll_i18nname(journal) reclist = get_collection_reclist(journal) print "<h2>%s</h2>" % escape(name) if not reclist: print "<p>None yet.</p>" continue print "<p><ul>" for recid in reclist: record = get_record(recid) title = remove_html_markup(record_get_field_value(record, '245', code='a'), remove_escaped_chars_p=False).strip() doi = record_get_field_value(record, '024', '7', code='a') print '<li><a href="http://dx.doi.org/%s" target="_blank">%s</a>: %s</li>' % (escape(doi, True), escape(doi), title) print "</ul></p>"
def tokenize_for_pairs(self, phrase): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ words = {} if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(' ', phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace last_word = '' for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: block = remove_stopwords(block, self.remove_stopwords) block = length_check(block) block = apply_stemming(block, self.stemming_language) # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): subblock = remove_stopwords(subblock, self.remove_stopwords) subblock = length_check(subblock) subblock = apply_stemming(subblock, self.stemming_language) if subblock: # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split( subblock): alphanumeric_group = remove_stopwords( alphanumeric_group, self.remove_stopwords) alphanumeric_group = length_check( alphanumeric_group) alphanumeric_group = apply_stemming( alphanumeric_group, self.stemming_language) if alphanumeric_group: if last_word: words['%s %s' % (last_word, alphanumeric_group)] = 1 last_word = alphanumeric_group return words.keys()
def tokenize_for_pairs(self, phrase, recid): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ if not self.isAuthority(recid): return [] words = {} if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(" ", phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace last_word = "" for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: block = remove_stopwords(block, self.remove_stopwords) block = length_check(block) block = apply_stemming(block, self.stemming_language) # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): subblock = remove_stopwords(subblock, self.remove_stopwords) subblock = length_check(subblock) subblock = apply_stemming(subblock, self.stemming_language) if subblock: # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords) alphanumeric_group = length_check(alphanumeric_group) alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language) if alphanumeric_group: if last_word: words["%s %s" % (last_word, alphanumeric_group)] = 1 last_word = alphanumeric_group return words.keys()
def test_remove_html_markup_replacement(self): """htmlutils - remove HTML markup, some replacement""" test_input = 'This is <a href="test">test</a>.' test_expected = 'This is XtestX.' self.assertEqual(remove_html_markup(test_input, 'X'), test_expected)
def test_remove_html_markup_empty(self): """htmlutils - remove HTML markup, empty replacement""" test_input = 'This is <a href="test">test</a>.' test_expected = 'This is test.' self.assertEqual(remove_html_markup(test_input, ''), test_expected)
def _get_feature_text(record, language): """ Looks for a text (header) that can be featured on the article overview page. """ washer = HTMLWasher() header_text = "" # Check if there is a header if language == "fr": header = record.field('590__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('520__a') else: header = record.field('520__a') if header.strip() in \ ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']: header = record.field('590__a') header = washer.wash(html_buffer=header, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) if header != "": header_text = header else: if language == "fr": article = record.fields('590__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('520__b') else: article = record.fields('520__b') if not article or \ (len(article) == 1 and \ article[0].strip() in \ ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']): article = record.fields('590__b') try: article = article[0] except: return '' match_obj = re.search(header_pattern, article) if not match_obj: match_obj = re.search(header_pattern2, article) try: header_text = match_obj.group("header") header_text = washer.wash(html_buffer=header_text, allowed_tag_whitelist=['a'], allowed_attribute_whitelist=['href', 'target', 'class']) if header_text == "": raise Exception except: article = article.replace(header_text, '') article = article.replace('<p/>', '') article = article.replace('<p> </p>', '') match_obj = re.search(para_pattern, article) try: # get the first paragraph header_text = match_obj.group("paragraph") try: header_text = washer.wash(html_buffer=header_text, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results header_text = remove_html_markup(header_text) if header_text.strip() == "": raise Exception else: if len(header_text) > 250: header_text = _get_first_sentence_or_part(header_text) except: # in a last instance get the first sentence try: article = washer.wash(article, allowed_tag_whitelist=[], allowed_attribute_whitelist=[]) except: # was not able to parse correctly the HTML. Use # this safer function, but producing less good # results article = remove_html_markup(article) header_text = _get_first_sentence_or_part(article) return header_text
def add_basic_fields(rec, form, email): """ Adds the basic fields from the form. Note that these fields are mapped to specific MARC fields. For information on the fields see the www.loc.gov website. For example http://www.loc.gov/marc/bibliographic/bd260.html contains information on field 260 for publication data. """ # why aren't subfields a dictionary?! try: if form['title']: record_add_field(rec, '245', subfields=[('a', remove_html_markup(form['title']))]) if form['creator']: record_add_field(rec, '100', subfields=[('a', remove_html_markup(form['creator']))]) if form['domain']: record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['domain']))]) pubfields = [] if form['publisher']: pubfields.append(('b', remove_html_markup(form['publisher']))) if form.get('publication_date'): pubfields.append(('c', remove_html_markup(form['publication_date']))) if pubfields: record_add_field(rec, '260', subfields=pubfields) record_add_field(rec, '856', ind1='0', subfields=[('f', email)]) if 'open_access' in form: record_add_field(rec, '542', subfields=[('l', 'open')]) else: record_add_field(rec, '542', subfields=[('l', 'restricted')]) if form['licence']: record_add_field(rec, '540', subfields=[('a', remove_html_markup(form['licence']))]) record_add_field(rec, '520', subfields=[('a', remove_html_markup(form['description']))]) if form['tags']: for kw in form['tags'].split(','): record_add_field(rec, '653', ind1='1', subfields=[('a', remove_html_markup(kw.strip()))]) if form['contributors']: for kw in form['contributors'].split(';'): record_add_field(rec, '700', subfields=[('a', remove_html_markup(kw.strip()))]) if form['language']: record_add_field(rec, '546', subfields=[('a', remove_html_markup(form['language']))]) # copying zenodo here, but I don't think 980 is the right MARC field if form['resource_type']: record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['resource_type']))]) if form['alternate_identifier']: record_add_field(rec, '024', subfields=[('a', remove_html_markup(form['alternate_identifier']))]) if form['version']: record_add_field(rec, '250', subfields=[('a', remove_html_markup(form['version']))]) record_add_field(rec, '264', subfields=[('b', CFG_SITE_NAME), ('c', str(datetime.utcnow()) + " UTC")]) except Exception as e: current_app.logger.error(e) raise