def _get_doc_by_title(self, aliases_dict): doc = None try: biblio = aliases_dict["biblio"][0] biblio_title = remove_punctuation(biblio["title"]).lower() biblio_year = str(biblio["year"]) if biblio_title and biblio_year: try: doc = self.session.catalog.advanced_search( title=biblio_title, min_year=biblio_year, max_year=biblio_year, view='stats').list(page_size=1).items[0] except (UnicodeEncodeError, IndexError): biblio_title = remove_punctuation(biblio["title"].encode('ascii','ignore')) try: doc = self.session.catalog.advanced_search( title=biblio_title, min_year=biblio_year, max_year=biblio_year, view='stats').list(page_size=1).items[0] except (IndexError): return None mendeley_title = remove_punctuation(doc.title).lower() if biblio_title != mendeley_title: logger.debug(u"Mendeley: titles don't match so not using this match /biblio_print %s and %s" %( biblio_title, mendeley_title)) doc = None except (KeyError, MendeleyException): # logger.info(u"No biblio found in _get_doc_by_title") pass return doc
def _find_product_page(self, use_organic=True): '''Find the Product Page from the Company's website. Sometimes when there is only one result, a Site will return the result's details page instead of a search results page. A Site can set the :data:`SEARCH_REDIRECTED_TEXT` class attribute to handle this. :param use_organic: Whether or not to check for a non-organic version of the product. :type use_organic: bool :returns: The Product Page's HTML or :obj:`None` :rtype: :obj:`str` ''' search_terms = remove_punctuation(self.sese_name) if use_organic and self.sese_organic: search_terms += " organic" if self.INCLUDE_CATEGORY_IN_SEARCH: search_terms += ' ' + remove_punctuation(self.sese_category) search_page = self._search_site(search_terms) if self.SEARCH_REDIRECTED_TEXT is not None: if self.SEARCH_REDIRECTED_TEXT in search_page: return search_page match = self._get_best_match_or_none(search_page) check_without_organic = (self.sese_organic and match is None and use_organic) return (match if not check_without_organic else self._find_product_page(use_organic=False))
def _prepend_name_match_amounts(self, search_results): '''Prepend the % of SESE Name matched to the ``search_results`` list. ``search_results`` should be a list of (URL, Name) tuples. This method iterates through the provided ``search_results`` comparing the Product Name with the SESE Product Name by calculating the percentage of words in the Company's Name that are also in the SESE Name. The match percentage will be prepended to each :obj:`tuple` in the ``search_results`` returning a list of ``[(Match Percentage, (URL, Name)),...]`` :param search_results: A list of tuples containing the ``(URL, Name)`` of each matching Product :type search_results: list :returns: A list of tupes containing ``(Match%, (URL, Name))`` of each Product :rtype: :obj:`list` ''' sese_words = [ remove_punctuation(x) for x in self.sese_name.lower().split() + self.sese_category.lower().split() ] number_of_sese_words = len(sese_words) output = [] for result in search_results: number_of_matches = 0 site_words = [ remove_punctuation(x) for x in result[1].lower().split() ] number_of_site_words = len(site_words) for word in site_words: if word in sese_words: number_of_matches += 1 percent_site_words_matched = (float(number_of_matches) / number_of_site_words * 100) site_to_sese_word_ratio = (float(number_of_site_words) / number_of_sese_words) percent_sese_words_matched = min( float(number_of_matches) / number_of_sese_words * 100, 100) sese_to_site_word_ratio = (float(number_of_sese_words) / number_of_site_words) match_percentage = ( percent_site_words_matched * site_to_sese_word_ratio + percent_sese_words_matched * sese_to_site_word_ratio) / 2 output.append((match_percentage, result)) output.sort(key=lambda x: x[0], reverse=True) return output
def is_open_via_doaj_issn(issns, pub_year=None): if issns: for issn in issns: issn = remove_punctuation(issn) for (row_issn, row_license, doaj_start_year) in doaj_issns: if issn == remove_punctuation(row_issn): if doaj_start_year and pub_year and (doaj_start_year > pub_year): pass # journal wasn't open yet! else: # logger.info(u"open: doaj issn match!") return find_normalized_license(row_license) return False
def set_mendeley_data(product): resp = None doc = None try: mendeley_session = get_mendeley_session() if product.doi: method = "doi" try: doc = mendeley_session.catalog.by_identifier(doi=product.doi, view='stats') except (UnicodeEncodeError, IndexError): return None elif product.title and product.year: biblio_title = remove_punctuation(product.title).lower() biblio_year = product.year try: method = "title" doc = mendeley_session.catalog.advanced_search( title=biblio_title, min_year=biblio_year, max_year=biblio_year, view='stats').list(page_size=1).items[0] mendeley_title = remove_punctuation(doc.title).lower() if biblio_title != mendeley_title: return None except (UnicodeEncodeError, IndexError): return None if not doc: return None # print u"\nMatch! got the mendeley paper! for title {}".format(biblio_title) # print "got mendeley for {} using {}".format(product.id, method) resp = {} resp["reader_count"] = doc.reader_count resp[ "reader_count_by_academic_status"] = doc.reader_count_by_academic_status resp[ "reader_count_by_subdiscipline"] = doc.reader_count_by_subdiscipline resp["reader_count_by_country"] = doc.reader_count_by_country resp["mendeley_url"] = doc.link resp["abstract"] = doc.abstract resp["method"] = method except (KeyError, MendeleyException): pass return resp
def set_mendeley_data(product): resp = None doc = None try: mendeley_session = get_mendeley_session() if product.doi: method = "doi" try: doc = mendeley_session.catalog.by_identifier( doi=product.doi, view='stats') except (UnicodeEncodeError, IndexError): return None elif product.title and product.year: biblio_title = remove_punctuation(product.title).lower() biblio_year = product.year try: method = "title" doc = mendeley_session.catalog.advanced_search( title=biblio_title, min_year=biblio_year, max_year=biblio_year, view='stats').list(page_size=1).items[0] mendeley_title = remove_punctuation(doc.title).lower() if biblio_title != mendeley_title: return None except (UnicodeEncodeError, IndexError): return None if not doc: return None # print u"\nMatch! got the mendeley paper! for title {}".format(biblio_title) # print "got mendeley for {} using {}".format(product.id, method) resp = {} resp["reader_count"] = doc.reader_count resp["reader_count_by_academic_status"] = doc.reader_count_by_academic_status resp["reader_count_by_subdiscipline"] = doc.reader_count_by_subdiscipline resp["reader_count_by_country"] = doc.reader_count_by_country resp["mendeley_url"] = doc.link resp["abstract"] = doc.abstract resp["method"] = method except (KeyError, MendeleyException): pass return resp
def format_text_udf(text): return functions.udf( lambda t: remove_punctuation( REMOVE_URL_EXPR.sub( "", strip_accents(t.lower().replace('\t', ' ').replace('\n', ' '))) ), types.StringType())(text)
def fake_spaces_etc(s, text): """Revised the provided text such that it does not include any character present in ch. - The FAKE_SPACE character is used to space or tab. - Removes carriage returns. - Punctuation characters are removed with FAKE_SPACE protected should it be a punctuation character. - members of s are converted hex """ # use FAKE_SPACE instead space and tab text = text.replace(' ', FAKE_SPACE) text = text.replace('\t', FAKE_SPACE) text = text.replace('\r', '') # Remove all punctuation but preserve FAKE_SPACE non_punctuation_character = chr(3) text = text.replace(FAKE_SPACE, non_punctuation_character) text = remove_punctuation(text) text = text.replace(non_punctuation_character, FAKE_SPACE) assert '-' not in s # ASCII dash assert '—' not in s # Unicode long dash ? # Convert to hex any members of s found in text for ch in s: if ch in text: text = text.replace(ch, f"<x{ord(ch):02x}>") return text
def anagram_hash(word, ignore_punc=True): """ Returns a hash of the given word, suitable for checking anagram equality: >>> anagram_hash("fiber") 'befir' >>> anagram_hash("brief") 'befir' It ignores character cases, and punctuation by default: >>> anagram_hash("It's") 'ist' >>> anagram_hash("sit") 'ist' To consider punctuation as part of the anagram, pass ignore_punc=False >>> anagram_hash("it's", ignore_punc=False) "'ist" """ # Remove punctuation, if requested if ignore_punc: word = remove_punctuation(word) # Convert the word to lowercase, then sort its letters # This gives us a string that will only be equal for words that are anagrams return "".join(sorted(word.lower()))
def add_to_index(p, name, id): text = name.lower() text = util.remove_accents(text) text = util.remove_punctuation(text) words = text.split() swords = set(words) for word in swords: w = 'si-' + word p.sadd(w, id)
def artist_search(text): lwords = set() text = text.lower() text = util.remove_accents(text) text = util.remove_punctuation(text) words = text.split() swords = set(words) for word in swords: w = 'si-' + word lwords.append(w) aids = r.sinter(lwords) print 'as', lwords, aids return list(aids)
def _get_best_match_or_none(self, search_page_html): '''Attempt to find the best match on the Search Results HTML. The method will first attempt to find a Product that contains the name of the SESE variety. Otherwise it will use the Product with the most words in common with the SESE variety name, if a minimum percentage of the words match(specified by :data:`settings.MINIMUM_NAME_MATCHING_PERCENTAGE`). If no results are found, the method will return :obj:`None`. :param search_page_html: The Search Results Page's HTML :type search_page_html: str :returns: Product Page HTML of the best match or :obj:`None` if no good match is found :rtype: :obj:`str` ''' products = self._get_results_from_search_page(search_page_html) has_no_results = len(products) == 0 or (self.NO_RESULT_TEXT is not None and self.NO_RESULT_TEXT in search_page_html) if has_no_results: return None for product in products: relative_url, product_name = product clean_product_name = remove_punctuation(product_name).lower() clean_sese_name = remove_punctuation(self.sese_name).lower() if clean_sese_name in clean_product_name: page_url = self.ROOT_URL + relative_url return get_page_html(page_url) product_ranks = self._prepend_name_match_amounts(products) best_match = product_ranks[0] match_amount = best_match[0] if match_amount >= settings.MINIMUM_NAME_MATCHING_PERCENTAGE: match_url = self.ROOT_URL + best_match[1][0] return get_page_html(match_url)
def process_tweet(self, tweet): ''' Process one tweet ''' # Save all tweet collected util.save_tweet(tweet, self.output_all_tweets) # Transform the tweet's text in lowercase text = tweet['text'].lower() # Remove punctuation text = util.remove_punctuation(text) self.count_all = self.count_all + 1 # Check whether the tweet is a crime related tweet if self.clf_crime.predict([text]) == 1: # Define the type of crime of this tweet type_crime = self.description[self.clf_typecrime.predict([text])] # Save the tweets related to crime with theirs respective types tweet['type_crime'] = type_crime util.save_tweet(tweet, self.output_crime_related) self.count_crime_related = self.count_crime_related + 1 # Apply the alias dictionary to the text text = self.apply_alias(text) # Extract full address street_address = self.extract_full_address(text) # Extract the location of the tweet # street_address = self.extract_street(text) if street_address != "": self.count_with_location = self.count_with_location + 1 tweet['street_address'] = street_address # Extract the state information # tweet['state'] = self.extract_state(text) util.save_tweet(tweet, self.output_with_location) #print("All\tCrimeRelated\tLocation") if self.count_all % 20 == 0: print("%d\t%d\t\t%d" % \ (self.count_all, self.count_crime_related, \ self.count_with_location))
def synset_review(review): review = unicodedata.normalize('NFKD', review).encode('ascii', 'ignore') review = remove_stopwords(remove_punctuation(review.lower())) words = review.split() return ' '.join([' '.join(synset_word(word)) for word in words])
def filter_name(text): text = text.lower() text = util.remove_accents(text) text = util.remove_punctuation(text) return text
def synset_review(review): review = unicodedata.normalize('NFKD', review).encode('ascii','ignore') review = remove_stopwords(remove_punctuation(review.lower())) words = review.split() return ' '.join([' '.join(synset_word(word)) for word in words])
def extract_words_fsize_line_from_page_vertical_region(page, next_page, region, min_fsize=-1, min_len=2): """ Exctract words, their fontsize and the line in which they sit from the top portion of the page whose font size is big enough :type page: bs4.BeautifulSoup :type next_page: bs4.BeautifulSoup or None :param region: portion of the page to extract(between 0 and 1=all) :type region: float :param min_fsize: minimum font size of a word to be extracted :type min_fsize: float :param min_len: minimum lenght of a word to be extracted :type min_len: int :rtype: list[(str,int,int)] """ # Find start of current page and next page page_top = util.get_coordinate_from_style(page.contents[0]['style'], 'top') default_page_size = 1000 if next_page is None: # Last pages are badly formatted anyway, just take an approximation next_page_top = page_top + default_page_size else: next_page_top = util.get_coordinate_from_style( next_page.contents[0]['style'], 'top') current_line = "" words_fsize = [] last_top = 0 tag_fsize = min_fsize line_number = 0 for tag in page.children: if hasattr(tag, "style"): if tag.name == "div" or tag.name == "span": # Check font size tag_fsize = util.get_coordinate_from_style( tag['style'], "font-size") if tag_fsize is None or tag_fsize == 0: tag_fsize = min_fsize if tag_fsize >= min_fsize: # Check position in page tag_top = util.get_coordinate_from_style( tag['style'], 'top') if tag_top is not None: if tag_top < region * (next_page_top - page_top) + page_top: if tag_top == last_top: current_line += util.remove_punctuation( "".join(tag.strings)) else: # Remove multiple newlines and spaces current_line = re.sub(r'\n+', ' ', current_line) current_line = re.sub(r' +', ' ', current_line) single_words = current_line.split(" ") single_words = [ word for word in single_words if not len(word) < min_len ] if len(single_words) > 0: words_fsize += [(word, tag_fsize, line_number) for word in single_words] line_number += 1 current_line = util.remove_punctuation("".join( tag.strings)) last_top = tag_top if len(current_line) > 0: current_line = re.sub(r'\n+', ' ', current_line) current_line = re.sub(r' +', ' ', current_line) single_words = current_line.split(" ") single_words = [ word for word in single_words if not len(word) < min_len ] if len(single_words) > 0: words_fsize += [(word, tag_fsize, line_number) for word in single_words] return words_fsize