def scrape_url( self, url, parser='html.parser', tag_to_find='p', ): try: sents = [] resp = requests.get(url=url, ) soup = BeautifulSoup(resp.content, parser) contents_tag = soup.find_all(tag_to_find) for cont in contents_tag: txt = StringUtils.trim(cont.get_text()) sent_list = txt.split('。') sent_list = [StringUtils.trim(s) for s in sent_list if s] if len(sent_list): sents += sent_list Log.debug('Split "' + str(txt) + '" into:' + str(sent_list)) # [Log.debug('\t"' + str(s) + '"') for s in sent_list] return sents except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error scraping url "' + str(url) + '", exception: ' + str(ex))
def __segment_words(self, text): sent = StringUtils.trim(text) sent = sent.lower() sent = sent.split(' ') # Split out punctuations sent = BasicPreprocessor.clean_punctuations(sentence=sent) return sent
def process_common_words(self, word_split_token=' '): try: self.raw_words = StringUtils.trim(self.raw_words) self.raw_words = re.sub(pattern='[\xa0\t\n\r]', repl=word_split_token, string=self.raw_words) self.raw_words = self.raw_words.lower() except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error processing raw words. Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) try: self.common_words = self.raw_words.split(word_split_token) # Remove None, '', {}, etc. self.common_words = [w for w in self.common_words if w] word_stems = self.add_word_stems() if word_stems: self.common_words = word_stems + self.common_words self.common_words = sorted(set(self.common_words)) Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Loaded ' + str(len(self.common_words)) + ' common words of lang "' + str(self.lang) + '".' ) except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error processing common words. Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) return
def import_form_fields( list_json, mex_form_model ): if len(list_json) != len(mex_form_model): raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': List of fields must be same length with mex expr list.' + ' Fields: ' + str(list_json) + ', Mex Expr List: ' + str(mex_form_model) ) form_fields = [] for i in range(len(list_json)): json_field = list_json[i] json_field[ffld.FormField.KEY_MEX_EXPR] = StringUtils.trim(mex_form_model[i]) try: form_fields.append( ffld.FormField.import_form_field(json_obj=json_field) ) except Exception as ex_field: errmsg = \ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error importing field: ' + str(json_field) \ + '. Exception: ' + str(ex_field) Log.error(errmsg) raise Exception(errmsg) return form_fields
def confirm_answer(self, answer): answer = StringUtils.trim(answer) if answer.lower() in self.text_list_confirm_words: self.confirm_current_field() return True else: # No form confirmation return False
def get_training_data_by_scraping( self, url, tag_to_find='p', min_char_per_sent=0, max_char_per_sent=np.inf, rm_html_markup=False, unquote_html=False, ): # Пример данных из википедии sentences_list_from_wiki_scraping = Scrape().scrape_url( url=url, tag_to_find=tag_to_find) Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Scraped ' + str(len(sentences_list_from_wiki_scraping)) + ' sentences from url "' + str(url) + '"') sentences_list = [] for s in sentences_list_from_wiki_scraping: s = StringUtils.trim(s) s = BeautifulSoup(s).text s_clean = s if rm_html_markup: # Remove all patterns '<...>' html_tags_re = re.compile(r'<[^>]+>') s_clean = re.sub(html_tags_re, '', string=s) if unquote_html: # Convert strings like '%3Fmode%3DLSD%26mid%3Dshm%26sid1%3D102%26oid%3D421%26aid%3D0005537039' # into '?mode=LSD&mid=shm&sid1=102&oid=421&aid=0005537039' s_clean = urllib.parse.unquote(string=s) len_s = len(s_clean) if (len_s >= min_char_per_sent) and (len_s <= max_char_per_sent): sentences_list.append(s_clean) Log.debug('From\n\r\t"' + str(s) + '" to\n\r\t"' + str(s_clean) + '"') Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Filtered to ' + str(len(sentences_list)) + ' sentences from url "' + str(url) + '"') return sentences_list
def confirm_form(self, answer): answer = StringUtils.trim(answer) if answer.lower() in self.text_list_confirm_words: self.set_state_form_completed_and_confirmed() self.reset_continuous_error_count() return True else: # Try to update all fields strictly, maybe user wants to change something result = self.set_all_field_value_from_answer(answer=answer) if result.is_updated: self.reset_continuous_error_count() else: self.increment_continuous_error_count() if self.is_error_threshold_hit(): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Reset form after ' + str(self.fill_form_continuous_err_count) + ' error counts.') self.reset() # No form confirmation return False
def trim_lower(x): x = StringUtils.trim(str(x)) return x.lower()