def header_removal(fulltext): # convert List to string prepared_text = create_str_from_list(fulltext) # remove all types of regex patterns regex_page_number = r"\d[०-९][०-९][०-९]*\s" regex_shrivalmiki_ramayana = r"\w.\w.\w.ल्म.\w.\s\w.\w.\w." regex_sundar_kand = r"\wन्दर\sका.|\w.न्दर\sकाण.\w+|\w.न्दर\sकाण्ड." #"\wन्दर\sका|\w.न्दर\sका" regex_sarg = r"\wर्ग\s[०-९]*\W.[०-९]+|\wर्ग\s[०-९]*\W.[०-९]*" #"\wर्ग\s[०-९]*.|\wर्ग\s[०-९]*.[०-९]" pattern_list = [ regex_page_number, regex_sarg, regex_sundar_kand, regex_shrivalmiki_ramayana ] for pattern in pattern_list: tmp_match_set = create_match_list_n_log(pattern, prepared_text) #print(len(tmp_match_set)) regex_list.append(tmp_match_set) #print(len(regex_list)) for pattern in pattern_list: #print(len(single_list)) prepared_text = clean_text(pattern, prepared_text) #print(prepared_text[0:10]) return prepared_text
def predict(): # get data if request.method == 'POST': message = request.form['message'] ## Clean and preprocess message = helper.fn_translate(message) message = helper.clean_text(message) message = helper.pre_process(message) data = [message] # predictions result = model.predict(data) # send back to browser output = key[val.index(result)] if output == 'L1/L2': group = model_l1_l2.predict(data) # send back to browser output_final = key_list2[val_list2.index(group)] else : group = model_l3.predict(data) # send back to browser output_final = key_list[val_list.index(group)] # return data return render_template('result.html',prediction = output_final, team= output)
def main(): # assert("stunt nuts") userInput = input("Enter one or more sentences: ") text = helper.clean_text(userInput) if helper.isPalindrome(text): print("is a palindrome") else: print("is not a palindrome")
def extract_search_list(url): anime_entries = get_anime_entries(url) extracted_anime = {} if(anime_entries): for anime_entry in anime_entries: anime_title = helper.clean_text( anime_entry .find('p', class_='name') .find('a')['title'] ).capitalize() release_date = anime_entry.find('p', class_='released').text extracted_anime[anime_title] = release_date.strip() return extracted_anime
def extract_anime_list(url): anime_entries = get_anime_entries(url) extracted_anime = {} if(anime_entries): for anime_entry in anime_entries: anime_title = helper.clean_text( anime_entry .find('p', class_='name') .find('a')['title'] ).capitalize() episode_no = anime_entry.find('p', class_='episode').text.lower().strip('episode ') extracted_anime[anime_title] = int(episode_no) return extracted_anime
def get_word_window(self, excerpt, anchor): import helper cleaned_text = helper.clean_text(excerpt.raw_text, self.delimiters) anchor = excerpt.anchor words = cleaned_text.split() idx = words.index(anchor) low_idx = max(idx - self.word_window, 0) high_idx = min(len(words), idx + self.word_window) raw_text = excerpt.raw_text[:] sentence_positions = [i for i in range(len(words)) if words[i] == 'gsw'] highest_low = None lowest_high = None for pos in sentence_positions: if pos < idx: if highest_low == None: highest_low = pos elif pos > highest_low: highest_low = pos if pos > idx: if lowest_high == None: lowest_high = pos elif pos < lowest_high: lowest_high = pos if highest_low != None: low_idx = max(low_idx, highest_low) if lowest_high != None: high_idx = min(high_idx, lowest_high) import string return string.join(words[low_idx:high_idx], ' ')
def _load_event_descriptions(event_num=None): """ """ descriptions, topics = [], [] group_ids, dscr_tracking = set(), set() current_id = None mapping = get_group_topic_mapping() file = os.path.join(DATA_DIR, EVENT_FILE) with gzip.open(file) as f: for i, event in enumerate( (json.loads(x) for x in islice(f, event_num))): if i % 1000 == 0: print(i) dscr = event.get("description") if dscr: dscr = h.clean_text(dscr) grpId = event.get("group").get('id') if not (dscr and grpId and (grpId in mapping)): continue if current_id and dscr_tracking and (grpId != current_id): unique_descriptions = get_unique(list(dscr_tracking), threshold=0.5) topic = [mapping[current_id]] * len(unique_descriptions) descriptions.extend(unique_descriptions) topics.extend(topic) group_ids.add(grpId) current_id = grpId dscr_tracking = set() if not current_id: current_id = grpId dscr_tracking.add(dscr) return descriptions, topics
def amaq_queries(self) -> None: logging.info("Running Amaq " + time.asctime(time.localtime(time.time()))) rich_print("[bold blue]Hello, I'm Amaq[/bold blue]") rich_print("[bold blue]Let's get to the business.[\bold blue]") while True: text = self.console.input("[red]>>>[/red] ") if text.strip() == "": continue logging.info("User input: " + text) user_in = Word(clean_text(text)) if user_in in STOP_WORDS: self.quit() if FAQS.get(user_in): logging.info("User asked about question") rich_print(FAQS.get(user_in)) continue meanings = self.meaning_check(user_in) req = meanings[0] logging.info(f"Writing the result onto the output screen for {req}") del meanings[0] if not meanings: rich_print("[yellow]Sorry, I didn't found any match.[/yellow]") logging.info(f"Amaq failed to find any match for {req}") continue rich_print(f"[blue]{req.capitalize()}[/blue] stands for : \n") for meaning in meanings: rich_print(f"[green]*** {meaning}[/green]")
def main(): helper.prep() clean_links = helper.get_pdf_links() latest_link, latest_file, local_file = helper.get_latest_file(clean_links) print("Latest Link: {0}\nLatest File Basename: {1}\nLocal File Path:{2}".format(latest_link, latest_file, local_file)) r = requests.get(latest_link, allow_redirects=True) with open(local_file, 'wb') as fl: fl.write(r.content) pdf_file_obj = open(local_file, 'rb') pdf_reader = ppd.PdfFileReader(pdf_file_obj) print("Total pages:", pdf_reader.numPages) # latest_pdf = r".\data\20200406-sitrep-77-covid-19.pdf" start_page = int(min(helper.get_page_range(pdf_reader, 'Western Pacific Region'))) end_page = int(max(helper.get_page_range(pdf_reader, 'Subtotal for all'))) + 1 print("Table Start Page: {}, Table End Page: {}".format(start_page, end_page)) # Read PDF Tables df = helper.get_pdf_data(local_file, start_page, end_page) # Drop bad columns df = helper.drop_bad_columns(df) # Rename columns df.columns = helper.get_columns_labels() # df.to_csv(r".\data\before dropping rows.csv", header=True, encoding='utf-8', index=None) # Drop bad rows df = helper.drop_bad_rows(df) # Cleaning df['report_country'] = df['report_country'].astype('str') df.loc[df['report_country'].str.contains('Lao'), 'report_country'] = "Lao People's Democratic Republic" df = helper.clean_text(df) # Repopulate col1 with slid values from col2 df.loc[(df['report_country'].isnull()) | (df['report_country'] == 'nan'), 'report_country'] = \ df.loc[(df['report_country'].isnull()) | (df['report_country'] == 'nan'), 'confirmed'] # Amend dtypes df['confirmed'] = df['confirmed'].apply(lambda x: pd.to_numeric(x, errors='coerce')) df['confirmed_new'] = df['confirmed_new'].apply(lambda x: pd.to_numeric(x, errors='coerce')) df['deaths'] = df['deaths'].apply(lambda x: pd.to_numeric(x, errors='coerce')) df['deaths_new'] = df['deaths_new'].apply(lambda x: pd.to_numeric(x, errors='coerce')) df['days_since_last_report'] = df['days_since_last_report'].apply(lambda x: pd.to_numeric(x, errors='coerce')) # More cleaning df.drop(df[(~df['report_country'].isin(helper.get_region_labels())) & \ (df['confirmed'].isnull())].index, inplace=True) # Generate Region df['region'] = df.loc[(df['confirmed'].isnull()) & (~df['report_country'].isin(['Territories', 'erritories'])), 'report_country'] df['region'] = df['region'].fillna(method='ffill') # Extract Territories df['location_type'] = np.nan df.loc[df['report_country'].isin(['Territories', 'erritories']), 'location_type'] = 'Territory' df.loc[0, 'location_type'] = df.loc[0, 'report_country'] df.loc[df['region'] != df['region'].shift(1), 'location_type'] = df['region'] df['location_type'].fillna(method='ffill', inplace=True) df.loc[~df['location_type'].isin(['Territory', 'erritories']), 'location_type'] = 'Nation' df.dropna(subset=['report_country', 'confirmed'], inplace=True) df.drop(df[df['report_country'] == 'nan'].index, inplace=True) # Final Cleaning replacements = helper.get_replacements() df.replace(replacements, inplace=True) df.drop(df[df['report_country'].str.isnumeric()].index, inplace=True) df['report_date'] = helper.get_file_date(file_name=latest_file) df = df.loc[:, helper.get_final_column_order()] df.fillna(value=0, inplace=True) # Output output_file = helper.get_output_file(latest_file) print("Output saved at:", output_file) df.to_csv(output_file, header=True, encoding='utf-8', index=None)
# Loading Data tqdm.pandas() train_set = pd.read_csv('Data/train.csv', encoding='latin1') print('The data has {} rows and {} columns'.format(train_set.shape[0], train_set.shape[1])) # 2. Text Cleaning, preprocessing, and Meta Feature extraction ---------------- # 2.1 Applying lowercasing + punctuation removal + special character removal + # stopwords removal and lemmatization print('Applying clean text processing step: ' 'lowercasing + punctuation removal + special character removal + ' 'lemmatizacion and stopword removal (separate column))') stop_words = h.get_clean_stopwords(stopwords.words('english')) train_set['qt_clean'] = train_set.question_text.progress_apply( lambda t: h.clean_text(t)) train_set['qt_clean_stop'] = train_set.question_text.progress_apply( lambda t: h.clean_text(t, stop_words=stop_words)) # 2.2 Document Meta features # These features may aid text feature modelling # Basic features train_set['char_count'] = train_set.qt_clean_stop.progress_apply( len) # char clean count train_set['word_count'] = train_set.qt_clean_stop.progress_apply( lambda x: len(x.split())) # word clean count train_set['word_density'] = train_set.char_count / (train_set.word_count + 1 ) # word density count train_set['n_stopwords'] = train_set.qt_clean.progress_apply( lambda x: len([x for x in x.split() if x in stop_words])) train_set['n_numbers'] = train_set.qt_clean_stop.progress_apply(
def test_clean_text_digits() -> None: text = clean_text("hello world123") assert text == "hello world"
def test_clean_text_no_special_char() -> None: text = clean_text("hello world") assert text == "hello world"
def test_clean_text_multiple_spaces() -> None: text = clean_text("hello world ") assert text == "hello world"