Python clean_text示例，helper.clean_text Python示例

示例#1

0

显示文件

文件： preprocess.py 项目： rajrohan/ramayanaocr

def header_removal(fulltext):
    # convert List to string
    prepared_text = create_str_from_list(fulltext)
    # remove all types of regex patterns
    regex_page_number = r"\d[०-९][०-९][०-९]*\s"
    regex_shrivalmiki_ramayana = r"\w.\w.\w.ल्म.\w.\s\w.\w.\w."
    regex_sundar_kand = r"\wन्दर\sका.|\w.न्दर\sकाण.\w+|\w.न्दर\sकाण्ड."  #"\wन्दर\sका|\w.न्दर\sका"
    regex_sarg = r"\wर्ग\s[०-९]*\W.[०-९]+|\wर्ग\s[०-९]*\W.[०-९]*"  #"\wर्ग\s[०-९]*.|\wर्ग\s[०-९]*.[०-९]"

    pattern_list = [
        regex_page_number, regex_sarg, regex_sundar_kand,
        regex_shrivalmiki_ramayana
    ]
    for pattern in pattern_list:
        tmp_match_set = create_match_list_n_log(pattern, prepared_text)
        #print(len(tmp_match_set))
        regex_list.append(tmp_match_set)

    #print(len(regex_list))

    for pattern in pattern_list:
        #print(len(single_list))
        prepared_text = clean_text(pattern, prepared_text)
        #print(prepared_text[0:10])
    return prepared_text

示例#2

0

显示文件

文件： app.py 项目： Vishalbhat10/Auto_Ticket_Assignment

def predict():
    # get data

    if request.method == 'POST':	
        message = request.form['message']
        

        ## Clean and preprocess
        message = helper.fn_translate(message)
        message = helper.clean_text(message)
        message = helper.pre_process(message)
        
        data = [message]
        
        
        # predictions
        result = model.predict(data)
        
        # send back to browser
        output = key[val.index(result)]
        
        if output == 'L1/L2':
            group = model_l1_l2.predict(data)

        # send back to browser
            output_final = key_list2[val_list2.index(group)]
        else :
            group = model_l3.predict(data)

        # send back to browser
            output_final = key_list[val_list.index(group)]

    # return data
    return render_template('result.html',prediction = output_final, team= output)

示例#3

0

显示文件

文件： palindrome.py 项目： momentum-cohort-2019-02/w2d2-palindrome-dmpeterson317

def main():
    # assert("stunt nuts")

    userInput = input("Enter one or more sentences: ")

    text = helper.clean_text(userInput)

    if helper.isPalindrome(text):
        print("is a palindrome")
    else:
        print("is not a palindrome")

示例#4

0

显示文件

def extract_search_list(url):
    anime_entries = get_anime_entries(url)

    extracted_anime = {}
    if(anime_entries):
        for anime_entry in anime_entries:
            anime_title = helper.clean_text(
                            anime_entry
                            .find('p', class_='name')
                            .find('a')['title']
                            ).capitalize()
            release_date = anime_entry.find('p', class_='released').text
            extracted_anime[anime_title] = release_date.strip()

    return extracted_anime

示例#5

0

显示文件

def extract_anime_list(url):
    anime_entries = get_anime_entries(url)

    extracted_anime = {}
    if(anime_entries):
        for anime_entry in anime_entries:
            anime_title = helper.clean_text(
                            anime_entry
                            .find('p', class_='name')
                            .find('a')['title']
                            ).capitalize()
            episode_no = anime_entry.find('p', class_='episode').text.lower().strip('episode ')
            extracted_anime[anime_title] = int(episode_no)

    return extracted_anime

示例#6

0

显示文件

文件： match_features.py 项目： glareprotector/glare

    def get_word_window(self, excerpt, anchor):


        import helper
        cleaned_text = helper.clean_text(excerpt.raw_text, self.delimiters)


        anchor = excerpt.anchor
        words = cleaned_text.split()
        idx = words.index(anchor)
        low_idx = max(idx - self.word_window, 0)
        high_idx = min(len(words), idx + self.word_window)

        


        raw_text = excerpt.raw_text[:]



            
        


        sentence_positions = [i for i in range(len(words)) if words[i] == 'gsw']
        highest_low = None
        lowest_high = None
        for pos in sentence_positions:
            if pos < idx:
                if highest_low == None:
                    highest_low = pos
                elif pos > highest_low:
                    highest_low = pos
            if pos > idx:
                if lowest_high == None:
                    lowest_high = pos
                elif pos < lowest_high:
                    lowest_high = pos
        if highest_low != None:
            low_idx = max(low_idx, highest_low)
        if lowest_high != None:
            high_idx = min(high_idx, lowest_high)


        import string
        return string.join(words[low_idx:high_idx], ' ')

示例#7

0

显示文件

文件： dataset.py 项目： ruosiwang/plan-your-meetup

def _load_event_descriptions(event_num=None):
    """

    """
    descriptions, topics = [], []
    group_ids, dscr_tracking = set(), set()
    current_id = None
    mapping = get_group_topic_mapping()

    file = os.path.join(DATA_DIR, EVENT_FILE)
    with gzip.open(file) as f:
        for i, event in enumerate(
            (json.loads(x) for x in islice(f, event_num))):
            if i % 1000 == 0:
                print(i)
            dscr = event.get("description")
            if dscr:
                dscr = h.clean_text(dscr)
            grpId = event.get("group").get('id')

            if not (dscr and grpId and (grpId in mapping)):
                continue

            if current_id and dscr_tracking and (grpId != current_id):
                unique_descriptions = get_unique(list(dscr_tracking),
                                                 threshold=0.5)
                topic = [mapping[current_id]] * len(unique_descriptions)

                descriptions.extend(unique_descriptions)
                topics.extend(topic)

                group_ids.add(grpId)
                current_id = grpId
                dscr_tracking = set()

            if not current_id:
                current_id = grpId
            dscr_tracking.add(dscr)

    return descriptions, topics

示例#8

0

显示文件

文件： amaq.py 项目： tonybenoy/amaq-dictionary

    def amaq_queries(self) -> None:
        logging.info("Running Amaq " + time.asctime(time.localtime(time.time())))

        rich_print("[bold blue]Hello, I'm Amaq[/bold blue]")
        rich_print("[bold blue]Let's get to the business.[\bold blue]")

        while True:
            text = self.console.input("[red]>>>[/red] ")
            if text.strip() == "":
                continue

            logging.info("User input: " + text)
            user_in = Word(clean_text(text))

            if user_in in STOP_WORDS:
                self.quit()

            if FAQS.get(user_in):
                logging.info("User asked about question")
                rich_print(FAQS.get(user_in))
                continue

            meanings = self.meaning_check(user_in)

            req = meanings[0]
            logging.info(f"Writing the result onto the output screen for {req}")
            del meanings[0]

            if not meanings:
                rich_print("[yellow]Sorry, I didn't found any match.[/yellow]")
                logging.info(f"Amaq failed to find any match for {req}")
                continue

            rich_print(f"[blue]{req.capitalize()}[/blue] stands for : \n")
            for meaning in meanings:
                rich_print(f"[green]*** {meaning}[/green]")

示例#9

0

显示文件

文件： main.py 项目： TheEverlastingBish/who-covid19

def main():

    helper.prep()

    clean_links = helper.get_pdf_links()
    latest_link, latest_file, local_file = helper.get_latest_file(clean_links)
    print("Latest Link: {0}\nLatest File Basename: {1}\nLocal File Path:{2}".format(latest_link, latest_file, local_file))

    r = requests.get(latest_link, allow_redirects=True)

    with open(local_file, 'wb') as fl:
        fl.write(r.content)

    pdf_file_obj = open(local_file, 'rb')
    pdf_reader = ppd.PdfFileReader(pdf_file_obj)

    print("Total pages:", pdf_reader.numPages)

    # latest_pdf = r".\data\20200406-sitrep-77-covid-19.pdf"

    start_page = int(min(helper.get_page_range(pdf_reader, 'Western Pacific Region')))
    end_page = int(max(helper.get_page_range(pdf_reader, 'Subtotal for all'))) + 1
    print("Table Start Page: {}, Table End Page: {}".format(start_page, end_page))


    # Read PDF Tables
    df = helper.get_pdf_data(local_file, start_page, end_page)


    # Drop bad columns
    df = helper.drop_bad_columns(df)


    # Rename columns
    df.columns = helper.get_columns_labels()
    # df.to_csv(r".\data\before dropping rows.csv", header=True, encoding='utf-8', index=None)


    # Drop bad rows
    df = helper.drop_bad_rows(df)


    # Cleaning
    df['report_country'] = df['report_country'].astype('str')
    df.loc[df['report_country'].str.contains('Lao'), 'report_country'] = "Lao People's Democratic Republic"
    df = helper.clean_text(df)


    # Repopulate col1 with slid values from col2
    df.loc[(df['report_country'].isnull()) | (df['report_country'] == 'nan'), 'report_country'] = \
        df.loc[(df['report_country'].isnull()) | (df['report_country'] == 'nan'), 'confirmed']


    # Amend dtypes
    df['confirmed'] = df['confirmed'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
    df['confirmed_new'] = df['confirmed_new'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
    df['deaths'] = df['deaths'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
    df['deaths_new'] = df['deaths_new'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
    df['days_since_last_report'] = df['days_since_last_report'].apply(lambda x: pd.to_numeric(x, errors='coerce'))


    # More cleaning
    df.drop(df[(~df['report_country'].isin(helper.get_region_labels())) & \
               (df['confirmed'].isnull())].index, inplace=True)


    # Generate Region
    df['region'] = df.loc[(df['confirmed'].isnull()) & (~df['report_country'].isin(['Territories', 'erritories'])), 'report_country']
    df['region'] = df['region'].fillna(method='ffill')


    # Extract Territories
    df['location_type'] = np.nan
    df.loc[df['report_country'].isin(['Territories', 'erritories']), 'location_type'] = 'Territory'

    df.loc[0, 'location_type'] = df.loc[0, 'report_country']
    df.loc[df['region'] != df['region'].shift(1), 'location_type'] = df['region']
    df['location_type'].fillna(method='ffill', inplace=True)

    df.loc[~df['location_type'].isin(['Territory', 'erritories']), 'location_type'] = 'Nation'
    df.dropna(subset=['report_country', 'confirmed'], inplace=True)
    df.drop(df[df['report_country'] == 'nan'].index, inplace=True)


    # Final Cleaning
    replacements = helper.get_replacements()

    df.replace(replacements, inplace=True)
    df.drop(df[df['report_country'].str.isnumeric()].index, inplace=True)

    df['report_date'] = helper.get_file_date(file_name=latest_file)

    df = df.loc[:, helper.get_final_column_order()]
    df.fillna(value=0, inplace=True)

    # Output
    output_file = helper.get_output_file(latest_file)
    print("Output saved at:", output_file)

    df.to_csv(output_file, header=True, encoding='utf-8', index=None)

示例#10

0

显示文件

# Loading Data
tqdm.pandas()
train_set = pd.read_csv('Data/train.csv', encoding='latin1')
print('The data has {} rows and {} columns'.format(train_set.shape[0],
                                                   train_set.shape[1]))

# 2. Text Cleaning, preprocessing, and Meta Feature extraction ----------------
# 2.1 Applying lowercasing + punctuation removal + special character removal +
# stopwords removal and lemmatization
print('Applying clean text processing step: '
      'lowercasing + punctuation removal + special character removal + '
      'lemmatizacion and stopword removal (separate column))')
stop_words = h.get_clean_stopwords(stopwords.words('english'))
train_set['qt_clean'] = train_set.question_text.progress_apply(
    lambda t: h.clean_text(t))
train_set['qt_clean_stop'] = train_set.question_text.progress_apply(
    lambda t: h.clean_text(t, stop_words=stop_words))

# 2.2 Document Meta features
# These features may aid text feature modelling
# Basic features
train_set['char_count'] = train_set.qt_clean_stop.progress_apply(
    len)  # char clean count
train_set['word_count'] = train_set.qt_clean_stop.progress_apply(
    lambda x: len(x.split()))  # word clean count
train_set['word_density'] = train_set.char_count / (train_set.word_count + 1
                                                    )  # word density count
train_set['n_stopwords'] = train_set.qt_clean.progress_apply(
    lambda x: len([x for x in x.split() if x in stop_words]))
train_set['n_numbers'] = train_set.qt_clean_stop.progress_apply(

示例#11

0

显示文件

def test_clean_text_digits() -> None:
    text = clean_text("hello world123")
    assert text == "hello world"

示例#12

0

显示文件

def test_clean_text_no_special_char() -> None:
    text = clean_text("hello world")
    assert text == "hello world"

示例#13

0

显示文件

def test_clean_text_multiple_spaces() -> None:
    text = clean_text("hello world   ")
    assert text == "hello world"