Пример #1
0
def create_keywordProcessor(list_of_terms,
                            remove_stopwords=True,
                            custom_stopword_list=[""]):
    """ Creates a new flashtext KeywordProcessor and optionally
    does some lightweight text cleaning to remove stopwords, including
    any provided by the user.
    """
    # create a KeywordProcessor
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list_of_terms)

    # remove English stopwords if requested
    if remove_stopwords == True:
        keyword_processor.remove_keywords_from_list(stopwords.words('english'))

    # remove custom stopwords
    keyword_processor.remove_keywords_from_list(custom_stopword_list)

    return (keyword_processor)
Пример #2
0
import requests
from flashtext.keyword import KeywordProcessor
from nltk.corpus import stopwords

# let's read in a couple of forum posts
forum_posts = pd.read_csv("../input/ForumMessages.csv")

# get a smaller sub-set for playing around with
sample_posts = forum_posts.Message[0:3]

# get data from list of top 5000 pypi packages (last 30 days)
url = 'https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json'
data = requests.get(url).json()

# get just the list of package names
list_of_packages = [data_item['project'] for data_item in data['rows']]

# create a KeywordProcess
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(list_of_packages)

# remove english stopwords
keyword_processor.remove_keywords_from_list(stopwords.words('english'))

# remove custom stopwords
keyword_processor.remove_keywords_from_list(['http','kaggle'])

# test our keyword processor
for post in sample_posts:
    keywords_found = keyword_processor.extract_keywords(post, span_info=True)
    print(keywords_found)
    mt = MosesTokenizer(lang=f'{lang}')
    for domain in domains:
        sourcedir = f"{workdrive}/coding/Text2TCS/git/ACTER/{lang}/{domain}/annotations/"
        list_of_terms = []
        with open(f"{sourcedir}/{domain}_{lang}_terms_nes.ann",
                  "r", encoding="utf-8") as f:
            for line in f.readlines():
                s = str(line).replace("OOD_Term", "").replace("Common_Term", "").replace("Specific_Term", "").replace(
                    "Named_Entity", "").strip("\n").strip("\t")
                list_of_terms.append(s)

        kwp.add_keywords_from_list(list_of_terms)

        # Remove unwanted terms from list (single letters, prepositions, stop-words etc.)
        abc_list = list(string.ascii_uppercase + string.ascii_lowercase)
        kwp.remove_keywords_from_list(abc_list)
        kwp.remove_keywords_from_list(word_boundary_list)

        for i in stopwords(f"{lang}"):
            kwp.remove_keyword(i)
            kwp.remove_keyword(i.capitalize())

        # Extract the terms
        with open(f"{outdir}/{lang}{suffix1}/{domain}{suffix2}full_tok.txt", "r", encoding="utf-8") as f:
            sentences = f.readlines()
#        print(sentences[-10:])
        results =[]
        for line in tqdm(sentences):
            s = kwp.extract_keywords(line.rstrip())
            results.append(s)
        print(results[-10:])