Exemplo n.º 1
0
def preprocess(valid_text):
    """
    Calls all specific cleaning functions and returns a list of cleaned sentences
    
    valid_text -- list of text strings (list)
    """

    cleaned_text = [
        remove_url(remove_handles(remove_hashtags(remove_url(t))))
        for t in valid_text
    ]

    cleaned_text = list((set(cleaned_text)))

    cleaned_text = [nltk.tokenize.sent_tokenize(t) for t in cleaned_text]

    cleaned_text = [item for sublist in cleaned_text for item in sublist]

    clean_sentences = [nltk.tokenize.word_tokenize(t) for t in cleaned_text]

    mwetokenizer = nltk.MWETokenizer(separator='')
    mwetokenizer.add_mwe(('<', 'hndl', '>'))
    clean_sentences = [mwetokenizer.tokenize(t) for t in clean_sentences]

    new_sentences = []
    for sentence in clean_sentences:
        sentence = [
            str.lower(token) for token in sentence
            if 0 < len(token) < 17 and token not in '@.?\n\t'
        ]
        new_sentences.append(sentence)

    return new_sentences
def clean_body_title(text):

    # parse html
    soup = BeautifulSoup(text, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # get text
    text = soup.get_text()

    # remove url
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

    # convert to lowercase
    text = text.lower()

    # remove contractions
    rep_word = REReplacer()
    text = rep_word.replace(text)

    # split into words
    mwtokenizer = nltk.MWETokenizer(separator='')
    mwtokenizer.add_mwe(('c', '#'))
    tokens = mwtokenizer.tokenize(word_tokenize(text))

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = []
    for w in tokens:
        if w in set(single_word_top_400_tags):
            stripped.append(w)
        else:
            stripped.append(w.translate(table))

    # remove remaining tokens that are not alphabetic
    words = [
        word for word in stripped
        if (any(chr.isdigit() for chr in word) == False)
    ]

    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w != '']

    # lemmatization except top 400 tags
    wordnet_lemmatizer = WordNetLemmatizer()
    words_lemma = []
    for w in words:
        if w in set(single_word_top_400_tags):
            words_lemma.append(w)
        else:
            words_lemma.append(
                wordnet_lemmatizer.lemmatize(w, get_wordnet_pos(w)))

    return words_lemma
Exemplo n.º 3
0
def extract_NNP(sentence):
    tokenized = sent_tokenize(sentence)

    result = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            mwtokenizer = nltk.MWETokenizer(separator='')
            mwtokenizer.add_mwe(('C', '#'))
            mwtokenizer.add_mwe(('c', '#'))
            words = mwtokenizer.tokenize(words)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""skill: {(<JJ>*<NN>*<NNP>*)*<LS>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'skill'):
                result.append(' '.join([i[0] for i in subtree.leaves()]))
                for i in subtree.leaves():
                    result.append(i[0])

    except Exception as e:
        print(str(e))
    return result
Exemplo n.º 4
0
         ('Azure', 'Guest', 'OS', 'Machine', 'Key', 'Generation', 'Algorithm'),
         ('SLE', '12', 'SP2)'), ('SLE', '12', 'SP1)'), ('SLE', '12', 'SP3)'),
         ('oracleasm', 'kmp'), ('HTTP', 'Server'), ('Apache', 'Struts'),
         ('Apache', 'Commons'),
         ('WebSphere', 'Application', 'Server', 'Admin', 'Console'),
         ('WebSphere', 'Application', 'Server', 'Liberty'),
         ('Apache', 'Commons', 'HttpClient'),
         ('WebSphere', 'Application', 'Server', 'Edge', 'Caching', 'Proxy'),
         ('Java', 'Server', 'Faces'), ('Performance', 'Management', 'product'),
         ('Apache', 'CXF'), ('Rational', 'ClearQuest'),
         ('Rational', 'ClearCase'), ('Client', 'Management', 'Service'),
         ('MQ.NET', 'Managed', 'Client'), ('MQ', 'Appliance'), ('MQ', 'PAM'),
         ('MQ', 'Clients'), ('Queue', 'Manager'), ('GNU', 'C'
                                                   'library'),
         ('Process', 'Designer'), ('Business', 'Automation', 'Workflow'),
         ('Business', 'Process', 'Manager'), ('BigFix', 'Remote', 'Control'),
         ('DataPower', 'Gateways'),
         ('WebSphere', 'DataPower', 'XC10', 'Appliance'), ('Adobe', 'Flash'),
         ('Remote', 'Desktop', 'Protocol', '(RDP)', 'Denial', 'of', 'Service'),
         ('bug', 'fix,'), ('Red', 'Hat')]
tokenizer = nltk.MWETokenizer(list_, separator=' ')
Abstract = df1['Abstract'].tolist()
keywords = []
keywords_update = []
for ab in Abstract:
    keywords.append(tokenizer.tokenize(ab.split()))
for key in keywords:
    keywords_ = [re.split(',|\)|\(', k) for k in key]
    flat_list = [item for sublist in keywords_ for item in sublist]
    keywords_update.append(flat_list)
print(keywords_update)
Exemplo n.º 5
0
def tokenize(text,drug=None,pos_filter=False,lemma=True):
	"""Simple (or not) tokenizer for given text block.

	ARGS:
		text: string.
			Single comment block.

	KWARGS:
		drug: string or None.
			drug name (added to stoplist to prevent self-mentions)
		pos_filter: boolean.
			set True to use part-of-speech filtering.
		lemma: boolean.
			set True to use lemmatization.

	RETURNS:
		words: list.
			List of lower-case word tokens (individual strings)
	"""
	tokens = nltk.RegexpTokenizer(r'\w+').tokenize(text.lower())
	merger = nltk.MWETokenizer([('side','effect'),('side','effects')])
	tokens = merger.tokenize(tokens)
	
	# filter on stop words
	stops = sw.stop_words()
	if drug is not None:
		if drug.upper() != 'ANTIDEPRESSANT':
			stops.append(drug.lower())
			if _drug_dict[drug.upper()] != drug.upper():
				stops.append(_drug_dict[drug.upper()].lower())
			if drug.upper() in _gen_dict.keys():
				for bd in _gen_dict[drug.upper()]:
					stops.append(bd.lower())
		else:
			stops = stops+['antidepressant','antidepressants']
	stops = set(stops)
	tokens = [word for word in tokens if word not in stops]

	if pos_filter:
		tagged_tokens = nltk.pos_tag(tokens)
		tags = ['CD',
			'DT',
			'JJ',
			'JJR',
			'JJS',
			'NN',
			'NNP',
			'NNPS',
			'NNS',
			'RB',
			'RBR',
			'RBS',
			'VB',
			'VBD',
			'VBG',
			'VBN',
			'VBP',
			'VBZ']
		tokens = [word for (word,tag) in tagged_tokens if tag in tags]

	if lemma:
		tokens = [_lemmatizer.lemmatize(word,pos='v') for word in tokens]
		tokens = [_lemmatizer.lemmatize(word,pos='n') for word in tokens]

	# one more pass through stopword filter
	tokens = [word for word in tokens if word not in stops]

	return tokens

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }

    return tag_dict.get(tag, wordnet.NOUN)


mwtokenizer = nltk.MWETokenizer(separator='')
mwtokenizer.add_mwe(('c', '#'))


def clean_body_title(text):

    # parse html
    soup = BeautifulSoup(text, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # get text
    text = soup.get_text()
Exemplo n.º 7
0
# %%
import nltk
nltk.download("punkt")

# we have to keep a list of topics with symbols or digits that people will actually type in because of how nltk handles word tokenization
# this list includes tags that have more than 10,000 questions as of 2020 Jan
topics_with_symbols = ["c#", "c++", ".net", "asp.net", "node.js", "objective-c", "unity3d", "html5", "css3", \
                       "d3.js", "utf-8", "neo4j", "scikit-learn", "f#", "3d", "x86"]

df["body_tokenized"] = df["body"].progress_apply(lambda text: [word for word in nltk.word_tokenize(text) \
                                                               if word.isalpha() or word in list("+#") + topics_with_symbols])

# %%
# retokenize topics including meaningful symbols such as C#, C++
mwe_tokenizer = nltk.MWETokenizer(separator="")
mwe_tokenizer.add_mwe(("c", "#"))
mwe_tokenizer.add_mwe(("c", "+", "+"))
mwe_tokenizer.add_mwe(("f", "#"))

df["body_tokenized"] = df["body_tokenized"].progress_apply(
    lambda tokens: [token for token in mwe_tokenizer.tokenize(tokens)])

# %% [markdown]
# ### Remove stop words

# %%
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))