def run(input, client, output_path, index_name): INPUT = str(input) CLIENT = lib.create_connection(client) OUTPUT_PATH = output_path INDEX_NAME = index_name MODE = "url" if not lib.check_index(client=CLIENT, index=INDEX_NAME): lib.logger.debug(f"{INDEX_NAME} not found.") return Exception(f"{INDEX_NAME} not found.") website = newspaper(INPUT) fulltext = website.article try: nltk_download('punkt') except: pass sentences = tokenize.sent_tokenize(fulltext.strip()) scores_sentences = lib.get_scores(CLIENT, INDEX_NAME, sentences) format_scores_sentences = lib.format_scores(sentences, scores_sentences) result = lib.save_result(fulltext, INDEX_NAME, INPUT, format_scores_sentences, OUTPUT_PATH, MODE) return result
def run(input, client, output_path, index_name): INPUT = str(input) CLIENT = lib.create_connection(client) OUTPUT_PATH = output_path INDEX_NAME = index_name MODE = "text" if not lib.check_index(client=CLIENT, index=INDEX_NAME): lib.logger.debug(f"{INDEX_NAME} not found.") return Exception(f"{INDEX_NAME} not found.") fulltext = INPUT.replace('\r', ' ').replace('\n', ' ').replace("”", '"').replace("’", "'") try: nltk_download('punkt') except: pass sentences = tokenize.sent_tokenize(fulltext.strip()) scores_sentences = lib.get_scores(CLIENT, INDEX_NAME, sentences) format_scores_sentences = lib.format_scores(sentences, scores_sentences) result = lib.save_result(fulltext, INDEX_NAME, INPUT, format_scores_sentences, OUTPUT_PATH, MODE) return result
def get_nltk_data(): data = [ "twitter_samples", "punkt", "wordnet", "averaged_perceptron_tagger", "stopwords", ] for each in data: nltk_download(each)
def __init__(self, *args, **kwargs) -> None: """Initialise SentimentAnalysis instance, ensure we have downloaded required data.""" # Ensure we have the various corpora that we need for analysis works. text_blob_download() nltk_download('vader_lexicon') # Initialise the Vader sentiment analysis tool self.analyser = SentimentIntensityAnalyzer() super().__init__(*args, **kwargs) # type: ignore
def eval_pipeline(self, pkl_filename, title, content): """ Evaluate a sample via the pipeline model """ nltk_download('wordnet') nltk_download('stopwords') pipeline = load(pkl_filename) self.bow_transformer = pipeline['vectorizer'] self.model = pipeline['nbclassifier'] self.labelencoder = pipeline['labelenc'] self.labelencoder return self.eval(title, content)
def train(self): """ Create the LDA model with procurements and their tender descriptions. Grouped the procurements by the generated topics Dump it to a cache json file :return: None """ nltk_download('wordnet') # remove the unwanted words for all tender description processed_tender_descriptions = map( lambda p: self.preprocess(p.tender_description), data_holder.procurements) dictionary = gensim.corpora.Dictionary(processed_tender_descriptions) # filter away words that appears more than 25% of the time dictionary.filter_extremes(no_above=0.25) # create a list of tuple containing the word index and the number of times it appeared bow_corpus = [ dictionary.doc2bow(doc) for doc in processed_tender_descriptions ] # create the LDA model lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2) topic_to_procurements = [] # create an list of 10 empty list for i in range(10): topic_to_procurements.append([]) # group similar tender numbers together for p in data_holder.procurements: unseen_document = p.tender_description bow_vector = dictionary.doc2bow(self.preprocess(unseen_document)) # sort by descending probability, highest first sorted_probabilities = sorted(lda_model[bow_vector], key=lambda tup: -1 * tup[1]) topic_with_highest_prob = sorted_probabilities[0] topic_index = topic_with_highest_prob[0] # put the tender number into the topic with the highest probability topic_to_procurements[topic_index].append(p.tender_no) self.group_topic_procurements(topic_to_procurements) # save the cached result as json with open(self.CACHE_JSON_FILE_NAME, 'w') as outfile: json.dump(topic_to_procurements, outfile)
def get_stop_words() : try : return stopwords.words('english') except LookupError : from nltk import download as nltk_download import warnings # christ this is verbose... with warnings.catch_warnings() : warnings.simplefilter("ignore") nltk_download('stopwords') return stopwords.words('english')
def setup_directories(processed_path, nlkt_data_path): """Just in case delete/create a directory for processed files It's okay to store files data in memory if they are small. However, I would save/upload them them somewhere if the data is large""" if os.path.exists(processed_path): shutil.rmtree(processed_path) os.makedirs(processed_path) if not os.path.exists(nlkt_data_path): os.makedirs(nlkt_data_path) nltk_download(['punkt', 'stopwords'], download_dir=nlkt_data_path) nltk_path.append(nlkt_data_path)
def __init__(self, crawler): self.stats = crawler.stats s = crawler.settings # Defaults to wherever NLTK normally keeps its data self.dir = crawler.settings.get('NLTKDATA_DIR', None) self.dir = None # Ensure necessary NLTK data is present. This can be persisted across # runs using DotScrapy Persistence if on ScrapingHub. # TODO: Should check if they're present by trying to load them, then # download on exception. nltk.download() checks if up to date # as well, so this generates latency and network traffic. for pkg in [ 'punkt', 'words', ]: nltk_download(pkg, download_dir=self.dir)
def spellcheck(self, books, mode='simple'): if mode not in ['simple', 'complex']: raise ValueError('Mode must be one of "simple, complex"') nltk_download('punkt') sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7) dictionary_path = resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): return books books = [ book for book in books if self.hasNoSpellingErrors(book, sym_spell, mode) ] return books, self.set_unknown, self.set_known
def check_nltk_packages(dl_path='.nltk_data', quiet=True): from nltk import download as nltk_download reqs = [] for req in reqs: try: find_nltk_package(req, paths=[dl_path, '~/nltk_data']) except LookupError as e: logger.error(f'NLTK: Unable to find package `{req}`. ') logger.info(f'Downloading NLTK package `{req}`') try: nltk_download(req, download_dir=dl_path, quiet=quiet) except Exception as e: logger.error( f'NLTK: An error occurred while downloading package `{req}`.' ) logger.error(e) pass return True
def train(self, train_csv, test_size=0.2): self.processed_count = 0 nltk_download('wordnet') nltk_download('stopwords') self.df = pd.read_csv(train_csv) self.df['title_content'] = self.df['title'] + ' ' + self.df['content'] self.x = self.df['title_content'] self.y = self.df['publication'] self.lemmatizer = WordNetLemmatizer() # enocde publications self.labelencoder = LabelEncoder() self.y = self.labelencoder.fit_transform(self.y) x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, random_state=12345) self.total_files_to_process = x_train.shape[0] + x_test.shape[0] print(f'x_train shape: {x_train.shape}\nx_test_shape: {x_test.shape}\n' f'y_train_shape: {y_train.shape}\ny_test_shape: {y_test.shape}\n') self.bow_transformer = CountVectorizer(analyzer=self.process_text) text_bow_train = self.bow_transformer.fit_transform(x_train) text_bow_test = self.bow_transformer.transform(x_test) print(f'bow_train_shape: {text_bow_train.shape}\n' f'bow_test_shape: {text_bow_test.shape}\n') # multinomial naive bayes model self.model = MultinomialNB() # train the model self.model = self.model.fit(text_bow_train, y_train) print(self.model.score(text_bow_train, y_train)) # validation print(self.model.score(text_bow_test, y_test)) test_predictions = self.model.predict(text_bow_test) print(classification_report(y_test, test_predictions))
from atexit import register from bs4 import BeautifulSoup as soup from collections import Counter, OrderedDict from copy import deepcopy from datetime import datetime as dt, timedelta from date_extractor import extract_date from math import sqrt, exp from numpy import array as ndarray, errstate from nltk import download as nltk_download, pos_tag, word_tokenize from nltk.corpus import stopwords try: stopwords.words('english') pos_tag(word_tokenize('check')) except: nltk_download('stopwords') nltk_download('punkt') nltk_download('averaged_perceptron_tagger') from os import makedirs, remove, chmod from os.path import dirname, abspath, exists, join from pickle import load as pload, dump as pdump from regex import findall, sub, compile, DOTALL, match from requests import get from selenium.webdriver import Firefox from selenium.webdriver.firefox.options import Options from sklearn.cluster import KMeans, AgglomerativeClustering, FeatureAgglomeration, AgglomerativeClustering from sklearn.decomposition import PCA from sklearn.metrics import silhouette_score from sklearn.preprocessing import StandardScaler from sys import stdout, maxsize, platform from time import strftime, sleep, time from traceback import format_exc
import numpy as np import os import xml.etree.ElementTree as ET import pickle import copy from tqdm import tqdm from collections import Counter from nltk.stem import WordNetLemmatizer from nltk import download as nltk_download nltk_download('wordnet') """ Here, we process the ICSI files into data structures usable for our analysis. The following file types are in the ICSI corpus: Dialogue acts files, e.g. Bdb001.A.dialogue-acts.xml - Contain a list of dialogue acts for each (part of) a meeting - Meeting number is indicated by Bdb001 - Part of meeting is defined by A - File contains the nite:root structure - This contains many <dialogueact> structures - Each dialogue act has a nite:id - id of dialogue act - start time - end time - type (this is a tag corresponding to the function of the dialogue act) - adjacency (this is a tag corresponding to the adjacency pair this dialogue act belongs to - save as string now, and process later) - original type (what's this? not clear, but we will save it anyway - participant - A child structure, which refers to the words, as in "Bdb001.A.words.xml#id(Bdb001.w.701)..id(Bdb001.w.702)" - We should decompose this to - File does not need to be listed, since word indices are unique - file: Bdb001.A.words.xml (or file_id: Bdb001.A.words) - and word_index_start
#!/usr/bin/python3 from os import path, getenv from dotenv import load_dotenv from sqlalchemy import create_engine from googletrans.constants import LANGCODES from nltk.data import find as nltk_find from nltk import download as nltk_download try: nltk_find('stopwords') except LookupError: nltk_download('stopwords') finally: from nltk.corpus import stopwords ################## # File Locations # ################## DEFAULT_DIR = path.dirname(path.abspath(__file__)) PATH_DB = path.join(path.dirname(__file__), './log/quotes.db') PATH_WOTD = path.join(path.dirname(__file__), './docs/wordoftheday.txt') ENGINE = create_engine('sqlite:///./log/quotes.db', echo=False) ################## # Bot & API Info # ################## load_dotenv() TOKEN = getenv('DISCORD_TOKEN') POC_TOKEN = getenv('POC_TOKEN') GUILD = getenv('DISCORD_GUILD')
def download_requirements(): # Verify Python Version --------------------------------------------------------------------------------------------------------------------------------- from sys import version_info from sys import exit if version_info <= (3, 0): print( "\nUse python version 3.0 or higher. ( Code was tested with 3.5.2)" ) print("--> EXITING") exit(1) try: #Verify that we have the required nlkt packages from nltk.data import find as nltk_find nltk_find('tokenizers/punkt') nltk_find('corpora/wordnet') nltk_find('corpora/wordnet_ic') nltk_find('taggers/averaged_perceptron_tagger') nltk_find('corpora/brown') nltk_find('corpora/stopwords') except LookupError: print("Download packages since some packages were missing") from nltk import download as nltk_download nltk_download('wordnet') nltk_download('wordnet_ic') nltk_download('punkt') nltk_download('averaged_perceptron_tagger') nltk_download('stopwords') nltk_download('brown')
import csv from nltk import download as nltk_download from os import path, listdir, mkdir from nltk.classify import NaiveBayesClassifier from nltk.tokenize import word_tokenize import cPickle as pickle from make_classifier import get_classifier, get_ticker, filter_stock, best_bigram_word_feats ################################################################### # VARIABLE INITIALIZATIONA ################################################################### files = listdir('data_to_be_classified') files = [file for file in files if file != '.keep'] classifier = get_classifier() nltk_download('stopwords') nltk_download('punkt') ################################################################### # MAIN PROGRAM ################################################################### while True: view_classifier_or_classify = raw_input( 'Do you want to view information about the classifier or classify new data?(view/new/break) \n' ) if view_classifier_or_classify == 'break': print('Thanks for using the program. Goodbye!') break elif view_classifier_or_classify == 'new':
nltk_path.append(config.NLTK_DATA_PATH) nltk_to_download = [] try: stopwords.words('english') except LookupError: nltk_to_download.append('stopwords') try: word_tokenize('token test') except LookupError: nltk_to_download.append('punkt') if nltk_to_download: print 'Performing first-time setup' from nltk import download as nltk_download for package in nltk_to_download: print '\tDownloading:', package nltk_download(package) STOPWORDS = frozenset(stopwords.words('english')) | frozenset('.,:()&[]?%;') STEMMER = PorterStemmer() BASE_URL_DOMAIN = urlparse.urlparse(config.BASE_URL).netloc DISALLOWED_ARTICLE_PATHS = frozenset(( 'Category_Articles_with_hCards', 'Category_Biography_with_signature', )) TEMPLATE = ''' <!doctype html> <html> <head> <meta charset="UTF-8"> <title>
from nltk import download as nltk_download from nltk.corpus import stopwords from nltk.stem import RSLPStemmer from nltk.tokenize import TweetTokenizer from sklearn.feature_extraction.text import TfidfVectorizer import string import unidecode import re from pickle import load import os base_path = os.path.abspath(os.path.dirname(__file__)) nltk_download("stopwords", quiet=True) nltk_download("rslp", quiet=True) stop_words = stopwords.words("portuguese") stemmer = RSLPStemmer() tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) min_length = 3 def get_data_path(): data_dir = os.path.join(base_path) return data_dir def valid(word):
def sentiment_analysis(self, text): if not op.exists( op.join(self.DATA_DIR, f'./sentiment/{self.LEXICON}.zip')): nltk_download('vader_lexicon', download_dir=self.DATA_DIR) return SentimentIntensityAnalyzer().polarity_scores(text)
def generate_word_list(api, since=None, until=None, username=None, user_location=False, lists=False, subscriptions=False, mentions=False, tweets_to=False, tweets_from=False, count=20, location=None, currentlocation=False, trends=False, expand_location=False, loc_popular=False, loc_recent=False, radius=5, globaltrends=False, minwordlen=3, outputdir=None, all=False, alternate_stoplist=False): global tokenizer global exclusions if alternate_stoplist: from custom_stoplist import stoplist as exclusions else: nltk_download('stopwords') nltk_download('punkt') exclusions = stopwords.words('english') tokenizer = TweetTokenizer() #date_regex = r"\d{4}(-\d{2}){2}" if count > 200: print( "Count lowest common denominator max is 200 but specified {0}. Setting to 200." .format(count)) count = 200 if all: user_location = lists = subscriptions = mentions = tweets_to = tweets_from = currentlocation = True trends = expand_location = loc_popular = loc_recent = globaltrends = True # TODO: evaluate if date/time boxing is practical to implement #if (since is not None and not re.match(date_regex,since)) or (until is not None and not re.match(date_regex,until)): # ValueError("Dates must be specified in yyyy-mm-dd format") allWords = list() # TODO: I think black hills mentioned a more precise way to get geolocation... look into that # TODO: also, seems like twitter has a built in API to handle lookup by IP and lat/long to woeid geolookup_url = "http://ipinfo.io" # **** USER INFO **** if username is not None: user_info = list() # **** PROFILE INFO **** # get basic information about the user print("Pulling profile info for {0}".format(username)) user = api.GetUser(screen_name=username) user_info.extend( [user.location, user.name, user.description, user.status.text]) # get information for user's location (if available) if user.location is not None and user_location: print("Found associated location for {0}.".format(username)) if trends: user_location_trends = get_geo_trends(api, user.location, expand=expand_location) user_info.extend([t.name for t in user_location_trends]) print( "Getting mix of popular and recent tweets for {0} in {1} mile radius" .format(user.location, radius)) loc = get_location(user.location) search_geocode = [ loc.latitude, loc.longitude, "{0}mi".format(radius) ] user_location_tweets = api.GetSearch(geocode=search_geocode, count=count) user_info.extend([t.text for t in user_location_tweets]) # get timeline information for the user print("Pulling timeline info for {0}".format(username)) user_timeline = api.GetUserTimeline(user_id=user.id, count=count) user_info.extend([s.text for s in user_timeline]) # get favorites print("Pulling favorites info for {0}".format(username)) faves = api.GetFavorites(user_id=user.id, count=count) user_info.extend([s.text for s in faves]) # **** SEARCHES **** # these searches will all be a mix of popular and current, seems good enough for me if mentions: print("Pulling mentions for {0}".format(username)) mentions_query = "@{0}".format(username) mentions_search = api.GetSearch(term=mentions_query, count=count) user_info.extend([s.text for s in mentions_search]) if tweets_to: print("Pulling tweets to {0}".format(username)) to_query = "to:{0}".format(username) to_search = api.GetSearch(term=to_query, count=count) user_info.extend([s.text for s in to_search]) if tweets_from: print("Pulling tweets from {0}".format(username)) from_query = "from:{0}".format(username) from_search = api.GetSearch(term=from_query, count=count) user_info.extend([s.text for s in from_search]) # **** LISTS **** if lists: print("Pulling list timelines for {0}".format(username)) # TODO: add count back in for list when fixed # https://github.com/bear/python-twitter/pull/646 lists = api.GetLists(user_id=user.id) listTimelines = list() for entry in lists: listTimelines.extend( api.GetListTimeline(list_id=entry.id, count=count)) user_info.extend([l.text for l in listTimelines]) if subscriptions: print("Pulling subscribed list timelines for {0}".format(username)) subs = api.GetSubscriptions(user_id=user.id, count=count) subTimelines = list() for entry in subs: subTimelines.extend( api.GetListTimeline(list_id=entry.id, count=count)) user_info.extend([l.text for l in subTimelines]) allWords.extend(clean_tweets(user_info, minwordlen)) # **** LOCATION/TREND DATA **** # if specified, get geo data, if not, attempt to get current location if location is None and currentlocation: try: iplookup = requests.get(geolookup_url).json() location = "{0}, {1}, {2}, {3}".format(iplookup['city'], iplookup['region'], iplookup['postal'], iplookup['country']) print( "No location specified, pulling from IP: {0}".format(location)) except: print( "Unable to get geo IP information; skipping location based trend lookup" ) location = None # location can be full address, city, county, state, zip, or country # this will attempt to expand out from location specified to country in reverse order if location is not None: loc = get_location(location) search_geocode = [loc.latitude, loc.longitude, "{0}mi".format(radius)] if trends: location_trends = get_geo_trends(api, location, expand=expand_location) allWords.extend( clean_tweets([t.name for t in location_trends], minwordlen)) if loc_popular: print("Pulling popular tweets for {0} in {1} mile radius".format( location, radius)) popular_tweets = api.GetSearch(geocode=search_geocode, result_type="popular", count=count) allWords.extend( clean_tweets([t.text for t in popular_tweets], minwordlen)) if loc_recent: print("Pulling recent tweets for {0} in {1} mile radius".format( location, radius)) recent_tweets = api.GetSearch(geocode=search_geocode, result_type="recent", count=count) allWords.extend( clean_tweets([t.text for t in recent_tweets], minwordlen)) if not loc_popular and not loc_recent: print( "Pulling mixture of popular and recent tweets for {0} in {1} mile radius" .format(location, radius)) mixed_tweets = api.GetSearch(geocode=search_geocode, count=count) allWords.extend( clean_tweets([t.text for t in mixed_tweets], minwordlen)) # get worldwide trends if globaltrends == True: print("Pulling global trends") trends = api.GetTrendsCurrent() allWords.extend(clean_tweets([t.name for t in trends], minwordlen)) # this will effectively handle deduplication and frequency of occurrence ordering if outputdir is None: return dict(Counter(allWords).most_common()) else: datestr = datetime.now().isoformat()[:-7] filename = "{0}{1}{2}.csv".format( username + "_" if username is not None else "", location + "_" if location is not None else "", datestr) # strip out potentially bad chars and spaces b/c nobody wants that filename = re.sub(r'[^\w\-\_\.]', '_', filename) outfile = path.join(outputdir, filename) fieldnames = ['Word', 'Occurrences'] outdict = convert_tuple_to_dict( Counter(allWords).most_common(), fieldnames) with open(outfile, mode="w", encoding="UTF-8", newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(outdict)