Exemplo n.º 1
0
def run(input, client, output_path, index_name):
    INPUT = str(input)
    CLIENT = lib.create_connection(client)
    OUTPUT_PATH = output_path
    INDEX_NAME = index_name
    MODE = "url"

    if not lib.check_index(client=CLIENT, index=INDEX_NAME):
        lib.logger.debug(f"{INDEX_NAME} not found.")
        return Exception(f"{INDEX_NAME} not found.")

    website = newspaper(INPUT)
    fulltext = website.article

    try:
        nltk_download('punkt')
    except:
        pass
    sentences = tokenize.sent_tokenize(fulltext.strip())

    scores_sentences = lib.get_scores(CLIENT, INDEX_NAME, sentences)
    format_scores_sentences = lib.format_scores(sentences, scores_sentences)
    result = lib.save_result(fulltext, INDEX_NAME, INPUT,
                             format_scores_sentences, OUTPUT_PATH, MODE)
    return result
Exemplo n.º 2
0
def run(input, client, output_path, index_name):
    INPUT = str(input)
    CLIENT = lib.create_connection(client)
    OUTPUT_PATH = output_path
    INDEX_NAME = index_name
    MODE = "text"

    if not lib.check_index(client=CLIENT, index=INDEX_NAME):
        lib.logger.debug(f"{INDEX_NAME} not found.")
        return Exception(f"{INDEX_NAME} not found.")

    fulltext = INPUT.replace('\r',
                             ' ').replace('\n',
                                          ' ').replace("”",
                                                       '"').replace("’", "'")

    try:
        nltk_download('punkt')
    except:
        pass
    sentences = tokenize.sent_tokenize(fulltext.strip())

    scores_sentences = lib.get_scores(CLIENT, INDEX_NAME, sentences)
    format_scores_sentences = lib.format_scores(sentences, scores_sentences)
    result = lib.save_result(fulltext, INDEX_NAME, INPUT,
                             format_scores_sentences, OUTPUT_PATH, MODE)
    return result
Exemplo n.º 3
0
def get_nltk_data():
    data = [
        "twitter_samples",
        "punkt",
        "wordnet",
        "averaged_perceptron_tagger",
        "stopwords",
    ]
    for each in data:
        nltk_download(each)
Exemplo n.º 4
0
    def __init__(self, *args, **kwargs) -> None:
        """Initialise SentimentAnalysis instance, ensure we have downloaded required data."""
        # Ensure we have the various corpora that we need for analysis works.
        text_blob_download()
        nltk_download('vader_lexicon')

        # Initialise the Vader sentiment analysis tool
        self.analyser = SentimentIntensityAnalyzer()

        super().__init__(*args, **kwargs)  # type: ignore
 def eval_pipeline(self, pkl_filename, title, content):
     """ Evaluate a sample via the pipeline model """
     nltk_download('wordnet')
     nltk_download('stopwords')
     pipeline = load(pkl_filename)
     self.bow_transformer = pipeline['vectorizer']
     self.model = pipeline['nbclassifier']
     self.labelencoder = pipeline['labelenc']
     self.labelencoder
     return self.eval(title, content)
    def train(self):
        """
        Create the LDA model with procurements and their tender descriptions.
        Grouped the procurements by the generated topics
        Dump it to a cache json file
        :return: None
        """
        nltk_download('wordnet')
        # remove the unwanted words for all tender description
        processed_tender_descriptions = map(
            lambda p: self.preprocess(p.tender_description),
            data_holder.procurements)
        dictionary = gensim.corpora.Dictionary(processed_tender_descriptions)

        # filter away words that appears more than 25% of the time
        dictionary.filter_extremes(no_above=0.25)

        # create a list of tuple containing the word index and the number of times it appeared
        bow_corpus = [
            dictionary.doc2bow(doc) for doc in processed_tender_descriptions
        ]

        # create the LDA model
        lda_model = gensim.models.LdaMulticore(bow_corpus,
                                               num_topics=10,
                                               id2word=dictionary,
                                               passes=2,
                                               workers=2)

        topic_to_procurements = []
        # create an list of 10 empty list
        for i in range(10):
            topic_to_procurements.append([])

        # group similar tender numbers together
        for p in data_holder.procurements:
            unseen_document = p.tender_description
            bow_vector = dictionary.doc2bow(self.preprocess(unseen_document))

            # sort by descending probability, highest first
            sorted_probabilities = sorted(lda_model[bow_vector],
                                          key=lambda tup: -1 * tup[1])
            topic_with_highest_prob = sorted_probabilities[0]
            topic_index = topic_with_highest_prob[0]
            # put the tender number into the topic with the highest probability
            topic_to_procurements[topic_index].append(p.tender_no)

        self.group_topic_procurements(topic_to_procurements)
        # save the cached result as json
        with open(self.CACHE_JSON_FILE_NAME, 'w') as outfile:
            json.dump(topic_to_procurements, outfile)
Exemplo n.º 7
0
def get_stop_words() :
    try :
        return stopwords.words('english')

    except LookupError :
        from nltk import download as nltk_download
        import warnings

        # christ this is verbose...
        with warnings.catch_warnings() :
            warnings.simplefilter("ignore")
            nltk_download('stopwords')
            
    return stopwords.words('english')
Exemplo n.º 8
0
def get_stop_words() :
    try :
        return stopwords.words('english')

    except LookupError :
        from nltk import download as nltk_download
        import warnings

        # christ this is verbose...
        with warnings.catch_warnings() :
            warnings.simplefilter("ignore")
            nltk_download('stopwords')
            
    return stopwords.words('english')
Exemplo n.º 9
0
def setup_directories(processed_path, nlkt_data_path):
    """Just in case delete/create a directory for processed files
    It's okay to store files data in memory if they are small.
    However, I would save/upload them them somewhere if the data is large"""

    if os.path.exists(processed_path):
        shutil.rmtree(processed_path)

    os.makedirs(processed_path)

    if not os.path.exists(nlkt_data_path):
        os.makedirs(nlkt_data_path)
        nltk_download(['punkt', 'stopwords'], download_dir=nlkt_data_path)

    nltk_path.append(nlkt_data_path)
Exemplo n.º 10
0
 def __init__(self, crawler):
     self.stats = crawler.stats
     s = crawler.settings
     # Defaults to wherever NLTK normally keeps its data
     self.dir = crawler.settings.get('NLTKDATA_DIR', None)
     self.dir = None
     # Ensure necessary NLTK data is present. This can be persisted across
     # runs using DotScrapy Persistence if on ScrapingHub.
     # TODO: Should check if they're present by trying to load them, then
     #       download on exception. nltk.download() checks if up to date
     #       as well, so this generates latency and network traffic.
     for pkg in [
             'punkt',
             'words',
     ]:
         nltk_download(pkg, download_dir=self.dir)
    def spellcheck(self, books, mode='simple'):
        if mode not in ['simple', 'complex']:
            raise ValueError('Mode must be one of "simple, complex"')
        nltk_download('punkt')
        sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
        dictionary_path = resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        if not sym_spell.load_dictionary(
                dictionary_path, term_index=0, count_index=1):
            return books

        books = [
            book for book in books
            if self.hasNoSpellingErrors(book, sym_spell, mode)
        ]
        return books, self.set_unknown, self.set_known
Exemplo n.º 12
0
def check_nltk_packages(dl_path='.nltk_data', quiet=True):
    from nltk import download as nltk_download
    reqs = []
    for req in reqs:
        try:
            find_nltk_package(req, paths=[dl_path, '~/nltk_data'])
        except LookupError as e:
            logger.error(f'NLTK: Unable to find package `{req}`. ')
            logger.info(f'Downloading NLTK package `{req}`')
            try:
                nltk_download(req, download_dir=dl_path, quiet=quiet)
            except Exception as e:
                logger.error(
                    f'NLTK: An error occurred while downloading package `{req}`.'
                )
                logger.error(e)
                pass
    return True
    def train(self, train_csv, test_size=0.2):
        self.processed_count = 0
        nltk_download('wordnet')
        nltk_download('stopwords')

        self.df = pd.read_csv(train_csv)
        self.df['title_content'] = self.df['title'] + ' ' + self.df['content']
        self.x = self.df['title_content']
        self.y = self.df['publication']

        self.lemmatizer = WordNetLemmatizer()
        # enocde publications
        self.labelencoder = LabelEncoder()
        self.y = self.labelencoder.fit_transform(self.y)

        x_train, x_test, y_train, y_test = train_test_split(self.x,
                                                            self.y,
                                                            test_size=0.2,
                                                            random_state=12345)
        self.total_files_to_process = x_train.shape[0] + x_test.shape[0]

        print(f'x_train shape: {x_train.shape}\nx_test_shape: {x_test.shape}\n'
              f'y_train_shape: {y_train.shape}\ny_test_shape: {y_test.shape}\n')

        self.bow_transformer = CountVectorizer(analyzer=self.process_text)
        text_bow_train = self.bow_transformer.fit_transform(x_train)
        text_bow_test = self.bow_transformer.transform(x_test)
        print(f'bow_train_shape: {text_bow_train.shape}\n'
              f'bow_test_shape: {text_bow_test.shape}\n')
        # multinomial naive bayes model
        self.model = MultinomialNB()
        # train the model
        self.model = self.model.fit(text_bow_train, y_train)
        print(self.model.score(text_bow_train, y_train))
        # validation
        print(self.model.score(text_bow_test, y_test))
        test_predictions = self.model.predict(text_bow_test)
        print(classification_report(y_test, test_predictions))
Exemplo n.º 14
0
from atexit import register
from bs4 import BeautifulSoup as soup
from collections import Counter, OrderedDict
from copy import deepcopy
from datetime import datetime as dt, timedelta
from date_extractor import extract_date
from math import sqrt, exp
from numpy import array as ndarray, errstate
from nltk import download as nltk_download, pos_tag, word_tokenize
from nltk.corpus import stopwords
try:
    stopwords.words('english')
    pos_tag(word_tokenize('check'))
except:
    nltk_download('stopwords')
    nltk_download('punkt')
    nltk_download('averaged_perceptron_tagger')
from os import makedirs, remove, chmod
from os.path import dirname, abspath, exists, join
from pickle import load as pload, dump as pdump
from regex import findall, sub, compile, DOTALL, match
from requests import get
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from sklearn.cluster import KMeans, AgglomerativeClustering, FeatureAgglomeration, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sys import stdout, maxsize, platform
from time import strftime, sleep, time
from traceback import format_exc
Exemplo n.º 15
0
import numpy as np
import os
import xml.etree.ElementTree as ET
import pickle
import copy
from tqdm import tqdm
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk import download as nltk_download
nltk_download('wordnet')
""" Here, we process the ICSI files into data structures usable for our analysis.
The following file types are in the ICSI corpus:

Dialogue acts files, e.g. Bdb001.A.dialogue-acts.xml
    - Contain a list of dialogue acts for each (part of) a meeting
    - Meeting number is indicated by Bdb001
    - Part of meeting is defined by A

    - File contains the nite:root structure
        - This contains many <dialogueact> structures
            - Each dialogue act has a nite:id - id of dialogue act
            - start time
            - end time
            - type (this is a tag corresponding to the function of the dialogue act)
            - adjacency (this is a tag corresponding to the adjacency pair this dialogue act belongs to - save as string now, and process later)
            - original type (what's this? not clear, but we will save it anyway
            - participant
            - A child structure, which refers to the words, as in "Bdb001.A.words.xml#id(Bdb001.w.701)..id(Bdb001.w.702)"
                - We should decompose this to
                - File does not need to be listed, since word indices are unique - file: Bdb001.A.words.xml (or file_id: Bdb001.A.words)
                - and word_index_start
Exemplo n.º 16
0
#!/usr/bin/python3

from os import path, getenv
from dotenv import load_dotenv
from sqlalchemy import create_engine
from googletrans.constants import LANGCODES
from nltk.data import find as nltk_find
from nltk import download as nltk_download

try:
    nltk_find('stopwords')
except LookupError:
    nltk_download('stopwords')
finally:
    from nltk.corpus import stopwords

##################
# File Locations #
##################
DEFAULT_DIR = path.dirname(path.abspath(__file__))
PATH_DB = path.join(path.dirname(__file__), './log/quotes.db')
PATH_WOTD = path.join(path.dirname(__file__), './docs/wordoftheday.txt')
ENGINE = create_engine('sqlite:///./log/quotes.db', echo=False)

##################
# Bot & API Info #
##################
load_dotenv()
TOKEN = getenv('DISCORD_TOKEN')
POC_TOKEN = getenv('POC_TOKEN')
GUILD = getenv('DISCORD_GUILD')
def download_requirements():
    # Verify Python Version	 ---------------------------------------------------------------------------------------------------------------------------------
    from sys import version_info
    from sys import exit
    if version_info <= (3, 0):
        print(
            "\nUse python version 3.0 or higher. ( Code was tested with 3.5.2)"
        )
        print("--> EXITING")
        exit(1)
    try:
        #Verify that we have the required nlkt packages
        from nltk.data import find as nltk_find
        nltk_find('tokenizers/punkt')
        nltk_find('corpora/wordnet')
        nltk_find('corpora/wordnet_ic')
        nltk_find('taggers/averaged_perceptron_tagger')
        nltk_find('corpora/brown')
        nltk_find('corpora/stopwords')
    except LookupError:
        print("Download packages since some packages were missing")
        from nltk import download as nltk_download
        nltk_download('wordnet')
        nltk_download('wordnet_ic')
        nltk_download('punkt')
        nltk_download('averaged_perceptron_tagger')
        nltk_download('stopwords')
        nltk_download('brown')
Exemplo n.º 18
0
import csv
from nltk import download as nltk_download
from os import path, listdir, mkdir
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import cPickle as pickle
from make_classifier import get_classifier, get_ticker, filter_stock, best_bigram_word_feats

###################################################################
# VARIABLE INITIALIZATIONA
###################################################################

files = listdir('data_to_be_classified')
files = [file for file in files if file != '.keep']
classifier = get_classifier()
nltk_download('stopwords')
nltk_download('punkt')

###################################################################
# MAIN PROGRAM
###################################################################

while True:
    view_classifier_or_classify = raw_input(
        'Do you want to view information about the classifier or classify new data?(view/new/break) \n'
    )

    if view_classifier_or_classify == 'break':
        print('Thanks for using the program. Goodbye!')
        break
    elif view_classifier_or_classify == 'new':
nltk_path.append(config.NLTK_DATA_PATH)
nltk_to_download = []
try:
    stopwords.words('english')
except LookupError:
    nltk_to_download.append('stopwords')
try:
    word_tokenize('token test')
except LookupError:
    nltk_to_download.append('punkt')
if nltk_to_download:
    print 'Performing first-time setup'
    from nltk import download as nltk_download
    for package in nltk_to_download:
        print '\tDownloading:', package
        nltk_download(package)
STOPWORDS = frozenset(stopwords.words('english')) | frozenset('.,:()&[]?%;')
STEMMER = PorterStemmer()

BASE_URL_DOMAIN = urlparse.urlparse(config.BASE_URL).netloc
DISALLOWED_ARTICLE_PATHS = frozenset((
    'Category_Articles_with_hCards', 'Category_Biography_with_signature',
))


TEMPLATE = '''
<!doctype html>
<html>
	<head>
		<meta charset="UTF-8">
		<title>
Exemplo n.º 20
0
from nltk import download as nltk_download
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import unidecode
import re
from pickle import load
import os

base_path = os.path.abspath(os.path.dirname(__file__))

nltk_download("stopwords", quiet=True)
nltk_download("rslp", quiet=True)

stop_words = stopwords.words("portuguese")
stemmer = RSLPStemmer()

tokenizer = TweetTokenizer(preserve_case=False,
                           strip_handles=True,
                           reduce_len=True)
min_length = 3


def get_data_path():
    data_dir = os.path.join(base_path)
    return data_dir


def valid(word):
Exemplo n.º 21
0
    def sentiment_analysis(self, text):
        if not op.exists(
                op.join(self.DATA_DIR, f'./sentiment/{self.LEXICON}.zip')):
            nltk_download('vader_lexicon', download_dir=self.DATA_DIR)

        return SentimentIntensityAnalyzer().polarity_scores(text)
def generate_word_list(api,
                       since=None,
                       until=None,
                       username=None,
                       user_location=False,
                       lists=False,
                       subscriptions=False,
                       mentions=False,
                       tweets_to=False,
                       tweets_from=False,
                       count=20,
                       location=None,
                       currentlocation=False,
                       trends=False,
                       expand_location=False,
                       loc_popular=False,
                       loc_recent=False,
                       radius=5,
                       globaltrends=False,
                       minwordlen=3,
                       outputdir=None,
                       all=False,
                       alternate_stoplist=False):
    global tokenizer
    global exclusions
    if alternate_stoplist:
        from custom_stoplist import stoplist as exclusions
    else:
        nltk_download('stopwords')
        nltk_download('punkt')
        exclusions = stopwords.words('english')
    tokenizer = TweetTokenizer()
    #date_regex = r"\d{4}(-\d{2}){2}"
    if count > 200:
        print(
            "Count lowest common denominator max is 200 but specified {0}. Setting to 200."
            .format(count))
        count = 200
    if all:
        user_location = lists = subscriptions = mentions = tweets_to = tweets_from = currentlocation = True
        trends = expand_location = loc_popular = loc_recent = globaltrends = True
    # TODO: evaluate if date/time boxing is practical to implement
    #if (since is not None and not re.match(date_regex,since)) or (until is not None and not re.match(date_regex,until)):
    #    ValueError("Dates must be specified in yyyy-mm-dd format")
    allWords = list()
    # TODO: I think black hills mentioned a more precise way to get geolocation... look into that
    # TODO: also, seems like twitter has a built in API to handle lookup by IP and lat/long to woeid
    geolookup_url = "http://ipinfo.io"
    # **** USER INFO ****
    if username is not None:
        user_info = list()
        # **** PROFILE INFO ****
        # get basic information about the user
        print("Pulling profile info for {0}".format(username))
        user = api.GetUser(screen_name=username)
        user_info.extend(
            [user.location, user.name, user.description, user.status.text])
        # get information for user's location (if available)
        if user.location is not None and user_location:
            print("Found associated location for {0}.".format(username))
            if trends:
                user_location_trends = get_geo_trends(api,
                                                      user.location,
                                                      expand=expand_location)
                user_info.extend([t.name for t in user_location_trends])
            print(
                "Getting mix of popular and recent tweets for {0} in {1} mile radius"
                .format(user.location, radius))
            loc = get_location(user.location)
            search_geocode = [
                loc.latitude, loc.longitude, "{0}mi".format(radius)
            ]
            user_location_tweets = api.GetSearch(geocode=search_geocode,
                                                 count=count)
            user_info.extend([t.text for t in user_location_tweets])
        # get timeline information for the user
        print("Pulling timeline info for {0}".format(username))
        user_timeline = api.GetUserTimeline(user_id=user.id, count=count)
        user_info.extend([s.text for s in user_timeline])
        # get favorites
        print("Pulling favorites info for {0}".format(username))
        faves = api.GetFavorites(user_id=user.id, count=count)
        user_info.extend([s.text for s in faves])
        # **** SEARCHES ****
        # these searches will all be a mix of popular and current, seems good enough for me
        if mentions:
            print("Pulling mentions for {0}".format(username))
            mentions_query = "@{0}".format(username)
            mentions_search = api.GetSearch(term=mentions_query, count=count)
            user_info.extend([s.text for s in mentions_search])
        if tweets_to:
            print("Pulling tweets to {0}".format(username))
            to_query = "to:{0}".format(username)
            to_search = api.GetSearch(term=to_query, count=count)
            user_info.extend([s.text for s in to_search])
        if tweets_from:
            print("Pulling tweets from {0}".format(username))
            from_query = "from:{0}".format(username)
            from_search = api.GetSearch(term=from_query, count=count)
            user_info.extend([s.text for s in from_search])
        # **** LISTS ****
        if lists:
            print("Pulling list timelines for {0}".format(username))
            # TODO: add count back in for list when fixed
            # https://github.com/bear/python-twitter/pull/646
            lists = api.GetLists(user_id=user.id)
            listTimelines = list()
            for entry in lists:
                listTimelines.extend(
                    api.GetListTimeline(list_id=entry.id, count=count))
            user_info.extend([l.text for l in listTimelines])
        if subscriptions:
            print("Pulling subscribed list timelines for {0}".format(username))
            subs = api.GetSubscriptions(user_id=user.id, count=count)
            subTimelines = list()
            for entry in subs:
                subTimelines.extend(
                    api.GetListTimeline(list_id=entry.id, count=count))
            user_info.extend([l.text for l in subTimelines])
        allWords.extend(clean_tweets(user_info, minwordlen))
    # **** LOCATION/TREND DATA ****
    # if specified, get geo data, if not, attempt to get current location
    if location is None and currentlocation:
        try:
            iplookup = requests.get(geolookup_url).json()
            location = "{0}, {1}, {2}, {3}".format(iplookup['city'],
                                                   iplookup['region'],
                                                   iplookup['postal'],
                                                   iplookup['country'])
            print(
                "No location specified, pulling from IP: {0}".format(location))
        except:
            print(
                "Unable to get geo IP information; skipping location based trend lookup"
            )
            location = None
    # location can be full address, city, county, state, zip, or country
    # this will attempt to expand out from location specified to country in reverse order
    if location is not None:
        loc = get_location(location)
        search_geocode = [loc.latitude, loc.longitude, "{0}mi".format(radius)]
        if trends:
            location_trends = get_geo_trends(api,
                                             location,
                                             expand=expand_location)
            allWords.extend(
                clean_tweets([t.name for t in location_trends], minwordlen))
        if loc_popular:
            print("Pulling popular tweets for {0} in {1} mile radius".format(
                location, radius))
            popular_tweets = api.GetSearch(geocode=search_geocode,
                                           result_type="popular",
                                           count=count)
            allWords.extend(
                clean_tweets([t.text for t in popular_tweets], minwordlen))
        if loc_recent:
            print("Pulling recent tweets for {0} in {1} mile radius".format(
                location, radius))
            recent_tweets = api.GetSearch(geocode=search_geocode,
                                          result_type="recent",
                                          count=count)
            allWords.extend(
                clean_tweets([t.text for t in recent_tweets], minwordlen))
        if not loc_popular and not loc_recent:
            print(
                "Pulling mixture of popular and recent tweets for {0} in {1} mile radius"
                .format(location, radius))
            mixed_tweets = api.GetSearch(geocode=search_geocode, count=count)
            allWords.extend(
                clean_tweets([t.text for t in mixed_tweets], minwordlen))
    # get worldwide trends
    if globaltrends == True:
        print("Pulling global trends")
        trends = api.GetTrendsCurrent()
        allWords.extend(clean_tweets([t.name for t in trends], minwordlen))
    # this will effectively handle deduplication and frequency of occurrence ordering
    if outputdir is None:
        return dict(Counter(allWords).most_common())
    else:
        datestr = datetime.now().isoformat()[:-7]
        filename = "{0}{1}{2}.csv".format(
            username + "_" if username is not None else "",
            location + "_" if location is not None else "", datestr)
        # strip out potentially bad chars and spaces b/c nobody wants that
        filename = re.sub(r'[^\w\-\_\.]', '_', filename)
        outfile = path.join(outputdir, filename)
        fieldnames = ['Word', 'Occurrences']
        outdict = convert_tuple_to_dict(
            Counter(allWords).most_common(), fieldnames)
        with open(outfile, mode="w", encoding="UTF-8", newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(outdict)