def create_feature_sets(): # get the training set if "NLTK_PROXY" in os.environ: logger.debug("Using proxy %s" % os.environ["NLTK_PROXY"]) nltk.set_proxy(os.environ["NLTK_PROXY"]) nltk.download('movie_reviews') from nltk.corpus import movie_reviews logger.debug("Building data set...") # build list of words (1 list per doc) and pos/neg category documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) # extract list of features from most common words all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) logger.debug("Frequency dist of 15 most common words:%s" % all_words.most_common(15)) logger.debug("Frequency of 'stupid':%d" % all_words["stupid"]) word_features = list(all_words.keys())[:3000] # save feature list feature_fn = "features.pickle" logger.debug("Saving feature list to %s..." % feature_fn) features_f = open(feature_fn, "wb") pickle.dump(word_features, features_f) features_f.close() # build list of feature sets from data return [(find_features(word_features, rev), category) for (rev, category) in documents]
def _download_nltk_data(): global is_done_loading proxy_address = _get_proxy_address() if proxy_address: nltk.set_proxy(proxy_address) nltk.download(NLTK_DATA, download_dir=nltk_data_dir(), quiet=True) is_done_loading = True sys.stdout.flush()
def init(): from email._header_value_parser import TokenList isProxyNeeded = input("proxy required?(yes/no):") print(isProxyNeeded) if isProxyNeeded == 'yes': proxyDetails = input("provide proxy details:") nltk.set_proxy(proxyDetails) nltk.download('stopwords', quiet=True) nltk.download('wordnet', quiet=True)
def back_translate_data(text): try: nltk.set_proxy('http://s5.cyberspace.vn:3128') blob = TextBlob(text) str_en_translate = blob.translate(from_lang='vi', to='en') time.sleep(1) n_blob = TextBlob(str(str_en_translate)) str_vn_back_translate = n_blob.translate(from_lang='en', to='vi') time.sleep(1) except: return None return str_vn_back_translate
def conectarServidor(): print( "\n ==============================================================\n") print(" Conectando con el servidor de la Universidad ") print( "\n ==============================================================\n") try: nltk.set_proxy(url, (nickname, clave)) except Exception: print(" Error de Comunicacion con el Servidor") return print(" Servidor Conectado OK \n\n") conexionSSH() return
def gerarNuvemTagss(text, background): if background == 1: dia_noite = 'white' else: dia_noite = 'black' if webscrapin01.proxyy.lower() == 'y': nltk.set_proxy('http://' + webscrapin01.user_proxy + ':' + webscrapin01.pass_proxy + '@10.251.250.250:3128') nltk.download('punkt') nltk.download('stopwords') # remover caracteres especiais e remover acentuação review = re.sub('[^a-zA-Z]', ' ', str(unidecode(text))) # converter texto para minusculo review = review.lower() # criar uma lista de palavras review = review.split() # preparar palavras comuns e pontuação para serem removidas da lista stopw = (stopwords.words('portuguese') + list(punctuation)) # adicionar palavras comuns não previstas pelo stopwords addExcecoes = [ 'ha', 'ver', 'alguns', 'ate', 'aqui', 'la', 'sendo', 'estar', 'novo', 'vez', 'desses', 'quero', 'deixou', 'geral', 'cada', 'boa', 'outras', 'certo', 'dentro', 'deixe', 'tudo', 'pronto', 'toda', 'manter', 'locais', 'deste', 'sob', 'agora', 'diz', 'assim', 'daquele', 'tanto', 'busca', 'estao', 'deve', 'todo', 'ser', 'tambem', 'nao', 'quase', 'forma', 'qualquer', 'sem', 'nessa', 'ja', 'outro', 'outros', 'sao', 'ainda', 'parte', 'onde', 'tantos', 'todas', 'existem', 'apresento', 'adiante', 'antes', 'depois', 'durante', 'mesmos', 'todos', 'causa', 'pode', 'prontos', 'nessas', 'nesses', 'proximo', 'ter', 'alguma', 'sido', 'ido', 'subiu', 'pediu', 'chegou', 'existe', 'veio', 'vai', 'duas', 'entao', 'por que' ] # adicionar exceções ao stopword standard stopw.extend(addExcecoes) # remover palavras comuns da lista de palavras gerada a partir do texto informado palavras_sem_stopwords = [w for w in review if w not in stopw] # converter a lista de palavras num texto review = ' '.join(palavras_sem_stopwords) # gerar a nuvem de palavras wordcloud = WordCloud(width=1366, height=768, background_color=dia_noite).generate(str(review)) plt.figure(figsize=(12, 6)) plt.imshow(wordcloud, interpolation='lanczos') plt.axis("off") plt.show()
def configure(self, config): self._config = config self._log = logging.getLogger(self.__class__.__name__) # self._log.setLevel(logging.INFO) # i = logging.StreamHandler(sys.stdout) # e = logging.StreamHandler(sys.stderr) # i.setLevel(logging.INFO) # e.setLevel(logging.ERROR) # self._log.addHandler(i) # self._log.addHandler(e) self._issue_pattern_jira = re.compile('(?P<ID>[A-Z][A-Z0-9_]+-[0-9]+)', re.M) self._issue_pattern_bugzilla = re.compile( '((bug|issue|bugzilla)[s]*[#\s]*(?P<ID>[0-9]+))|(bugzilla\/show_bug\.cgi\?id=(?P<ID2>[0-9]+))', re.I | re.M) self._issue_pattern_github = re.compile( '((bug|issue)[s]*[#\s]*(?P<ID>[0-9]+))|(issues\/(?P<ID2>[0-9]+))', re.I | re.M) #direct classifiers self._keyword_classifier = Keyword_Classifier() self._test_classifier = Test_Classifier(self._log) self._documentation_classifier = Documentation_Classifier(self._log) self._refactoring_classifier = Refactoring_Classifier(self._log) #ml-classifiers if config['args'].proxy_host: nltk.set_proxy('http://{}:{}'.format(config['args'].proxy_host, config['args'].proxy_port)) nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') cur_dir = os.path.dirname(__file__) csv_file = os.path.join(cur_dir, 'vibscc_util/files', 'CCDataSet.csv') try: train_df = read_csv_df(csv_file) self._ml_classifiers_exec = ML_Classifiers_Exec(train_df) self._log.info("Starting ml-classifiers training") self._ml_classifiers_exec.train_classifiers() except Exception as exception: self._log.error(exception) sys.exit()
def applyConfig(self): """ apply configuration """ os.environ['HTTP_PROXY'] = '' os.environ['HTTPS_PROXY'] = '' try: newspaper_config = newspaper.Config() newspaper_config.memoize_articles = True newspaper_config.http_success_only = True newspaper_config.fetch_images = False newspaper_config.number_threads = 2 newspaper_config.browser_user_agent = self.configData['user_agent'] newspaper_config.request_timeout = self.configData['fetch_timeout'] # add this to config data self.configData['newspaper_config'] = newspaper_config # set OS environment variables for proxy server: if len(self.configData['proxy_url_http']) > 3 and len( self.configData['proxy_url_https']) > 3: os.environ['HTTP_PROXY'] = self.configData['proxy_url_http'] os.environ['HTTPS_PROXY'] = self.configData['proxy_url_https'] self.configData['proxies'] = { "http": self.configData['proxy_url_http'], "https": self.configData['proxy_url_https'] } # else: # print("INFO: Not using any proxy servers: " # , self.configData['proxy_url_http'] # , " or " # , self.configData['proxy_url_https']) nltk.set_proxy(self.configData['proxies']) self.configData['newspaper_config'].proxies = self.configData[ 'proxies'] # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies()) except Exception as e: print("ERROR: Unable to set proxy parameters: %s", e)
def check_nltk(): try: from nltk.tokenize import word_tokenize word_tokenize('It\'s.') except Exception: import nltk if not sslVerify: from ssl import _create_unverified_context from six.moves.urllib.request import install_opener, HTTPSHandler, build_opener # TODO: This needs still proxy support ! ctx = _create_unverified_context() opener = build_opener(HTTPSHandler(context=ctx)) install_opener(opener) if 'HTTP_PROXY' in os.environ: nltk.set_proxy(os.environ.get('HTTP_PROXY')) nltk.download('punkt') return
def translate(comments, language, proxy_url): nltk.set_proxy(proxy_url) def fetch_translate_with_delay(comment, delay=0.5): if hasattr(comment, "decode"): comment = comment.decode("utf-8") text = TextBlob(comment) try: text = text.translate(to=language) print("intermediate translation: ", text) text = text.translate(to="en") return text except Exception as e: print(e) if e == KeyboardInterrupt: return None res = [] for comment in comments: res.append(fetch_translate_with_delay(comment)) if len(res) == 0: # for single comment return res[0] else: # for multiple comments return res
def applyNetworkConfig(self): """ Apply configuration for networking """ os.environ['HTTP_PROXY'] = '' os.environ['HTTPS_PROXY'] = '' try: newspaper_config = newspaper.Config() newspaper_config.memoize_articles = False newspaper_config.http_success_only = True newspaper_config.fetch_images = False newspaper_config.number_threads = 2 newspaper_config.browser_user_agent = self.user_agent newspaper_config.request_timeout = self.fetch_timeout newspaper_config.use_cached_categories = False # add this to config data self.newspaper_config = newspaper_config # set OS environment variables for proxy server: if len(self.proxy_url_http) > 3 and len(self.proxy_url_https) > 3: os.environ['HTTP_PROXY'] = self.proxy_url_http os.environ['HTTPS_PROXY'] = self.proxy_url_https self.proxies = { "http": self.proxy_url_http, "https": self.proxy_url_https } else: os.environ['HTTP_PROXY'] = '' os.environ['HTTPS_PROXY'] = '' self.proxy_url_http = None self.proxy_url_https = None self.proxies = {} nltk.set_proxy(self.proxies) self.newspaper_config.proxies = self.proxies # print("INFO: For NLTK, using Proxy configuration: ", nltk.getproxies()) except Exception as e: print("ERROR: Unable to set proxy parameters: %s", e)
def drop_stopwords(rdd, stats): """Remove commonly occurring 'stop' words from the flattenned RDD and return the new RDD.""" stats.append(("num words with stop words", rdd.count())) # drop empty and single letter words words = rdd.filter(lambda x: len(x) > 1) stats.append(("num words with short words removed", words.count())) logger.debug('Dropping stopwords with NLTK') # drop stop words (downloads words if needed) # If you are behind a proxy, set NLTK_PROXY in your environment import nltk, os if 'NLTK_PROXY' in os.environ: logger.debug('Using proxy %s' % os.environ['NLTK_PROXY']) nltk.set_proxy(os.environ['NLTK_PROXY']) nltk.download("stopwords") from nltk.corpus import stopwords stopwords = stopwords.words('english') words = words.filter(lambda x: x not in stopwords) stats.append(("num words without stop words", words.count())) return words
import nltk nltk.set_proxy('http://localhost:5000') nltk.download()
users = db.Users categories = db.Categories comments = db.Comments userNameInsta = "InstagramUsername" passwordInsta = "InstagramPassword" until_date = '2017-03-31' count = 100 API = InstagramAPI(userNameInsta, passwordInsta) API.login() #API.getUsernameInfo() defaultHost = '36.67.119.129:65301' nltk.set_proxy(defaultHost) file_handler = logging.FileHandler('app.log') app.logger.addHandler(file_handler) app.logger.setLevel(logging.INFO) @app.errorhandler(401) def unauthorized(error=None): message = { 'status': 401, 'message': 'Not authorized : ' + request.url, } resp = jsonify(message) resp.status_code = 401
import nltk nltk.set_proxy('http://127.0.0.1:7890') nltk.download('stopwords') nltk.download('punkt')
import configparser import couchdb import os import tweepy from googletrans import Translator import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer # Read configuration file config = configparser.ConfigParser() config.read('config.ini') hostname = os.getenv('HOST', None) address = os.getenv('IP', None) proxy = config.get('server', 'proxy') nltk.set_proxy(proxy) nltk.download('vader_lexicon') server_id = config.get('id', hostname) task = 'task' + server_id consumer_key = config.get(task, 'consumer_key') consumer_secret = config.get(task, 'consumer_secret') access_token_key = config.get(task, 'access_token_key') access_token_secret = config.get(task, 'access_token_secret') username = config.get('couchdb', 'username') password = config.get('couchdb', 'password') database_raw = config.get('couchdb', 'database_raw') database_processed = config.get('couchdb', 'database_processed') # Connect to CouchDB
from sklearn.datasets import fetch_20newsgroups import ssl from scipy.special import digamma, polygamma from decimal import Decimal ssl._create_default_https_context = ssl._create_unverified_context # from .lda_real import * import heapq import toolz as tz import nltk nltk.set_proxy(None) nltk.download('stopwords') nltk.download('wordnet') import pandas as pd import gensim from nltk.stem import WordNetLemmatizer, SnowballStemmer import numpy as np def E_step_Realdata(alpha, BETA, doc, Phi0, gamma0, max_iter=100, tol=1e-3): """ Latent Dirichlet Allocation: E-step. Do to a specific document. ------------------------------------ Input: alpha as a k*1 vector; BETA as a k*V matrix; doc as a Nd*V matrix; Phi0 as a Nd*k matrix;
import os import os.path as op from gensim import models, corpora import nltk from nltk import word_tokenize from nltk.corpus import stopwords from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel from gensim.models.wrappers import LdaMallet from gensim.corpora import Dictionary from dateutil.relativedelta import relativedelta import datetime #import matplotlib.pyplot as plt nltk.set_proxy('http://proxy.admin2.oxa.tld:3128' ) # set proxy to be able to download nltk files basename = op.split(op.dirname(op.realpath(__file__)))[0] path_utils = op.join(basename, "utils") sys.path.insert(0, path_utils) from sys_utils import load_library, strInput2bool #from mongoDB_utils import connect_to_database # add path to utils module to python path basename = op.split(op.dirname(op.realpath(__file__)))[0] load_library(op.join(basename, 'preprocess')) from defines import ColumnNames as cn from defines import Patterns load_library(op.join(basename, 'readWrite'))
from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix import scipy.stats from sklearn.metrics import make_scorer from sklearn.model_selection import cross_val_score from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import cross_val_predict from sklearn_crfsuite.metrics import flat_classification_report from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt plt.style.use('ggplot') import nltk nltk.set_proxy('http://dfwproxy.ent.covance.com:80/') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') data_path = "CADEC.v2/cadec/train" def load_data_split(data_path): data_train = [] data_val = [] data_test = [] sen_length_arr = [] sen_max_len = 0 max_sen = '' for i, file in enumerate(sorted(os.listdir(data_path))): data = open(os.path.join(data_path, file)) sent_list_parse = data.read().rstrip('\n').split('\n\n')
def fxn(): warnings.warn("deprecated", DeprecationWarning) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer fxn() with redirect_stdout(open(os.devnull, "w")): nltk.set_proxy(os.environ['proxy']) nltk.download('punkt') nltk.download('wordnet') nltk.download('stopwords') warnings.filterwarnings('ignore', category=DeprecationWarning) url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' def load_data(database_filepath): """ INPUT: database_filepath (string) : database location OUTPUT: X (np.array) : messages to process
import nltk nltk.set_proxy('127.0.0.1:7890') import turing_bot_implemented new_bot = turing_bot_implemented.EnglishTuringBot('Artemis', 'programming', { 'funny': ['lmao', 'lmaoo', 'hahaha', 'lol', 'LOL', 'XD', 'xD', 'hhhhhhh', 'hahahaha', 'hhhh', 'hhh', 'LMAO', 'loll'], 'positive': ['sounds great!', 'wow', 'wow!', 'that\'s p nice', 'perfect!', 'cool', 'niiiceee', 'that\'s so coool!!', 'fabulous!', 'actually that\'s rly nice', 'great!', 'pog', 'pog!', 'POG', 'pogchamp!', 'pro gamers move', 'really??', 'wtf that\'s so cool', 'poggers', 'nicee :D'], 'negative': ['oop', 'sorry to hear that', 'that\'s horrible', 'that\'s terrible', 'welp', 'not poggers', 'what in the world', 'oop.', 'copium', 'sadge', 'not so cool', 'not pog', 'D:', ':\')', 'bruh moment'], 'confused': ['excuse me???', 'what??', '?', 'seriously?', 'wtf?', '??', '???', '????'], 'neutral': ['k', 'ok', 'I see', 'cool', 'that makes sense', 'makes sense', 'sure', 'hmm', 'uh huh', 'yep', 'yup'], 'greeting': ['hey', 'sup', 'hi', 'hello', 'mornin lol', 'gudday'], }) new_bot.run()
def nltk_setup(init=False): if init: nltk.set_proxy(proxies['http']) for p in nltk_packages: nltk.download(p)
path="C:/Users/anandrathi/Documents/DataScieince/Coursera/NLP/natural-language-processing-master/week1" path="C:/temp/DataScience/TextParseNLP/natural-language-processing-master/week1/" os.chdir(path) import sys sys.path.append("..") from common.download_utils import download_week1_resources download_week1_resources() from grader import Grader grader = Grader() import nltk nltk.set_proxy('http://*****:*****@proxyserver.health.wa.gov.au:8181',) nltk.download('stopwords') from nltk.corpus import stopwords from ast import literal_eval import pandas as pd import numpy as np def read_data(filename): data = pd.read_csv(filename, sep="\t") data['tags'] = data['tags'].apply(literal_eval) return data train = read_data('data/train.tsv') validation = read_data('data/validation.tsv') test = pd.read_csv('data/test.tsv', sep='\t')
import nltk nltk.set_proxy('http://192.168.36.22:3128', 'cs11m04', '123') nltk.download()
# -*- coding: utf-8 -*- import nltk nltk.set_proxy('http://web-proxy.ind.hp.com:8080') #nltk.download('stopwords') import pandas as pd import numpy as np from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.stem import PorterStemmer #nltk.download('wordnet') stop_words = stopwords.words('english') def removePuncts(sentences): return pd.Series(sentences).str.replace("[^a-zA-Z]", " ") def makeLowerCase(punctsRemoved): lowerCase = [s.lower() for s in punctsRemoved] #Remove addtional white spaces return [s.strip() for s in lowerCase] def removeStopWords(sen): noStopWords = " ".join([i for i in sen if i not in stop_words]) return noStopWords def removeDuplicate(sentences): lines_seen = list()
import nltk HTTP_PROXY = '' HTTPS_PROXY = '' if HTTP_PROXY != '': nltk.set_proxy(HTTP_PROXY) if HTTPS_PROXY != '': nltk.set_proxy(HTTPS_PROXY) nltk.download('punkt')
import xlrd import pandas as pd import nltk from nltk.tokenize import sent_tokenize, word_tokenize nltk.set_proxy('https://genproxy:8080') nltk.download('punkt') import bs4 as bs import urllib.request from nltk.tokenize import sent_tokenize, word_tokenize nltk.download('stopwords') import heapq from openpyxl import load_workbook '''Read the data from excel''' FileName='Generic4.xlsx' df = pd.read_excel(FileName,sheet_name='Calendar Business Coverage') ReadList=df.columns.values.tolist() #print(ReadList) #for i in range(0,9) : #print(df['Last updated on: '][6:]) #print(df['Unnamed: 2']) #print(df['08 Oct 2018 16:26'][5]) DataColumn=ReadList[0] ParameterNameCol=df[ReadList[1]][6:].values.tolist() #print(ParameterNameCol) ReadListLength=len(ReadList) #print(ReadListLength)
def translate_api(comment, language, proxy_url): nltk.set_proxy(proxy_url) translation = translator.translate(comment, dest=language) return translation.text
import nltk print(nltk.__version__) # nltk.set_proxy('http://192.168.0.134:8118', ('USERNAME', 'PASSWORD')) nltk.set_proxy('http://192.168.0.134:8118') nltk.download('cmudict')
#!/usr/bin/python3 # coding: utf-8 import nltk ################################################################## ## download() nltk.set_proxy('http://127.0.0.1:1080') # 配置 SS 翻墙, 其实不翻墙也可以下载, 就是太慢了 nltk.download() # 如果都不行的话, 你还可以直接到 http://nltk.googlecode.com/svn/trunk/nltk_data/index.xml 去下载数据包, 只要将数据包复制到你的 Download Directory 目录下即可 # 总共下载了 3.3G 的数据; nltk 2.0 和 nltk 3.0 的数据不一样 ################################################################## ## 单一下载 nltk.download('punkt')
import urllib.request import string import random import sys from unicodedata import category import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize nltk.set_proxy('http://127.0.0.1:2802') nltk.download('punkt') nltk.download('stopwords') # def download_book(url): # """ # Download the book from Gutenberg by using the url of the book # return: a string # """ # response = urllib.request.urlopen(url) # data = response.read() # a `bytes` object # text = data.decode('utf-8') # return text # # import the url of the two analyzed books and other 4 books to python # url_PP = 'http://www.gutenberg.org/files/1342/1342-0.txt' # text1 = download_book(url_PP) # url_Gatsby = 'https://dev.gutenberg.org/files/64317/64317-0.txt' # text2 = download_book(url_Gatsby) # url_Northanger = 'http://www.gutenberg.org/files/121/121-0.txt' # text3 = download_book(url_Northanger)