예제 #1
0
    def run(self):
        # Set the NLTK path (http://www.nltk.org/_modules/nltk/data.html)
        nltk_path.append(join(config.paths["rawdata"], "nltk"))

        try:
            # Check which classes are valid depending on min_docs_per_class
            nbprint('Loading classes')
            self.load_valid_classes()

            # Load the documents
            with data.document_writer(self.info) as document_writer:
                # Initialize info classes
                self.classinfo = ClassInfo()
                self.docinfo = DocumentInfo(document_writer)

                # Load documents and store class information in classinfo
                self.load_documents()

            # Print Meta Information
            self.docinfo.save_meta(self.info)
            self.classinfo.save_meta(self.info)

        except (LookupError, FileNotFoundError):
            raise ImporterError(
                info,
                'Directory "{}" does not contain the required corpus.'.format(
                    nltk_path))

        # Save the classes
        classes = self.classinfo.make_class_list()
        data.save_classes(classes, self.info)
예제 #2
0
def setup_directories(processed_path, nlkt_data_path):
    """Just in case delete/create a directory for processed files
    It's okay to store files data in memory if they are small.
    However, I would save/upload them them somewhere if the data is large"""

    if os.path.exists(processed_path):
        shutil.rmtree(processed_path)

    os.makedirs(processed_path)

    if not os.path.exists(nlkt_data_path):
        os.makedirs(nlkt_data_path)
        nltk_download(['punkt', 'stopwords'], download_dir=nlkt_data_path)

    nltk_path.append(nlkt_data_path)
예제 #3
0
import logging, logging.handlers

from string import punctuation, digits, maketrans

from splunk.appserver.mrsparkle.lib.util import make_splunkhome_path
from splunk import setupSplunkLogger
from nltk import word_tokenize, pos_tag
from nltk.data import path as nltk_data_path
from nltk.corpus import wordnet, stopwords as stop_words
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from splunklib.searchcommands import dispatch, StreamingCommand, Configuration, Option, validators

BASE_DIR = make_splunkhome_path(["etc", "apps", "nlp-text-analytics"])
CORPORA_DIR = os.path.join(BASE_DIR, 'bin', 'nltk_data')
nltk_data_path.append(CORPORA_DIR)


@Configuration(local=True)
class CleanText(StreamingCommand):
    """ Counts the number of non-overlapping matches to a regular expression in a set of fields.

    ##Syntax

    .. code-block::
        cleantext textfield=<field> [default_clean=<bool>] [remove_urls=<bool>] [remove_stopwords=<bool>] 
            [base_word=<bool>] [base_type=<string>] [mv=<bool>] [force_nltk_tokenize=<bool>] 
            [pos_tagset=<string>] [custom_stopwords=<comma_separated_string_list>] [term_min_len=<int>] 
            [ngram_range=<int>-<int>] [ngram_mix=<bool>]

    ##Description
예제 #4
0
# Import the libraries

import tensorflow as tf
import numpy as np
import nltk
import csv

from nltk.data import path
# append your path for nltk data
path.append("C:\\Users\\andri\\AppData\\Roaming\\nltk_data")

# Load the data

file_path = '.\Data\\train.csv' # path for the data set
X, y2 = [], []

with open(file_path, 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    next(reader, None) # Skip header
    
    for row in reader:      
        y2.append(row[1])
        X.append(row[2])

y_real = []        
for i in y2:
    y_real.append(int(i))

# Making vector y one_hot
y = [] # one hot y
for i in range(len(y_real)):
예제 #5
0
        # encapsulated
        return [field_block]

    def _word(self, s):
        return [fields[1] for fields in s]

    def _tag(self, s, _):
        return [(fields[1], fields[3]) for fields in s]

    def _parse(self, s):
        # dependencygraph wants it all back together...
        block = '\n'.join('\t'.join(line) for line in s)
        return DependencyGraph(block, top_relation_label='root')


path.append(abspath(dirname(__file__)))

ud_english = LazyCorpusLoader(
    'ud_english', UniversalDependencyCorpusReader, r'.*\.conll')

mystery = LazyCorpusLoader(
    'mystery', UniversalDependencyCorpusReader, r'.*\.conll')


class Transducer(object):
    """Provides generator methods for converting between data types

    Args:
        word_list : an ordered list of words in the corpus. word_list[i]
            will be assigned the id = i + 1. root is assigned id 0 and
            any words not in the list are assigned id = len(word_list) + 2.
예제 #6
0
import operator
from sklearn.decomposition import TruncatedSVD
from sklearn import pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import joblib
import gensim
from gensim import corpora
import numpy as np
import re
from nltk.data import path
from pprint import pprint

path.append("/home/analytics/data_partition/nltk_data")
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))
import warnings

warnings.filterwarnings('ignore')

home = os.path.abspath(os.path.dirname(__file__))
sys.path.append(home + '/../../')
from src.mysql_utils import MySqlUtils
from src.NLP.preprocessing import clean_tweet

Lda = gensim.models.ldamodel.LdaModel

threshold = 0.95
import os, sys, re
import json

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.data import path as nltk_path

import gensim
from gensim import utils, corpora, models
from gensim.corpora.wikicorpus import remove_markup

from preprocess_text import preprocess
import logging

nltk_path.append('./nltk_data/')
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

NUM_TOPICS = 20
db_dir = '/mnt/lascar/qqiscen/src/TextTopicNet/data/VOC2007/VOCdevkit/VOC2007/'
train_dict_path = 'train_dict_ImageCLEF_Wikipedia.json'

print '  ' + sys.argv[0]
print '  Learns LDA topic model with ' + str(
    NUM_TOPICS) + ' topics from corpora on ' + train_dict_path
print '  (...)'

img_dir = db_dir + 'JPEGImages/'
xml_dir = db_dir + 'Annotations/'

if not os.path.isdir(db_dir):
예제 #8
0
"""
import os
import numpy as np
import matplotlib.pyplot as plt
import random
try:
    import ConfigParser
except:
    import configparser as ConfigParser
import pandas as pd

from nltk.data import path as nltk_data_path
nltk_data_location = os.getenv('NLTK_DATA_PATH')
if nltk_data_location is not None:
    nltk_data_path.append(nltk_data_location)
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

import utils
import transferlearning as tl
from stratified_split import writefile


def run_experiment(transfer_exp_name):
    """ Run an experiment given in the experiments directory. Some
    configuration details (which seeds to use, which datasets to use,
    which transfer methods to use, which classifiers to use, number of
    sentences in the target training set, etc.) are given
    in the .cfg file within the directory transfer_exp_name.
예제 #9
0
#Ramon Ruiz Dolz
#Salvador Marti Roman
import nltk
from nltk.corpus import *
from nltk.corpus import PlaintextCorpusReader
from nltk.probability import *
from nltk.tokenize import *
from nltk.stem import SnowballStemmer
import io
import os
from nltk.data import path

dir_path = os.path.dirname(os.path.realpath(__file__))
corpus_root = dir_path.replace(".idea", "")
path.append(dir_path + "\\NLTK")
#Act1
originalContent = io.open("./library/quijote.txt", encoding="utf8").read()
#Act2
originalFreq = FreqDist(
    w for w in RegexpTokenizer(".").tokenize(originalContent))
print("Act2")
print(sorted(originalFreq.keys()))

#Act3
filterContent = re.sub('[.|,"¡!()\-:;¿?«»\'\]\[\\n]', '', originalContent)

#Act4
print("Act4")
filterFreq = FreqDist(filterContent)
print(sorted(filterFreq.keys()))
예제 #10
0
from nltk.corpus import wordnet
from nltk.data import path
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker

from pathlib import Path

#
# Startup
#

# BAD. I know, vendor lock-in. But right now we're
# doing it on replit, so it's fine.
path.append(str(Path("~/B5Chatbot1MalcolmMaxim/nltk_data").expanduser()))

# Class instances
lemmatizer = WordNetLemmatizer()
spellcheck = SpellChecker()

#
# Functions
#


def tokenize(user_input):
    """Peforms tokenization on user input as well as multiple
    other parsing steps including spellchecking, synonym generation
    and lemmatization.
    """
import operator
import md5
import urllib2
import sys

from selenium import webdriver
from nltk import word_tokenize
from nltk.data import path as nltk_path
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import config


# NLTK resource initialization
nltk_path.append(config.NLTK_DATA_PATH)
nltk_to_download = []
try:
    stopwords.words('english')
except LookupError:
    nltk_to_download.append('stopwords')
try:
    word_tokenize('token test')
except LookupError:
    nltk_to_download.append('punkt')
if nltk_to_download:
    print 'Performing first-time setup'
    from nltk import download as nltk_download
    for package in nltk_to_download:
        print '\tDownloading:', package
        nltk_download(package)
예제 #12
0
#STOP WORDS :: used just for removing the not usable words like 'of','the' etc.

from nltk.data import path
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
path.append("F:\\nltk_data")
example_sentence = "This is an example showing of stop word filteration"
stop_words = set(stopwords.words("english"))

words = word_tokenize(example_sentence)

#filtered_sentence=[]
#for w in words:
#    if w not in stop_words:
#        filtered_sentence.append(w)
filtered_sentence = [w for w in words
                     if w not in stop_words]  #oneliner of above code

print(filtered_sentence)
예제 #13
0
import re
import os
import multiprocessing
import time
import sqlite3

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.data import path as nltk_path

from database_manager import DatabaseManager

from search_config import DIR_FILES, DIR_DUMP, DIR_DATABASE

stemmer = SnowballStemmer('english')
nltk_path.append("../resources/nltk_data")


def get_files_to_clean(dir):
    return filter(lambda x: x[0] != '.', sorted(os.listdir(dir)))

def write_to_file(content, new_file_name):
    new_file = open(new_file_name, 'w')
    new_file.write(content)
    new_file.close()

def chunks(l, n):
    newn = int(len(l) / n)
    for i in xrange(0, n-1):
        yield l[i*newn:i*newn+newn]
    yield l[n*newn-newn:]
예제 #14
0
from .multi_process import create_process_pool, multiprocess_search_mdx, pre_pool_search

prpool = None
thpool = None

if check_system() == 0:
    prpool = create_process_pool()
    pre_pool_search(prpool)
# else:
#     thpool = create_thread_pool()

try:
    from nltk.data import path as nltk_path
    from nltk.stem import WordNetLemmatizer
    nltk_path.append(os.path.join(ROOT_DIR, 'media', 'nltk_data'))
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize('a')
    # WordNetLemmatizer()第一次运行lemmatize()慢,需要初始化,将本地语料库调入内存,耗时1秒多,因此这里要预加载。
except Exception as e:
    lemmatizer = None
    print(e)

spell = SpellChecker(distance=1)
# 默认距离为2,比较慢,大概1.6秒左右,设置距离为1后,大约0.001秒左右。

builtin_dic_name = '内置词典'


def search(query_list, group):
    record_list = []
예제 #15
0
from base.sys_utils import check_system
from mdict.models import MyMdictEntry, MyMdictItem
from mdict.serializers import mdxentry
from mysite.settings import BASE_DIR
from .init_utils import init_mdict_list
from .loop_search import loop_search_sug
from .mdict_config import get_config_con

if check_system() == 0:
    from .multiprocess_search import pool, multiprocess_search_mdx, multiprocess_search_sug, check_pool_recreate, \
        loop_create_model
else:
    from .multithread_search import thpool, multithread_search_mdx, multithread_search_sug, \
        check_threadpool_recreate, loop_create_thread_model

nltk_path.append(BASE_DIR + os.sep + 'media' + os.sep + 'nltk_data')
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('a')
# WordNetLemmatizer()第一次运行lemmatize()慢,需要初始化,将本地语料库调入内存,耗时1秒多,因此这里要预加载。

spell = SpellChecker(distance=1)
# 默认距离为2,比较慢,大概1.6秒左右,设置距离为1后,大约0.001秒左右。

builtin_dic_name = '内置词典'


def search(query, is_en, group):
    record_list = []
    query = query.strip()
    t2 = time.perf_counter()
    try: