示例#1
0
def create_spacy_corpus(text_corpus: PlaintextCorpusReader,
                        lang: Language) -> Corpus:
    data = ((text_corpus.raw(fid), {
        'fileid': fid
    }) for fid in text_corpus.fileids())
    corpus = Corpus(lang, data)
    return corpus
示例#2
0
DEMO_DATA_ROOT = "../../../RepositoryData/data"

## Loading Corpus Raw Texts

import nltk
from nltk.corpus.reader import PlaintextCorpusReader
import numpy as np
import jieba, re

jieba.set_dictionary(DEMO_DATA_ROOT + "/jiaba/dict.txt.big.txt")

corpus_dir = DEMO_DATA_ROOT+"/TaiwanPresidentialInaugarationSpeech_en"

twp = PlaintextCorpusReader(corpus_dir, ".*\.txt")

len(twp.raw())


## Word Segmentation

- Try two methods: `ckiptagger` vs. `jieba`

from ckiptagger import WS

```{margin}
```{note}
Please remember to download the CKIP model files and change the path accordingly.
```
```

ws = WS("/Users/Alvin/Dropbox/Corpus/CKIP_WordSeg/data")
def training_data(paths=None, file_count=0):

    """
        Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX>
        in order to extract the bits of text containing the relevant information and
        group them into a list
        Chunk the elements of the list leaving only a tuple reprezented by the type of the entity
        and its name

        :param paths          the paths towards the file containing the training data
        :param file_count     the number of files to read
        :return               a list of lists where each element is a list formed from the type of the entity and its ful name
    """

    # extract training data from WSJ
    # pattern : the general pattern of a tag
    # snd_pattern : the approximate pattern of the desired information from the tag
    pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII)
    snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII)

    # the strings representing the tags extracted from the files

    text = PlaintextCorpusReader(paths[0], '.*\.txt')

    data = []
    for fid in text.fileids():
        data = data + pattern.findall(text.raw(fileids=fid),re.ASCII)

    # from every tag form the list find the two sub-strings
    # that correspond to the snd_pattern
    # use sets to eliminate redundancy
    raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)]))))))

    # extract data from names folders
    del data
    data = PlaintextCorpusReader(paths[1], '.*')

    name_data = data.words('names.male') + data.words('names.female') + data.words('names.family')

    # extract the most common 350 organization tokens

    organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities))))

    organization_specific_tokens = []
    for wl in organization_words:
        organization_specific_tokens += wl

    organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350)))

    location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities))))
    location_specific_tokens = []
    for wl in location_words:
        location_specific_tokens += wl

    location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350)))

    # put the names in a dictionary for quicker access
    name_dict = {}
    for n in list(set(name_data + names.words())):
        if n.lower()[0] in name_dict:
            name_dict[n.lower()[0]] += [n.lower()]
        else:
            name_dict[n.lower()[0]] = [n.lower()]

    # put the location data in a dictionary for quicker access
    loc_dict = {}
    for l in location_specific_tokens[1:]:
        if l[0] in loc_dict:
            loc_dict[l[0]] += [l]
        else:
            loc_dict[l[0]] = [l]

    # put the organization data in a dictionary for quicker access
    org_dict = {}
    for o in organization_specific_tokens:
        if o[0] in org_dict:
            org_dict[o[0]] += [o]
        else:
            org_dict[o[0]] = [o]

    entity_dict1 = {
        'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))),
        'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))),
        'ORGANIZATION': list(
            map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities))))
    }

    entity_dict2 = {}
    for l in ['PERSON', 'ORGANIZATION', 'LOCATION']:
        entity_dict2[l] = {}
        for e in entity_dict1[l]:
            if e[0] in entity_dict2[l]:
                entity_dict2[l][e[0]] += [e]
            else:
                entity_dict2[l][e[0]] = [e]

    return entity_dict2, org_dict, name_dict, loc_dict
示例#4
0
            file.close()

reader = CategorizedPlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus",
    r'tweets_.*\.txt',
    cat_pattern=r'tweets_(\w+)\.txt')

# setting up stopwords
stopword_reader = PlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/",
    r'.*\.txt',
    encoding='latin-1')
stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"])

for file in stopword_reader.fileids():
    stops = stopword_reader.raw(file).replace("\n", ",").split(",")
    for word in stops:
        stop_words.add(word)

# text wrangling functions:


def remove_emoji(
    string
):  # github https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
示例#5
0
import numpy as np
import nltk
import pandas as pd
from nltk.corpus.reader import PlaintextCorpusReader
from sklearn.feature_extraction.text import CountVectorizer

mycorpus = PlaintextCorpusReader(r"CSI58100TextFiles", r".*\.txt")
vec = CountVectorizer()
indx = 0
lst = []
for i in mycorpus.fileids():
    nlst = mycorpus.raw(i)
    indx = indx + 1
    lst.append(nlst)
corpus = np.array(lst)

#-----------Stop Words---------
vec = CountVectorizer(stop_words="english")
vec.fit(corpus)
#Sparse matrix
X = vec.transform(corpus)
bM = pd.DataFrame(X.toarray(),
                  columns=vec.get_feature_names(),
                  index=mycorpus.fileids()).T
print(type(corpus))
print(corpus)
print(bM)
# bM.to_csv('booleanMatrix.csv')
# # Jaccards similarity
# from sklearn.metrics import jaccard_score
# similarity = []
import os
directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/"
input_directory = directory + "Input/_Product_Management/"
output_directory = directory + "1_POS/"
if not os.path.exists(output_directory): os.mkdir(output_directory)

# reading stuff
file_list = os.listdir(input_directory)
print file_list

# just for testing create a corpus reader
from nltk.corpus.reader import PlaintextCorpusReader
reader = PlaintextCorpusReader(input_directory,'.*.txt')

reader.fileids()
reader.raw()
reader.sents()
reader.words()

## default POS tagger from NLTK ##
import nltk
# import pprint
# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
pos = "nltk"
path = output_directory + pos
if not os.path.exists(path): os.mkdir(path)
for i in range(len(file_list)):
#    posting = []
    output = path + "/" + str(file_list[i])
    jfile=open (output,"w")
    reader = PlaintextCorpusReader(input_directory,str(file_list[i]))
示例#7
0
english_stops = set(stopwords.words('english'))
english_stops_nopunct = {
    stopword.translate(table)
    for stopword in english_stops
}

# Load the insect wordlist of stems
insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt'])

# A list to hold the frequency data
freq_data = []

count = 1
# Read each file in turn
for file in files:
    text = reader.raw(file)

    print(f'{count}: TOKENISING {file}')

    # Tokenise and normalise to lowercase
    tokens = word_tokenize(text.lower())

    # Remove all punctuation marks
    tokens_nopunct = [token.translate(table) for token in tokens]

    # Remove all tokens that are only numbers (or punctuation marks if there were any left)
    words = [word for word in tokens_nopunct if word.isalpha()]

    # Remove stopwords from the tokens
    words_nostops = [
        word for word in words if word not in english_stops_nopunct