def training_data(paths=None, file_count=0):

    """
        Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX>
        in order to extract the bits of text containing the relevant information and
        group them into a list
        Chunk the elements of the list leaving only a tuple reprezented by the type of the entity
        and its name

        :param paths          the paths towards the file containing the training data
        :param file_count     the number of files to read
        :return               a list of lists where each element is a list formed from the type of the entity and its ful name
    """

    # extract training data from WSJ
    # pattern : the general pattern of a tag
    # snd_pattern : the approximate pattern of the desired information from the tag
    pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII)
    snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII)

    # the strings representing the tags extracted from the files

    text = PlaintextCorpusReader(paths[0], '.*\.txt')

    data = []
    for fid in text.fileids():
        data = data + pattern.findall(text.raw(fileids=fid),re.ASCII)

    # from every tag form the list find the two sub-strings
    # that correspond to the snd_pattern
    # use sets to eliminate redundancy
    raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)]))))))

    # extract data from names folders
    del data
    data = PlaintextCorpusReader(paths[1], '.*')

    name_data = data.words('names.male') + data.words('names.female') + data.words('names.family')

    # extract the most common 350 organization tokens

    organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities))))

    organization_specific_tokens = []
    for wl in organization_words:
        organization_specific_tokens += wl

    organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350)))

    location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities))))
    location_specific_tokens = []
    for wl in location_words:
        location_specific_tokens += wl

    location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350)))

    # put the names in a dictionary for quicker access
    name_dict = {}
    for n in list(set(name_data + names.words())):
        if n.lower()[0] in name_dict:
            name_dict[n.lower()[0]] += [n.lower()]
        else:
            name_dict[n.lower()[0]] = [n.lower()]

    # put the location data in a dictionary for quicker access
    loc_dict = {}
    for l in location_specific_tokens[1:]:
        if l[0] in loc_dict:
            loc_dict[l[0]] += [l]
        else:
            loc_dict[l[0]] = [l]

    # put the organization data in a dictionary for quicker access
    org_dict = {}
    for o in organization_specific_tokens:
        if o[0] in org_dict:
            org_dict[o[0]] += [o]
        else:
            org_dict[o[0]] = [o]

    entity_dict1 = {
        'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))),
        'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))),
        'ORGANIZATION': list(
            map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities))))
    }

    entity_dict2 = {}
    for l in ['PERSON', 'ORGANIZATION', 'LOCATION']:
        entity_dict2[l] = {}
        for e in entity_dict1[l]:
            if e[0] in entity_dict2[l]:
                entity_dict2[l][e[0]] += [e]
            else:
                entity_dict2[l][e[0]] = [e]

    return entity_dict2, org_dict, name_dict, loc_dict
示例#2
0
import pyodbc
from random import randint

cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=R0224576\RYANSQLSERVER;DATABASE=FAQ;UID=m097654;Trusted_Connection=yes')
cursor = cnxn.cursor()

data = cursor.execute('select msg from FACT').fetchall()
tokens = nltk.word_tokenize(str(data))
text = nltk.Text(tokens)
nwords = [w.lower() for w in text if w.isalpha()]
text = nltk.Text(nwords)

corpus_root='C:\Python_workspace\FAQ Scripts\corpus'

newcorpus = PlaintextCorpusReader(corpus_root,'.*')
postxt = newcorpus.words('positive-words.txt')
negtxt = newcorpus.words('negative-words.txt')

neglist = []
poslist = []

for i in range(0,len(negtxt)):
	neglist.append('negative')

for i in range(0,len(postxt)):
	poslist.append('positive')

postagged = zip(postxt,poslist)
negtagged = zip(negtxt,neglist)

tagged = postagged + negtagged 
input_directory = directory + "Input/_Product_Management/"
output_directory = directory + "1_POS/"
if not os.path.exists(output_directory): os.mkdir(output_directory)

# reading stuff
file_list = os.listdir(input_directory)
print file_list

# just for testing create a corpus reader
from nltk.corpus.reader import PlaintextCorpusReader
reader = PlaintextCorpusReader(input_directory,'.*.txt')

reader.fileids()
reader.raw()
reader.sents()
reader.words()

## default POS tagger from NLTK ##
import nltk
# import pprint
# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
pos = "nltk"
path = output_directory + pos
if not os.path.exists(path): os.mkdir(path)
for i in range(len(file_list)):
#    posting = []
    output = path + "/" + str(file_list[i])
    jfile=open (output,"w")
    reader = PlaintextCorpusReader(input_directory,str(file_list[i]))
    text = str(reader.raw())
    sents = nltk.sent_tokenize(text)
## Corpus example ############################
sample = gutenberg.raw("bible-kjv.txt")
sent = sent_tokenize(sample)

for x in range(5):
    print("Sentence - %s\n" % (sent[x]))
    print("Words - %s\n" % (nltk.word_tokenize(sent[x])))

## Reading corpora from a text files ##########
## No POS tags, chunks or categories ##########
reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg",
                               r'^.*\.txt')
files = reader.fileids()
print("File IDs:", files)
print("Number of files:", len(files))
print(reader.words(files[0]))
print(reader.sents(files[0]))

## Reading tagged corpora #####################
reader = TaggedCorpusReader('/Users/atul/nltk_data',
                            r'brown.pos',
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
print(reader.tagged_words())
print(reader.tagged_sents())
print(
示例#5
0
            index = index + 1
        yield x, y

if __name__ == "__main__":
    directory = 'F:/Minhaz/GitHubRepo/News_Gen/Minhaz_Shahadat/Code/Bengali_Word2Vec_LSTM/'
    corpus_dir = directory + 'corpus/'
    examples = directory + 'examples.txt'
    vocabulary = directory + 'vocab.txt'
    
    w_t = RegexpTokenizer("[\u0980-\u09FF']+")
    corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt', word_tokenizer=w_t)
    
    text_in_words = []
    files = corpus.fileids()
    for f in files:    
        words_in_doc = corpus.words(f)
        text_in_words.append(words_in_doc)
    text_in_words = [[re.sub(r'\d+', '<number>', word) for word in document]for document in text_in_words]
    
    words = []
    for doc in text_in_words:
        for word in doc:
            words.append(word)
    words = sorted(set(words))
    print_vocabulary(vocabulary, words)
    
    if not os.path.isdir(directory + 'checkpoints/'):
        os.makedirs(directory + 'checkpoints/')
    
    # Try different window sizes
    vector_model = word2vec.Word2Vec(text_in_words, size = 500, min_count = 1, window = 7)