示例#1
0
def create_bnc_data():
    if not os.path.exists('domain/'):
        os.mkdir('domain/')
    print(
        "Please go to https://ota.bodleian.ox.ac.uk/repository/xmlui/handle/20.500.12024/2554"
    )
    print("download, and save the BNC corpus file as {}".format(
        os.getcwd() + "/domain/bnc.zip"))
    #wget.download(bnc_url, 'domain/bnc.zip')
    input("Press ENTER when this is done to continue...")
    zip_file = zipfile.ZipFile('domain/bnc.zip', 'r')
    zip_file.extractall('domain/')
    bnc_reader = BNCCorpusReader(root="domain/download/Texts",
                                 fileids=r'[A-K]/\w*/\w*\.xml')
    sents = bnc_reader.sents()
    fout = open('domain/bnc.raw', 'w+')
    for sent in sents:
        fout.write(' '.join(sent) + '\n')
    fout.close()
    path = 'domain/'
    file_name = 'bnc.raw'
    num_lines = sum(1 for line in open(path + file_name))
    filter_word_level(path, file_name, 'domain_dev/', 'bnc.dev', 0,
                      num_lines // 2)
    filter_word_level(path, file_name, 'domain_test/', 'bnc.test',
                      num_lines // 2 + 1, num_lines)
示例#2
0
        def it():
            reader = BNCCorpusReader(fileids=path, root=self.root)
            words_tags = reader.tagged_words(stem=False)
            stems = (s for s, _ in reader.tagged_words(stem=True))

            for (word, tag), stem in zip(words_tags, stems):
                yield Token(word, stem, tag)
示例#3
0
def bnc_sentence_dump(root_path):
    """This process randomly dumps sentences in xmls into txt files under train,
    dev, test split (roughly 7:1:2)
    """
    all_xmls = glob(os.path.join(root_path, r'*/*/*.xml'))
    random.shuffle(all_xmls)
    train_dir = '../../../data/BNC/train/'
    test_dir = '../../../data/BNC/test/'
    dev_dir = '../../../data/BNC/dev/'

    for directory in [train_dir, test_dir, dev_dir]:
        if os.path.isdir(directory):
            shutil.rmtree(directory)
        print(f'Creating directory {directory}')
        os.mkdir(directory)

    for i, full_path in tqdm(enumerate(all_xmls)):
        root, fileid = os.path.split(full_path)
        bnc_reader = BNCCorpusReader(root=root, fileids=fileid)
        filename, ext = os.path.splitext(fileid)

        if i % 10 == 9 or i % 10 == 3:
            save_dir = test_dir
        elif i % 10 == 6:
            save_dir = dev_dir
        else:
            save_dir = train_dir

        save_path = os.path.join(save_dir, filename + '.txt')
        with open(save_path, 'w') as f:
            f.write('\n'.join([' '.join(s) for s in bnc_reader.sents()]))
示例#4
0
        def it():
            reader = BNCCorpusReader(fileids=path, root=self.root)
            words_tags = reader.tagged_words(stem=False)
            stems = (s for s, _ in reader.tagged_words(stem=True))

            for (word, tag), stem in zip(words_tags, stems):
                yield Token(word, stem, tag)
示例#5
0
def preprocess(input_folder_path, output_folder_path, combined, mode,
               lowercase):
    if input_folder_path[-1] != '/':
        input_folder_path = input_folder_path + '/'
    if output_folder_path[-1] != '/':
        output_folder_path = output_folder_path + '/'

    if not (os.path.exists(output_folder_path)
            and os.path.isdir(output_folder_path)):
        os.mkdir(output_folder_path)

    file_list = []

    if combined:
        output_file_path = output_folder_path + mode + '-corpus_preprocessed.txt'
        with open(output_file_path, 'w') as f:
            pass
        file_list = dirWalk(input_folder_path, output_file_path, file_list,
                            combined)
    else:
        output_folder_path = output_folder_path + \
            mode.capitalize() + '-corpus_preprocessed/'
        if not (os.path.exists(output_folder_path)
                and os.path.isdir(output_folder_path)):
            os.mkdir(output_folder_path)
        file_list = dirWalk(input_folder_path, output_folder_path, file_list,
                            combined)

    for file_entry in file_list:
        root_path = file_entry[0]
        file_name = file_entry[1]
        output_file_path = file_entry[2]

        bncreader = BNCCorpusReader(root=root_path, fileids=file_name)
        words = bncreader.tagged_words(c5=True)

        if lowercase:
            data = "".join((str(word[0]).lower() + "_" + str(word[1]) + "\n")
                           for word in words)
        else:
            data = "".join(
                (str(word[0]) + "_" + str(word[1]) + "\n") for word in words)

        # mwdata = mwpreprocess(root_path+file_name, lowercase)
        # data = data+mwdata

        if combined:
            with open(output_file_path, 'a') as f:
                f.write(data)
        else:
            with open(output_file_path, 'w') as f:
                f.write(data)
示例#6
0
def read_bnc_subcorpus(name, regexp):
    bnc_reader = BNCCorpusReader(root='./BNC/texts/', fileids=regexp)

    words = [word.lower() for word in bnc_reader.words()]
    tokenizer = RegexpTokenizer(r'\w+')

    tokens = tokenizer.tokenize(" ".join(words))

    csvfile = f'./words/{name}.csv'
    with open(csvfile, "w", encoding='utf-8') as output:
        writer = csv.writer(output, lineterminator='\n')
        for row in tokens:
            writer.writerow([row])
示例#7
0
def bnc_words(args):
    root, fileids, c5, stem, omit_tags = args
    logger.debug('Processing %s', fileids)
    bnc = BNCCorpusReader(root=root, fileids=fileids)

    try:
        if not omit_tags:
            return Counter(bnc.tagged_words(stem=stem, c5=c5))
        else:
            return Counter(bnc.words(stem=stem))
    except:
        logger.error('Could not process %s', fileids)
        raise
示例#8
0
文件: bnc.py 项目: ednussi/loss2vec
def BNC2TXT():
    bnc_reader = BNCCorpusReader(root=PATH_TO_BNC_TEXTS,
                                 fileids=r'[A-K]/\w*/\w*\.xml')
    tokenizer = RegexpTokenizer(r'\w+')
    # txt = bnc_reader.sents() #all the bnc corpus by sentances

    with open(NEW_BNX_TXT, 'w') as nf:
        i = 0
        for s in bnc_reader.sents():
            nf.write(' '.join(tokenizer.tokenize(s)))
            i = i + 1
            if i % 100000 == 0:
                print('Joined {} Sentances , {}% Done'.format(i, i / 6026276))
        pass
示例#9
0
def bnc_cooccurrence(args):
    """Count word couccurrence in a BNC file."""
    root, fileids, window_size, stem, targets, context = args

    logger.debug('Processing %s', fileids)

    cooccurences = count_cooccurrence(
        BNCCorpusReader(root=root, fileids=fileids).tagged_words(stem=stem),
        window_size=window_size,
    )

    # It might be that case that targets are just words, not (word, POS) pairs.
    # In case this is the case, disregard the POS tags for targets.
    if not isinstance(targets.index[0], tuple):
        cooccurences = ((t[0], c, n) for t, c, n in cooccurences)

    counts = [(targets.loc[t].id, context.loc[c].id, n)
              for t, c, n in cooccurences
              if (t in targets.index) and (c in context.index)]

    if not counts:
        return Counter()

    counts = pd.DataFrame(
        counts,
        columns=('target', 'context', 'count'),
    ).groupby(('target', 'context'), ).sum()

    # TODO: it would be nice to return a DataFrame.
    #
    # Later, do_sum_counters could sum up data frames, instead of dicts.
    # Probably, it's not even needed to sum up counters across multiple processes.
    # Though, this needs benchmarking, for example on the SWDA targes.
    return Counter(dict(zip(counts.index, counts['count'])))
示例#10
0
def bnc_vocabulary(root_path):
    """This process prepares the vocabulary file for BNC. Words occur less than
    or equal to 5 times are eliminated.

    There are 4049 xml's in BNC. Total processing time ~ 1 h 30 min
    """
    word_counter = Counter()
    for full_path in tqdm(glob(os.path.join(root_path, r'*/*/*.xml'))):
        root, fileids = os.path.split(full_path)
        bnc_reader = BNCCorpusReader(root=root, fileids=fileids)
        words = bnc_reader.words()
        word_counter.update(words)

    common_words = word_counter.most_common()
    common_words_5 = list(filter(lambda x: x[1] > 5, common_words))
    with open('../../../data/vocabulary/vocab_bnc_5.txt', 'w') as f:
        f.write('\n'.join([w[0] for w in common_words_5]))
示例#11
0
def create_wordlist_from_subcorpus(name, regexp):
    bnc_reader = BNCCorpusReader(root='./BNC/texts/', fileids=regexp)

    words = [word.lower() for word in bnc_reader.words()]

    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(" ".join(words))

    fdist1 = FreqDist(tokens).items()

    sorted_fdist = sorted(fdist1, key=lambda item: item[1], reverse=True)

    csvfile = f'./exported/${name}.csv'

    with open(csvfile, "w", encoding='utf-8') as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(sorted_fdist)
示例#12
0
    def init_kwargs(cls, root=None, fileids=r'[A-K]/\w*/\w*\.xml'):
        if root is None:
            root = os.path.join(getcwd(), 'BNC', 'Texts')

        return dict(
            root=root,
            paths=BNCCorpusReader(root=root, fileids=fileids).fileids(),
        )
示例#13
0
# -*- coding: utf-8 -*-
from os import listdir
from os.path import isfile, join
import pdb
import pickle
from nltk.corpus.reader.bnc import BNCCorpusReader

strPath = 'C:/mydesktop/BNC/download/Texts/'
bnc_reader = BNCCorpusReader(root=strPath,
                             fileids=r'[A-K]/\w*/\w*\.xml',
                             lazy=False)

listDir = listdir(strPath)
#list_of_fileids=[]
all_words = []
all_tagged_words = []
try:
    for strDir in listDir:  # DIR [A-K]
        listSubDir = listdir(strPath + strDir)
        for strSubDir in listSubDir:  # SUBDIR [A0-AY]
            #pdb.set_trace()
            listFile = listdir(strPath + strDir + '/' + strSubDir)
            for strFile in listFile:  # FILES [A00.xml-A0Y.xml]
                print(strFile)
                strFileID = strDir + '/' + strSubDir + '/' + strFile
                words = bnc_reader.words(stem=True, fileids=strFileID)
                all_words.append(words)
                tagged_words = bnc_reader.tagged_words(
                    stem=True, c5=True, fileids=strFileID)  # C5 Tag
                all_tagged_words.append(tagged_words)
                #pdb.set_trace()
示例#14
0
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from collections import Counter
import os
import csv

# контроль по списку
# кластеризовать слова w2v - матричные и зависимые

stopw = set(stopwords.words('english'))
with open('control_verbs.txt', 'r', encoding='utf-8') as f:
    control = {x.strip() for x in f.readlines()}
lmzr = WordNetLemmatizer()

r = BNCCorpusReader(root='../corpus/BNC/', fileids=r'B/\w*/\w*\.xml')
tagged_sents = r.tagged_sents(c5=True)
sents = r.sents()
matrix = []
data = []
for tsent, sent in zip(tagged_sents[:50000], sents[:50000]):
    for i in range(1, len(tsent)):
        existence = tsent[i][1] is not None and tsent[i - 1][1] is not None
        now_bareinf = existence and tsent[i][1][0] == 'V' and tsent[i][1][
            2] == 'I'
        now_inf = existence and tsent[i][1] == 'TO0'
        now_ger = existence and tsent[i][1][0] == 'V' and tsent[i][1][2] == 'G'
        prev_matrix = existence and tsent[i - 1][1].startswith('VV')
        prev_lex = existence and tsent[i - 1][0].lower() not in stopw
        if prev_matrix and prev_lex:
            if now_ger:
示例#15
0
Make sure that you set the bnc_reader (line 21) to the correct path to where
the BNC is stored on your system.
"""

# We'll use the NLTK BNC reader.
# Beware -- it's very slow!
import nltk

# We'll save the results using json
import json

# We're using the BNC, which is what Payne et. al (2013) use.
from nltk.corpus.reader.bnc import BNCCorpusReader
print('Loading BNC corpus')
bnc_reader = BNCCorpusReader(root="/home/nick/nltk_data/corpora/bnc/Texts", \
                             fileids=r'[A-K]/\w*/\w*\.xml')

# Write to this file:
output_file = './sample_data.json'

# Get some tagged sentences
# The c5 tags provide more relevant information
# than the default tags
print('Preparing tagged sentences.')
tagged_sentences = bnc_reader.tagged_sents(c5=True)

# Count the sentences (since we'll need this number)
print('Counting tagged sentences.')
#tagged_sentences_count = 0
#for sentence in tagged_sentences:
#tagged_sentences_count += 1
示例#16
0
 def bnc(self):
     """BNC corpus reader."""
     root = self.kwargs['bnc']
     return BNCCorpusReader(root=root, fileids=self.fileids)
示例#17
0
文件: bnc.py 项目: gw769/python-notes
# BNC-XML

- XML
- CHILDES
- JSON

## BNC XML

## Read BNC XML
import nltk
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

# Instantiate the reader like this
bnc_reader = BNCCorpusReader(root="../../../Corpus/BNC-XML/Texts/", fileids=r'[A-K]/\w*/\w*\.xml')
list_of_fileids = ['A/A0/A00.xml', 'A/A0/A01.xml']
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(bnc_reader.words(fileids=list_of_fileids))
scored = finder.score_ngrams(bigram_measures.raw_freq)

print(scored)
示例#18
0
def getContexts(lemmatization, path, wordWindow=1000):
    """ Returns a list of contexts (subdivided documents based on word window, default 1,000) of the British National Corpus.  

    Parameters
    ----------
    lemmatization : Boolean
        .
    path : str
        The directory of the folder where the BNC is saved.
    wordWindow : int
        The word length for which the BNC documents are subdivided.
        If not given, 1,000 is used as default. 

    Returns
    -------
    contextsList : list
        List of all the contexts in the corpus.
    contextInfo : str
        Id of the context.
    docInfo : list
        Id of the document.
    
    Notes
    -----

    References
    ----------

    """
    # Set time
    start_time = time.clock()
    # Set BNC reader & parameters
    bnc_reader = BNCCorpusReader(root="Resources/Corpora/BNC/Texts",
                                 fileids=r'[A-K]\/w*\/w*/.xml')
    # Check if text is from written source
    tags = [
        elem.tag for event, elem in ET.iterparse(path, events=("start", "end"))
    ]
    if "wtext" in tags:
        docID = path[33:-4]
        # Set time
        start_time = time.clock()
        # Read in a document as list of words
        docWordListRaw = bnc_reader.words(fileids=path[28:],
                                          strip_space=True,
                                          stem=lemmatization)
        # Preprocessing of raw text
        docWordList = TextPreProcessing(docWordListRaw)
        # Split document into contexts
        contextsList, contextInfo, docInfo = SplitDocuments(
            docWordList, docID, wordWindow)
    elif "stext" in tags:
        contextsList = "SPOKEN"
        contextInfo = "SPOKEN"
        docInfo = "SPOKEN"
    else:
        contextsList = "NEITHER"
        contextInfo = "NEITHER"
        docInfo = "NEITHER"
    # Print out status
    t = time.clock()
    print('t: ', t / 60, end='\t')
    print(t - start_time, "multiprocessor seconds")
    return (contextsList, contextInfo, docInfo)