예제 #1
0
def load_corpus(race_code=None,
                gender_code=None
                ):  #loads corpora into an array based on race and gender

    if (race_code == None):  # if none is specified, search all
        race_code = ".."
    if (gender_code == None):
        gender_code = ".."

    reader = PlaintextCorpusReader(
        corpus_root, ".*_" + race_code + "_" + gender_code +
        "\.txt")  # uses filename encoding to load specified texts
    corpora = []

    for fileid in reader.fileids(
    ):  #creates ComedyCorpus object, populates with fileid and name
        new_corpus = ComedyCorpus()
        new_corpus.set_fileid(fileid)
        try:
            new_corpus.set_text(
                reader.raw(fileid))  #gets word content based on fileid
        except UnicodeDecodeError:
            continue
        fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "",
                        fileid)
        #name is fileid without encoding
        fileid = fileid.replace("%20", " ")
        fileid = fileid.replace("_", "; ")
        print(fileid)
        new_corpus.set_name(fileid)
        corpora.append(new_corpus)

    return corpora
def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/'  # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False,
                            estimator)  #uses bigrams just cause they BETTER
    return ngrammodel
예제 #3
0
def get_phrase():
    root_dir = r'E:\github_repo\python_basic\pythonbasictest\self_nltk\files'
    wordlists = PlaintextCorpusReader(root_dir,".*")
    x = nltk.Text(wordlists.words("test.txt"))
    print(x)
    
    print(x.collocations())
def align(filename):

	files = filename.split('(')
	ripe_file = os.path.abspath(files[1])
	raw_file = os.path.abspath(files[0])
	raw_for_nltk = os.path.abspath('data/newcorpus/source.txt')
	with open(files[0]) as f:
		with open(raw_for_nltk,"w") as f1:
			for line in f:
				f1.write(line)

	corpusdir = 'data/newcorpus/'
	newcorpus = PlaintextCorpusReader(corpusdir, '.*',sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/german.pickle'))
	out = open(ripe_file, "w")
	i = 0
	temp =[]
	temp.append(newcorpus.sents(raw_for_nltk))
	tempVal = str(temp[i])
	tempVal = tempVal.replace(",", "")
	tempVal = tempVal.replace("u'", "")
	tempVal = tempVal.replace("'", "")
	tempVal = tempVal.replace("[", "")
	tempVal = tempVal.replace("]", "")
	out.write(tempVal+os.linesep)
	out.close()
	return
예제 #5
0
def read_corpus(corpus_path):
    from nltk.corpus.reader.plaintext import PlaintextCorpusReader
    corpus = PlaintextCorpusReader(corpus_path, ".*\.txt")
    ctext = corpus.raw()
    #    with open('corpus.txt', 'w') as cf:
    #        cf.write(ctext.encode('utf-8'))
    return ctext
예제 #6
0
def generate_words_grammar():
    """
    Use sentence grammar to find words that could be Rent lyrics
    :return:
    """
    # Load corpuses to look in
    gentrification = PlaintextCorpusReader(
        'corpus', '.*')  # Gentrification articles are in this directory
    gentrify_sents = gentrification.sents()  #
    wine_sents = nltk.corpus.webtext.sents('wine.txt')
    corpus_sents = gentrify_sents + wine_sents
    syls_1 = []
    syls_2 = []
    syls_4 = []
    syls_2_sing = []
    for sent in corpus_sents:
        parsed_sent = nltk.pos_tag(sent)
        for word in parsed_sent:
            no_syls = count_syllables(word[0])
            if word[1] == 'NNS' and len(word[0]) > 3:
                if no_syls == 1:
                    syls_1 = syls_1 + [word[0].lower()]
                elif no_syls == 2:
                    syls_2 = syls_2 + [word[0].lower()]
                elif no_syls == 4:
                    syls_4 = syls_4 + [word[0].lower()]
            if word[1] == 'NN' and len(word[0]) > 2:
                if no_syls == 2:
                    syls_2_sing = syls_2_sing + [word[0].lower()]
    return list(set(syls_1)), list(set(syls_2)), list(set(syls_4)), list(
        set(syls_2_sing))
예제 #7
0
def load_feat_data(dir_array):

    data_list = []

    for direct in dir_array:

        data = []

        corpus_dir = 'dataset/' + direct
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:
            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload

            else:
                text = e.get_payload()

            data.append(extract_features(text, corpus, file))

        data_list.extend(data)

    return data_list
예제 #8
0
    def __init__(self, master):
        '''	Constructor. master is a string that names a directory in the same repository that contains all the work from inspiration
		'''
        self.master = 'masters/' + master
        self.reader = PlaintextCorpusReader(self.master,
                                            r'.*',
                                            encoding='utf-8')
        self.text = self.reader.words()
예제 #9
0
def get_corpus_words():
    '''
        Returns all the words from corpus.
    '''
    reader = PlaintextCorpusReader(settings.CORPUS_ROOT,
                                   settings.CORPUS_FILES_GLOBB)
    if reader:
        return reader.words()
    return []
예제 #10
0
 def cv_to_matrix(self):
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252')
     print("Preprocessing words....")
     sents = [[token.lemma_ for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for sent in corpa.sents()]
     print("training word vectors....")
     model = Word2Vec(sents,window=5, size=self.ncol,min_count=1, workers=4)
     fname = get_tmpfile("vectors.kv")
     model.wv.save(fname)
     print("cv_to_matrix model saved")
     return model.wv
예제 #11
0
 def __init__(self, data_root):
     self.data_root = data_root
     self.data = PlaintextCorpusReader(data_root, '.*')
     self.words = [i for i in self.data.words() if i.isalpha()]
     self.text = Text(self.words)
     self.stop = set(stopwords.words('english')).union({
         'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv',
         'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume',
         'march', 'boston', 'table'
     })
     with open('bib.json') as fi:
         self.bib = json.load(fi)
예제 #12
0
 def build_d2v_model(self):
     print("Début de la construction du modèle Doc2Vec")
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252')
     print("tokenizing...")
     resumes = [[token.lemma_  for sent in paras for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for paras in  corpa.paras()]
     #print(resumes[0:3])
     print("tokenization completed")
     documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(resumes)]
     model = Doc2Vec(documents, vector_size=self.cv_length, window=5, min_count=1, workers=4)
     print("Fin de la construction du modèle Doc2Vec")
     return model
예제 #13
0
 def token_in_coverage(self):
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252')
     resumes = [[item for sent in paras for item in sent] for paras in corpa.paras()]
     cpt=0
     for resume in resumes :
         resume_text = " ".join(resume)
         resume_sents = nltk.sent_tokenize(resume_text)
         resume_words = set(token.lemma_ for sent in resume_sents for token in nlp(" ".join(sent).lower()))
         if not resume_words.isdisjoint(self.tokens_in) :
             cpt+=1
     coverage = cpt*1.0/len(resumes)
     print("token_in coverage : {}".format(coverage))
예제 #14
0
def get_fileid_lst(source_dir):
    '''
    Use NLTK to pull in the list of file ids in the given source directory

    :param {str} source_dir:
        The relative path to the source directory that contains all the data (book) files
    :return {str} fileid_lst:
        List of all file id's ending in '.txt' in the source_dir
    '''
    temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
    fileid_lst = temp_corp.fileids()

    return fileid_lst
예제 #15
0
def main():
    """
    Main function of the program
    """

    corpus_dir = 'NLP_dataset/training_set'  # Directory of corpus.
    new_corpus = PlaintextCorpusReader(corpus_dir, '.*')
    for file_id in new_corpus.fileids():
        file_to_read = open(corpus_dir+"/"+file_id, "r")

        # reading each file to get matched sentences
        matched_sen = match_regular_expressions(file_to_read)

        # writing the matched sentences to files
        write_to_files(matched_sen, file_id)
예제 #16
0
파일: reasoner.py 프로젝트: Nurtal/BIBOT
def create_corpus():
    ## Create corpus from abstract
    ## fetched by BIBOT
    ## return a corpus object

    ## Read the abstract result file
    abstract_to_content = {}
    abstract_file = open("fetched/pubmed_abstract.txt", "r")
    for line in abstract_file:
        line = line.replace("\n", "")
        if (line[0] == ">"):
            abstract = line[1:]
            abstract_to_content[abstract] = ""
        else:
            content = line
            abstract_to_content[abstract] = content
    abstract_file.close()

    ## create files
    for key in abstract_to_content.keys():
        text_file = open("fetched/corpus/" + str(key) + ".txt", "w")
        text_file.write(abstract_to_content[key])
        text_file.close()

    ## ntlk magical lines
    corpusdir = 'fetched/corpus/'
    newcorpus = PlaintextCorpusReader(corpusdir, '.*')

    return newcorpus
def getCorupsFromCorpusFile(CorpusFile):

    CorpusDir, CorpusFile = os.path.split(CorpusFile)

    corpus = PlaintextCorpusReader(CorpusDir, CorpusFile)

    return corpus
예제 #18
0
    def load_corpus(self):

        if len(self.corpus) == 0:
            raise Exception('No corpus defined.')

        if os.path.isdir(self.corpusdir) is False:
            self.generate_corpus_files()

        newcorpus = PlaintextCorpusReader(self.corpusdir, '.*')

        # bard.sents = newcorpus.sents
        bard.tokens = newcorpus.words()
        print len(bard.tokens)

        # print 'init markov NLG text generator'
        self.generator = bard.generators.markov.IntelligentMarkovGenerator(bard.tokens)
예제 #19
0
class App:

    def makeTrainingData (reader):
        for category in reader.categories():
            for file in reader.fileids(category):
                yield FreqDist(reader.words(fileids=[file])), category



    corpusDirectory = "../../resources/input/"
    #Was using PlaintextCorpusReader, switched to Categorized to provide categories
    wattsCorpus = PlaintextCorpusReader(corpusDirectory, '.*')

    print wattsCorpus.raw().strip()
    print wattsCorpus.words()
    for sentence in wattsCorpus.sents():
        print sentence
    print len(wattsCorpus.sents())
    text = nltk.tokenize.word_tokenize(wattsCorpus.raw())
    print "tokenized text: ", text

    #example of finding similar word
    text = nltk.Text(word.lower() for word in wattsCorpus.words())
    print "similar to god: ", text.similar('god')

    words = nltk.pos_tag(text)
    fdist = nltk.FreqDist(words)
    print "frequencey distribution: ", fdist

    sentence = "So there are two ways of playing the game. The first way, which is the usual way, is that a guru or teacher who wants "
    sentenceWords = nltk.word_tokenize(sentence)
    fdistForSentence = nltk.FreqDist(sentenceWords)
    fdistForSentence.plot()
예제 #20
0
def construct_models():
    """ Builds the classification models. """
    sources = [
        'Conservative',  # Scalia + Rehnquist
        'Progressive'
    ]  # Ginsburg + Stevens
    corpus = [(PlaintextCorpusReader('data/' + path + '/', '.*'), path)
              for path in sources]
    documents = []
    for (c, cat) in corpus:
        for fileid in c.fileids():
            documents.append((c.words(fileid), cat))

    random.shuffle(documents)

    all_words = []

    for (c, cat) in corpus:
        all_words.extend(c.words())

    all_words = nltk.FreqDist(all_words)
    word_features = list(all_words.keys())[:3000]
    featuresets = [(find_features(opinion, word_features), cat)
                   for (opinion, cat) in documents]

    training_subset = int(len(featuresets) * 0.9)
    training_set = featuresets[:training_subset]
    testing_set = featuresets[training_subset:]

    ensemble = EnsembleClassifer(training_set, testing_set)
    ensemble.show_most_useful_features()
    ensemble.accuracy()
    print(ensemble.classify(testing_set[0][0]))
예제 #21
0
파일: wikiterm.py 프로젝트: hmetaxa/LiTe
def main():
    corpus_root = sys.argv[1]
    num_text_files = int(sys.argv[2])
    algorithm_type = sys.argv[3]
    pmi_freq_filter = int(sys.argv[4])
    file_list = []
    for i in range(0, num_text_files):
        file_list.append(sys.argv[5 + i])
    corpus = PlaintextCorpusReader(corpus_root, '.*')
    if 'bigram' in algorithm_type:
        measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(corpus.words())
        finder.apply_freq_filter(pmi_freq_filter)
        scored = finder.score_ngrams((f(algorithm_type)))
    else:
        measures = nltk.collocations.TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(corpus.words())
        finder.apply_freq_filter(pmi_freq_filter)
        scored = finder.score_ngrams((f(algorithm_type)))

    sort = (sorted(scored, key=lambda tu: tu[1]))
    for key in sort:
        ngrams = len(key[0])
        if (ngrams == 2):
            print key[0][0] + "\t" + key[0][1] + "\t" + str(key[1])
        else:
            print key[0][0] + "\t" + key[0][1] + "\t" + key[0][2] + "\t" + str(
                key[1])
def create_corpus(directory):
    corpus = PlaintextCorpusReader(directory,
                                   '.*',
                                   encoding="iso-8859-1",
                                   word_tokenizer=word_tokenize,
                                   sent_tokenizer=sent_tokenize)
    return corpus
예제 #23
0
파일: nlp.py 프로젝트: MCodeLab/udn-nlp
def corpus_reader(corpus_name):
    ''' Open a PlaintextCorpusReader for the given UDN corpus.
    '''
    # If the user requested an unfiltered corpus version, we need to know the root corpus name
    root_corpus = corpus_name.replace('-unfiltered', '')

    # Ensure the desired corpus's submodule is checked out
    if not os.path.exists('./corpora/{}/README.md'.format(root_corpus)):
        retcode = subprocess.call(
            "git submodule update --init -- corpora/{}".format(
                root_corpus).split(" "))
        if retcode != 0:
            print(
                "Attempt to checkout submodule for corpus '{}'. Try running 'git submodule update --init' manually."
                .format(root_corpus))
            exit()

    percentage = ''
    with open('./corpora/{0}/{0}.txt'.format(root_corpus), 'r') as f:
        manifest = f.readlines()
        query = manifest[0].split(" ")[3]
        num_found = util.dry_make_request(query, 0, 1)[0]['numFound']
        num_in_corpus, last_one = util.files_in_dir('./corpora/{}/{}'.format(
            root_corpus, corpus_name))
        percentage = '{0:.0%}'.format(num_in_corpus / num_found)
        if percentage != '100%':
            print('NOTE: This corpus is only {} complete. Last file: {}\n'.
                  format(percentage, last_one))

    corpus = PlaintextCorpusReader(
        './corpora/{}/{}'.format(root_corpus, corpus_name), r'.*\.txt')
    return corpus
예제 #24
0
    def __init__(self,
                 input_folder_name,
                 doc_pattern,
                 categ_pattern,
                 encoding='utf-8'):
        CategorizedPlaintextCorpusReader.__init__(self,
                                                  input_folder_name,
                                                  doc_pattern,
                                                  cat_pattern=categ_pattern)
        self.input_folder_name = input_folder_name
        self.encoding = encoding
        self.root_reader = PlaintextCorpusReader(input_folder_name,
                                                 fileids=r'[^\/]*.' +
                                                 doc_pattern[-3:])
        #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()]

        self.root_ids = list(self.root_reader.fileids())
예제 #25
0
    def setup_corpus(self, corpus_dir, paths='.*'):
        """Setting up a corpus.

        Args:
            corpus_dir(str): Path to corpus directory.
        """
        self.corpus = PlaintextCorpusReader(corpus_dir, paths)
        return self.corpus
예제 #26
0
def load_data(dir_label):

    data_list = []
    labels = []

    for dl in dir_label:

        data = []

        directory = dl[0]
        label = dl[1]

        corpus_dir = 'dataset/' + directory
        corpus = PlaintextCorpusReader(corpus_dir, '.*\.*')
        file_ids = corpus.fileids()

        for file in file_ids:

            d = []

            text = corpus.raw(file)
            e = email.message_from_string(text)

            if (e.is_multipart()):
                for payload in e.get_payload:
                    text = payload.get_payload
            else:
                text = e.get_payload()

            feats = [
                cf.charac_feats_extractor(text),
                wf.word_feats_extractor(text),
                syf.syntac_feats_extractor(text),
                stf.struct_feats_extractor(corpus, file, text),
                fwf.funct_word_feats_extractor(text)
            ]

            for f in feats:
                d.extend(list(f.values()))

            data.append(d)
            labels.append(label)

        data_list.extend(data)

    return [data_list, labels]
예제 #27
0
def pdf_to_corpus():
    path = 'D://Eclipse Workspace//NLP//Assignment//res//'

    for filename in glob.glob(os.path.join(path, '*.pdf')):
        print(filename)
        pdfFileObj = open(filename, 'rb')

        # creating a pdf reader object
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

        # printing number of pages in pdf file
        print(pdfReader.numPages)

        # creating a page object
        pageObj = pdfReader.getPage(0)

        # extracting text from page
        text = pageObj.extractText()

        strings_list = text.split("\n")
        # Make new dir for the corpus.
        corpusdir = 'customcorpus/'
        if not os.path.isdir(corpusdir):
            os.mkdir(corpusdir)

        # Output the files into the directory.
        file_name = filename.split("\\")[-1]

        print(file_name)
        pbar = ProgressBar(widgets=[
            'Creating Corpus',
            Bar('#', '[', ']'), ' ',
            Percentage(), ' ',
            ETA()
        ],
                           maxval=100)
        for text in pbar(strings_list):
            with open(corpusdir + '[PDF] ' + file_name + '.txt', 'ab') as fout:
                fout.write(text.encode('utf-8'))
        pbar.finish()

        #create_corpus(text)
        corpus = PlaintextCorpusReader('customcorpus/', '.*')

        print(corpus.raw())
예제 #28
0
	def _strip_tags(self, title):

		new_title = ''

		custom_corpus = PlaintextCorpusReader('../custom_corpora/', '.*')

		#For each word in the title
		for word in title.split():

			#Remove all punctuation
			noPunc = ''.join(c for c in word if c not in string.punctuation)

			#If this word isn't in stopwords and isn't just a single letter
			if noPunc.lower() not in (stopwords.words('english')) and len(noPunc) > 1:

				stripped_word = self._strip_word(word)

				if stripped_word not in (custom_corpus.words('media')) and len(stripped_word) > 1:
					new_title = ' '.join([new_title, stripped_word])

		return new_title[1:]
예제 #29
0
def corpus_reader(filepath):
    """
    takes a filepath including filename
    formats in case file is csv
    loads file into PlainTextCorpusReader
    """
    print "TEST: corpus_reader call"

    csv_file = open(filepath, 'rb') # use test_1.csv as test case
    csv_data = csv.reader(csv_file)
    global csv_read
    csv_read = open('uploads/tmp/read.tmp', 'w')
    for line in csv_data:
        line_to_write = re.sub('[\s\t]+', ' ', str(line))
        line_to_write = line_to_write.lstrip('[\'')
        line_to_write = line_to_write.rstrip('\']')
        csv_read.write(str(line_to_write) + "\n\n")
    root = 'uploads/'
    corpus = PlaintextCorpusReader(root, 'tmp/read.tmp')
    #response = corpus.paras()
    words = corpus.words()
    return words
예제 #30
0
 def __init__(self, config):
     print('Filepath for texts = ', config.textpath)
     self.corpus = PCR(config.textpath,
                       '.*\.txt',
                       encoding='utf-16',
                       para_block_reader=read_line_block)
     if config.clean_paragraphs == 'yes':
         self.clean(config, mode='para')
     if config.clean_sentences == 'yes':
         self.clean(config, mode='sent')
     #Corpus summaries
     self.corpus_info()
     self.LDA(config.num_topics, config.num_words)
     self.plot(config.num_words)
def Read_corpus(path_c, fname_c, fo1):
    import nltk
    import re
    import spacy
    import en_core_web_sm
    import fileinput
    nlp = spacy.load('en_core_web_sm')
    from nltk.corpus.reader.plaintext import PlaintextCorpusReader

    pcorpus = PlaintextCorpusReader(path_c, fname_c, encoding="utf")

    #HTML Tags to file
    fappend(fo1, P_htmltag.writehtmltag1(fname_c), fname_c)

    # Iterate through each paragraph
    for para in pcorpus.paras():
        L0 = rep_tags(para)
        L1 = L0.split("\n")
        for i, w in enumerate(L1):
            if (w != ""):
                ApplyNLP(nlp(str(w[1:])), fo1)

    fappend(fo1, P_htmltag.writehtmltag3(fname_c), fname_c)
예제 #32
0
def token_assamese():
    # Modifiy these to change the location of the coupus file  and the name of  the courpus  file
    corpus_path = "/Users/partha/All/Python/ProjectMaterials/Learned material/Arts"
    corpus_filename = 'Psychology.txt'

    newcorpus = PlaintextCorpusReader(corpus_path,
                                      corpus_filename,
                                      encoding='utf16')
    text = newcorpus.raw().strip().replace('ред', '.')
    words = nltk.word_tokenize(text)

    for index, item in enumerate(words):
        if (str(item) == '.'):
            words[index] = 'ред'

    output_file_path = "C:/Users/HEMANT/Documents/1.Project/"
    output_filename = 'Result.txt'

    with open(output_file_path + output_filename, 'w', encoding='utf8') as f:
        for i in words:
            f.writelines(str(i) + '\n')

    f.close()
예제 #33
0
 def __init__(self, my_input_file):
     self.config = configparser.ConfigParser()
     self.config.read("text_analysis.cfg")
     self.input_file = my_input_file
     self.nlp_model = self.config["DEFAULT"]["nlp_model"]
     #The output file name
     self.output_file = self.config["DEFAULT"]["output_file"]
     self.nlp = load_nlp(self.nlp_model)
     self.corpus = CorpusReader(".", self.input_file)
     self.raw_text = self.corpus.raw()
     self.nlp_text = self.nlp(self.raw_text)
     # Here, lets put together the infos for text analysis with spacy.
     self.analysis_dictionary = Counter()
     self.word_count = 0
     self.get_word_count_nltk()
예제 #34
0
def read_article(file_path):
    #file = open(file_path, "r")
    ##INSERT FILE NAME IN FUNCTION CALL BELOW######
    bcr = PlaintextCorpusReader(file_path, 'bernie.txt')
    #filedata = file.read()
    filedata = bcr.raw()
    #for word in filedata.split():
    #    if word == 'Mr.':
    #        filedata[word] = 'Mr'
    article = filedata.replace("\n\n", '. ').replace('Mr.', 'Mr').replace(
        "\r", ' ').replace('\n', ' ').split('. ')
    articlez = []
    for line in article:
        if line == '':
            continue
        if line[0] == '\n':
            line = line[1:]
        articlez.append(line)
    sentences = []
    for sentence in articlez:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop()

    return sentences
예제 #35
0
def parseFolder( dirPath ):
    assignments = []
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for i in range(len(finalReader.fileids())):
        final = finalReader.paras( finalIdsSortedList[i] )
        draft = draftReader.paras( draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments.append( assn )

    return assignments
예제 #36
0
파일: Parse.py 프로젝트: bflick/FYE-NLP
def parseFolder( dirPath ):
    assignments = {}
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for pid in finalReader.fileids():
        final = finalReader.paras( pid ) #finalIdsSortedList[i] )
        draft = draftReader.paras( pid ) #draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments[pid] = assn

    return assignments
예제 #37
0
def processFile(newCorpusDir):
    if not os.path.isdir(newCorpusDir):
        os.mkdir(newCorpusDir)
    txt1 = getText('sample_feed.txt')
    txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf')
    txt3 = word.getTextWord('my_doc.docx')

    files = [txt1, txt2, txt3]
    for idx, f in enumerate(files):
        with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
            fout.write(f)

    newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

    print(newCorpus.words())
    print(newCorpus.sents(newCorpus.fileids()[1]))
    print(newCorpus.paras(newCorpus.fileids()[0]))
예제 #38
0
import nltk.data
import re
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import treetaggerwrapper

article_corpus = PlaintextCorpusReader('text_plain/', '.*\.txt', 
	sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/spanish.pickle'))

stop_words = nltk.corpus.stopwords.words('spanish') 
non_alphabetic = re.compile("\W|\d")
words = []
tags = []

# Using TreeTagger 
# 1) pip install treetaggerwrapper
# 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger
# 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib
# See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings
tagger = treetaggerwrapper.TreeTagger(TAGLANG='es')
for sentence in article_corpus.sents():
	tagged_sentence = tagger.tag_text(sentence) 
	tags.extend(treetaggerwrapper.make_tags(tagged_sentence))

#TODO: create a tagger script, save the tagged files
#TODO: look at alternate taggers, compare

#TODO: profile this and see which part is taking so long
for tag in tags:
	lemma = tag[2].lower()
예제 #39
0
import os
import nltk
import pickle
import zlib
import base64
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

corpusdir = "./text"
newcorpus = PlaintextCorpusReader(corpusdir, ".*")
labeled_names = (
    [(name, "comp") for name in newcorpus.words("comp.txt")]
    + [(name, "animal") for name in newcorpus.words("animal.txt")]
    + [(word, "ignore") for word in newcorpus.words("ignorethese.txt")]
)
features = [({n: n}, thing) for (n, thing) in labeled_names]
training = features[:]
testing = "What color is the mouse?".lower().split(" ")
classifier = NaiveBayesClassifier.train(training)
pickleclf = pickle.dumps(classifier)
compressed = base64.b64encode(zlib.compress(pickleclf, 9))
with open("PickledClassifier.txt", "wb") as outobj:
    outobj.write(compressed)
compScore = 0
animalScore = 0
for word in testing:
    if (
        word[len(word) - 1] == "."
        or word[len(word) - 1] == ","
        or word[len(word) - 1] == "?"
예제 #40
0
#!/usr/bin/python
import sys
import wsd
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.text import *

if len(sys.argv) != 4:
  print "Usage:", sys.argv[0], "word sense1 sense2"
  exit(-1)

focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#focal_word = "plant"
#senses = ["manufacturing","life"]
corpus = PlaintextCorpusReader('outcorpus/', '.*')
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
decision_list = wsd.DecisionList()
decision_list.load("senses_bootstrap_" + focal_word + ".csv")    

i = 0
for infile in sorted(corpus.fileids()):
  print i, "/", len(corpus.fileids())
  i += 1
  
  words = corpus.words(infile)
  text = Text(words)
  c = nltk.ConcordanceIndex(text.tokens)
  offsets = c.offsets(focal_word)
  
  for offset in offsets:
예제 #41
0
        for j in range(len(gwords[i])):
            file.write(str(gwords[i][j]) + ':' +str(gwords[i].count(gwords[i][j])) + ' ')
            # writing the words in a filw with a proper format
        file.write('#label#:'+ str(category) +'\n')
        # adding label at the end of each file
    file.close()

def create_content (gdocs,graphicsdir,gcontent):
    for file in gdocs:
        gcontent.append(open(graphicsdir+'/'+str(file),'r').read())


# defining the directory path for each category
graphicsdir,autosdir,gunsdir = '20news-bydate/train/comp.graphics','20news-bydate/train/rec.autos','20news-bydate/train/talk.politics.guns'
graphicstest,autostest,gunstest = '20news-bydate/test/comp.graphics','20news-bydate/test/rec.autos','20news-bydate/test/talk.politics.guns'
graphicscorpus,autoscorpus,gunscorpus = PlaintextCorpusReader(graphicsdir, '.*'),PlaintextCorpusReader(autosdir, '.*'),PlaintextCorpusReader(gunsdir, '.*')
graphicscorpustest,autoscorpustest,gunscorpustest = PlaintextCorpusReader(graphicstest, '.*'),PlaintextCorpusReader(autostest, '.*'),PlaintextCorpusReader(gunstest, '.*')

# initializing the lists
gdocs,adocs,ndocs,gcontent,acontent,ncontent,gwords,awords,nwords,vocab = [],[],[],[],[],[],[],[],[],[]
gtdocs,atdocs,ntdocs,gtcontent,atcontent,ntcontent,gtwords,atwords,ntwords,vtocab = [],[],[],[],[],[],[],[],[],[]
# for train dataset
gdocs.extend(graphicscorpus.fileids()) # for graphics category
adocs.extend(autoscorpus.fileids()) # for autos category
ndocs.extend(gunscorpus.fileids()) # for guns category
# for test dataset
gtdocs.extend(graphicscorpustest.fileids()) # for graphics category
atdocs.extend(autoscorpustest.fileids()) # for autos category
ntdocs.extend(gunscorpustest.fileids()) # for guns category
# retriving the words for each category
# for train dataset
예제 #42
0
class DumbClusterer():
    """A rather dumb clusterer. 
    """
    def __init__(self, corpus_dir=None, mwes=[], setup_mwes=True, **kwargs):
        self.mwes = mwes
        if corpus_dir is not None:
            self.setup_corpus(corpus_dir, '.*')
            if setup_mwes:
                self.setup_mwes(**kwargs)

    def setup_corpus(self, corpus_dir, paths='.*'):
        """Setting up a corpus.

        Args:
            corpus_dir(str): Path to corpus directory.
        """
        self.corpus = PlaintextCorpusReader(corpus_dir, paths)
        return self.corpus

    def extract_expressions(self, document, features=None):
        """Returns expressions from given features and multi-word expressions.
        
        In addition to passing a document into this method, MWEs or Multi-Word Expressions
        can be given to treat some multi words as one expression.

        >>> from document import ArthurDocument
        >>> pdf_path = base_path + '/test/test.pdf'
        >>> with open(pdf_path, 'rb') as f:
        ...     document = ArthurDocument(f.read())
        >>> features = document.get_features()[730:816,:]
        >>> print(document.get_text(features)) # doctest:+ELLIPSIS
        VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive

        Multi-word expression should be detected:
        >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates'])
        >>> expressions = clusterer.extract_expressions(document, features)
        >>> print(expressions[2]['text'])
        CROWN JEWEL

        x position should equal x of "C" from "CROWN JEWEL" :
        >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')]
        True

        and width should equal to width of "CROWN JEWEL":
        >>> expr_width = expressions[2]['x1']-expressions[2]['x']
        >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')]
        >>> expr_width == ftr_width
        True

        Args:
            document(ArthurDocument): Document to extract data fields from.
            features(list): List of features containing data fields to extract. If not given, use
                            all document features.
            mwes(list): List of Multi-Word Expressions. Example value:
                        `['property type', 'single family)]`. With that list, both "property type"
                        and "single family" will each be treated as single expressions.        
        Returns:
            np.array: An array of data_fields.
        """
        mwes = self.mwes
        if features is None:
            features = document.get_features()
        text = document.get_text(features)
        for idx, mwe in enumerate(mwes):
            if isinstance(mwe, str):
                mwes[idx] = word_tokenize(mwe.lower())
            elif hasattr(mwe, '__iter__'):
                mwes[idx] = [x.lower() for x in mwe]
        tokenizer = MWETokenizer(mwes, separator=' ')
        tokenized = tokenizer.tokenize(word_tokenize(text.lower()))

        expressions = []
        pos = 0
        for token in tokenized:
            # token could be "deez nutz" but text contains multiple spaces e.g. "deez  nutz",
            # so we need to split the token and find position of first and last characters.
            words = token.split()
            start_pos = text.lower().index(words[0], pos)
            for word in words:
                ipos = text.lower().index(word, pos)
                end_pos = ipos + len(word)
            pos = end_pos
            min_x = 0
            max_x = 0
            min_y = 0
            max_y = 0
            page = 0
            if len(features[start_pos:end_pos,:] > 0):
                min_x =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')]
                max_x =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')]
                min_y =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')]
                max_y =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')]
                page = features[start_pos, ArthurDocument.get_feature_id('page')]

            expressions.append({
                'text': text[start_pos:end_pos],
                'x': min_x,
                'x1': max_x,
                'y': min_y,
                'y1': max_y,
                'page': page
            })
        return expressions

    def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000):
        """Create multi-word expressions by learning a corpus located in a corpus directory.

        Testing setting up mwes with custom path and setting it up twice (correct when no exception):
        >>> corpus_dir = os.path.join(base_path, 'test', 'corpus')
        >>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe'])
        >>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000)
        >>> 'custom mwe' not in mwes
        True

        >>> 'custom mwe' in clusterer.mwes
        True

        Args:
            trigram_nbest(int): Number of highest ranked trigrams to acquire.
            bigram_nbest(int): Number of highest ranked trigrams to acquire.
        Returns:
            list: List of multi-word expressions.
        """
        if self.corpus is None:
            raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.")

        bigram_measures = BigramAssocMeasures()
        trigram_measures = TrigramAssocMeasures()

        # Following are not used since ne chunk takes too much time.
        # Text processing before bigrams and trigrams calculated
        # words = []
        # for sent in self.corpus.sents():
        #     for chunk in nltk.ne_chunk(nltk.pos_tag(sent)):
        #         if not isinstance(chunk, nltk.Tree):
        #             w = chunk[0]
        #             # - Removal of words containing numbers or punctuations
        #             if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
        #                 # - Lowercasing all words
        #                 words.append(w.lower())
        #                 print(w.lower().encode("utf-8")),

        # Text processing before bigrams and trigrams calculated
        words = []
        for w in self.corpus.words():
            # - Removal of words containing numbers or punctuations
            if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
                # - Lowercasing all words
                words.append(w.lower())

        bigram_finder = BigramCollocationFinder.from_words(words)
        trigram_finder = TrigramCollocationFinder.from_words(words)
        mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest)
        # Basically combining two list by turning them into sets to make sure union returned 
        # i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they
        # need to be converted into sets.
        set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes}
        set2 = set(mwes)
        self.mwes = list(set1 | set2)
        return mwes
예제 #43
0
import nltk
import re
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import FreqDist

corpus_root = '/home/aman/entire-src/py/dir'
speeches = PlaintextCorpusReader(corpus_root, '.*\.txt')

print "Finished importing corpus"

raw = speeches.raw().lower()
tokens = nltk.word_tokenize(raw)
tgs = nltk.trigrams(tokens)
fdist = nltk.FreqDist(tgs)
for k,v in fdist.items():
    print k,v
예제 #44
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Create a new corpus by specifying the parameters
# (1) directory of the new corpus
# (2) the fileids of the corpus
# NOTE: in this case the fileids are simply the filenames.
newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*')

# Access each file in the corpus.
for infile in sorted(newcorpus.fileids()):
    print infile # The fileids of each file.
    fin = newcorpus.open(infile)# Opens the file.
    print fin.read().strip() # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
#Tried to find misspellings in a corpus of text files. See find_misspellings.py and grouping_docs.py for documentation.
#There are ~30,400 unique words in these 49 communication files
#Rebecca's laptop took too long to make the correlation matrix

import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
import numpy as np
from numpy import linalg

#make a new corpus
corpusdir = 'communications/small_test_batch' #where the files are
newcorpus = PlaintextCorpusReader(corpusdir, '.*')

fileids = newcorpus.fileids() #list of fileids
j = len(fileids) #number of docs

words_list = [] #['doc', '1', 'words', 'doc', '2', 'words',...]
doc_breaks = [0] #ith entry = index of first word in doc i in words_list
keywords = set() #{'doc', '1', 'words', '2',...}

tokenizer = RegexpTokenizer('\w+') #pick out alphanumeric sequences; discard punctuation, white space

#create set of keywords and list of file texts
for id in fileids:
    raw = newcorpus.raw(id)
    raw2 = ''.join([i if ord(i)<128 else '' for i in raw]) #remove unicode characters
    raw3 = raw2.encode('ascii')
    file_words = map(str.lower,tokenizer.tokenize(raw3)) #list of cleaned words: lower-case, no punct, no whitespace
    words_list = words_list + file_words
    doc_breaks = doc_breaks + [len(file_words)+doc_breaks[len(doc_breaks)-1]]
예제 #46
0
파일: verify.py 프로젝트: statguy/wsd
import sys
import wsd
import random
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.text import *
import nltk

if len(sys.argv) != 4:
  print "Usage:", sys.argv[0], "word sense1 sense2"
  exit(-1)

focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#focal_word = "plant"
#senses = ["manufacturing","life"]
corpus = PlaintextCorpusReader('outcorpus/', '.*')
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
decision_list = wsd.DecisionList()
decision_list.load("senses_bootstrap_" + focal_word + ".csv")
corpus_ids = corpus.fileids()
random.shuffle(corpus_ids)

num_words = 1
num_words_max = 100
tagged = 0
ambiguous = 0
unknown = 0

for infile in corpus_ids:
  if num_words > num_words_max: break
import datetime
import nltk
from nltk import word_tokenize
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import floresta,mac_morpho
from parser_portuguese_risk import evaluateModel, splitTrainTestModel, simplify_tag
time1 =datetime.datetime.now()

###############################################################################
### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ###

# Reading corpus
corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus.
#corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus.   
risco = PlaintextCorpusReader(corpusdir, '.*')
risco.fileids()

raw_text = risco.raw('gloss533.txt')
#print raw_text[0:]

# Some statistics

print 'Number of term: ', len(risco.words())
print 'Number of unique terms: ', len(set(risco.words()))

fd = nltk.FreqDist(risco.words())
print fd.freq('bem')
print fd['bem']

# presenting ngrams of the term
예제 #48
0
#
# for line in p:
#     for sentence in line:
#         sentence.draw()




st=StanfordPOSTagger('english-bidirectional-distsim.tagger')
parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

# setup corpus of texts
childStoryCorpusDir = '../resources/org_transcripts'
robotStoryCorpusDir = '../resources/robot_stories'

childStoryCorpus = PlaintextCorpusReader(childStoryCorpusDir, ".*\.txt")
robotStoryCorpus = PlaintextCorpusReader(robotStoryCorpusDir, ".*\.txt")


# average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score)
# for fileid in childStoryCorpus.fileids():
#     num_chars = len(childStoryCorpus.raw(fileid))
#     num_words = len(childStoryCorpus.words(fileid))
#     num_sents = len(childStoryCorpus.sents(fileid))
#     num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)]))
#     print ((float(num_chars)/float(num_words)), float(num_words)/float(num_sents), float(num_words)/float(num_vocab), fileid)


for fileid in childStoryCorpus.fileids():

    print (fileid)
예제 #49
0
파일: reader.py 프로젝트: advaith2/Data
@author: Advaith GVK
'''

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import string
import csv
from fileinput import filename

corpusdir = 'C:/Users/Advaith GVK/workspace/Trial/src/Pack/New folder' # Directory of corpus.

newcorpus = PlaintextCorpusReader(corpusdir, '.*')

filenames = newcorpus.fileids()
# print newcorpus.sents()

def getWordNetType(tag):
        #print tag
        if tag in ['JJ', 'JJR', 'JJS']:
            return wn.ADJ
        elif tag in ['NN', 'NNS', 'NNP', 'NNPS','POS','FW']:
            return wn.NOUN
        elif tag in ['RB', 'RBR', 'RBS','WRB']:
            return wn.ADV
        elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            return wn.VERB
        return wn.NOUN
예제 #50
0
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import FreqDist

#Create a corpus
corpusdir = "/home/erdinc/nltk/cs290f_proj/tos/"
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
corpusWords = nltk.Text(newcorpus.words())
posTags = nltk.pos_tag(corpusWords)


#Total number of words in corpus
def getTotalNumberOfWords(words):
	return len(words)

#Number of unique words in corpus
def getNumberOfUniqueWords(words):
	return len(set(words))

#Most frequently used 25 words
def getMostFreqWords(words):
	fdist = FreqDist(words)
	vocab = fdist.keys()
	return vocab[:25]


#Name List
def getNameList(tags):
	nameList = []
예제 #51
0
파일: to_matlab.py 프로젝트: Vivaq/scripts
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from decimal import Decimal
from math import pi

if __name__ == '__main__':
    ptcr = PlaintextCorpusReader('C:\Users\Jakub\Downloads\pr4\Trzeci plik', ['znormalizowane.txt', 'katy.txt'])
    data = []
    t = ptcr.raw(fileids=ptcr.fileids()[1]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data.append(float(Decimal(x)*360/315))
    print data
    data_ = []
    t = ptcr.raw(fileids=ptcr.fileids()[0]).replace(',', '.').replace('\r', '').split('\n')
    t.remove('')
    for x in t:
        data_.append(float(x)/100)
    print data_
예제 #52
0
import os
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify import PositiveNaiveBayesClassifier
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

def features(sentence):
	words = sentence.lower().split()
	return dict(('contains(%s)' % w, True) for w in words)

corpusdir = './text'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')
positive_featuresets = list(map(features, newcorpus.raw('comp.txt')))
unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt')))
classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, 
	unlabeled_featuresets, .3)
print classifier.classify(features('.'))
예제 #53
0
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from wordcloud import STOPWORDS
_stop_words = set(STOPWORDS)

stop_words = set(stopwords.words('english'))
stop_words.update(_stop_words, ('thing', 'u', 'us', 'nt'))
lemmatizer = WordNetLemmatizer()

# Read .txt files from ./docs directory into a corpus
corpus = PlaintextCorpusReader('./docs/', ".*\.txt")

# filter list of words to remove uneeded ones and punctuation
# losing U.S. which is not ideal, tried splitting sentences on spaces and preserving dots just for it

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tokenized = tokenizer.tokenize(corpus.raw())

# drop punctuation
non_punct = list(
    filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct,
           tokenized))

# lowercase everything
lowercased = [word.lower() for word in non_punct]
예제 #54
0
#!/usr/bin/python
import sys
import csv
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.text import *
import wsd

if len(sys.argv) != 4:
  print "Usage:", sys.argv[0], "word sense1 sense2"
  exit(-1)

corpus = PlaintextCorpusReader('outcorpus/', '.*')
focal_word = sys.argv[1]
senses = [sys.argv[2], sys.argv[3]]
#senses = ["manufacturing","life"]
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]

with open("senses_" + focal_word + ".csv") as senses_file:
  reader = csv.reader(senses_file)
  for row in reader:
    infile, offset, sense = row
    offset = int(offset)
    words = corpus.words(infile)
    text = Text(words)
    
    for collocation in collocations:
      collocation.add_collocation(text, offset, sense)


#print collocations[0].frequencies.items()[0][1].items()[0][1]
예제 #55
0

# new file with weightings
new_file = open(new_file_name, "w+", encoding="utf-8")
more_stopwords = open("stopwords.txt", "r", encoding="utf-8")
stop_words = set(nltk.corpus.stopwords.words('english'))
for line in more_stopwords:
    stop_words.add(line[:-1])
    #words = line.split()
    #for word in words:
        #stop_words.add(word)
regex = re.compile(r'(?:^|)[a-zA-Z0-9\-]+')
not_regex = re.compile(r'\@[a-zA-Z0-9\-]+')
#print(stop_words)

texts = PlaintextCorpusReader(CORPUS_TEXT, '.*\.txt')

def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    #pdb.set_trace()
    #print(stop_words)
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    lambda_func = lambda w_p_c: w_p_c[2] != 'O'
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda_func) if key]
def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/' # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER
    return ngrammodel
예제 #57
0
CORPUS_ROOT = '/home/ksotala/Hiit/mallet-2.0.7/dataset/lemmatized/nostop2/'
CORPUS_EXTENSION =r'.*\.txt'

import nltk
import os
from os import listdir
from os.path import isfile, join
from nltk.collocations import *
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# read in corpus, find all the 3-grams above the min frequency
print "Reading in corpus from", CORPUS_ROOT
my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
print "Read in " + str(len(my_corpus.fileids())) + " files"
print "Finding 3-grams"
finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words())
print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY
finder_3gram.apply_freq_filter(MIN_FREQUENCY)

# combine all the 3-grams meeting the PMI threshold
print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI
filelist = [ join(CORPUS_ROOT,f) for f in listdir(CORPUS_ROOT) if isfile(join(CORPUS_ROOT,f)) ]
gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI)
processGrams(gen, filelist)

# now let's do the same for the 2-grams
# our previous step altered the corpus so let's read it in again
print "Reading in corpus from", CORPUS_ROOT
예제 #58
0
import os
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer

## create the corpus of 1965 songs from html files
corpusdir = '../../data/billboard_data/1960/billboard_1965/'
bb_1965 = PlaintextCorpusReader(corpusdir, '.*')

## get the raw text from specific songs/files
help = bb_1965.raw('help.html')
desolation_row = bb_1965.raw('desolation_row.html')

## clean the raw text to remove the p tags
clean_help = nltk.clean_html(help)
clean_desolation = nltk.clean_html(desolation_row)

# word tokenize
tokens_help = nltk.word_tokenize(clean_help)
tokens_desolation = nltk.word_tokenize(clean_desolation)

# point of speech tagging
tags_help = nltk.pos_tag(tokens_help)
tags_desolation = nltk.pos_tag(tokens_desolation)

tokenizer = RegexpTokenizer(r'\w+')

## print the unique, sorted pos tags
for item in sorted(set(tags_help)):
	print 'help tags: ', item