示例#1
0
    def __init__(self):
        self.split_dash = True
        self.split_single_quote = False
        self.split_period = False
        self.split_comma = False

        # Unix character classes to split on
        resplit = r"\p{Pd}\p{Po}\p{Pe}\p{S}\p{Pc}"

        # A list of optional exceptions, for this character will we trust nltk
        # to split correctly
        dont_split = ""
        if not self.split_dash:
            dont_split += "\-"
        if not self.split_single_quote:
            dont_split += "'"
        if not self.split_period:
            dont_split += "\."
        if not self.split_comma:
            dont_split += ","

        resplit = "([" + resplit + "]|'')"
        if len(dont_split) > 0:
            split_regex = r"(?![" + dont_split + "])" + resplit
        else:
            split_regex = resplit

        self.split_regex = regex.compile(split_regex)
        try:
            self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle')
        except LookupError:
            logging.info("Downloading NLTK punkt tokenizer")
            nltk.download('punkt')
            self.sent_tokenzier = nltk.load('tokenizers/punkt/english.pickle')
        self.word_tokenizer = nltk.TreebankWordTokenizer()
示例#2
0
    def __init__(self, lang):
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            for i in range(3):
                signal.signal(signal.SIGALRM, timeout)
                signal.alarm(120)
                try:
                    result = nltk.download('punkt', quiet=True)
                    signal.alarm(0)
                    break
                except myTimeout:
                    pass
            else:
                raise Exception(
                    "Unable to download 'punkt' NLTK data after 3 retries: try to download it manually or check your internet connection."
                )

        langname = self.getLanguageName(lang.lower())

        try:
            self.segmenter = load(
                'tokenizers/punkt/{0}.pickle'.format(langname))
        except:
            self.segmenter = load('tokenizers/punkt/english.pickle')
示例#3
0
def Q1():
    text = []
    stopwords = nltk.load('stopwords.txt')
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''"
    ]
    i = 1
    while i <= 10:
        path = '/txt/' + str(i) + '.txt'
        temp = nltk.load(path, encoding='gbk')
        temp = nltk.word_tokenize(temp)
        temp = [words.lower() for words in temp]
        temp = [words for words in temp if words not in english_punctuations]
        temp = [words for words in temp if words not in stopwords]
        text.append(temp)
        i += 1

    #print(text)
    wordlist = []
    for sens in text:
        for words in sens:
            if words not in wordlist:
                wordlist.append(words)
            else:
                continue

    print(len(wordlist))
    wb = xlwt.Workbook()
    ws = wb.add_sheet('TF')
    i = 0
    while i != len(wordlist):
        ws.write(i + 1, 0, label=wordlist[i])
        i += 1

    i = 0
    while i != 10:
        ws.write(0, i + 1, label='text' + str(i + 1))
        i += 1

    i = 0
    while i != 10:
        j = 0
        for word in wordlist:
            ctr = 0
            for item in text[i]:
                if word == item:
                    ctr += 1
                else:
                    continue
            ws.write(j + 1, i + 1, label=ctr)
            j += 1
        i += 1
    wb.save('data.xls')
示例#4
0
    def tokenize_corpus(self, corpus):
        """Read the corpus a list sentences, each of which is a list of
        tokens and the spans in which they occur in the text."""
        if os.path.isdir(corpus):
            corpus_dir = corpus
            corpus = [
                os.path.join(corpus_dir, fn) for fn in os.listdir(corpus_dir)
            ]
        else:
            corpus = [corpus]

        tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english'))

        for filename in corpus:
            with open(filename) as fin:
                print(filename)
                data = fin.read()

            segment_start = 0

            for span in ps.split_quoted_quotes(data):
                for sent_tokens in split_sentences(span, tokenizer,
                                                   segment_start):
                    yield sent_tokens
                segment_start += len(span)
示例#5
0
    def tokenize_corpus(self, corpus):
        """Read the corpus a list sentences, each of which is a list of
        tokens and the spans in which they occur in the text."""
        if os.path.isdir(corpus):
            corpus_dir = corpus
            corpus = [
                os.path.join(corpus_dir, fn) for fn in os.listdir(corpus_dir)
            ]
        else:
            corpus = [corpus]

        tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english'))

        for filename in corpus:
            with open(filename) as fin:
                data = fin.read()

            for start, end in tokenizer.span_tokenize(data):
                sent = data[start:end]
                sent_tokens = []
                matches = re.finditer(
                    r'\w+|[\'\"\/^/\,\-\:\.\;\?\!\(0-9]', sent
                )
                for match in matches:
                    mstart, mend = match.span()
                    sent_tokens.append(
                        (match.group(0).lower().replace('_', ''),
                         (mstart+start, mend+start))
                    )
                yield sent_tokens
def Q1():

    text = nltk.load('text_0.txt', encoding='gbk')  # code for Q1a

    # token_sentlist = nltk.sent_tokenize(text)
    #
    # token_list = []
    #
    # for sent in token_sentlist:
    #     token_list.append(nltk.word_tokenize(sent))

    token_list = nltk.word_tokenize(text)  # code for Q1a

    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''"
    ]

    token_list = [
        word for word in token_list if word not in english_punctuations
    ]

    token_list1 = nltk.pos_tag(token_list)

    print(len(token_list1))  # code for Q1a

    print(token_list1)  # code for Q1a

    token_list2 = [w.lower() for w in token_list]

    token_list2 = nltk.pos_tag(token_list2)

    print(token_list2)
示例#7
0
def run(fpath):
    posLex = loadLexicon('positive-words.txt')
    negLex = loadLexicon('negative-words.txt')

    # make a new tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    # read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    # split sentences
    sentences = sent_tokenize(text)

    freq = {
    }  # keep track of noun frequency  (number of sentences that include the noun)

    matched4gramsPerSent = [
    ]  # will hold all 4grams with the following structure: not <any word>  <pos/neg word>  <noun>

    # for each sentence
    for sentence in sentences:
        matched4grams = processSentence(sentence, freq, posLex, negLex, tagger)
        matched4gramsPerSent.append(matched4grams)

    freqNouns = getTop3(freq)  # atts=None#getAtts() #['bike','size']
    final4grams = set()  # final result

    for fgrams in matched4gramsPerSent:  # for each sentence
        for fg in fgrams:  # for each matched 4gram in this sentence
            if fg[3] in freqNouns: final4grams.add(' '.join(fg))

    return final4grams
示例#8
0
 def __init__(self):
     self._word_tokenizer = nltk.TreebankWordTokenizer()
     if FLAGS.punkt_tokenizer_file is not None:
         self._sent_tokenizer = py_utils.load_pickle(
             FLAGS.punkt_tokenizer_file)
     else:
         self._sent_tokenizer = nltk.load("tokenizers/punkt/english.pickle")
示例#9
0
def run(fpath):
    dict = {
        'idea': 30,
        'thing': 70,
        'vitaan': 9,
        'good': 7,
        'might': 88,
        'end': 99
    }
    posLex = loadLexicon('positive-words.txt')
    negLex = loadLexicon('negative-words.txt')

    #make a new tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    #read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    #split sentences
    sentences = sent_tokenize(text)

    # for each sentence
    for sentence in sentences:
        print(processSentence(sentence, posLex, negLex, tagger))

    freqNouns = getTop3(dict)

    return freqNouns
def Q2b():
    # nltk.download('wordnet')
    text = nltk.load('text.txt', encoding='gbk')  # code for Q2a
    token_list = nltk.sent_tokenize(text)
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''", "-"
    ]
    token_list = [nltk.word_tokenize(sen) for sen in token_list]
    new_token = []
    for sens in token_list:
        sens = [word for word in sens if word not in english_punctuations]
        new_token.append(sens)

    new_token = [nltk.pos_tag(sen) for sen in new_token]
    print(new_token)
    lemmatized = []

    for sen in new_token:
        for word in sen:
            if "V" in word[1]:
                w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v')
            else:
                w = WordNetLemmatizer().lemmatize(word[0], 'n')
            lemmatized.append(w.lower())
    # test = [WordNetLemmatizer().lemmatize(new_token)]
    # print(new_token[1])
    print(lemmatized)
示例#11
0
def main():
    print "Loading word2vec"
    global word2vec
    word2vec = Word2Vec.load_word2vec_format(sys.argv[2], binary=True)
    tagger = load("taggers/maxent_treebank_pos_tagger/english.pickle")
    f_sentences = codecs.open(sys.argv[1], encoding="utf-8")
    invalid = list()
    valid = list()
    on = False
    for line in f_sentences:
        if line.startswith("#"):
            continue
        if line.startswith("VALID"):
            on = True
            continue
        sentence = Sentence(line.strip(), "ORG", "LOC", 6, 1, 2, tagger)
        for rel in sentence.relationships:
            t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after)
            if on is True:
                valid.append(t)
            elif on is False:
                invalid.append(t)
    f_sentences.close()

    for v in valid:
        for i in invalid:
            score = similarity_3_contexts(v, i)
            print "VALID", v.e1, v.e2, "\t", v.bet_words
            print "INVALID", i.e1, i.e2, "\t", i.bet_words
            print score
示例#12
0
def run(fpath):

    #make a new tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    #read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    #split sentences
    sentences = sent_tokenize(text)
    print('NUMBER OF SENTENCES: ', len(sentences))

    adjAfterAdv = []

    # for each sentence
    for sentence in sentences:

        #tokenize the sentence
        terms = nltk.word_tokenize(sentence)

        POStags = ['JJ', 'RB']  # POS tags of interest
        POSterms = getPOSterms(terms, POStags, tagger)

        adjectives = POSterms['JJ']
        adverbs = POSterms['RB']

        #get the results for this sentence
        adjAfterAdv += getAdvAdjTwograms(terms, adjectives, adverbs)

    return adjAfterAdv
示例#13
0
def main(grammar_filename, sentence_filename, output_filename):
    # Load CNF grammar
    grammar = load(grammar_filename)

    # Generate parser based on grammar
    parser = CKYParser(grammar=grammar)

    # Iterate over sentences in sentence_filename, produce parses and write to file with output_filename
    with open(sentence_filename, 'r') as infile:
        number_parses = []
        with open(output_filename, 'w') as outfile:
            for line in infile.readlines():
                # Strip any trailing whitespace from line (including newlines)
                line = line.rstrip()
                print(line)
                outfile.write(line + '\n')
                valid_parses = parser.parse_sentence(sentence=line)
                for tree in valid_parses:
                    print(tree)
                    outfile.write(str(tree) + '\n')
                print('Number of parses: %d' % len(valid_parses))
                print()
                number_parses.append(len(valid_parses))
                outfile.write('Number of parses: %d\n\n' % len(valid_parses))
            avg_number_parses = np.mean(number_parses)
            print('Average number of parses: %.3f' % avg_number_parses)
示例#14
0
def run(fpath):

    #load the positive and negative lexicons
    posLex = loadLexicon('positive-words.txt')
    negLex = loadLexicon('negative-words.txt')

    #make a new tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    #read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    #split sentences
    sentences = sent_tokenize(text)

    structList = []
    for sentence in sentences:  #for each sentence

        #get the results for this sentence
        structList += processSentence(sentence, posLex, negLex, tagger)

    return structList
示例#15
0
    def reset_model(self, model_path):
        """
        Reset the base model

        :param model_path:  Model path for sentence tokenization
        """
        self.__tokenizer = nltk.load(model_path)
示例#16
0
def ngrammer(text):
    # tag for every word based on sentence.
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)  # loading a pre-trained model

    # split sentences
    # converts to list of sentences. can do '.' split but not always true coz if i am saying 6.78 it will take as poimt or what if i have a ?
    sentences = sent_tokenize(text)
    print('NUMBER OF SENTENCES: ', len(sentences))

    # holds the adverb-adjective pairs found inthe text(list of 2-grams)
    nounAfterAdj = []

    # for each sentence
    for sentence in sentences:

        # tokenize the sentence (list of all words in sentence) (can do split a t space but not always works so use this)
        terms = nltk.word_tokenize(sentence)

        # do POS tagging on the tokenized sentence
        tagged_terms = tagger.tag(terms)

        for i in range(len(tagged_terms) - 1):  # for every tagged term
            term1 = tagged_terms[i]  # current term
            term2 = tagged_terms[i + 1]  # following term

            # re.match checks if it starts with same prefix. re.look looks for whole word
            # current term is an adverb, next one is an adjective
            if re.match('JJ', term1[1]) and re.match('NN', term2[1]):
                # add the adverb-adj pair to the list
                nounAfterAdj.append((term1[0].lower(), term2[0].lower()))

    return nounAfterAdj
示例#17
0
    def transformText(self, textData, pcaModelPath):
        self.loadNltk()
        num_texts = textData.size
        columns = []
        tagdict = nltk.load('help/tagsets/upenn_tagset.pickle')
        for key in tagdict.keys():
            columns.append(str(key))

        df = pd.DataFrame(columns=columns)

        for i in range(0, num_texts):
            if (i % 500 == 0):
                print("ShallowSyntax: Processed ", i, "/", num_texts)

            new_row = pd.DataFrame(index=[i], columns=columns)
            for key in tagdict.keys():
                new_row.set_value(i, key, 0)

            text = nltk.tokenize.word_tokenize(
                (str(textData[i]).replace(r'"(.*?)"', '')))
            tagged_text = nltk.pos_tag(text)
            tag_fd = nltk.FreqDist(tag for (word, tag) in tagged_text)
            for (key, value) in tag_fd.items():
                if (key in tagdict.keys()):
                    new_row.set_value(i, key, 100 * (value / tag_fd.N())**2)
            df = pd.concat([df, new_row])

        self.pca = pickle.load(open(pcaModelPath, "rb"))

        transformed_features = self.pca.transform(df.values)

        self.shallow_syntax_features = pd.DataFrame(transformed_features)
示例#18
0
def run(fpath):

    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    #read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    #split sentences
    sentences = sent_tokenize(text)
    output = []
    posLex = loadLexicon('positive-words.txt')
    negLex = loadLexicon('negative-words.txt')

    # for each sentence
    for sentence in sentences:

        sentence = re.sub(
            '[^a-zA-Z\d]', ' ', sentence
        )  #replace chars that are not letters or numbers with a spac
        sentence = re.sub(' +', ' ',
                          sentence).strip()  #remove duplicate spaces
        terms = nltk.word_tokenize(sentence.lower())
        c = 'e'
        output += processSentence(terms, posLex, negLex, tagger, c)

    return output
示例#19
0
    def initializer(self):
        # Use Encoder class as a container for global data
        Encoder.tokenizer = get_nmt_tokenizer(
            library=self.args.tokenizer_library,
            model_name=self.args.tokenizer_type,
            tokenizer_model=self.args.tokenizer_model,
            vocab_file=self.args.vocab_file,
            merges_file=self.args.merge_file,
            delimiter=self.args.delimiter,
        )
        if self.args.split_sentences:
            if not nltk_available:
                print("NLTK is not available to split sentences.")
                exit()
            splitter = nltk.load("tokenizers/punkt/english.pickle")
            if self.args.keep_newlines:
                # this prevents punkt from eating newlines after sentences
                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
                    train_text=splitter._params,
                    lang_vars=CustomLanguageVars())
            else:
                Encoder.splitter = splitter

        else:
            Encoder.splitter = IdentitySplitter()
示例#20
0
def run(fpath):

    #make a new tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)
    posLex = loadLexicon('positive-words.txt')
    negLex = loadLexicon('negative-words.txt')

    #read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    #split sentences
    sentences = sent_tokenize(text)
    #print ('NUMBER OF SENTENCES: ',len(sentences))

    reqdString = []

    # for each sentence
    for sentence in sentences:

        sentence = re.sub(
            '[^a-zA-Z\d]', ' ', sentence
        )  #replace chars that are not letters or numbers with a spac
        sentence = re.sub(' +', ' ',
                          sentence).strip()  #remove duplicate spaces

        reqdString += processSentence(sentence, posLex, negLex, tagger)

    return reqdString
def run(fpath):

    posLex = loadLexicon("positive-words.txt")
    negLex = loadLexicon("negative-words.txt")

    #make a new tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    #read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    #split sentences
    sentences = sent_tokenize(text)

    fgram = []

    # for each sentence
    for sentence in sentences:

        fgram += processSentence(sentence, posLex, negLex, tagger)
        #adjAfterAdv+=getAdvAdjTwograms(terms, adjectives, adverbs)

    return fgram
示例#22
0
def run(fpath):

    # make a new tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    # read the input
    f = open(fpath)
    text = f.read().strip()
    f.close()

    # split sentences
    sentences = sent_tokenize(text)
    print('NUMBER OF SENTENCES: ', len(sentences))

    adjAfterAdv = []  # holds the adverb-adjective pairs found in the text

    # for each sentence
    for sentence in sentences:

        terms = nltk.word_tokenize(sentence)  # tokenize the sentence
        # do POS tagging on the tokenized sentence
        tagged_terms = tagger.tag(terms)

        for i in range(len(tagged_terms) - 1):  # for every tagged term
            term1 = tagged_terms[i]  # current term
            term2 = tagged_terms[i + 1]  # following term

            # current term is an adverb, next one is an adjective
            if re.match('RB', term1[1]) and re.match('JJ', term2[1]):
                # add the adverb-adj pair to the list
                adjAfterAdv.append((term1[0], term2[0]))

    return adjAfterAdv
示例#23
0
 def setup(self):
     self.tokenizer = tft.SentencepieceTokenizer(
         model=tf.io.gfile.GFile(self.vocab_model_file, "rb").read())
     self.sentence_tokenizer = nltk.load(SENTENCE_TOKENIZER_PATH)
     self.delimiter_range_pair = rendering_utils.get_default_delimiter_range_pair(
         task=self.task,
         delimiter_type=self.delimiter_type,
     )
示例#24
0
def run(path):   
    # initialize list
    adjWithNoun = []
    
    # make a tagger
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)

    # load sexicon of stop words
    stopLex = set(stopwords.words('english'))
    
    # get raw review text from file
    with open(in_path, 'rb') as f:
        review = []
        reader = csv.reader(f)
        for row in reader:
            review = row[2]
    
            print(review)

            
            try:
            # split sentences
                sentences = sent_tokenize(review)
                #print (sentences)
                print 'NUMBER OF SENTENCES: ', len(sentences)
                continue
            except:
                print "Oops!  That was not tokenizable. Try again..."

            # for each sentence
            for sentence in sentences:
                
                print (sentence)
                # replace chars that are not letters or numbers with a space
                sentence = re.sub('[^a-zA-Z\d]',' ',sentence)
         
                # remove duplicate spaces
                sentence = re.sub(' +',' ', sentence).strip()

                # tokenize the lowercase sentence
                terms = nltk.word_tokenize(sentence.lower())   
                print (terms)
                
                # POS tags of interest 
                POStags = ['JJ','NN'] 		
                POSterms = getPOSterms(terms,POStags,tagger)

                # get the set of adjectives and nouns
                adjectives = POSterms['JJ']
                nouns = POSterms['NN']

                # get the results for this sentence 
                # call function to get ngrams
                n = 2
                adjWithNoun += getNounAdjNgrams(terms, nouns, adjectives, n)
		
	return adjWithNoun
示例#25
0
def initialize_hardcoded():
    global pos_tags
    global ner_tags

    pos_tags = nltk.load('help/tagsets/upenn_tagset.pickle')
    ner_tags = [
        'GSP', 'LOCATION', 'GPE', 'ORGANIZATION', 'PERSON', 'O', 'PERSON',
        'FACILITY'
    ]
示例#26
0
 def __call__(self, text):
     if self._sent_tokenizer is None:
         self._tokenizer = nltk.load(
             'tokenizers/punkt/{0}.pickle'.format('english'))
         self._sent_tokenizer = self._tokenizer.tokenize
     sentences = self._sent_tokenizer(text)
     tokens = []
     for sent in sentences:
         tokens.extend(nltk.word_tokenize(sent, preserve_line=True))
     return tokens
示例#27
0
文件: utility.py 项目: LucaPrg/TLN
def get_lhs_terminal(grammar=load(grammar_url)):
    """
    Return a production list of lhs(left hand side) that are terminal
    :param grammar:
    :return:
    """
    lhs_list = []
    for p in grammar.productions():
        if p.lhs() not in lhs_list and is_terminal(p.rhs()[0]):
            lhs_list.append(p.lhs())
    return lhs_list
示例#28
0
def ent():
    random = []
    spam = []
    stopwords = nltk.load('stopwords.txt')
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''"
    ]
    i = 1
    while i <= 10:
        path = '/txt/random/tweet' + str(i) + '.txt'
        temp = nltk.load(path, encoding='gbk')
        temp = nltk.word_tokenize(temp)
        temp = [words.lower() for words in temp]
        temp = [words for words in temp if words not in english_punctuations]
        temp = [words for words in temp if words not in stopwords]
        random.append(temp)
        i += 1

    i = 1
    while i <= 10:
        path = '/txt/spam/tweet' + str(i) + '.txt'
        temp = nltk.load(path, encoding='gbk')
        temp = nltk.word_tokenize(temp)
        temp = [words.lower() for words in temp]
        temp = [words for words in temp if words not in english_punctuations]
        temp = [words for words in temp if words not in stopwords]
        spam.append(temp)
        i += 1

    # random = np.array(random)
    # spam = np.array(spam)

    # print(random)
    # print(spam)
    ent_random = 0.0
    random = [' '.join(tweet) for tweet in random]
    spam = [' '.join(tweet) for tweet in spam]

    print(calcShannonEnt(random))
    print(calcShannonEnt(spam))
示例#29
0
def run(fpath):
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)
    f=open(fpath)
    text=f.read().strip()  
    f.close()
    sentence=sent_tokenize(text)
    print ('NUMBER OF SENTENCES: ',len(sentence))
    posLex=loadLexicon('positive-words.txt')
    negLex=loadLexicon('negative-words.txt')
    fourword=processSentence(sentence,posLex,negLex,tagger)
    return fourword
示例#30
0
def PMI():
    text = []
    stopwords = nltk.load('stopwords.txt')
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''"
    ]
    i = 1
    while i <= 10:
        path = '/txt/' + str(i) + '.txt'
        temp = nltk.load(path, encoding='gbk')
        temp = nltk.word_tokenize(temp)
        temp = [words.lower() for words in temp]
        temp = [words for words in temp if words not in english_punctuations]
        temp = [words for words in temp if words not in stopwords]
        text.append(temp)
        i += 1

    pairs = []
    words = nltk.load('word1.txt')
    words = nltk.word_tokenize(words)
    wb = xlwt.Workbook()
    ws = wb.add_sheet('TF')
    # print(words)
    i = 0
    for word in words:
        for sens in text:
            if word in sens:
                for item in sens:
                    if item in words and item != word:
                        temp1 = item + ' ' + word
                        temp2 = word + ' ' + item
                        if temp1 not in pairs and temp2 not in pairs:
                            pairs.append(temp2)
                            ws.write(i + 1, 0, word)
                            ws.write(i + 1, 1, item)
                            i += 1

    print(pairs)
    wb.save('PMI.xls')
示例#31
0
def lemmatize(article):    
    _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
    tagger = load(_POS_TAGGER)
    wnpos = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
    lemmatizer = WordNetLemmatizer()

    words=article.split(' ')
    tagged=tagger.tag(words)
    words2 = [lemmatizer.lemmatize(t[0],wnpos(t[1])) for t in tagged]
    ret=""
    for i in words2:
        ret += i + ' '
    return ret
示例#32
0
    def __preloaded_nltk_tokenizer(self):
        # Code pulled out of nltk == 3.2.5
        tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english'))
        sent_tokenizer = tokenizer.tokenize

        def word_tokenize(text):
            sentences = sent_tokenizer(text)
            tokens = []
            for sent in sentences:
                tokens.extend(nltk.word_tokenize(sent, preserve_line=True))
            return tokens

        return word_tokenize
示例#33
0
def process_sentece(review):
        review=re.sub('[^a-zA-Z\d]',' ',review)#replace chars that are not letters or numbers with a spac
        review=re.sub(' +',' ',review).strip()#remove duplicate spaces
        #tokenize the sentence
        _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        tagger = load(_POS_TAGGER)
        terms = nltk.word_tokenize(review.lower()) #removecase
        POStags=["JJ","JJR","JJS","RB","RP","NN","NNS","NNP","RB","RBR","RBS","VB","VBD","VBG","VBN","VBP","VBZ","CC","DT","TO","UH","WDT"] # POS tags of interest 		
        FinalPostTerms=get_attributes(terms,POStags,tagger)
               

        #Remoeve unnecessary words
        Particles=FinalPostTerms["RP"]
        cleantemrs=clean(terms,Particles)
        ProperNames=FinalPostTerms["NNP"]
        cleantemrs=clean(terms,ProperNames)
        Conjuctions=FinalPostTerms["CC"]
        cleantemrs=clean(terms,Conjuctions)
        Determiners=FinalPostTerms["DT"]
        cleantemrs=clean(cleantemrs,Determiners)
        TOERS=FinalPostTerms["TO"]
        cleantemrs=clean(cleantemrs,TOERS)
        Interjections=FinalPostTerms["UH"]
        cleantemrs=clean(cleantemrs,Interjections)
        WhichDeterminer=FinalPostTerms["WDT"]
        cleantemrs=clean(cleantemrs,WhichDeterminer)
        cleanterms=clean_uselesswords(cleantemrs)
        
        
        #Get all Names
        Nouns=FinalPostTerms["NN"]
    
        #Get all Adjectives
        AdjectivesSimp=FinalPostTerms["JJ"]
   
        #Get all Verbs 
        VerbsBasic=FinalPostTerms["VB"]
        
        #Get all Adverbs
        AdverbsSimp=FinalPostTerms["RB"]

        notanyword=[]
        #get the results for this sentence 
        notanyword+=getAdjNoun(cleanterms,Nouns,AdjectivesSimp)
        notanyword+=getAdjNounAnyNoun(cleanterms,Nouns,AdjectivesSimp)
        notanyword+=getNounNoun(cleanterms,Nouns)
        notanyword+=getNounVerbAdj(cleanterms,Nouns,AdjectivesSimp,VerbsBasic)
        notanyword+=getNounVerbAdvAdj(cleanterms,Nouns,AdjectivesSimp,VerbsBasic,AdverbsSimp)
        notanyword+=getNounAnyNounVerbAdvAdj(cleanterms,Nouns,AdjectivesSimp,VerbsBasic,AdverbsSimp)
        return notanyword
示例#34
0
def load_stopwords(bitextfn):
    """Determine source language from the input filename."""
    langs = bitextfn.split(".")[1]
    sl = langs.split("-")[0]
    assert sl in ["en", "es"], "wrong sl {0}".format(sl)
    sl = "english" if sl == "en" else "spanish"

    wordtext = nltk.load("corpora/stopwords/{0}".format(sl), format="text")
    wordlist = wordtext.split()

    out = set(wordlist)
    ## XXX: remove some common verbs from the set.
    out.difference_update({"estar"})
    out.difference_update({"have"})
    out.difference_update({"be"})
    out.difference_update({"do"})
    return out
def word_token_gen(text):
    """
    Parse the text into a series of WorkTokens.
    I don't use the default nltk work tokenizer here because it doesn't include offsets.
    Instead I am using a RegexpTokenizer which does not do a good of job
    on things like contractions.
    The sentence tokenizer is probably not necessairy at the moment,
    but if the work tokenizer is replaced with something more complex it might
    be needed.
    """
    sent_tokenizer = nltk.load('tokenizers/punkt/english.pickle')
    for sent_offsets in sent_tokenizer.span_tokenize(text):
        for word_offsets in word_tokenizer.span_tokenize(text[sent_offsets[0]:sent_offsets[1]]):
            yield WordToken(
                text,
                sent_offsets[0] + word_offsets[0],
                sent_offsets[0] + word_offsets[1]
            )
示例#36
0
def split_sentences(text, tokenizer=None, offset=0):
    """\
    Splits text into lists of lists. Each list contains a sentence, which is a
    list of normalized tokens, including the token's indexes in the original
    text.

    """

    if tokenizer is None:
        tokenizer = nltk.load('tokenizers/punkt/{0}.pickle'.format('english'))
    for start, end in tokenizer.span_tokenize(text):
        sent = text[start:end]
        sent_tokens = []
        matches = re.finditer(
            r'\w+|[\'\"\/^/\,\-\:\.\;\?\!\(0-9]', sent
        )
        for match in matches:
            mstart, mend = match.span()
            seg_start = start + offset
            sent_tokens.append(
                (match.group(0).lower().replace('_', ''),
                 (mstart+seg_start, mend+seg_start))
            )
        yield sent_tokens
示例#37
0
import csv
import time
import matplotlib
import numpy as np
import nltk
import enchant
import plotly.plotly as ply
import plotly.graph_objs as pgo

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tag.perceptron import PerceptronTagger
from nltk.util import ngrams
from enchant.checker import SpellChecker

TAGS = sorted(nltk.load('help/tagsets/upenn_tagset.pickle').keys())
SEP = '|'
MAXNG = 3
tagger = PerceptronTagger()

F_WD = os.path.dirname(__file__)

F_INP = os.path.join('.', 'inputs', 'chs')
F_OUT = os.path.join('.', 'outputs', 'chs')

F_SRC = '/chs/'

class Doc:
    raws = {}
    tokenlists = {}
    datasets = {}
示例#38
0
文件: data.py 项目: pblouw/semvec
 def __init__(self, min_count=0, min_len=4):
     self.stopwords = nltk.corpus.stopwords.words("english")
     self.tokenizer = nltk.load("tokenizers/punkt/english.pickle")
     self.min_count = min_count
     self.min_len = min_len
示例#39
0
def chunker():
    """Return an instance of ne_chunker by loading a stored model"""

    return nltk.load(MULTICLASS_NE_CHUNKER)
示例#40
0
from nltk.stem.snowball import SpanishStemmer
import unicodedata
import pickle

def stopwords_from_file(stopwords_filepath = "data/stopwords/spa.txt"):
    stopwords = codecs.open(stopwords_filepath, "r", "utf-8")
    ret = set()
    for line in stopwords:
        word = line.rstrip("\n")
        word = regex.sub(" *\|.*$", "", word)
        if regex.search("[^\s]", word):
            word = unicodedata.normalize("NFD", word)
            ret.add(word)
    return ret

tokenizer = nltk.load("tokenizers/punkt/spanish.pickle")
stopwords = stopwords_from_file("../../src/qlc/data/stopwords/spa.txt")
stemmer = SpanishStemmer()

doc = ""
doc_id = 0
sentence_id = 0

sentences_for_stem = collections.defaultdict(set)
docs_for_stem = collections.defaultdict(set)

for l in fileinput.input("/Users/ramon/qlc-github/data/eswiki/AA/wiki00"):
    l = l.strip()
    l = l.decode("utf-8")
    l = unicodedata.normalize("NFD", l)
    
示例#41
0
"""
NOTE: This works, but it's a mess, so I'm planning on
cleaning it up and refactoring it shortly.
"""
import nltk
import string
from collections import namedtuple
from subprocess import call
import sys

tokenizer = nltk.load('tokenizers/punkt/english.pickle')

Story = namedtuple('Story', ['text', 'queries'])
Query = namedtuple('Query', ['text', 'choices', 'answer'])


def load_answers(filename):
    with open(filename, 'r') as f:
        answers = f.read().split('\n')[:-1]
        answers = [a.split('\t') for a in answers]
        answers = [[c.strip() for c in a] for a in answers]
        answers = {i:j for i, j in enumerate(answers)}
    return answers


def load_stories(filename, answerfile):
    stories = []

    with open(filename, 'r') as f:
        data = f.read()
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import ViterbiParser
    from nltk.grammar import toy_pcfg1, toy_pcfg2
    from nltk.draw.tree import draw_trees
    from nltk import Tree
    from nltk.draw.util import CanvasFrame
    from nltk.draw import TreeWidget

    # Define two demos.  Each demo has a sentence and a grammar.
    # demos = [('move the green sphere to the bottom left corner', learned_pcfg),
    #          ('move the green ball over the red block', learned_pcfg),
    #          ('take the green pyramid and put it in the top left corner', learned_pcfg),
    #           ('put the green pyramid on the red block', learned_pcfg),
    #           ('move the red cylinder and place it on top of the blue cylinder that is on top of a green cylinder', learned_pcfg),]

    # Ask the user which demo they want to use.
    # print()
    # for i in range(len(demos)):
    #     print('%3s: %s' % (i+1, demos[i][0]))
    #     print('     %r' % demos[i][1])
    #     print()
    # print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
    # try:
    #     snum = int(sys.stdin.readline().strip())-1
    #     sent, grammar = demos[snum]
    # except:
    #     print('Bad sentence number')
    #     return

    max_scene = 1

    if max_scene<10:            sc = '0000'+str(max_scene)
    elif max_scene<100:         sc = '000'+str(max_scene)
    elif max_scene<1000:        sc = '00'+str(max_scene)
    elif max_scene<10000:       sc = '0'+str(max_scene)

    g = 'grammar_'+sc+'.txt'
    learned_pcfg = load('/home/omari/Dropbox/robot_modified/AR/grammar/'+g)
    grammar = learned_pcfg

    file1 = open('/home/omari/Dropbox/robot_modified/AR/hypotheses/matched_commands.txt', 'r')
    g1 = [i for i in file1.readlines()]
    for line in g1:
        line = unicode(line,encoding='utf-8')
        sent = line.split('\n')[0].split('-')[-1]
        scene = line.split('\n')[0].split('-')[0]
        sent_num = line.split('\n')[0].split('-')[1]
        print(line)
        if scene == '239' and sent_num == '0':  continue


        # Tokenize the sentence.
        tokens = sent.split()

        parser = ViterbiParser(grammar)
        all_parses = {}

        # print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
        parser.trace(3)
        parses = parser.parse_all(tokens)
        average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
                   if parses else 0)
        num_parses = len(parses)
        for p in parses:
            all_parses[p.freeze()] = 1

        # Print some summary statistics
        # print()
        # print('Time (secs)   # Parses   Average P(parse)')
        # print('-----------------------------------------')
        # print('%11.4f%11d%19.14f' % (time, num_parses, average))
        parses = all_parses.keys()
        if parses:
            p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
        else: p = 0
        # print('------------------------------------------')
        # print('%11s%11d%19.14f' % ('n/a', len(parses), p))

        # Ask the user if we should draw the parses.
        # print()
        # print('Draw parses (y/n)? ', end=' ')
        # if sys.stdin.readline().strip().lower().startswith('y'):

        #     print('  please wait...')
        # draw_trees(*parses)

        cf = CanvasFrame()
        # t = Tree(parses)
        t = Tree.fromstring('(S  (CH_POS_PREPOST move)  (PRE_POST    (PRE      (the the)      (_entity (F_HSV green) (F_SHAPE sphere)))    (PREPOST_connect (to to) (the the))    (POST      (_F_POS (F_POS (_bottom_left (bottom bottom) (left left)))) (corner corner))))')

        tc = TreeWidget(cf.canvas(), t, draggable=1,
                        node_font=('helvetica', -14),
                        leaf_font=('helvetica', -12),
                        roof_fill='white', roof_color='black',
                        leaf_color='green4', node_color='blue4')
        cf.add_widget(tc,10,10)

        # tc = TreeWidget(cf.canvas(),t)
        # cf.add_widget(tc,10,10) # (10,10) offsets
        cf.print_to_file('/home/omari/Dropbox/robot_modified/trees/scene-'+scene+'-'+sent_num+'.ps')
        cf.destroy()
示例#43
0
def tokenize_into_sentences(content):
    # Lazy loaded?
    # When creating a new post, tokenize the content with this function
    detector = nltk.load('tokenizers/punkt/english.pickle')        
    return detector.tokenize( content )
示例#44
0
文件: main.py 项目: davidxc/wordtag
 def load_resource(self):
     """Loads the resource needed for part of speech tagging."""
     
     #Load resource using the NLTK protocol. nltk.load() searches for the resource URL in the directories specified by nltk.data.path
     nltk.load('taggers/maxent_treebank_pos_tagger/english.pickle')