예제 #1
0
def segment_en(texts, flag_keep_number=False):
    tk = StanfordTokenizer()
    results = {}
    for text in texts:
        if flag_keep_number:
            words = tk.tokenize(text)
        else:
            words = map(replace_number, tk.tokenize(text))
        segmented = ' '.join(words).lower()
        results[text] = segmented
    return results
예제 #2
0
def readwordarr(isTokenize=True):
    posWords = []
    negWords = []
    stopwords = getstopword()
    if isTokenize:
        tokenizer = StanfordTokenizer()
        with open(negfilepath, 'r', encoding='utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding='utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    else:
        with open(negfilepath, 'r', encoding='utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding='utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    return posWords, negWords
예제 #3
0
def readwordarr(isTokenize = True):
    posWords = []
    negWords = []
    stopwords = getstopword()
    if isTokenize:
        tokenizer = StanfordTokenizer()
        with open(negfilepath, 'r', encoding = 'utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word) 
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding = 'utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))       
    else:
        with open(negfilepath, 'r', encoding = 'utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr=line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding = 'utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr=line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    return posWords,negWords
예제 #4
0
class WordSegment(object):
    def __init__(self, user_dict=None):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.enTokenizer = StanfordTokenizer(
            path_to_jar=prefix + conf_tokenizer["path_to_jar"]
        )
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )

        # TODO:
        # 這裡要加上自定義字典

    def get_tokens(self, text):
        tokens = self.enTokenizer.tokenize(text)

        return self.en_tagger.tag(tokens)

    def get_new_words(self, text):
        pass
예제 #5
0
def tokenize_stopwords_stemmer(texts):
    #texts:列表存放的字符串
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts = texts[0]
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar")
    texts_tokenized = tokenizer.tokenize(Str_texts)  #输入必须是字符串

    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)
    texts_filtered0 = [
        document for document in texts_tokenized
        if not document in pa1.findall(document)
    ]

    p2 = r'.+[-_\./].+'
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_') > -1:
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
        else:
            texts_filtered.append(document)

    texts_filtered = [
        document for document in texts_filtered
        if document != '' and document != "''" and document != "``"
    ]

    #stopwords
    english_stopwords = stopwords.words('english')  #得到停词
    texts_filtered_stopwords = [
        document for document in texts_filtered
        if not document in english_stopwords
    ]  #

    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<',
        '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/',
        '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?'
    ]  #得到标点

    texts_filtered = [
        document for document in texts_filtered_stopwords
        if not document in english_punctuations
    ]  #

    porter = nltk.PorterStemmer()
    texts_Stemmered = [porter.stem(t) for t in texts_filtered]  #列表类型

    return texts_Stemmered  #返回一个列表
    """
예제 #6
0
def segment(texts):
    tk = StanfordTokenizer()
    results = {}
    for text in texts:
        words = tk.tokenize(text)
        segmented = ' '.join(words).lower()
        results[text] = segmented
    return results
예제 #7
0
class POSTagger:
    """POSTagger creates a POS tagger for german language. Different tagger are available to use."""
    STAN = "stanford-hgc-tagger"
    SFT = "stanford-fast-tagger"
    TT = "tree-tagger"
    SPACY = "spacy-tagger"

    # paths to Stanford tagger modules
    __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar"
    __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/"

    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
        else:
            raise Exception("Wrong tagger parameter.")

    def tag(self, text):
        """POS tag tokenized text."""
        if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN:
            tokens = self.__tokenizer.tokenize(text)
            return self.__tagger.tag(tokens)
        elif self.tagger_name == POSTagger.TT:
            tags = self.__tagger.tag_text(text)
            tuple_list = []
            tag_list = treetaggerwrapper.make_tags(tags)
            for item in tag_list:
                tuple_list.append((item[0], item[1]))
            return tuple_list
        elif self.tagger_name == POSTagger.SPACY:
            tags = self.__tagger(text)
            tuple_list = []
            for word in tags:
                tuple_list.append((word.orth_, word.tag_))
            return tuple_list
        else:
            pass

#tagger = POSTagger("spacy-tagger")
#doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.")
#print(tagger.tag("Ich werde morgen in die Schule gehen."))
#print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
예제 #8
0
def tokenize_q(qa, phase):
    qas = len(qa)
    MyTokenizer = StanfordTokenizer()
    for i, row in enumerate(tqdm(qa)):
        row['question_toked'] = MyTokenizer.tokenize(row['question'].lower())[:14]
        if i % 50000 == 0:
            json.dump(qa, open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w'))
        if i == qas - 1:
            json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
예제 #9
0
def Tokenize_stopwords_stemmer(texts):
    #print time()
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts = texts[0]
    print os.getcwd()
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar")
    texts_tokenized = tokenizer.tokenize(Str_texts)  #输入必须是字符串
    #print time()
    p2 = r'.+[-_\./"].+'
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_tokenized:
        if document in pa2.findall(document):
            if document.find('_') > -1:
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
        else:
            texts_filtered.append(document)
    #print time()
    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)
    p3 = r'.+">'
    pa3 = re.compile(p3)
    english_stopwords = stopwords.words('english')  #得到停词
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<',
        '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/',
        '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?'
    ]  #得到标点
    texts_filtered0 = []
    for document in texts_filtered:
        if document in pa1.findall(document) or document in pa3.findall(
                document
        ) or document == '' or document == "''" or document == "``" or document in english_stopwords or document in english_punctuations:
            pass
        else:
            texts_filtered0.append(document)
    #print time()

    porter = nltk.PorterStemmer()
    texts_Stemmered = [porter.stem(t) for t in texts_filtered0]  #列表类型
    #print time()

    return texts_Stemmered  #返回一个列表
def tokenize_stopwords_stemmer(texts):
    Str_texts = texts[0]
    # tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(
        path_to_jar=r"C:\Users\zw\Desktop\stanford-parser.jar")  # path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子
    java_path = 'E:soft/Java/jdk1.8.0_121/bin/java.exe'
    os.environ['JAVAHOME'] = java_path
    texts_tokenized = tokenizer.tokenize(Str_texts)  # 输入必须是字符串,进行分词
    # print(texts_tokenized)

    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)  # re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例)
    texts_filtered0 = [document for document in texts_tokenized if not document in pa1.findall(document)]

    p2 = r'.+[-_\/].+'  # 将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_') > -1:  # split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list)
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
            elif document.find('/') > -1:
                texts_filtered = texts_filtered + document.split('/')
        else:
            texts_filtered.append(document)

    texts_filtered = [document for document in texts_filtered if
                      document != '' and document != "''" and document != "``"]  # 过滤掉空格,单引号和--

    # # stopwords
    # english_stopwords =stopwords.words('english')  # 得到停词
    # texts_filtered_stopwords = [document for document in texts_filtered if not document in english_stopwords]  # 过滤掉停词

    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '||',
                            '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '0', '$', '^', '/*', '*/', '/**', '**/',
                            '**', '-', '_', '__', '|', '+', '=', r'-?-', r'@?']  # 得到标点

    texts_filtered = [document for document in texts_filtered if
                      not document in english_punctuations]  # 过滤掉标点
    return texts_filtered
def stanford_tokenizer(str):

    tokenizer = StanfordTokenizer(
        path_to_jar=
        'D:/software/stanford-parser-full-3.7/stanford-parser-3.7.0-models.jar'
    )

    # sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    return tokenizer.tokenize(str)


# if __name__=='__main__':
#     sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
#     result = stanford_tokenizer(sent)
#     print(result)

# st = StanfordPOSTagger('english-bidirectional-distsim.tagger')

# from nltk.tokenize import StanfordTokenizer
# s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
# StanfordTokenizer().tokenize(s)
# s = "The colour of the wall is blue."
# StanfordTokenizer(options={"americanize": True}).tokenize(s)
예제 #12
0
def tokenize(text_list,
             clean_html=False,
             tokenizer="twitter",
             remove_reps=True,
             spell_correct=True):
    if tokenizer == "stanford":
        tolkenizer_obj = StanfordTokenizer()
    elif tokenizer == "twitter":
        tolkenizer_obj = TweetTokenizer()
    else:
        tolkenizer_obj = StringTokenizer()

    token_list = []
    for text in text_list:
        if clean_html:
            text = BeautifulSoup(text).get_text()
        if remove_reps:
            text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        tokens = tolkenizer_obj.tokenize(text)
        if spell_correct:
            tokens = [spell(t) for t in tokens]
        token_list.append(tokens)
    return token_list
예제 #13
0
파일: test.py 프로젝트: tianyaozhu/WSD
tags = semcor.tagged_sents(tag = 'sem')

n = 0

correct = 0
base = 0
total = 0

for sent in corp:

    sentence =  ' '.join(sent)

    print sentence

    parsed = list(parser.parse(tokenizer.tokenize(sentence)))

    for term in tags[n]:
        if len(term)==1 and isinstance(term[0], basestring) and isinstance(term, Tree) and len(wordnet.synsets(term[0])) > 1:
            if isinstance(term.label(), unicode):
                continue
            syn = term.label().synset()
            word = term[0]
            sense_standard = syn

            print word
        
            for pair in parsed[0].triples():
                if pair[0][0] == word:
                    pos = pair[0][1]
                if pair[2][0] == word:
예제 #14
0
from nltk.tag.stanford import StanfordNERTagger, StanfordPOSTagger
from nltk.tokenize import StanfordTokenizer
from wordsegment import load, segment

CUR_DIRECTORY = '/home/wmq/Desktop/DeepText/StanfordNLP'
SEGMENT_PATH = CUR_DIRECTORY + '/stanford-segmenter-3.8.0.jar'
NER_MODEL_PATH = CUR_DIRECTORY + '/english.all.3class.distsim.crf.ser.gz'
NER_JAR_PATH = CUR_DIRECTORY + '/stanford-ner.jar'
POS_MODEL_PATH = CUR_DIRECTORY + '/english-left3words-distsim.tagger'
POS_JAR_PATH = CUR_DIRECTORY + '/stanford-postagger.jar'

ner_tagger = StanfordNERTagger(NER_MODEL_PATH, NER_JAR_PATH, java_options='')
pos_tagger = StanfordPOSTagger(POS_MODEL_PATH, POS_JAR_PATH, java_options='')
tokenizer = StanfordTokenizer(SEGMENT_PATH)
load()

s = "@user nah pretty sure it's jackson's great jokes"
ws = tokenizer.tokenize(s)
print(' '.join(ws))
# print (' '.join(segment('#happythankgiving')))
# s = 'i got to to go formal with my best friend @ phi mu at jsu'.split()
# ner_sent = ner_tagger.tag(s)
# pos_sent = pos_tagger.tag(s)
# print (ner_sent)
# print (pos_sent)
예제 #15
0
from __future__ import absolute_import
예제 #16
0

if __name__ == '__main__':  #very important
    # res = request([["excellent"],["poor"]])
    poshit = 1510000000032
    neghit = 771000000037
    print(poshit)
    print(neghit)
    stopword = ["-LSB-", "-RSB-", "-LRB-", "-RRB-"]
    tokenizer = StanfordTokenizer()
    filename = "F:/course/sentimentcode/rt-polarity.neg"
    file_object = codecs.open(filename, 'r', 'utf-8')
    allres = []
    try:
        all_the_text = file_object.read()
        arr = tokenizer.tokenize(all_the_text)
        la = len(arr)
        correct = 0
        for line in arr:
            ax = line.split()
            wordarr = []
            for word in ax:
                if word in stopword:
                    continue
                wordarr.append(word)
            list = nltk.pos_tag(wordarr)
            result = getPhraseByPos(list)
            if len(result) == 0:
                continue
            allres.append(result)
        posres = f(allres, 1)
예제 #17
0
 t = line.split("<e1>")
 text.append(t[0])
 e1start = len(t[0])
 t = t[1].split("</e1>")
 e1 = t[0]
 text.append(t[0])
 e1end = len(t[0]) + e1start
 t = t[1].split("<e2>")
 text.append(t[0])
 e2start = len(t[0]) + e1end
 t = t[1].split("</e2>")
 text.append(t[0])
 e2 = t[0]
 e2end = len(t[0]) + e2start
 text.append(t[1])
 text = " ".join(tokenizer.tokenize("".join(text)))
 txtfile.write(text)
 txtfile.write("\n")
 offset = 0
 err = False
 while e1 != text[e1start + offset:e1end + offset]:
     offset += 1
     if e1end + offset > len(text):
         break
 if e1end + offset > len(text):
     offset = 0
     e1 = " ".join(tokenizer.tokenize(e1))
     e1end = e1start + len(e1)
     while e1 != text[e1start + offset:e1end + offset]:
         offset += 1
         if e1end + offset > len(text):
예제 #18
0
    from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    from nltk.tokenize import StanfordTokenizer
    workingdirectory = os.getcwd()
    segmenter = StanfordSegmenter(
        path_to_jar=os.path.join(workingdirectory, 'stanford-segmenter.jar'),
        path_to_slf4j=os.path.join(workingdirectory, 'slf4j-api.jar'),
        path_to_sihan_corpora_dict=os.path.join(workingdirectory, 'data'),
        path_to_model=os.path.join(workingdirectory, 'data', 'pku.gz'),
        path_to_dict=os.path.join(workingdirectory, 'data',
                                  'dict-chris6.ser.gz'))
    tokenizer = StanfordTokenizer(
        path_to_jar=os.path.join(workingdirectory, 'stanford-parser.jar'))
    n = 1
    for line in open(sourcefile):
        token = segmenter.segment(line)
        words = tokenizer.tokenize(token)
        with open('%s%s.txt' % (prefix, n), "w",
                  encoding='utf-8') as resultfile:
            resultwrite = csv.writer(resultfile)
            for word in words:
                resultwrite.writerow([word])
        n = n + 1
    print('Done')
elif (tokeniser == 'stanfordctb'):
    from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    from nltk.tokenize import StanfordTokenizer
    workingdirectory = os.getcwd()
    segmenter = StanfordSegmenter(
        path_to_jar=os.path.join(workingdirectory, 'stanford-segmenter.jar'),
        path_to_slf4j=os.path.join(workingdirectory, 'slf4j-api.jar'),
        path_to_sihan_corpora_dict=os.path.join(workingdirectory, 'data'),
예제 #19
0
#     model_path=u"edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
# )
#
# mylist = list(eng_parser.parse(sentence.split()))
# print(len(mylist))
# print("句法分析结果", mylist)

# 依存句法分析
# 对于依存关系的标签说明:http://universaldependencies.org/u/dep/all.html#al-u-dep/det
eng_dependency_parser = StanfordDependencyParser(
    path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar",
    path_to_models_jar=
    r"D:\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar",
    model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

outputs = ' '.join(tokenizer.tokenize("Dole was defeated by Clinton"))
print(outputs)

result = list(eng_dependency_parser.parse(outputs.split()))
for each in result[0].triples():
    print(each)
#     if each[1]=='dobj':
#         # print(each)
#         print(each[0][0])
#         print(each[2][0])
# print("依存句法分析结果:")
# for row in result[0].triples():
#     print(row)
# print(result[0])
# 中文分词
# 还要研究一下,一下代码报错
예제 #20
0
    path_to_slf4j="/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/slf4j-api.jar",
    path_to_sihan_corpora_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data",
    path_to_model="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/pku.gz",
    path_to_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/dict-chris6.ser.gz"

)
str="我在我在博客园开了一个博客。"
print (segmenter.segment(str))

#英文分词


from nltk.tokenize import StanfordTokenizer
tokenizer=StanfordTokenizer(path_to_jar=r"/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/stanford-parser.jar")
sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
print (tokenizer.tokenize(sent))

#中文命名实体识别
from nltk.tag import StanfordNERTagger
chi_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/chinese.misc.distsim.crf.ser.gz'
                             ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar')
print (chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。\r\n'.split()))




#英文命名实体识别
from nltk.tag import StanfordNERTagger
eng_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz'
                             ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar')
print (eng_tagger.tag('Rami Eid is studying at Stony Brook University in NY'.split()))
예제 #21
0
    'interest_3': 2,
    'interest_4': 3,
    'interest_5': 4,
    'interest_6': 5
}

bayes = [[], [], [], [], [], []]

count = [0, 0, 0, 0, 0, 0]

n = 0

for instance in senseval.instances('interest.pos')[0:1599]:
    count[sense[instance.senses[0]]] += 1
    sentence = ' '.join(w for (w, p) in instance.context)
    parsed = list(parser.parse(tokenizer.tokenize(sentence)))
    for triple in parsed[0].triples():
        related = 0
        if triple[0][0] in interest:
            word = triple[2][0]
            related = 1
        if triple[2][0] in interest:
            word = triple[0][0]
            related = 1
        if related == 1:
            exist = 0
            for item in bayes[sense[instance.senses[0]]]:
                if item[0] == word:
                    item[1] += 1
                    exist = 1
            if exist == 0:
예제 #22
0
aparser = argparse.ArgumentParser(
    description="Run CoreNLP tokenizer on a TSV definition file")
aparser.add_argument('input_filepath', type=str, help='input file path')
aparser.add_argument('output_filepath', type=str, help='output file path')
aparser.add_argument('corenlp_postagger_path',
                     type=str,
                     help="path to stanford-postagger.jar")

opt = aparser.parse_args()
tokenizer = StanfordTokenizer(path_to_jar=opt.corenlp_postagger_path,
                              options={
                                  "ptb3Escaping": "false",
                                  "tokenizePerLine": "true",
                                  "tokenizeNLs": "true"
                              })
entries = []
definitions = []
with open(opt.input_filepath) as ifp:
    for line in ifp:
        parts = line.strip().split('\t')
        entries.append(parts[:-1])
        definitions.append(parts[-1])
def_str = "\n".join(definitions)
tokens = tokenizer.tokenize(def_str)
def_str = " ".join(tokens)
definitions = def_str.split("*NL*")
with open(opt.output_filepath, 'w') as ofp:
    for entry, definition in zip(entries, definitions):
        ofp.write("{}\t{}\n".format('\t'.join(entry), definition.strip()))
예제 #23
0
파일: test.py 프로젝트: weiang/baike
def test_tokenizer():
    tokenizer = StanfordTokenizer()
    sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    print tokenizer.tokenize(sent)
예제 #24
0
train_question_2 = []
train_labels = []
map_index_vec = dict()
map_word_index = dict()

tokenizer = StanfordTokenizer(options={"ptb3Escaping": True})

words = set()
for col in ['question1', 'question2']:
    sentences = []
    print('Processing column: %s' % col)
    for i, sentence in enumerate(train_data[col]):
        if i % 10000 == 0:
            print('Sentence: %d' % i)

        split = tokenizer.tokenize(sentence)
        new_sentence = []
        for word in split:
            word = word.encode('utf-8').strip()
            word = word.lower()

            if word in glove_model:
                if word not in words:
                    words.add(word)
                new_sentence.append(word)
            else:
                if 'unk' not in words:
                    words.add('unk')
                new_sentence.append('unk')

        sentences.append(" ".join(new_sentence))
from __future__ import absolute_import
예제 #26
0

aparser = argparse.ArgumentParser(
    description="Run CoreNLP tokenizer on a TSV definition file")
aparser.add_argument(
    'input_filepath', type=str, help='input file path')
aparser.add_argument(
    'output_filepath', type=str, help='output file path')
aparser.add_argument(
    'corenlp_postagger_path', type=str, help="path to stanford-postagger.jar")

opt = aparser.parse_args()
tokenizer = StanfordTokenizer(path_to_jar=opt.corenlp_postagger_path,
                              options={"ptb3Escaping": "false",
                                       "tokenizePerLine": "true",
                                       "tokenizeNLs": "true"})
entries = []
definitions = []
with open(opt.input_filepath) as ifp:
    for line in ifp:
        parts = line.strip().split('\t')
        entries.append(parts[:-1])
        definitions.append(parts[-1])
def_str = "\n".join(definitions)
tokens = tokenizer.tokenize(def_str)
def_str = " ".join(tokens)
definitions = def_str.split("*NL*")
with open(opt.output_filepath, 'w') as ofp:
    for entry, definition in zip(entries, definitions):
        ofp.write("{}\t{}\n".format('\t'.join(entry), definition.strip()))
import nltk
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import StanfordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer


snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = StanfordTokenizer()
eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')


text="Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a change.,They got themselves and Dan on a diet.".split(',')


for sen in text:
    token_list=tokenizer.tokenize(sen[:-1])
    tagged_sen=eng_tagger.tag(token_list)
    new_sen=[]
    for (word,tag) in tagged_sen:
        # print word,tag
        if tag[0]=='V':
            
            lemma_word=wordnet_lemmatizer.lemmatize(word,pos='v')
        else:
            lemma_word=wordnet_lemmatizer.lemmatize(word)
        stem_word=snowball_stemmer.stem(lemma_word)
        new_sen.append(stem_word)
    print " ".join(new_sen)
예제 #28
0
def en_standseg(sent):
    tokenizer = StanfordTokenizer(
        path_to_jar=r"E:\tools\stanfordNLTK\jar\stanford-parser.jar")
    print(tokenizer.tokenize(sent))
예제 #29
0
class KeywordExtractor(object):
    def __init__(self, **kwargs):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_segmenter = self.conf_corenlp["segmenter"]
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.segmenter = StanfordSegmenter(
            path_to_jar=prefix + conf_segmenter["path_to_jar"],
            path_to_sihan_corpora_dict=prefix +
            conf_segmenter["path_to_sihan_corpora_dict"],
            path_to_model=prefix + conf_segmenter["path_to_model"],
            path_to_dict=prefix + conf_segmenter["path_to_dict"],
            path_to_slf4j=prefix + conf_segmenter["path_to_slf4j"],
            encoding=conf_segmenter["encoding"])
        self.enTokenizer = StanfordTokenizer(path_to_jar=prefix +
                                             conf_tokenizer["path_to_jar"])
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.frequency = defaultdict(int)
        pynlpir.open()
        pynlpir.nlpir.ImportUserDict(conf.load("pynlpir")["user_dict"],
                                     Overwrite=False)

        try:
            self.excluded_docs = kwargs["excluded_docs"]
        except:
            self.excluded_docs = [""]

        # experimental features
        self.f_token_indexes = prefix + conf.load("pynlpir")["user_dict"]

    def _getDocs(self, num_docs, **kwargs):
        # from pymongo import MongoClient
        # dbClient = MongoClient("mongodb://127.0.0.1:27017")
        # db = dbClient["nick"]
        # collection = db["tickets"]
        # cursor = collection.find({}).limit(num_docs)
        # return enumerate(cursor)
        separated = kwargs["separated"]
        i = 0
        if separated is True:
            samples_dir = conf.load('samples')['dir']
            docs = [
                os.path.join(samples_dir, x) for x in os.listdir(samples_dir)
            ]
            while i < num_docs:
                with open(docs[i]) as f:
                    try:
                        yield i, json.load(f)
                        i += 1
                    except:
                        i += 1  # TODO: may cause problem on the generator index
                        num_docs += 1
            return
        else:
            samples_loc = conf.load('samples')['single']
            docs = json.loads(open(samples_loc).read())
            while i < num_docs:
                try:
                    yield i, docs[i]
                    i += 1
                except:
                    i += 1  # TODO: may cause problem on the generator index
                    num_docs += 1
            return
        #try:
        #    from pymongo import MongoClient
        #    dbClient = MongoClient("mongodb://127.0.0.1:27017")
        #    db = dbClient["nick"]
        #    collection = db["tickets"]
        #    cursor = collection.find({}).limit(num_docs)
        #    return enumerate(cursor)
        #except ImportError:
        #    i = 0
        #    docs = [os.path.join(conf.load("samples"), x) for x in os.listdir(conf.load("samples"))]
        #    while i < num_docs:
        #        with open(docs[i]) as f:
        #            i += 1
        #            yield json.load(f)
        #    return

    def saveToDoclist(self, num_docs, **kwargs):
        file_docs = open(
            self.conf_io["prefix"] + self.conf_io["output_data_directory"] +
            'num_docs-' + str(num_docs) + '.doclist', 'wb')
        separated = kwargs["separated"]
        docs = self._getDocs(num_docs, separated=separated)
        for ind, i in docs:
            try:
                text = i["title"].replace("\n", " ").replace(
                    "\r", " ") + " " + i["body"].replace('\n', ' ').replace(
                        "\r", " ")
            except Exception as e:
                print e
                continue
            else:
                file_docs.write("%s\n" % (text.encode("utf-8")))
        file_docs.close()

    def calculateTokens(self, line, **kwargs):
        line = """{}""".format(line)
        doc = [
            excluded_doc for excluded_doc in self.excluded_docs
            if excluded_doc not in line.split('\n')[0]
        ]
        if doc == []:
            print "skipped:", line.split('\n')[0]
            return ''

        doc_len_lower_bound = int(kwargs["doc_len_lower_bound"])
        doc_len_upper_bound = int(kwargs["doc_len_upper_bound"])
        if len(line.decode('utf-8')) <= doc_len_lower_bound or len(
                line.decode('utf-8')) >= doc_len_upper_bound:
            return ''

        allowed_list = ["noun", "intransitive verb", "noun-verb", "adjective"]
        # if you want to try the stanford coreNLP tokenizer in other languages...
        _en_tokens = [token.lower() for token in pre.GetEnTokens()(line)]
        with Timer('stanford_seg') as t:
            _en_tokens_tokenized = self.enTokenizer.tokenize(
                ' '.join(_en_tokens))
        en_tokens = [
            token for token in _en_tokens_tokenized if token.lower() not in
            pre.FilterKeywords().getStopwordsAsJSON()["en"]
        ]
        en_tokens = [
            token for token in en_tokens if token.lower() not in
            pre.FilterKeywords().getCustomStopwordsAsList()
        ]
        en_tokens = list(set(en_tokens))
        # now we have English tokens...
        tokens_in_each_doc = []
        with Timer('stanford_tag') as t:
            tags = self.en_tagger.tag(en_tokens)
        for word, tag in tags:
            if tag in ["NN", "FW", "VBD", "NNS", "VBP"]:
                tokens_in_each_doc.append(word)

        # _token_list = [i[0] for i in pynlpir.get_key_words(line.decode("utf-8"), weighted=True)] + en_tokens
        if str(kwargs["method"]) == "keyword":
            _token_list = [
                i[0] for i in pynlpir.get_key_words(line.decode("utf-8"),
                                                    weighted=True)
            ]
        elif str(kwargs["method"]) == "normal":
            # for i in pynlpir.segment(line.decode("utf-8"), pos_names='child'):
            # print i[0], i[1]
            if "2G" in line.decode("utf-8"):  # hot fix for a bug
                line = line.replace("2G", "")
                _token_list = [
                    i[0] for i in pynlpir.segment(line.decode("utf-8"),
                                                  pos_names='child')
                    if i[1] in allowed_list
                ]
            else:
                _token_list = [
                    i[0] for i in pynlpir.segment(line.decode("utf-8"),
                                                  pos_names='child')
                    if i[1] in allowed_list
                ]
        __token_list = [
            token for token in _token_list
            if token not in pre.FilterKeywords().getStopwordsAsJSON()["zh"]
        ]

        token_list = [
            token for token in __token_list if token.lower() not in
            pre.FilterKeywords().getStopwordsAsJSON()["en"] and token.lower()
            not in pre.FilterKeywords().getCustomStopwordsAsList()
        ]
        zh_tokens = [token for token in token_list if token not in _en_tokens]
        token_list = zh_tokens + tokens_in_each_doc

        #remove item in token_list that appears only few times
        for token in token_list:
            self.frequency[token.lower()] += 1
        tokens = ','.join(token_list)
        print "Done tokenizing text: ", tokens

        return tokens

    def getKeywordsAndSave(self, *args, **kwargs):
        import pickle
        freq_lower_bound = int(kwargs["freq_lower_bound"])
        token_len_lower_bound = int(kwargs["token_len_lower_bound"])
        doc_len_lower_bound = int(kwargs["doc_len_lower_bound"])
        doc_len_upper_bound = int(kwargs["doc_len_upper_bound"])

        if str(kwargs["method"]) == "keyword":
            file_keywords = open(
                self.conf_io["prefix"] +
                self.conf_io["output_data_directory"] +
                str(kwargs["target_name"]) + '.fine.keywords', 'w')
        elif str(kwargs["method"]) == "normal":
            file_keywords = open(
                self.conf_io["prefix"] +
                self.conf_io["output_data_directory"] +
                str(kwargs["target_name"]) + '.keywords', 'w')
        tokens = []
        token_indexes = {}
        if bool(kwargs["static_file"]) is True:
            source_name = self.conf_io["prefix"] + self.conf_io[
                "output_data_directory"] + str(kwargs["source_name"])
            with open(source_name, 'r') as f:
                _ind = 0
                for ind, line in enumerate(f):
                    try:
                        with Timer('calculateTokens') as t:
                            tokens.append(
                                self.calculateTokens(
                                    line,
                                    method=str(kwargs["method"]),
                                    doc_len_lower_bound=doc_len_lower_bound,
                                    doc_len_upper_bound=doc_len_upper_bound))
                        # [experimental feature]
                        # this is to be used with LDA
                        # to show what raw doc is associated with each topic
                        token_indexes[ind] = _ind
                        _ind += 1
                    except Exception as e:
                        if e is KeyboardInterrupt:
                            break
                        print e
                        print "error with ", line
                        continue
                    else:
                        pass
                for line in tokens:
                    if line is not None:
                        filtered_tokens = [
                            token for token in line.split(',')
                            if self.frequency[token.lower()] > freq_lower_bound
                            and len(token) > token_len_lower_bound
                        ]
                        filtered_tokens = ','.join(filtered_tokens)
                        file_keywords.write('%s\n' %
                                            (filtered_tokens.encode('utf-8')))
                        file_keywords.flush()
            f.close()
            # experimental
            json.dump(token_indexes,
                      open(self.f_token_indexes + "token_indexes.pickle", "w"),
                      ensure_ascii=True)
        else:
            doc_list = args[0]
            for ind, line in enumerate(list(doc_list)):
                try:
                    tokens.append(
                        self.calculateTokens(
                            line,
                            method=str(kwargs["method"]),
                            doc_len_lower_bound=doc_len_lower_bound,
                            doc_len_upper_bound=doc_len_upper_bound))
                except Exception as e:
                    if e is KeyboardInterrupt:
                        break
                    print e
                    print "error with ", line
                    continue
                else:
                    pass
            for line in tokens:
                if line is not None:
                    filtered_tokens = [
                        token for token in line.split(',')
                        if self.frequency[token.lower()] > freq_lower_bound
                        and len(token) > token_len_lower_bound
                    ]
                    filtered_tokens = ','.join(filtered_tokens)
                    file_keywords.write('%s\n' %
                                        (filtered_tokens.encode('utf-8')))
                    file_keywords.flush()
        file_keywords.close()
        pynlpir.close()
        return True

    def _loadTopicModel(self, **kwargs):
        try:
            str(kwargs["method"])
        except:
            print "You must specify a topic modeling method! Only tfidf is supported now."
        else:
            self.method = str(kwargs["method"])
            if self.method != 'tfidf':
                print "Error. We will use method=tfidf in the following analysis."
                self.method = 'tfidf'
            self.conf_tfidf = self.conf_embedding[self.method]

        _corpora = corpora.MmCorpus(self.conf_embedding["prefix"] +
                                    self.conf_tfidf["corpus_save_to"] + '.mm')
        self.dictionary = corpora.Dictionary.load(
            self.conf_embedding["prefix"] + self.conf_tfidf["dict_save_to"] +
            '.dict')

        _model = models.TfidfModel.load(self.conf_embedding["prefix"] +
                                        self.conf_tfidf["model_save_to"] +
                                        '.tfidf.model')

        return _model, _corpora

    def refineKeywords(self, **kwargs):
        #TODO: Whether setting TF-IDF threshold?
        top_k = int(kwargs["top_k"])
        file_keywords = open(
            self.conf_io["prefix"] + self.conf_io["output_data_directory"] +
            str(kwargs["target_name"]) + '.filtered.keywords', 'w')
        _model, _corpora = self._loadTopicModel(method='tfidf')

        for corpus in _corpora:
            # take the top-10 tf-idf weight tokens within each document, also, we set an absolute weight to it
            corpus = _model[corpus]
            sorted_corpus_per_doc = [
                token for token in sorted(corpus, key=lambda x: -x[1])[:top_k]
            ]
            tokens = [
                self.dictionary.id2token[_token[0]]
                for _token in sorted_corpus_per_doc
            ]
            tokens = ','.join(tokens)
            file_keywords.write('%s\n' % (tokens.encode('utf-8')))

        file_keywords.close()
        return True

if __name__ == '__main__':#very important
    # res = request([["excellent"],["poor"]])
    poshit = 1510000000032
    neghit = 771000000037 
    print(poshit)
    print(neghit)
    stopword = ["-LSB-","-RSB-","-LRB-","-RRB-"]              
    tokenizer = StanfordTokenizer()
    filename = "F:/course/sentimentcode/rt-polarity.neg"
    file_object = codecs.open(filename,'r','utf-8')
    allres = []
    try:
        all_the_text = file_object.read()
        arr = tokenizer.tokenize(all_the_text)
        la = len(arr)
        correct = 0
        for line in arr:
            ax = line.split()
            wordarr = []
            for word in ax:
                if word in stopword:
                    continue
                wordarr.append(word)
            list = nltk.pos_tag(wordarr)
            result = getPhraseByPos(list)
            if len(result)==0:
                continue
            allres.append(result) 
        posres = f(allres,1)