示例#1
0
def clean_words(tokens, filterStopwords=False, filterPos=None):
	cleanTokens = []
	stopwordList = stopwords.words('spanish')
	
	if filterPos:
		tagger = StanfordPOSTagger('stanford/models/spanish.tagger', 'stanford/stanford-postagger.jar', encoding='utf8')

	for token in tokens:
		cleanToken = token
		for char in string.punctuation:
			cleanToken = cleanToken.replace(char, "")
		
		if filterPos and not filterStopwords:
			res = tagger.tag([cleanToken])
			if len(res)>0:
				word, pos = res[0]
				if pos[0] in filterPos:
					cleanTokens.append(cleanToken)
		
		elif filterStopwords and not filterPos:
			if cleanToken not in stopwordList:
				cleanTokens.append(cleanToken)
		
		elif filterStopwords and filterPos:
			res = tagger.tag([cleanToken])
			if len(res)>0:
				word, pos = res[0]
				if cleanToken not in stopwordList and pos[0] in filterPos:
					cleanTokens.append(cleanToken)

		elif not filterStopwords and not filterPos:
			cleanTokens.append(cleanToken)
	
	return cleanTokens
示例#2
0
 def tagWordsInSentences(self, studying, entry):
     '''Tags the part of speech for each word.'''
     jar_path = 'stanford-postagger-full/stanford-postagger.jar'
     if studying in self.english:
         words = parseWordsFromEntry(entry)
         tagged_words = tagWords(words)
         return tagged_words
     elif studying in self.japanese or self.korean or self.mandarin:
         #segmenter = TinySegmenter()
         #words = segmenter.tokenize(entry)
         rm = RakutenMA()
         tagged_words = rm.tokenize(entry)
         #mecab = Mecab()
         #tagged_words = mecab.pos(entry)
         return tagged_words
     else:
         if studying in self.spanish:
             model_path = 'stanford-postagger-full/models/spanish.tagger'
             words = parseWordsFromEntry(entry)
         elif studying in self.french:
             model_path = 'stanford-postagger-full/models/french.tagger'
             words = parseWordsFromEntry(entry)
         postagger = StanfordPOSTagger(model_path,
                                       jar_path,
                                       encoding='utf8')
         tagged_words = postagger.tag(words)
         return tagged_words
示例#3
0
def posTagging():
    #myNounPhrases = []
    myCompletePOSStructure = []
    a = ['NNP', 'NNPS'] #Avoid NN,NNS. Only NNP , NNPS for purpose of NER.
    print '######## POS'
    english_postagger = StanfordPOSTagger(
        './Masters-Passau/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger',
        './Masters-Passau/stanford-postagger-full-2016-10-31/stanford-postagger.jar')
    #abc = english_postagger.tag('Steve Jobs was Founder of Apple. He was born in United States of America'.split())
    abc = english_postagger.tag('Who was the CEO of IBM'.split())
    print abc
    for number in abc:
        #print number[0],number[1]
        someTup = (number[0].encode('utf8'),number[1].encode('utf8'))
        #print someTup
        myCompletePOSStructure.append(someTup)

        #print split1[0] + ' ' + split1[1]
        #print unicodedata.normalize('NFKD', split1[0]).encode('ascii','ignore')
        #print unicodedata.normalize('NFKD', split1[1]).encode('ascii', 'ignore')

    print myCompletePOSStructure

    for number in abc:
        if any(x in number for x in a):
            #print number
            split1 = str(number).split(',')
            split2 = str(split1[0]).split('u')
            # print split2[1].replace("'", "")
            myNounPhrases.append(number)
示例#4
0
class POS_tagger_stanford(object):
    def __init__(self):
        """
        Initializes the tagger object
        """
        self.model = TAGGER_MODEL
        self.jar_file = POS_TAGGER_JAR_FILE
        self.tagger = StanfordPOSTagger(self.model, self.jar_file)
        self.tagger_type = STANFORD_TAGGER_NAME

    def get_tags(self, sentence):
        """
        Gets the tags for tokenized sentence
        The full list of tags is available online:
            https://nlp.stanford.edu/software/spanish-faq.shtml

        Args:
            sentence (list): the sentence used to obtain the POS tags, each word
                is an element in the list

        Returns:
            tags (list): List containing both the word and its corresponding tag
        """
        #tagger = self.get_tagger()
        tags = self.tagger.tag(sentence)
        return tags
示例#5
0
def getUsername(message, *args):
    pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8")
    words = nltk.word_tokenize(message.lower())
    tagged_words = pos_tagger.tag(words)
    sug_usernames = []
    # Check if pervious username input is passed
    if len(args) > 0:
        previous_username = args[0]
        sug_usernames = [
            word for word, tag in tagged_words
            if tag in ['NN', 'NNP', 'FW', 'NNPS'] and word != previous_username
        ]
    else:
        sug_usernames = [
            word for word, tag in tagged_words
            if tag in ['NN', 'NNP', 'FW', 'NNPS']
        ]

    if len(sug_usernames) > 0:
        if getSentenceSentiment(message) == 'pos':
            return sug_usernames[-1]
        else:
            return sug_usernames[
                -1] + 'salt123'  # return last suggested username

    return 'randomuser567user'
示例#6
0
def extractor():
    st = StanfordPOSTagger(
        '../stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger',
        '../stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar')
    nouns = []
    pnouns = []
    i = 0

    with open('../data/scraped_text_NYT.txt', 'r',
              encoding='utf-8') as inputFile:
        comment = inputFile.readline()
        while comment != "":
            sentences = sent_tokenize(comment, 'english')

            for sent in sentences:
                if (sent.strip() == ""):
                    continue
                pos_tags = st.tag(sent.split())
                for pos_tag in pos_tags:
                    if (pos_tag[1] == 'NN' or pos_tag[1] == 'NNS'):
                        nouns = nouns + [pos_tag[0]]
                    elif (pos_tag[1] == 'NNP' or pos_tag[1] == 'NNPS'):
                        pnouns = pnouns + [pos_tag[0]]
            i = i + 1
            print(i)
            print(comment)
            comment = inputFile.readline()

    outFile = open('../data/nouns_scraped_text_NYT.txt', 'a')
    outFile.write('NOUNS:\n')
    for noun in nouns:
        outFile.write(noun + "\n")
    outFile.write('\n\nPNOUNS:\n')
    for pnoun in pnouns:
        outFile.write(pnoun + '\n')
示例#7
0
def tagged_def():
    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path
    tagger = StanfordPOSTagger(
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger',
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger'
    )

    path_data = "data" + os.sep + "items_tagged_modified.json"
    data = json.load(codecs.open(path_data, encoding='UTF-8'))
    for item in data:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            definition = pos2def["definition"]
            #             print chardet.detect(definition)
            print definition.encode('gbk')
            definition_pure = re.sub(r'\([\s\S]*?\)', "", definition)
            tokens = nltk.word_tokenize(definition_pure)
            #             print tokens
            for token in tokens:
                print chardet.detect(token)
            tagged_tokens = tagger.tag(definition_pure.encode('utf-8').split())
            pos2def['tagged_def'] = tagged_tokens

    path_tagged_output = "items_tagged_auto.json"
    json.dump(data,
              codecs.open(path_tagged_output, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
示例#8
0
def test_POSSent():
    import nltk
    from nltk.tag.stanford import StanfordPOSTagger
    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path
    tagger = StanfordPOSTagger(
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger',
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/stanford-postagger-3.7.0.jar'
    )
    sent = 'abutment is a tooth, root, or implant used for support and retention of a fixed or removable prosthesis.'
    sent = 'angulated abutment is an abutment whose body is not parallel to the long axis of the implant. It is utilized when the implant is at a different inclination in relation to the proposed prosthesis.'
    sent = u'substance abuse is the misuse of legal or illegal substances with the intent to alter some aspect of the user閳ユ獨 experience. May include medications, illicit drugs, legal substances with potential mood-altering effects, or substances whose primary use may not be for human consumption.'
    #     print chardet.detect(sent)
    tokens = nltk.word_tokenize(sent)
    #     print tagger.tag(sent.split())
    print tagger.tag(tokens)
示例#9
0
def test_StanfordAndNLTKPOS():
    import nltk
    from nltk.tag.stanford import StanfordPOSTagger
    sent = 'a low-calorie sweetener that reduces caries activity and the growth and transmission of S. mutans.'
    sent = 'a wire formed by drawing a cast structure through a die; used in dentistry for partial denture clasps and orthodontic appliances.'
    sent = 'readily stained with acid dyes.'
    print chardet.detect(sent)
    #     sent='technique metered spray refers to a topical anesthetic dispersal technique that controls the amount and rate at which a drug is administered.'
    #     sent='older term for a traumatic ulcer of the oral mucosa.'
    #     sent='one or more vertically parallel surfaces of abutment teeth shaped to direct the path of placement and removal of a remarkable partial denture. Also called guiding plane.'
    #     sent='agents that bond, seal, or cement particles or objects together.'
    #     sent='teeth that are at such an angle as to cause them to be out of centric contact with opposing teeth during occlusion.'
    start = datetime.now()
    text = nltk.word_tokenize(sent)
    nltk_pos = nltk.pos_tag(text)

    java_path = "C:/ProgramData/Oracle/Java/javapath"
    os.environ['JAVAHOME'] = java_path
    stanford_tagger = StanfordPOSTagger(
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger',
        'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger'
    )
    stanford_pos = stanford_tagger.tag(text)
    print 'nltk_pos: ' + str(nltk_pos)
    print 'stanford_pos: ' + str(stanford_pos)
示例#10
0
class POS(object):
    """Part of Speech tagging using Stanford POSTagger"""

    STANFORD_POS = os.path.join(PACKAGE_ROOT, 'language', 'stanford-pos')
    STANFORD_POS_JAR = os.path.join(STANFORD_POS, 'stanford-postagger.jar')
    STANFORD_POS_TAGGER = os.path.join(
        STANFORD_POS, 'models/english-bidirectional-distsim.tagger')

    def __init__(self):
        self._tagger = StanfordPOSTagger(POS.STANFORD_POS_TAGGER,
                                         path_to_jar=POS.STANFORD_POS_JAR)

    def tag(self, tokens):
        """
        Tag Part of Speech using Stanford NER

        Parameters
        ----------
        tokens

        Returns
        -------
        POS: list of tuples of strings
        """
        return self._tagger.tag(tokens)
示例#11
0
def pos_tagging(docs, stanford_path, pos_tagger):
    print("\nGenerating Part-of-Speech tags...")

    # Configuring Stanford NLP POS tagger
    path_to_model = "{}/models/{}.tagger".format(stanford_path, pos_tagger)
    path_to_jar = "{}/stanford-postagger.jar".format(stanford_path)

    tagger = StanfordPOSTagger(model_filename=path_to_model,
                               path_to_jar=path_to_jar)
    # Setting higher memory limit for long sentences
    tagger.java_options = '-mx8192m'

    data = []
    for doc in progressbar.progressbar(docs):
        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        try:
            # Perform POS tagging
            tagged = tagger.tag(tokens)
        except:
            continue

        # Take the word, POS tag, and its label
        data.append([(w, pos, label)
                     for (w, label), (word, pos) in zip(doc, tagged)])
    return data
示例#12
0
    def part_of_speech_tagging(
            self, words: List[str],
            multi_word_name_entities: Set[str]) -> List[Tuple[str, str]]:
        """
        perform part-of-speech tagging using StanfordPOSTagger
        :param words: a list of words in a sentence
        :param multi_word_name_entities: a set of multi-word name entities
        :return: part-of-speech tag of the sentence
        """
        # define pos tagger
        path_to_model = 'stanford/pos/english-bidirectional-distsim.tagger'
        path_to_jar = 'stanford/pos/stanford-postagger.jar'
        pos_tagger = StanfordPOSTagger(path_to_model, path_to_jar)

        stan_pos_tag = pos_tagger.tag(words[:-1])  # omit the last period
        normal_pos_tag = nltk.pos_tag(words[:-1])  # omit the last period

        # print('Stanford POS tagging:', stan_pos_tag)        # for comparison
        # print('nltk.pos_tag tagging:', normal_pos_tag)      # for comparison

        def post_treatment(stan_pos_tag: List[Tuple[str, str]],
                           norm_pos_tag: List[Tuple[str, str]],
                           multi_word_name_entities: Set[str]) -> None:
            """
            combine the multi-word name entities
            since nltk.pos_tag label multi-word name entities together, so I correct stan_pos_tag by using norm_pos_tag
            the problem of norm_pos_tag is that it usually mislabels words, and that's why I prefer to use StanfordPOStagger
            :param stan_pos_tag: a list of pos-tags of sentences using stanford pos tagger
            :param norm_pos_tag: a list of pos-tags of sentences using nltk.pos_tag
            """
            stan_len = len(stan_pos_tag)
            norm_len = len(normal_pos_tag)
            stan_i = 0
            norm_i = 0
            while stan_i < stan_len and norm_i < norm_len:
                stan_word, stan_pos = stan_pos_tag[stan_i]
                norm_word, norm_pos = norm_pos_tag[norm_i]
                # check if word exists in multi_word_name_entities
                if stan_word == norm_word.split(
                        ' ')[0] and norm_word in multi_word_name_entities:
                    # scan the following words in stan_pos_tag and combine if they can form a multi-word entity
                    temp_i = stan_i + 1
                    match_idx = 1
                    entities = norm_word.split(' ')
                    while temp_i < stan_len and match_idx < len(entities):
                        temp_word, temp_pos = stan_pos_tag[temp_i]
                        if temp_word == entities[match_idx]:
                            _ = stan_pos_tag.pop(temp_i)
                            match_idx += 1
                        else:
                            break
                    stan_pos_tag[stan_i] = (norm_word, stan_pos)
                stan_i += 1
                norm_i += 1

        post_treatment(stan_pos_tag, normal_pos_tag, multi_word_name_entities)

        return stan_pos_tag
示例#13
0
class POSTagger:
    """POSTagger creates a POS tagger for german language. Different tagger are available to use."""
    STAN = "stanford-hgc-tagger"
    SFT = "stanford-fast-tagger"
    TT = "tree-tagger"
    SPACY = "spacy-tagger"

    # paths to Stanford tagger modules
    __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar"
    __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/"

    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
        else:
            raise Exception("Wrong tagger parameter.")

    def tag(self, text):
        """POS tag tokenized text."""
        if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN:
            tokens = self.__tokenizer.tokenize(text)
            return self.__tagger.tag(tokens)
        elif self.tagger_name == POSTagger.TT:
            tags = self.__tagger.tag_text(text)
            tuple_list = []
            tag_list = treetaggerwrapper.make_tags(tags)
            for item in tag_list:
                tuple_list.append((item[0], item[1]))
            return tuple_list
        elif self.tagger_name == POSTagger.SPACY:
            tags = self.__tagger(text)
            tuple_list = []
            for word in tags:
                tuple_list.append((word.orth_, word.tag_))
            return tuple_list
        else:
            pass

#tagger = POSTagger("spacy-tagger")
#doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.")
#print(tagger.tag("Ich werde morgen in die Schule gehen."))
#print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
def getTags(sen_arr):
    tag_arr = []
    st = StanfordPOSTagger('english-left3words-distsim.tagger')
    res = st.tag(sen_arr)
    for i in res:
        tag = i[1].encode("utf-8")
        tag_arr.append(tag)

    return tag_arr
示例#15
0
    def determine_sentpos_by_nltk(self, sentence):
        '''
			get pos collection for sentence from nltk
		'''
        pos_model_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/models/english-bidirectional-distsim.tagger"
        pos_jar_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/stanford-postagger.jar"
        pos = StanfordPOSTagger(model_filename=pos_model_file,
                                path_to_jar=pos_jar_file)
        return pos.tag(sentence.split(" "))
def pos_tagger(text):
    from nltk.tag.stanford import StanfordPOSTagger
    english_postagger = StanfordPOSTagger(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger',
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar'
    )
    english_postagger.java_options = '-mx4096m'
    tags = english_postagger.tag(text)
    return tags
示例#17
0
def pos_tag(to_tag, stanford_postagger_path):
    '''tag the tokens with part of speech; to_tag is the tags; model_path is the 
    file path to the stanford POS tagger model; and jar_path to the Stanford POS 
    tagger jar file'''
    pos_tagger = StanfordPOSTagger(stanford_postagger_path +"\\models\\french.tagger",
                                   stanford_postagger_path +"\\stanford-postagger.jar",
                                   encoding='utf8') #create an object of class POSTagger that is encoded in UTF-8
    tags = pos_tagger.tag(to_tag) #run the tagging algorithm on the tokenized raw text
    return tags
示例#18
0
def posInput(text):
	print("POS")
	path_to_model = "./stanford-postagger/models/english-caseless-left3words-distsim.tagger"
	path_to_jar = "./stanford-postagger/stanford-postagger.jar"
	tagger=StanfordPOSTagger(path_to_model, path_to_jar)
	tagger.java_options='-mx4096m'          ### Setting higher memory limit for long sentences
	# sentence = 'THIS IS TESTING'
	result = tagger.tag(word_tokenize(text))
	# print result
	return result
示例#19
0
def transform_wnli(premise,hypothesis):
    cased_premise=premise
    premise=[w.lower() for w in nltk.word_tokenize(premise)]

    #transform WNLI examples back into WSC format
    hypothesis = [w.lower() for w in nltk.word_tokenize(hypothesis)]
    best_target=["","","","","",""]#should get overwritten
    best_masked_s=[]
    for l in range(len(hypothesis)):
        for r in range(l+1,l+6):
            left_part = hypothesis[:l]
            right_part = hypothesis[r:]
            pattern = left_part + ["_"]+ right_part
            for s in range(len(premise)):
                ok=True
                if s+len(pattern)>len(premise):
                    break
                for a,b in zip(pattern,premise[s:s+len(pattern)]):
                    if a=="_":
                        continue
                    if a==b:
                        continue
                    if a in [',','.','?','!'] and b in [',','.','?','!']:#punctuation is ignored
                        continue
                    ok=False
                    break
                if ok and len(hypothesis[l:r])<=len(best_target):
                    best_target = hypothesis[l:r]
                    best_masked_s = premise[:s]+pattern+premise[s+len(pattern):]
    if len(best_masked_s)==0:#We failed
        return None,None
    #We extracted the masked sentence from the premise.
    global POS_tagger
    if POS_tagger is None:
        os.environ['STANFORD_MODELS'] = "stanford-postagger-2018-10-16/models"
        os.environ['CLASSPATH'] = "stanford-postagger-2018-10-16"
        POS_tagger = StanfordPOSTagger("stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger")
    tagged_premise = POS_tagger.tag(nltk.word_tokenize(cased_premise))
    candidates = []
    current=[]
    for word,tag in tagged_premise:
        if tag in ["NN","NNS","NNP","NNPS"]:
            current.append(word)
        else:
            if current!=[]:
                candidates.append(" ".join(current).lower())
                current=[]
    if current!=[]:
        candidates.append(" ".join(current).lower())
    best_target=" ".join(best_target)
    candidates=[c for c in candidates if c.find(best_target)==-1 and best_target.find(c)==-1]
    candidates = [best_target]+candidates
    found_sentence = " ".join(best_masked_s).replace(" n't","n't").replace(" 's","'s")#Sorry nltk
    return found_sentence,candidates
示例#20
0
    def create_pos(self, tweet):
        self.pos_tweet = None

        tweet = word_tokenize(tweet.lower())

        english_pos = StanfordPOSTagger(
            'postagger/models/english-bidirectional-distsim.tagger',
            'postagger/stanford-postagger.jar')

        self.pos_tweet = english_pos.tag(tweet)

        return self.pos_tweet
示例#21
0
 def tag(tokens):
     #java_path = "C:/Program Files/Java/jdk1.8.0_31/bin/java.exe"
     #os.environ['JAVAHOME'] = java_path
     special_symbols_array = ["the", "a", "an"]
     english_postagger = StanfordPOSTagger(
         'tagger/english-bidirectional-distsim.tagger',
         'tagger/stanford-postagger.jar')
     token_tag_array = english_postagger.tag(tokens)
     for element in token_tag_array:
         if element[0].lower() in special_symbols_array:
             token_tag_array.remove(element)
     return token_tag_array
示例#22
0
class String2POSNGramsList(String2TokenList):

    def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'):

        # Other Taggers:
        #   1. 'english-bidirectional-distsim.tagger'
        #   2. 'english-left3words-distsim.tagger'

        super(String2POSNGramsList, self).__init__()

        # N-Grams size
        self.n = n

        # Tagger Class Selection... See detail in Stanford Tagger documentation.
        self.tagger_cls = tagger_cls

        # Getting the Stanford tagger instance.
        self.spt = StanfordPOSTagger(self.tagger_cls)
        # self.spt = CoreNLPPOSTagger(url='http://localhost:9000')
        self.spt.java_options = '-mx10g'

    @property
    def N(self):
        return self.n

    @N.setter
    def N(self, value):
        self.n = value

    @property
    def Tagger_cls(self):
        return self.n

    @Tagger_cls.setter
    def Tagger_cls(self, value):
        self.tagger_cls = value

    def terms_lst(self, text):

        # Getting the Analysed list of tokens.
        analyzed_terms_lst = self.token_lst(text)

        # Tagging the Analyzed terms list and getting the tags list as terms.
        pos_tags = [pos for t, pos in self.spt.tag(analyzed_terms_lst)]

        # Constructing the Words N-Grams List
        analyzed_terms_lst = [
            " ".join(pos_tags[i: i+self.n])
            for i in range(len(pos_tags) - self.n + 1)
        ]

        return analyzed_terms_lst
示例#23
0
class StanfordPOS():
    def __init__(self, model_filename, jarfile):
        self.model_filename = model_filename
        self.path_to_jar = jarfile
        self.tager = StanfordPOSTagger(model_filename=self.model_filename,
                                       path_to_jar=self.path_to_jar)

    def tagger(self, X):
        transformed_X = []
        for doc in X:
            res = self.tager.tag(doc)
            transformed_X.append(np.array(res))
        return transformed_X
示例#24
0
 def _POS(self, txt, id):
     self.df[['ID', 'pos']].to_csv('pos_ner.csv', sep='\t')
     path_pos = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
     model_path = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger'
     from nltk.tag.stanford import StanfordPOSTagger
     tagger = StanfordPOSTagger(model_path, path_pos)
     tagger.java_options = '-mx8096m'  ### Setting higher memory limit for long sentences
     tokens = nltk.word_tokenize(txt)
     pos_res = tagger.tag(tokens)
     filepath = '/home/ise/NLP/NLP3/pos/pos_{}.txt'.format(id)
     with open(filepath, 'w') as file_handler:
         for item in pos_res:
             file_handler.write("{}\n".format(item))
     return pos_res
示例#25
0
def transform_to_pos(text):
    import os
    #os.environ['JAVAHOME'] = java_path
    from nltk.corpus import sentiwordnet as swn
    from nltk.tag.stanford import StanfordPOSTagger
    from nltk import word_tokenize

    path_to_model = "./postagging/english-bidirectional-distsim.tagger"
    path_to_jar = "./postagging/stanford-postagger.jar"
    tagger = StanfordPOSTagger(path_to_model, path_to_jar)
    tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences
    tokens = word_tokenize(text)
    size = len(tokens)
    from collections import Counter
    pos = tagger.tag(tokens)
    counts = Counter(tag for word, tag in pos)
    for key in counts:
        counts[key] /= size
    counts["totalWordsCount"] = size
    counts[";"] = tokens.count(";") / size
    counts["questionmarks"] = tokens.count("?") / size
    counts["exclamationmarks"] = tokens.count("!") / size
    counts["Quotes"] = tokens.count("\"") / size
    try:
        counts.pop(".")
    except:
        pass
    from collections import OrderedDict
    ot = [
        'NNP', 'VBD', 'VBN', 'IN', 'CD', 'VBP', ',', 'DT', 'NN', 'JJ', 'RB',
        'TO', 'SYM', 'PRP', 'NNS', 'CC', 'PRP$', 'POS', 'FW', 'VBG', ':',
        'WRB', 'EX', 'JJR', 'WDT', 'totalWordsCount', ';', 'questionmarks',
        'exclamationmarks', 'Quotes'
    ]
    counts = OrderedDict(counts)
    for key in ot:
        if key in counts:
            pass
        else:
            counts[key] = 0
    tmp = counts.copy()
    for key in tmp:
        if key not in ot:
            counts.pop(key, None)
    dab = {}
    for i in ot:
        dab[i] = counts[i]
    counts = dab.copy()
    return counts
示例#26
0
def extractPOS(inputFile_data, inputFile_tags, inputFile_version,
               outputFile_pos):
    f = open(inputFile_tags)
    allTags = set(f.read().split(","))  # Load all tags
    f.close()

    f = open(inputFile_version)
    lines = f.readlines()
    f.close()
    tag_version = []  # tags with version number
    for index, row in enumerate(lines):
        items = row.strip().split()
        if items[0] in allTags:
            for tag in items[1].split(","):
                tag_version.append(tag)

    print "The number of tag_version is: ", len(tag_version)
    tag_version = set(tag_version)

    fw_pos = open(outputFile_pos, "w")
    english_postagger = StanfordPOSTagger(
        '/Users/songshuaichen/Downloads/jars/models/english-bidirectional-distsim.tagger'
    )
    f = open(inputFile_data)
    lines = f.readlines()
    f.close()

    for index, row in enumerate(lines):
        if index % 300 == 0:
            print index, " Finish ", float(index) / len(lines)
        items = row.strip().split("		")

        # if index >=5000 and index < 6000 and items[0] in tag_version:
        if items[0] in tag_version:
            fw_pos.write(str(index) + "	" + items[0] + "	\n")
        if items[0] not in tag_version:

            fw_pos.write(str(index) + "	" + items[0] + "	")
            if len(items) > 1:
                text = items[1].split(". ")[0].decode('utf-8')
                pos = english_postagger.tag(text.split())

                for p in pos:
                    fw_pos.write(str(p))
                    fw_pos.write("	")

            fw_pos.write("\n")

    fw_pos.close()
示例#27
0
def build_question_set():
    sv_file = 'data/kprestval_pos_tags.json'
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    meta = load_and_process_metadata('val')
    images = split_data_by_seed(meta, 'kprestval')
    num = len(images)
    pos_tags_dict = {}
    for i, info in enumerate(images):
        question_id = info.question_id
        question = info.question.lower()
        _pos_tags = st.tag(word_tokenize(question))
        pos_tags_dict[question_id] = _pos_tags
        print('\nPOS TAGGER: %d/%d' % (i, num))
        print(_pos_tags)
    save_json(sv_file, {'pos_tags': pos_tags_dict})
示例#28
0
class PosTaggerTest(object):

    def __init__(self):
        self.eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

    def tag(self,sentence):
        tknzr = TweetTokenizer()
        res = self.eng_tagger.tag([' '.join(tknzr.tokenize(sentence))])
        return res

    def show(self,sentence):
        res = dict(self.tag(sentence))
        for key in res:
            # print sentence
            print key,"\t",res[key]
    def pos_clean(text_list,java_path,stanford_tagger_path,remove_verbs=True):
        
        import os
        #java_path = "C:/Program Files/Java/jdk1.8.0_261/bin/java.exe"
        os.environ['JAVAHOME'] = java_path
        #stan_path = "C:/Users/אילנה/Dropbox/jupyter_notebooks/data-science/idc-research/mine/stanford-tagger-4.0.0/"
        
        from nltk.tag.stanford import StanfordPOSTagger as POS_Tag
        
        arabic_postagger = POS_Tag(stanford_tagger_path+'models/arabic.tagger', stanford_tagger_path+'/stanford-postagger.jar')
        
        text_list_pos = [arabic_postagger.tag(inner_word_list) for inner_word_list in text_list]
        
        
        if remove_verbs==True:
            
            pos_to_remove = ["VB","VBD","VBG","VBN","VBP","VBZ"]

            text_list_final = []

            for inner_list in text_list_pos:

                final_inner_list = []

                for pos_tuple in inner_list:

                    # sometimes structure is unstable, so need to use find

                    if pos_tuple[0].find("/")>=0:

                        idx=0

                    else:

                        idx=1

                    if pos_tuple[idx].split("/")[1] not in pos_to_remove:           

                        final_inner_list.append(pos_tuple[idx].split("/")[0])

                text_list_final.append(final_inner_list)
                
            else:
                
                text_list_final = text_list
                
                
        return text_list_final
示例#30
0
def getPOSTags(sentence):
    """Generate POS tags with Stanford POS tagger. 
    Use Standford POS tagger as annotation model.
    https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

    Args:
      sentence: a sequence of sentence text
    
    Returns:
      tags: POS tags
  """
    pos_path = config['POS_MODEL_PATH']
    pos_jar = config['POS_JAR_PATH']
    postagger = StanfordPOSTagger(pos_path, pos_jar)
    word_tags = postagger.tag(sentence.split())
    tags = [list(t) for t in zip(*word_tags)][1]
    return tags
示例#31
0
def pos_tag(series):
    import nltk
    def rem_mentions_hasht(tweet):
        words = tweet.split()
        relevant_tokens = [w for w in words if '@' not in w and '#' not in w]
        return( " ".join(relevant_tokens))
    
    series = series.apply(lambda tweet: rem_mentions_hasht(tweet))

    from nltk.tag.stanford import StanfordPOSTagger
    import os
    java_path = "C:/Program Files/Java/jre1.8.0_111/bin/java.exe"
    os.environ['JAVAHOME'] = java_path
    
    english_postagger = StanfordPOSTagger(os.getcwd()+'\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger'
    , os.getcwd()+'\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar')
    
    return series.apply(lambda a: english_postagger.tag(nltk.word_tokenize(a)))
示例#32
0
def review_tager(tokenized_reviews):
    st_model_path = r'SPOST/models/english-bidirectional-distsim.tagger'
    st = StanfordPOSTagger(st_model_path, r'SPOST/stanford-postagger.jar')
    results = []
    errors = []
    count = 0

    for review in tokenized_reviews:
        try:
            results.append(st.tag(review))
            count += 1
        except:
            print(count)
            errors.append(count)
            results.append(review)
            count += 1
    print('errors for the following indexes\n', errors)
    return results
示例#33
0
def get_pos_sentence(sentences_spans,pos_vocab):
    """
    Get POS tags for each sentence. (needed to build end2end system)
    :param start:
    :param end:
    :return:
    """
    #raw_dir_simple = read.read_from_json('test/test_dir_simple')   #### in folder data/
    #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples')
    #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple')

    #raw_dir_simple = ["NYT19980206.0466"]
    english_postagger = StanfordPOSTagger(
        StandforParser,    #### in folder data/
        StandforParser_jar) #### in folder data/
    english_postagger.java_options = '-mx8000m'
    pos_sentences = list()

    for sent_span in sentences_spans:
        print(sent_span[0])
        text = nltk.word_tokenize(sent_span[0])
        text_pos = english_postagger.tag(text)   #####StanfordPnOSTagger failed to tag the underscore, see https://github.com/nltk/nltk/issues/1632  if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues

        index = 0
        for token in text_pos:
            # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"):  ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and ''))
            #     text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] == "``"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "``"]
            if text[index] ==token[0] and token[0] == "''"  and text[index] not in sent_span[0]:
                text_pos[index] = ["\"", "\'\'"]
            if text[index] == token[0] and token[0] in ['{','(','['] :
                text_pos[index] = [token[0],"("]
            if text[index] == token[0] and token[0] in ['}',')',']']:
                text_pos[index] = [token[0],")"]
            pos_vocab[token[1]]+=1
            index+=1
        pos_sentences.append(text_pos)
    return pos_sentences,pos_vocab
'''
Created on Mar 11, 2016

@author: zhongzhu
'''
import os

from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordParser
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger


st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
st.tag('What is the airspeed of an unladen swallow ?'.split())

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))

dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
示例#35
0
os.environ['STANFORD_MODELS'] = \
'C:/stanford_data/stanford-parser-3.5.2-models.jar'

parser = stanford.StanfordParser(model_path= \
"C:/stanford_data/englishPCFG.ser.gz")


parsed_sentences = parser.raw_parse( \
(my_sentence))

for i in parsed_sentences:
    for k in i:
        print(k)

# GUI
for line in parsed_sentences:
    for sentence in line:
        sentence.draw()

sys.exit()

st = StanfordPOSTagger(r'C:/stanford_data/english-bidirectional-distsim.tagger',r'C:/stanford_data/stanford-postagger.jar')

bobo = st.tag(my_sentence.split())

print(bobo)

for i in bobo:
    print(i)
    
示例#36
0
class NltkHelper:

	def __init__(self, text):
		reload(sys)  
		sys.setdefaultencoding('utf8')

		self.text = text

		root = os.path.dirname(os.path.realpath(__file__))
		os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar"
		os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/"
		
		
		_path_to_model  = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
		_path_to_jar    = root + '/stanford-postagger/stanford-postagger.jar'
		self.stanford   = StanfordPOSTagger(_path_to_model, _path_to_jar)

		self.sentences  = sent_tokenize(text.encode("utf-8"))
		self.words      = word_tokenize(text.encode("utf-8"))
		

		self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) ))
		#cleanWords = self.cleanWords()
		#self.tags = self.stringifyTuples(self.stanford.tag( cleanWords ))
		#print self.cleanWords()
		
		self.taggedBigrams = self.ngramsAndTags(2) 

		#print self.words
		#print self.cleanWords()
		
		#print "Bigrams --> ", self.taggedBigrams
		#print "Tags --> ", self.findTags()
		
		#print (nouns)
	
	def personal_names(self):
		output = []

		#(('reports', 'NNS'), ('claim', 'VBP'))
		for gram in self.taggedBigrams:
			tag1  = gram[0][1]
			tag2  = gram[1][1]
			word1 = gram[0][0]
			word2 = gram[1][0]

			if self.isPersonalName( tag1 ) and self.isPersonalName( tag2 ):
				output.append( "{0} {1}".format(word1, word2) )
		return output

	def isPersonalName(self, tag):
		return tag == "NNP" or tag == "FW"

	
	def preprocessTitle(self):
		
		output = ''
		for taggedWord in self.tags:
			
			word = taggedWord[0]
			tag  = taggedWord[1]

			if self.isPersonalName(tag):
				output = "{0} {1}".format(output, word.title())
			else:
				output = "{0} {1}".format(output, word.lower())


		return output
	
	def ngramsAndTags(self, n):
		output = []
		for i in range(len(self.tags)-n+1):
			gram = (self.tags[i],)
			for j in range(i+1, i+n):
				gram += ( self.tags[j], )
	    		output.append( gram )

		return output
	


	def sortFrequencies( self, ngram ):
		return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)		    
    

	
	def findTags(self):
		#pattern = [("AJ", NOUN/S/FWS), (FW, FW), NOUN, NOUN]
		output = []

		#(('reports', 'NNS'), ('claim', 'VBP'))
		for gram in self.taggedBigrams:
			tag1  = gram[0][1]
			tag2  = gram[1][1]
			word1 = gram[0][0]
			word2 = gram[1][0]

			if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ):
				output.append( "{0} {1}".format(word1, word2) )
		return output


	def isAdj(self, tag):
		return tag=='JJ'

	def isNounOrForeignWord(self, tag):
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		return tag in nouns

	"""
	def bigramsList(self):		
		pass
	"""
	def stringifyList(self, list):
		output = []
		for tag in list:
			output.append( str(tag.encode('utf-8')) )
		
		return output

	def stringifyTuples(self, tuples):
		output = []
		for tag in tuples:
			output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) )
		
		return output


	"""
	returns list of tuples of tagged words in text
	"""
	def analyze(self):
		output = []
		for sentence in self.sentences:
			taggedWords = self.stanford.tag( word_tokenize( sentence.lower() ) )
			output.append(taggedWords)

		return self.stringifyTuples(taggedWords)

	"""
	returns list of nouns and foreign words
	"""
	def filterNounsInText(self):
		output = set()
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']

		for sentence in self.sentences:
			taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
			for item in taggedWords:
				if item[1] in nouns:
					output.add( item[0] )
		
		return self.stringifyList( list(output) )

	



	def cleanWords(self):
		input = ''
		for item in self.words:
			input = "{0} {1}".format(input, item)

		input = re.sub('\n+', " ", input)
		input = re.sub('\[[0-9]*\]', "", input)
		input = re.sub(' +', " ", input)
		input = bytes(input)
		input.decode('ascii', 'ignore')

		input = input.split(" ")
		cleanInput = []

		for item in input:
			item = item.strip( string.punctuation )

			if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
			    cleanInput.append( item )

		return cleanInput



	def bigramNouns(self, text):
		nouns = self.filterNouns(text)		
		
		

	def isTagNounOrForeignWord(self, word):
		output = False
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		taggedWords = self.stanford.tag( word.lower()  )
		for item in taggedWords:
			if item[1] in nouns:
				output = True
				break
		return output

	@staticmethod
	def filterNouns(self, input):
		output = set()
		nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
		sentences = sent_tokenize(input)
		for sentence in sentences:
			taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
			for item in taggedWords:
				if item[1] in nouns:
					output.add( item[0] )
		nList = list(output)
		return self.stringifyTuples(nList)

	@staticmethod
	def define( self, word ):	

		definitions = []	
		try:
			synsets = wn.synsets(word)
			for synset in synsets:
				definitions.append (synset.definition())
		except ValueError:
			print "Cannot define '{0}'".format(word)

		return definitions

	def sentenceExamples( self, noun):
		output = []
		try:
			synsets = wn.synsets(noun)
			for synset in synsets:
				examples = synset.examples()
				for example in examples:
					output.append( example )
		except ValueError, AttributeError:
			print "Cannot find any example for '{0}'".format(noun)

		return output
class NLTKHelper(object):
    """docstring for NLTKHelper"""
    def __init__(self, text):
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.text = text

        root = os.path.dirname(os.path.realpath(_file_))
        os.environ["STANFORD_PARSER"] = root+
        os.environ["STANFORD_MODELS"] = root+
        _path_to_model = root + ''
        _path_to_jar = root + ''
        self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar)
        self.sentences = sent_tokenize(text.encode("utf-8"))
        self.words = word_tokenize(text.encode("utf-8"))

        self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower())))
        #cleanWords

        self.taggedBigrams = self.ngramsAndTags(2)
        #print self.words

    def  personal_names(self):
        output = []

        for  gram in self.taggedBigrams:
            tag1  = gram[0][1]
            tag2  = gram[1][1]
            word1 = gram[0][0]
            word2 = gram[1][0]

            if  self.isPersonalName(tag1) and self.isPersonalName(tag2):
                output.append("{0} {1}".format(word1, word2))
        
        return output

    def isPersonalName(self, tag):
        return tag == "NNP" or tag == "FW"

    def preprocessTitle(self):

        output = ''
        for taggedWord in self.tags:
            word = taggedWord[0]
            tag  = taggedWord[1]

            if self.isPersonalName(tag):
                output = "{0} {1}".format(output, word.title())
            else:
                output = "{0} {1}".format(output, word.lower())

            return output

    def ngramsAndTags(self, n):
        output = []
        for i in range(len(self.tags)-n+1):
            gram = (self.tags[i],)
            for j in range(i+1, i+n):
                gram +=(self.tags[j], )
            output.append(gram)
        return output

    def sortFrequencies(self, ngram):
        return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True)

    def findTags(self):
        output = []

        for gram in self.taggedBigrams:
            tag1 = gram[0][1]
            tag2 = gram[1][1]
            word1 = gram[0][0]
            word2 = gram[1][0]

            if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ):
                output.append( "{0} {1}".format(word1, word2) )
            return output

    def isAdj(self, tag):
        return tag=='JJ'

    def isNounOrForeignWord(self, tag):
        nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
        return tag in nouns

    def stringifyList(self, list):
        output = []
        for tag in list:
            output.append( str(tag.encode('utf-8')) )
        
        return output

    def stringifyTuples(self, tuples):
        output = []
        for tag in tuples:
            output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) )
        
        return output

    #returns list of tuples of tagged words in text
    def analyze(self):
        output = []
        for sentence in self.sentences:
            taggedWords = self.stanford.tag(word_tokenize(sentence.lower()))
            output.append(taggedWords)

            return self.stringifyTuples(taggedWords)

    def filterNounsInText(self):
        output = set()nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']

        for sentence in self.sentences:
            taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
            for item in taggedWords:
                if item[1] in nouns:
                    output.add( item[0] )
        
        return self.stringifyList( list(output) )

    def cleanWords(self):
        input = ''
        for item in self.words:
            input = "{0} {1}".format(input, item)

        input = re.sub('\n+', " ", input)
        input = re.sub('\[[0-9]*\]', "", input)
        input = re.sub(' +', " ", input)
        input = bytes(input)
        input.decode('ascii', 'ignore')

        input = input.split(" ")
        cleanInput = []

        for item in input:
            item = item.strip( string.punctuation )

            if len(item)>1 or (item.lower()=='a' or item.lower()=='i'):
                cleanInput.append( item )

        return cleanInput

    def bigramNouns(self, text):
        nouns = self.filterNouns(text)

    def isTagNounOrForeignWord(self, word):
        output = False
        nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW']
        taggedWords = self.stanford.tag( word.lower()  )
        for item in taggedWords:
            if item[1] in nouns:
                output = True
                break
        return output

    @staticmethod
    def filterNouns(self, input):
        output = set()
        nouns = ['NN', 'NNS', 'NNPS', 'FW']
        sentences = sent_tokenize(input)
        for sentence in sentences:
            taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) )
            for item in taggedWords:
                if item[1] in nouns:
                    output.add( item[0] )
        nList = list(output)
        return self.stringifyTuples(nList)

    @staticmethod
    def define(self, word):

        definitions = []
        try:
            synsets = wn.synsets(word)
            for synset in synsets:
                definitions.append(synset.definition())
            except ValueError:
                print "Cannot define '{0}'".format(word)

            except definitions
示例#38
0
        wordsSplit = sent1.split(" ")

        ## Feature 1:  Sentence Length
        length=len(wordsSplit)
        class_arrays.append(length)

        for f in range(0,length):
                wordsClean=wordsSplit[f]
                if "&_" in wordsClean:
                        target1=wordsClean.translate(string.maketrans("",""), string.punctuation)
                        wordsSplit[f]=wordsClean
                        break

        ## choose tagger and tag sentence
        sentClean=str(wordsSplit)
        sentTagged=st.tag(sentClean)


        ## Feature 2: Completeness (capital word initial pos, punct. mark final)
        if wordsSplit[0][0].isupper() and (sent1.endswith(".") or sent1.endswith("!") or sent1.endswith("?")) :
                comp=1
        class_arrays.append(comp)        

        ## Feaure 5: Complexity (Stanford): how deeply embedded is the sentence?
        ##parse sentence with the Stanford parser
        parse=list(parser.raw_parse(sentClean.decode("utf-8")))
        sentParse=str(parse).split(" ")
        for i in range(0, len(sentParse)):
                if "Tree('S'" in sentParse[i]:
                        complexity=complexity+1
示例#39
0
class SenticParser:
	def __init__(self):
		self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')

	def TaggedSentenceSlashForm(self, sentence ):

		#print sentence.split()
		Tagged = self.st.tag(sentence.split())

		TaggedSentence = ""
		for i in Tagged:
			TaggedSentence = TaggedSentence+"/".join(i)+" "


		#print TaggedSentence
		return TaggedSentence


	def TaggedSentence(self, sentence ):
		Tagged = self.st.tag(sentence.split())
		return Tagged


	def FindStemmedVerb(self, word):
		st = LancasterStemmer()
		StemmedVerb = st.stem(word)
		
		dic = enchant.Dict("en_US")
		if( dic.check(StemmedVerb) ):
			return StemmedVerb
		else:
			return StemmedVerb+"e"			
	

	def FindSplit(self, sentence, TaggedSentence):
		TokenizedSentence = nltk.word_tokenize(sentence)

		SplitList = []
		SentAdded = ""
		split = 0 

		#print TaggedSentence

		for i in range(len(TaggedSentence)):
			if TaggedSentence[i][1].startswith("VB"):
				SplitList.append(SentAdded)
				try:
					if (TaggedSentence[i+1][1].startswith("VB")):
						SentAdded = ""
					else:
						SplitList.append(SentAdded)
						SentAdded = TaggedSentence[i][0]+" "
					#	print "split"
				except:
					SplitList.append(TaggedSentence[i][0]) 
				
			else:
				#print SentAdded
				SentAdded = SentAdded + TokenizedSentence[i] + " "
							
		SplitList.append(SentAdded)		
	

		Str_list = filter(None, SplitList)
		Str_list = list(set(Str_list))

		'''
		for i in range(len(Str_list)):
			Str_list[i] = Str_list[i][:-1].translate(string.maketrans("",""), string.punctuation)
		'''
		return Str_list
from nltk import pos_tag,word_tokenize
#from Utils import getQues
#txt=getQues()
#txt="benim adim yahya"
from nltk.tag.stanford import StanfordPOSTagger
txt="i am dentist"
tgr=StanfordPOSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar')
print  tgr.tag(word_tokenize(txt))
示例#41
0
class Parser:

    def __init__(self):
        self.MatchList = []
        self.ConceptMatches = []
        self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')


    def SyntacticMatch(self, concept1, concept2 ):                      # Checks for syntactic similarity. Checks for matching words between two concepts. 
	TaggedConcept1 = self.st.tag(nltk.word_tokenize(concept1))
	TaggedConcept2 = self.st.tag(nltk.word_tokenize(concept2))     

	print TaggedConcept1
	print TaggedConcept2

	flag = 0 

	for i in TaggedConcept1:
		for j in TaggedConcept2:
			if (i == j):
				if i[1].startswith("NN"):
					flag = 1
	

	if ( flag == 1):
		return True
	else:
		return False






    def FindBigrams(self, concept):                                      # Finds All Bigrams associated with the concept
        #sentence = concept.split(" ")                     	         # Splits the Given concept into Bigrams     e.g) "a very special christmas gift" gets split as ["a very", "very special", "special 																	christmas", "christmas gift"]

	sentence = self.st.tag(nltk.word_tokenize(concept))        
 
	print sentence

	Bigrams = []										
  
	for i in range(len(sentence) - 1):
            if ( sentence[i][1] == "JJ"  and sentence[i+1][0] in stopwords.words('english') ):		# If the bigram is [ adj + stopword ] , ignore
                continue 									           # bigrams like "a very" are ignored
	    
	    elif ( sentence[i][0] in stopwords.words('english')  and sentence[i+1][0] in stopwords.words('english') ):		# If the bigram is [ adj + stopword ] , ignore
                continue              


            elif ( sentence[i+1][1] == "JJ"  and sentence[i][0] in stopwords.words('english') ):            # If the bigram is [ stopword + adj ] , ignore 
                continue									           # bigrams like "amazingly a" is ignored

            elif ( sentence[i][1] == "JJ" and sentence[i+1][1].startswith("NN") ):                       # If the bigram is [ adj + concept ] , then include [adj + concept] and [concept] to the list
                Bigrams.append(sentence[i+1][0])						 # e.g) "special christmas" --> concepts extracted will be "special christmas" and "christmas" are added
                Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0])
                
            elif ( sentence[i][0] in stopwords.words("english") and sentence[i+1][1].startswith("NN") ):       # If the bigram is [ stopword + concept ], then inlcude only the concept w/ and w/o the concept 
                    Bigrams.append(sentence[i+1][0])                                                                 # e.g) "the christmas" --> concepts that will be extracted is "christmas" , "the christmas"
		    Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0])						          
           
	    elif ( sentence[i][1].startswith("NN") and sentence[i+1][1] == "JJ" ):							       # If the bigram ends with adjective , then ignore the adjective. 
                Bigrams.append(sentence[i][0])    							              # e.g) "present amazing" --> concept that will be extracted is "present"
                
            elif ( sentence[i][1].startswith("NN") and sentence[i+1][0] in stopwords.words("english")):					# If the bigram ends with a stopword , then ignore the stopword
                    Bigrams.append(sentence[i][0])							              # e.g) "christmas the" --> concept that will be extracted is "christmas"
             
            else:	
                Bigrams.append(sentence[i][0]+ " "+ sentence[i+1][0])
                   
                     
        print Bigrams

        return Bigrams
示例#42
0
##############################
#  Tokenization des Textes
##############################

tokens = nltk.word_tokenize(text, language='german')
#print(tokens)
sentence_tokens = nltk.sent_tokenize(text, language='german')
#print (sentence_tokens)

# Auswahl eines zufälligen Übungssatzes
randSentence = sentence_tokens[randint(0,len(sentence_tokens))]
randSentenceTokens = nltk.word_tokenize(randSentence, language='german')

# Auswahl des zu trainierenden Worttyps
pos_sentence = st.tag(randSentence.split())
#print(pos_sentence)
response = input("Welche Wortart wollen Sie trainieren? (Verb, Nomen, Adjektiv, Artikel)")
if response == "Verb":
 picked_wordtype= "VAFIN"
elif response == "Nomen":
 picked_wordtype = "NN"
elif response == "Adjektiv":
 picked_wordtype = "ADJA"
elif response == "Artikel":
 picked_wordtype = "ART"
else:
 print("Nicht zugelassen")

# Ausgewählten Worttyp im Übungssatz finden
temp = list()
示例#43
0
# -*- coding: utf-8 -*-
import nltk
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize

#the path where you have downloaded and unziped the full parser.
sp_dir = '/home/sarah/postagger/'
english_model = sp_dir + 'models/english-bidirectional-distsim.tagger'
chinese_model = sp_dir + 'models/chinese-distsim.tagger'
jar_path = sp_dir + 'stanford-postagger.jar'

#testing the english POS tagger
print "For the English model"
st_eng = StanfordPOSTagger(model_filename = english_model, path_to_jar = jar_path)
eng_sent = 'This is Stanford postagger in nltk for Python users.'
print eng_sent
eng_tokens = word_tokenize(eng_sent)
eng_tagged = st_eng.tag(eng_tokens)
for i in eng_tagged:
	print i

#testing for the chinese POS tagger
print "\n\nFor the Chinese model"
st_chi = StanfordPOSTagger(model_filename = chinese_model, path_to_jar = jar_path,encoding = 'utf-8')
chi_sent = '这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'
print chi_sent
chi_tokens = word_tokenize(chi_sent)
chi_tagged = st_chi.tag(chi_tokens)
for i in chi_tagged:
	print i
#print st_chi.tag('这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'.split())
示例#44
0
class Parser(object):
    modeldir = os.path.abspath(BASE_DIR + "/weiss/planner/models/")
    stopword_path = modeldir + "/english.stp"

    def __init__(self):
        self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger',
                                           self.modeldir + '/postagger/stanford-postagger.jar')
        self._stemmer = nltk.SnowballStemmer("english")
        self._stopwords = stopword(self.stopword_path)
        self._type_words = self._set_type_words()
        self._sentiment = self._get_sentiment()


    def _get_sentiment(self):
        sentiment = {}
        for line in open(self.modeldir + "/AFINN.txt"):
            word, score = line.split('\t')
            sentiment[word] = int(score)
        return sentiment

    def calculate_sentiment(self, query):
        tokens = nltk.word_tokenize(query)
        score = 0
        for token in tokens:
            if token in self._sentiment:
                score += self._sentiment[token]
        return score

    def entity_recognition(self, query, arguments):
        """Parse query and extract keywords

        This function is called in planner

        Args:
            query: query needs to be parsed
            arguments: info needs to be updated
        """
        tokens = nltk.word_tokenize(query)
        tags = self._postagger.tag(tokens)

        tuples = []

        for tag in tags:
            if tag[0] in self._stopwords:
                continue
            stemmed = self._stemmer.stem(tag[0])
            if stemmed in self._type_words['movie']:
                continue
            if stemmed in self._type_words['article']:
                continue
            if stemmed in self._type_words['restaurant']:
                continue
            if tag[1][:2] == 'NN' or tag[1][:2] == 'JJ':
                tuples.append(tag[0])

        if len(tuples) > 0:
            arguments['keywords'] = tuples
            logger.info("Here are the keywords: %s" % arguments['keywords'])

    def _set_type_words(self):
        """Initialize synonymy words of movie, article and restaurant

        This function is called during initialization

        Return: A dictionary, key: movie, article, restaurant, value: their synonymy words
        """
        topic = {}
        movie = ['cinema', 'show', 'film', 'picture', 'cinematograph',
                 'videotape', 'flick', 'pic', 'cine', 'cinematics', 'photodrama',
                 'photoplay', 'talkie', 'flicker', 'DVD', 'movie']
        article = ['report', 'announcement', 'story', 'account',
                   'newscast', 'headlines', 'press', 'communication', 'talk', 'word',
                   'communique', 'bulletin', 'message', 'dispatch', 'broadcast',
                   'statement', 'intelligence', 'disclosure', 'revelation',
                   'gossip', 'dispatch', 'news', 'article']
        restaurant = ['bar', 'cafeteria', 'diner', 'dining', 'saloon', 'coffeehouse',
                      'canteen', 'chophouse', 'drive-in', 'eatery', 'grill', 'lunchroom', 'inn', 'food',
                      'pizzeria', 'hideaway', 'cafe', 'charcuterie', 'deli', 'restaurant']
        for m in movie:
            topic.setdefault('movie', set([]))
            topic['movie'].add(self._stemmer.stem(m))
        for a in article:
            topic.setdefault('article', set([]))
            topic['article'].add(self._stemmer.stem(a))
        for r in restaurant:
            topic.setdefault('restaurant', set([]))
            topic['restaurant'].add(self._stemmer.stem(r))
        return topic


    def type_recognition(self, query, arguments):
        """Identity the type of the topic: movie, article or restaurant

        This is called in planner

        Args:
            query: query needs to be parsed
            arguments: info needs to be updated

        """
        tokens = nltk.word_tokenize(query)
        first = self._stemmer.stem(tokens[0])
        last = self._stemmer.stem(tokens[-1])
        lastsecond = self._stemmer.stem(tokens[-2]) if len(tokens) > 1 else "toy"
        if (first in self._type_words['article'] or last in self._type_words['article']
            or lastsecond in self._type_words['article']):
            arguments['tid'] = Type.News
        elif (first in self._type_words['restaurant'] or last in self._type_words['restaurant']
              or lastsecond in self._type_words['restaurant']):
            arguments['tid'] = Type.Restaurant
        elif (first in self._type_words['movie'] or last in self._type_words['movie']
              or lastsecond in self._type_words['movie']):
            arguments['tid'] = Type.Movie
        else:
            arguments['tid'] = Type.Unknown


    @staticmethod
    def _string_to_idx(number):
        if number == 'first' or number == 'one':
            return 0
        if number == 'second' or number == 'two':
            return 1
        if number == 'third' or number == 'three':
            return 2
        if number == 'fourth' or number == 'four':
            return 3
        if number == 'fifth' or number == 'five':
            return 4


    @staticmethod
    def keyword_matching(arguments, entities):
        words = arguments['keywords']
        phonics = set([])
        overlap = []

        for w in words:
            phonics.add(fuzzy.nysiis(w))

        for i in xrange(0, len(entities)):
            entity_name = nltk.word_tokenize(entities[i].name)
            entity_phonics = set([])
            for word in entity_name:
                entity_phonics.add(fuzzy.nysiis(word))
            common = len(phonics & entity_phonics) / len(entity_phonics)
            if common == 1:
                arguments['idx'] = i
                return
            overlap.append(common)
        arguments['idx'] = overlap.index(max(overlap))


    def find_number(self, query, arguments, entities):
        tokens = nltk.word_tokenize(query)
        tags = self._postagger.tag(tokens)
        last = query.find('last')

        # Edge case, "first" cannot be tagged correctly
        if len(query.split(" ")) <= 3 and query.find('first') != -1:
            arguments['idx'] = 0
            return 

        number = None
        for t in tags:
            if t[1] == 'JJ' and t[0][-2:] in set(['th', 'nd', 'st', 'rd']):
                number = t[0]
                break
            elif t[1] == 'CD' and t[0]:
                number = t[0]
                if number.isdigit() and int(number) < 6:
                	arguments['idx'] = int(number) - 1
                	return
                break

        if number is not None:
            if last == -1:
                arguments['idx'] = self._string_to_idx(number)
            else:
                arguments['idx'] = len(entities) - self._string_to_idx(number) - 1
示例#45
0
from nltk.tag.stanford import StanfordPOSTagger
import nltk
import os

os.environ['CLASSPATH'] = "/home/vishesh/Downloads/stanford-postagger-full-2015-12-09/"

english_postagger = StanfordPOSTagger('models/english-bidirectional-distsim.tagger')

print english_postagger.tag(nltk.word_tokenize('this is stanford postagger in nltk for python users'))


fo = open('europarl-v7.de-en.de','r')
data = fo.read()
fo.close()

fw = open('europarl_tags_testing.txt','w')

data = data.decode('utf-8')
data = data.split('\n')

#tokens = data.split()
#print len(tokens)

#print 'Tagging...'

german_postagger = StanfordPOSTagger('/home/vishesh/Documents/NLP/postagger/models/german-fast-caseless.tagger')
for i in range(10000,11500):
	tokens = nltk.word_tokenize(data[i])
	
	tags = german_postagger.tag(tokens)