Пример #1
0
def vectorizer(tokens, w2v_db):
    db_path = w2v_db
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)
    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
            
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label
    # Get the vectors of words. Maintain order as in document.
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
    conn.close()
    return unsorted_kw, token_vecs
Пример #2
0
def nltk_stanfordpos(inpath, outfolder):
    """POS-Tagging French text with Stanford POS-Tagger via NLTK."""
    print("\nLaunched nltk_stanfordpos.")

    import os
    import glob
    from nltk.tag.stanford import POSTagger

    for file in glob.glob(inpath):
        st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
        with open(file, "r", encoding="utf-8") as infile:
            untagged = infile.read()
            tagged = st.tag(untagged.split())

            taggedstring = ""
            for item in tagged:
                item = "\t".join(item)
                taggedstring = taggedstring + str(item) + "\n"
            #print(taggedstring)

            basename = os.path.basename(file)
            cleanfilename = basename
            if not os.path.exists(outfolder):
                os.makedirs(outfolder)
            with open(os.path.join(outfolder, cleanfilename),"w") as output:
                output.write(taggedstring)
    print("Done.")
Пример #3
0
def main():

    st = POSTagger(
        "/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger",
        "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar",
    )

    # st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \
    # "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar")

    # print st.tag("Die Kinder in Bayern haben lange Ferien".split())

    # return

    with open(sys.argv[1], "r") as f:
        content = f.read()

    sentences = re.split("\n|\.|\?", content)

    for s in sentences:
        if len(s) == 0:
            continue
        # print s
        pieces = st.tag(s.split())
        strippedPieces = stripPieces(pieces)

        print " ".join(strippedPieces)
Пример #4
0
def stanford_corenlp_filter(sent):
    from nltk.tag.stanford import POSTagger
    posTagger = POSTagger(
        '/Users/gt/Downloads/'
        'stanford-postagger-2013-06-20/models/'
        'wsj-0-18-bidirectional-nodistsim.tagger',
        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
        '/stanford-postagger-3.2.0.jar',
        encoding=encoding)

    b1, b2 = sent.split(blockSeparator)
    b2 = b2.rstrip()

    b1 = b1.lower()
    tokens = word_tokenize(b1)
    pos_tags = posTagger.tag(tokens)
    filtered_sent = ' '
    for pos_t in pos_tags:
        if pos_t[1] in filterList:
            # filtered_sent += stemmer.stem(pos_t[0]) + ' '
            filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

            #note: 1 concat stemmer(word) == stemmer(1 concat word)

    b2 = b2.lower()
    tokens = word_tokenize(b2)
    pos_tags = posTagger.tag(tokens)
    filtered_sent = ' '
    for pos_t in pos_tags:
        if pos_t[1] in filterList:
            # filtered_sent += stemmer.stem(pos_t[0]) + ' '
            filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

    return filtered_sent
Пример #5
0
def cleanTokens(tokens):


    st = POSTagger('/models/german-fast.tagger')

    tags = st.tag(tokens);
    def cleanTags(x):
        y = x[1]
        return True if re.match("NE|NN",y) and len(x[0]) > 3 else False

    clean_tags= filter(cleanTags,tags)

    #import pdb;pdb.set_trace();


    def buildSentens(arr):
        list = []
        sen =""
        for i in arr:
            list.append(i[0])
        return list



    #print len(clean_tags)
    #print clean_tags
    clean =  buildSentens(clean_tags)

    return clean
Пример #6
0
def postext_st(filename):
    # Opening of File
    path_to_raw = '/home/cyneo/Work/Scans/Text Version/'

    if type(filename) != str:
        raise IOError('Filename must be a string')

    # Preparing to Tokenize
    with open(osp.abspath(path_to_raw + filename + '.txt'),
              'r', encoding='utf8') as raw:
        # Initialize the punkt module
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = []

        for line in raw:
            sents.extend(sent_detector.tokenize(line.strip()))
    
    tokenedsents = []
    # Tokenizing
    from nltk.tokenize.stanford import StanfordTokenizer
    for line in sents:
        tokenedsents.append(StanfordTokenizer().tokenize(line))

    # Parts of Speech Tagging
    posSents = []
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
                   encoding='utf8')

    for line in tokenedsents:
        # Returns a list of a list of tuples
        posSents.append(st.tag(line))

    return posSents
Пример #7
0
def stanford_corenlp_filter(sent):
  from nltk.tag.stanford import POSTagger
  posTagger = POSTagger('/Users/gt/Downloads/'
                        'stanford-postagger-2013-06-20/models/'
                        'wsj-0-18-bidirectional-nodistsim.tagger',
                        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
                        '/stanford-postagger-3.2.0.jar',encoding=encoding)

  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1 = b1.lower()
  tokens = word_tokenize(b1)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

      #note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2 = b2.lower()
  tokens = word_tokenize(b2)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  return filtered_sent
Пример #8
0
def nltk_stanfordpos(inpath, outfolder):
    """POS-Tagging French text with Stanford POS-Tagger via NLTK."""
    print("\nLaunched nltk_stanfordpos.")

    import os
    import glob
    from nltk.tag.stanford import POSTagger

    for file in glob.glob(inpath):
        st = POSTagger(
            '/home/christof/Programs/stanfordpos/models/french.tagger',
            '/home/christof/Programs/stanfordpos/stanford-postagger.jar',
            encoding="utf8")
        with open(file, "r", encoding="utf-8") as infile:
            untagged = infile.read()
            tagged = st.tag(untagged.split())

            taggedstring = ""
            for item in tagged:
                item = "\t".join(item)
                taggedstring = taggedstring + str(item) + "\n"
            #print(taggedstring)

            basename = os.path.basename(file)
            cleanfilename = basename
            if not os.path.exists(outfolder):
                os.makedirs(outfolder)
            with open(os.path.join(outfolder, cleanfilename), "w") as output:
                output.write(taggedstring)
    print("Done.")
Пример #9
0
def createModel():
    global classifierit
    global classifierloose
    global classifieryou
    global classifierto
    global classifiertheir
    trainingitSet = []
    traininglooseSet = []
    trainingyouSet = []
    trainingtoSet = []
    trainingtheirSet= []
    st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
    for line in brown.sents():
        print line
        tagSent = st.tag(line)
        print tagSent
        arrayOfitFeature = pos_itfeatures(tagSent)
        arrayOfyouFeature = pos_youfeatures(tagSent)
        arrayOftheirFeature = pos_theirfeatures(tagSent)
        arrayOflooseFeature = pos_loosefeatures(tagSent)
        arrayOftoFeature = pos_tofeatures(tagSent)
        if arrayOfitFeature:
            trainingitSet.extend(arrayOfitFeature)
        if arrayOftheirFeature:
            trainingtheirSet.extend(arrayOftheirFeature)
        if arrayOflooseFeature:
            traininglooseSet.extend(arrayOflooseFeature)
        if arrayOftoFeature:
            trainingtoSet.extend(arrayOftoFeature)
        if arrayOfyouFeature:
            trainingyouSet.extend(arrayOfyouFeature)
        
    
    algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
    #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
    classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
    f = open('classifierit.pickle', 'wb')
    pickle.dump(classifierit, f)
    f.close()
    #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
    classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
    f = open('classifierloose.pickle', 'wb')
    pickle.dump(classifierloose, f)
    f.close()
    #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
    classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
    f = open('classifieryou.pickle', 'wb')
    pickle.dump(classifieryou, f)
    f.close()
    #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
    classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
    f = open('classifierto.pickle', 'wb')
    pickle.dump(classifierto, f)
    f.close()
    #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
    classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
    f = open('classifiertheir.pickle', 'wb')
    pickle.dump(classifiertheir, f)
    f.close()      
Пример #10
0
 def __init__(self):
     self.st = POSTagger(
         os.path.normpath(
             os.path.dirname(os.path.realpath(__file__)) +
             '/stanford-pos/models/english-bidirectional-distsim.tagger'),
         os.path.normpath(
             os.path.dirname(os.path.realpath(__file__)) +
             '/stanford-pos/stanford-postagger.jar'))
Пример #11
0
def stanford_tag(sentence):
    ''' use stanford tagger to tag a single tokenized sentence
    '''
    import src.experiment.path as path
    tagger = POSTagger(path.stanford_tagger_model_path(),
                       path.stanford_tagger_path(),
                       java_options='-Xmx16g -XX:MaxPermSize=256m')
    return tagger.tag(sentence)
Пример #12
0
def tag(segments):
    #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
    st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'),
                   os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
    tagged = []
    for segment in segments:
        x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
        tagged.append(x.decode('utf-8'))
    return tagged
Пример #13
0
def spanish_pos(text):
	""" Parts of speech tagger for Spanish """
	
	text = text.encode('utf8')

	st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger', 
				'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')

	pos_tagged = st.tag(text.split())

	return pos_tagged  
Пример #14
0
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger',
                       'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w, t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs) / 4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data, nk, iter=20, minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir, name + '.json')
    file_dest = open(json_path, 'w')
    json.dump(
        {
            'url': url,
            'vectors': token_vecs,
            'keyword_frequency': unsorted_kw,
            'centroids': centroids
        }, file_dest)
    file_dest.close()
Пример #15
0
def german_pos(text):
	""" Parts of speech tagger for German """
	
	text = text.encode('utf8')

	st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger', 
				'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')

	pos_tagged = st.tag(text.split())

	return pos_tagged  
Пример #16
0
def stanford_batch_tag(sentences):
    '''use stanford tagger to batch tag a list of tokenized
    sentences
    '''
    import src.experiment.path as path
    # need to replace the model path and tagger path of standford parser 
    # in your computer (I use two functions here, you can hard code the paths if 
    # you like)
    tagger = POSTagger(path.stanford_tagger_model_path(),
                       path.stanford_tagger_path())
    return tagger.batch_tag(sentences)
Пример #17
0
def pos_tag(texts):

    from nltk.tag.stanford import POSTagger
    
    jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar"
    if language == "german":
        model = config.mainpath+"analyze/SPOS/models/german-fast.tagger"
    if language == "english":
        model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger"
    tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8")

    return tagger.tag_sents(texts)
Пример #18
0
def pos_tag(to_tag,
            model_path=root_path +
            "\\stanford-postagger-full-2013-06-20\\models\\french.tagger",
            jar_path=root_path +
            "\\stanford-postagger-full-2013-06-20\\stanford-postagger.jar"):
    '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file'''
    pos_tagger = POSTagger(
        model_path, jar_path, encoding='utf8'
    )  #create an object of class POSTagger that is encoded in UTF-8
    tags = pos_tagger.tag(
        to_tag)  #run the tagging algorithm on the tokenized raw text
    return tags
Пример #19
0
def tag(segments):
    #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
    st = POSTagger(
        os.path.join(stanford_path,
                     'models/english-left3words-distsim.tagger'),
        os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
    tagged = []
    for segment in segments:
        x = ' '.join(
            nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
        tagged.append(x.decode('utf-8'))
    return tagged
Пример #20
0
        def __init__(self, pathToParser=None, javaHeapOptions='-Xmx4g -XX:+UseParallelGC -XX:-UseGCOverheadLimit'):

                if pathToParser is None:
                        taggerLibraryPath = normpath(os.path.join(os.getcwd(), "sp/jar/stanford-postagger.jar"))
                        taggerModelPath = normpath(os.path.join(os.getcwd(), "sp/models/english-bidirectional-distsim.tagger"))
                else:
                        taggerLibraryPath = normpath(os.path.join(pathToParser, "sp/jar/stanford-postagger.jar"))
                        taggerModelPath = normpath(os.path.join(pathToParser, "sp/models/english-bidirectional-distsim.tagger"))

                self.stanfordTagger = POSTagger(taggerModelPath,
                        taggerLibraryPath, java_options=javaHeapOptions)

                """
Пример #21
0
def main():

    print "Inicio..."
    with open("tweets_a_procesar_v2.csv", 'rb') as csvfile:
        lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'")
        # En esta variable estan todos los tweets
        tweets = []
        for line in lines:
            tweet = Tweet(line)
            #print tweet.spanish_text.split()
            tweets.append(tweet)
        
    #archivo de salida
    output = open("output_tagged_v2.csv", 'wb')
    filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'")

    #importando el tagger en español de Stanford NLP
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
    #st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
    #st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8')

    n=0
    for tweet in tweets:
        n+=1
        print tweet.spanish_text
        #Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split())
        tweet_tagged = st.tag((tweet.spanish_text).split())
        #Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        #print tweet_tagged

        important_words = []
        n_adj = 0
        for tag in tweet_tagged:
            inicial = tag[1][:1]
            if('a' in inicial):
                important_words.append(tag[0])
            if('r' in inicial):
                important_words.append(tag[0])
            if('n' in inicial):
                important_words.append(tag[0])
            if('v' in inicial):
                important_words.append(tag[0])

        #tweet.cant_adj = n_adj
        tweet.tweet_tagged = tweet_tagged
        tweet.important_words = important_words
        filewriter.writerow(tweet.to_CSV())
        if n % 100 == 0: print n
    print "Done"
    output.close()
def pos_tag_stanford(toked_sentence):
	"""
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

	from nltk.tag.stanford import POSTagger
	st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', 
               '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar')

	return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger('english-bidirectional-distsim.tagger',
                   'stanford-postagger.jar')

    return st.tag(toked_sentence)
Пример #24
0
def pos_tag(sent, tagger='stanford'):
    
    # saves pos_tagger as global variable,
    # such that it is not recreated everytime pos_tag is executed
    if not 'pos_tagger' in globals():
        global pos_tagger
        pos_tagger = POSTagger(conf.stanford_pos_model, path_to_jar=conf.stanford_postagger, encoding='UTF-8')

    if tagger == 'nltk' :
        tokens = tokenize(sent, 's')
        return nltk.pos_tag(tokens)
    elif tagger == 'stanford' :
        tokens = tokenize(sent,'w')
        return pos_tagger.tag(tokens)
    else :
        raise ValueError('No such tagger: ' + tagger)
Пример #25
0
    def __init__(self, posTagModelPath, posTaggerPath, parserModelPath,
                 workingDir):

        try:
            self.posTagger = POSTagger(posTagModelPath, posTaggerPath, "UTF-8")
            print "pos tagger is loaded"
        except:
            print "Error in loading POS tagger"

        try:
            self.parser = MaltParser(tagger=None,
                                     mco=parserModelPath,
                                     working_dir=workingDir)
            print "parser is loaded"
        except:
            print "Error in loading the MALT Parser"
Пример #26
0
 def add_POS(self, row_file, target):
     '''
     row_str = '';
     f = open(row_file,'rb');
     for row in f:
         row_str+=row;
     soup = BeautifulSoup(row_str);
     self.soup = soup;
     sentences = soup.find_all('sentence');
     all_token = list();
     for block in sentences:
         text = block.text.strip();
         text_token = self.tf.stanford_tokenize(text);
         all_token.append(text_token);
     '''
     all_token = self.get_token(target)
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     tag_list = list()
     for row in all_token:
         temp_list = list()
         for word in row:
             if len(word) > 1 and re.match(r'^[A-Z]+', word):
                 temp_list.append(word.lower())
             else:
                 temp_list.append(word)
         tag_list.append(temp_list)
         1
     #end for
     tagged_result = stanford_tagger.tag_sents(tag_list)
     '''
     for row in tagged_result:
         index_list = list();
         for num,item in enumerate(row):
             if not re.match(r'.*[\w\d]+',item[0]):
                 index_list.append(num);
         for i in index_list:
             row[i]=(row[i][0],row[i][0]);
     #end for
     '''
     w = open('pos_%s' % target, 'wb')
     for num1, row in enumerate(tagged_result):
         for num2, item in enumerate(row):
             w.write(all_token[num1][num2] + ' ' + item[1] + '\n')
         w.write('\n')
     #print tagged_result;
     return
Пример #27
0
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False):
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)

    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label

    # Get the vectors list
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
    conn.close()

    #Compute cluster centers:
    nk = round(len(token_vecs)/4)
    data = numpy.array(list(token_vecs.values()))
    cent, _ = kmeans2(data,nk,iter=20,minit='points')
    centroids = cent.tolist()

    # Create the JSON object for this webpage.

    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    json_path = os.path.join(json_dir,name+'.json')
    file_dest = open(json_path, 'w')
    json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest)
    file_dest.close()
def stan_pos(input_sent):
    """
    This function calls stanford POS tagger.In this function Stanford POS tagger directory must be in the same directory.And this function chooses model "wsj left 3 words" as normal POS tagging model. If  you want to use other POS tagging models, please change first argument of st = POSTagger() below.

    """
    eval_sent = []

    st = POSTagger("./stanford-postagger-2012-11-11/models/wsj-0-18-left3words.tagger","./stanford-postagger-2012-11-11/stanford-postagger.jar")

    pos_result = st.tag(input_sent.split())
    for one_tuple in pos_result:
        pos_format = one_tuple[0] + "_" + one_tuple[1]
        
        eval_sent.append(pos_format)

    eval_sent = reg_form(eval_sent)
    return eval_sent
Пример #29
0
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples8qfa

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger(
        '/home/satyam/zip/opinionproject/opinion_mining/resources/english-bidirectional-distsim.tagger',
        '/home/satyam/zip/opinionproject/opinion_mining/resources/stanford-postagger.jar'
    )

    return st.tag(toked_sentence)
Пример #30
0
 def add_POS(self,row_file,target):
     '''
     row_str = '';
     f = open(row_file,'rb');
     for row in f:
         row_str+=row;
     soup = BeautifulSoup(row_str);
     self.soup = soup;
     sentences = soup.find_all('sentence');
     all_token = list();
     for block in sentences:
         text = block.text.strip();
         text_token = self.tf.stanford_tokenize(text);
         all_token.append(text_token);
     '''
     all_token = self.get_token(target);
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
     tag_list = list();
     for row in all_token:
         temp_list = list();
         for word in row:
             if len(word)>1 and re.match(r'^[A-Z]+',word):
                 temp_list.append(word.lower());
             else:
                 temp_list.append(word);
         tag_list.append(temp_list);1
     #end for
     tagged_result = stanford_tagger.tag_sents(tag_list);
     '''
     for row in tagged_result:
         index_list = list();
         for num,item in enumerate(row):
             if not re.match(r'.*[\w\d]+',item[0]):
                 index_list.append(num);
         for i in index_list:
             row[i]=(row[i][0],row[i][0]);
     #end for
     '''
     w = open('pos_%s'%target,'wb');
     for num1,row in enumerate(tagged_result):
         for num2,item in enumerate(row):
             w.write(all_token[num1][num2]+' '+item[1]+'\n');
         w.write('\n');
     #print tagged_result;
     return;
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    st = POSTagger(
        '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger',
        '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar'
    )

    return st.tag(toked_sentence)
Пример #32
0
def main():
    dict2 = readDict("dict2.txt")
    sentences2 = readSentences("sentences2.txt")
    translated2 = translate(sentences2, dict2)
    print "======================================BASE TRANSLATION=========================================="
    for sentence in translated2:
        print sentence

    print "================================================================================================"

    st = POSTagger('stanford-postagger/models/english-left3words-distsim.tagger',
        'stanford-postagger/stanford-postagger.jar')
    POS = []
    for sentence in translated2:
        tagged = st.tag(sentence.split())
        if (len(tagged)>0):
            POS.append(tagged)

    POS = stupidFixes(POS)
    print "==================================STUPID FIXES TRANSLATION======================================"
    for sentence in POS:
#        print sentence # '[%s]' % ', '.join(map(str, sentence))
        print ' '.join(map(getWord, sentence))


    POS = rulesOneThree(POS)
    print "=====================================RULE1+3 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))

    POS = rulesFourFiveSeven(POS)
    print "=====================================RULE4+5+7 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))

    POS = ruleTwoNine(POS)
    POS = ruleTwoNine(POS) # apply twice
    print "=====================================RULE2+9 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))

    POS = ruleSixEight(POS)
    print "=====================================RULE6+8 TRANSLATION========================================"
    for sentence in POS:
        print ' '.join(map(getWord, sentence))
Пример #33
0
	def get_transactions(self, product_reviews):
		'''
			Generates a set of transactions ready for frequent itemset mining
			from the crawled product reviews
		'''
		pos_tagger = POSTagger(PATHS['POS_MODEL'], PATHS['POS_TAGGER'])

		pos_output = []
		transactions_output = []

		print 'Generating transactions...'
		product_count = 0
		sentence_count = 0
		for product in product_reviews:
			sentences = sent_tokenize(product)
			for sentence in sentences:
				try:
					sent_pos = pos_tagger.tag(word_tokenize(sentence))
				except UnicodeEncodeError:
					continue
				trans = []
				pos_tags = []
				for word, pos in sent_pos:
					pos_tags.append(':'.join([word, pos]))
					if ((pos == 'NN' or pos == 'NNS' or pos == 'NP') and
						re.match('^[A-Za-z0-9-]+$', word)):
						trans.append(word.lower())
				if trans:
					pos_output.append([sentence] + pos_tags)
					transactions_output.append([sentence] + trans)
					sentence_count += 1
			product_count += 1

			print '---%s Reviews and %s Transactions Parsed---' % (
				product_count,
				sentence_count
			)

		write_csv(PATHS['POS'], pos_output)
		write_csv(PATHS['TRANSACTIONS'], transactions_output)

		print 'Finished generating transactions...'
Пример #34
0
    def __init__(self, override=False):
        tagger_path = os.path.join(DIRS.user_data_dir, stanford_postagger_name)
        if not os.path.exists(tagger_path):
            raise LookupError("Stanford POS tagger not found. Try running the "
                              "command download_third_party_data.py")

        postagger = POSTagger(
            os.path.join(tagger_path, 'models', 'english-bidirectional-distsim.tagger'),
            os.path.join(tagger_path, 'stanford-postagger.jar'),
            encoding='utf8')
        super(StanfordTaggerRunner, self).__init__(postagger.batch_tag, override)
def main(word_transformation = None, result_path = None, save = SAVE, n = 500):
    tagger = POSTagger('/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger',
                       '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar')
    
    tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:]
    
    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w,t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w,t in sent]
             for sent in tagged_corpus)
    
    correct_tags = [[t for w,t in sent]
                    for sent in tagged_corpus]
    
    print "predicting"
    predicted_tags = []
    really_correct_tags = [] # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i+1) % 5 == 0:
            print "%d finished" %(i+1)
        try:
            ptags = [t for w,t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" %(sent)                
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent
        except Exception:
            traceback.print_exc()

    if save:
        print "dumping to '%s'" %(result_path)
        dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
Пример #36
0
def main(word_transformation=None, result_path=None, save=SAVE, n=500):
    tagger = POSTagger(
        '/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger',
        '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar')

    tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:]

    print "extracting sentence words"
    if word_transformation and callable(word_transformation):
        tagged_corpus = [[(word_transformation(w), t) for w, t in sent]
                         for sent in tagged_corpus]

    print "extracting sents/tags"
    sents = ([w for w, t in sent] for sent in tagged_corpus)

    correct_tags = [[t for w, t in sent] for sent in tagged_corpus]

    print "predicting"
    predicted_tags = []
    really_correct_tags = []  # some sentence might be dropped
    sentences = []
    for i, (ctags, sent) in enumerate(zip(correct_tags, sents)):
        if (i + 1) % 5 == 0:
            print "%d finished" % (i + 1)
        try:
            ptags = [t for w, t in tagger.tag(sent)]
            if len(ctags) == len(ptags):
                predicted_tags.append(ptags)
                really_correct_tags.append(ctags)
                sentences.append(sent)
            else:
                print "tags length does not match for %r" % (sent)
        except UnicodeDecodeError:
            print "UnicodeDecodeError for ", sent
        except Exception:
            traceback.print_exc()

    if save:
        print "dumping to '%s'" % (result_path)
        dump((really_correct_tags, predicted_tags, sentences),
             open(result_path, "w"))
Пример #37
0
class Tagger():
    def __init__(self):
        self.st = POSTagger(
            os.path.normpath(
                os.path.dirname(os.path.realpath(__file__)) +
                '/stanford-pos/models/english-bidirectional-distsim.tagger'),
            os.path.normpath(
                os.path.dirname(os.path.realpath(__file__)) +
                '/stanford-pos/stanford-postagger.jar'))

    def tag(self, line):
        return self.st.tag(line.split())
def pos_tag_stanford(toked_sentence):
    """
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

    from nltk.tag.stanford import POSTagger
    import os

    basePath = os.getcwd()
    st = POSTagger(
        path +
        '/resources/stanford-postagger-2015-12-09/models/english-bidirectional-distsim.tagger',
        path +
        '/resources/stanford-postagger-2015-12-09/stanford-postagger.jar')

    return st.tag(toked_sentence)
def stanfordTag(modelPath,stanfordJarPath,text,encoding):

    if not bool(re.search("java.exe", os.getenv("JAVA_HOME"))):
        java_path=os.getenv("JAVA_HOME")+"/bin/java.exe"
        os.environ['JAVA_HOME'] = java_path
        print(java_path)
        nltk.internals.config_java(java_path)
    entities = []
    stemmer = SnowballStemmer("french")
    st = POSTagger(modelPath,stanfordJarPath,encoding) 
    print(text.split())
    tags=st.tag(text.split())
    print(tags)
    for tag in tags[0]:           
        entity = {
        'token': tag[0],
        'pos': tag[1],
        'stemm' : stemmer.stem(tag[0])       
        }
        entities.append(entity)
    return entities
Пример #40
0
	def __init__(self, pos_model, stanford_tagger, java_path):
		"""
		Creates a POSTagSelector instance.
	
		@param pos_model: Path to a POS tagging model for the Stanford POS Tagger.
		The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param stanford_tagger: Path to the "stanford-postagger.jar" file.
		The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml
		@param java_path: Path to the system's "java" executable.
		Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems.
		"""
		os.environ['JAVAHOME'] = java_path
		self.tagger = POSTagger(pos_model, stanford_tagger)
Пример #41
0
def tag_tokens(tokens):
    tagged_sents = []
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
                   encoding='utf8')

    print('Starting to tag sentences')
    """
    Progress Bar:
    """
    toolbar_width = 40

    # setup toolbar
    sys.stdout.write("[%s]" % (" " * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width + 1))
    # return to start of line, after '['

    no_of_sents = len(tokens)
    no_of_ticks = 0
    sent_counter = 0

    for line in tokens:
        # Returns a list of a list of tuples
        tagged_sents.append(st.tag(line))

        # Updating bar
        sent_counter += 1
        trigger = (sent_counter * toolbar_width - 1) / no_of_sents
        if trigger >= no_of_ticks:
            while no_of_ticks < math.floor(trigger):
                sys.stdout.write("-")
                sys.stdout.flush()
                no_of_ticks += 1

    sys.stdout.write(">]\n")
    print('Done tagging')

    return tagged_sents
	def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir):
		
		
		
		try:
			self.logger = logging.getLogger(__name__)
			self.posTagger = POSTagger(posTagModelPath, posTaggerPath,encoding="UTF-8", java_options='-Xmx16000m')
			#self.posTagger = POSTagger(posTagModelPath, posTaggerPath,"UTF-8")
			#print "pos tagger is loaded"
		except:
			self.logger.warning("Error in loading POS tagger!")
			e = sys.exc_info()[0]
			self.logger.warning("Error:" + str(e))
					
		
		try:
			self.parser = MaltParser(tagger=None, mco = parserModelPath, working_dir= workingDir, additional_java_args=['-Xmx16000m']) 
			#print "parser is loaded"
		except:
			self.logger.warning("Error in loading the MALT Parser")
			e = sys.exc_info()[0]
			self.logger.warning("Error:" + str(e))				
Пример #43
0
 def generate_pos_set(self):
     print '正在构建正性集词典....'
     pos_dict = dict()
     pos_set = set()
     sentences = list()
     for row in self.train_label:
         for key in row:
             if ' ' in key:
                 sentences.append(self.tk.word_tokenize(key))
             else:
                 pos_dict[key] = pos_dict.setdefault(key, 0) + 1
                 #pos_set.add(key);
     #end for
     st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
                     ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     result = st.tag_sents(sentences)
     for row in result:
         for item in row:
             if item[1].startswith('NN'):
                 pos_dict[item[0]] = pos_dict.setdefault(item[0], 0) + 1
                 #pos_set.add(item[0]);
     #end for
     neg_dict = dict()
     for num, row in enumerate(self.tagged_train_data):
         for item in row:
             if item[1].startswith(
                     'NN') and item[0] not in self.train_word_label[num]:
                 neg_dict[item[0]] = neg_dict.setdefault(item[0], 0) + 1
     for key in pos_dict.keys():
         if pos_dict[key] > 1:
             if neg_dict.has_key(key):
                 if neg_dict[key] / pos_dict[key] < 2:
                     pos_set.add(key)
             else:
                 pos_set.add(key)
     self.pos_set = pos_set
     print '完成!'
     return
Пример #44
0
class yagoScores:
    def __init__(self):
        None
        self.en_postagger = POSTagger('parser/models/english-bidirectional-distsim.tagger', 'parser/stanford-postagger.jar')
    
    def parse(self,text):
        return self.en_postagger.tag(text.split())
        
    def get_underscoreWords(self,text):
        return re.findall("[a-z]+_[a-z]+", text)
    
    def findNounsSeq(self,tuples):
        self.noun = []    
        self.nouns = []
        prev = ""
        for each in tuples:
            if(each[1]=="NN"):
                self.noun.append(each[0])
            if(each[1]=="NNS"):
                self.nouns.append(prev+" "+each[0])
                prev = prev+" "+each[0]
            else:
                prev = each[0]
    
    def searchInWiki(self,guessess):
        #text = " ".join(self.noun)+" ".join(self.nouns)  
        text = " ".join(self.nouns) 
        print text  
        links = wikipedia.search(text)
        print ("LINKS")
        print links    
        for link in links:
            page = wikipedia.page(link)
            print page.title
            # check if guess appears in that page
            for eachg in guessess:
                print eachg.replace("_", " ").lower()
                if(eachg.replace("_", " ").lower() in page.content.lower()):
                    print "founddddddddddddddddddddd"
                    self.freq[eachg] += 1
    
    # Call getScore(self,text,guessess)function from outside, returns dict of scores of wiki appearances
    def getScore(self,text,guessess):
        self.freq = defaultdict(int)
        tuples = self.parse(text)
        print tuples
        self.findNounsSeq(tuples)
        self.searchInWiki(guessess)
        print self.freq
        return self.freq
Пример #45
0
 def generate_pos_set(self):
     print '正在构建正性集词典....';
     pos_dict = dict();
     pos_set=set();
     sentences = list();
     for row in self.train_label:
         for key in row:
             if ' ' in key:
                 sentences.append(self.tk.word_tokenize(key));
             else:
                 pos_dict[key] = pos_dict.setdefault(key,0) + 1;
                 #pos_set.add(key);
     #end for
     st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
                     ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
     result = st.tag_sents(sentences);
     for row in result:
         for item in row:
             if item[1].startswith('NN'):
                 pos_dict[item[0]] = pos_dict.setdefault(item[0],0) + 1;
                 #pos_set.add(item[0]);
     #end for
     neg_dict = dict();
     for num,row in enumerate(self.tagged_train_data):
         for item in row :
             if item[1].startswith('NN') and item[0] not in self.train_word_label[num]:
                 neg_dict[item[0]] = neg_dict.setdefault(item[0],0) + 1;
     for key in pos_dict.keys():
         if pos_dict[key] > 1:
             if neg_dict.has_key(key):
                 if neg_dict[key]/pos_dict[key] < 2:
                     pos_set.add(key);
             else:
                 pos_set.add(key);
     self.pos_set=pos_set;
     print '完成!';
     return;
Пример #46
0
 def get_whole(self, sentence):
     opinion_dict = dict()
     pos_f = open('../opinion-lexicon-English/positive-words.txt', 'rb')
     neg_f = open('../opinion-lexicon-English/negative-words.txt', 'rb')
     for _ in xrange(35):
         pos_f.readline()
         neg_f.readline()
     for word in pos_f:
         opinion_dict[word.strip()] = True
     for word in neg_f:
         opinion_dict[word.strip()] = False
     pos_f.close()
     neg_f.close()
     stemmer = PorterStemmer()
     stanford_parser = parser.Parser()
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
     w = open('sentence_test', 'wb')
     text_token = self.tf.stanford_tokenize(sentence)
     text_pos = stanford_tagger.tag(text_token)
     print text_pos
     text_dependency = stanford_parser.parseToStanfordDependencies(sentence)
     temp_list = ['none'] * len(text_token)
     for dep in text_dependency:
         if dep[0] == 'amod':
             temp_list[int(dep[1])] = '%s_1' % dep[0]
             temp_list[int(dep[2])] = '%s_2' % dep[0]
     #end for
     for num, item in enumerate(text_pos[0]):
         temp_str = 'order'
         if opinion_dict.has_key(item[0]):
             temp_str = 'opion'
         featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\
                       temp_str,temp_list[num],'O']
         w.write(' '.join(featrue_list) + '\n')
     pass
Пример #47
0
 def get_whole(self,sentence):
     opinion_dict = dict();
     pos_f = open('../opinion-lexicon-English/positive-words.txt','rb');
     neg_f = open('../opinion-lexicon-English/negative-words.txt','rb');
     for _ in xrange(35):
         pos_f.readline();
         neg_f.readline();
     for word in pos_f:
         opinion_dict[word.strip()]=True;
     for word in neg_f:
         opinion_dict[word.strip()]=False;
     pos_f.close();
     neg_f.close();
     stemmer = PorterStemmer();
     stanford_parser = parser.Parser();
     stanford_tagger = \
     POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
     w = open('sentence_test','wb');
     text_token = self.tf.stanford_tokenize(sentence);
     text_pos = stanford_tagger.tag(text_token);
     print text_pos;
     text_dependency = stanford_parser.parseToStanfordDependencies(sentence);
     temp_list = ['none']*len(text_token);
     for dep in text_dependency:
         if dep[0] == 'amod':
             temp_list[int(dep[1])]='%s_1'%dep[0];
             temp_list[int(dep[2])]='%s_2'%dep[0];
     #end for
     for num,item in enumerate(text_pos[0]):
         temp_str = 'order';
         if opinion_dict.has_key(item[0]):
             temp_str = 'opion';
         featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\
                       temp_str,temp_list[num],'O'];
         w.write(' '.join(featrue_list)+'\n');
     pass;
Пример #48
0
class StanfordTagger(WorkflowNativePOSTagger):

    def __init__(self, xml):
        from nltk.tag.stanford import POSTagger
        import os
        super(StanfordTagger, self).__init__(xml)
        self.tagger = POSTagger(os.path.join(os.getcwd(),'External/english-bidirectional-distsim.tagger'), os.path.join(os.getcwd(),'External/stanford-postagger.jar'))

    def is_ascii(self, s):
        return all(ord(c) < 128 for c in s)

    def tokenize(self, document):
        # Non ASCII characters makes the stanford tagger go crazy and run out of heap space
        if self.is_ascii(document):
            for word, tag in self.tagger.tag(document):
                    yield "%s/%s" % (word, tag)
Пример #49
0
def main():
    data_file = open("../data/good_data.txt", "r")
    out_file = open("../data/good_lines_tags_1.txt", "w")
    lines = data_file.readlines()
    data_file.close()
    line_count = 0
    english_postagger = POSTagger(
        '../postagger/models/english-bidirectional-distsim.tagger',
        '../postagger/stanford-postagger.jar')
    for line in lines:
        tag_list = []
        for t in english_postagger.tag(line.split('\n')[0].split(' ')):
            tag_list.append(t[1])
        out_file.write(" ".join(tag_list))
        out_file.write("\n")
        print "completed line" + str(line_count)
        line_count += 1
    out_file.close()
Пример #50
0
    def _parse(self, text):
        # clean up any leftover results
        while True:
            try:
                self.pos_tagger.read_nonblocking(4000, 0.25)
            except pexpect.TIMEOUT:
                break

        # send the actual text
        self.pos_tagger.sendline(text)

        max_expected_time = min(40, 3 + len(text) / 20.0)
        end_time = time.time() + max_expected_time

        incoming = ""
        while True:
            # Time left, read more data
            try:
                incoming += self.pos_tagger.read_nonblocking(
                    2000, 0.5).decode('utf-8')
                if "_" in incoming:
                    break
                time.sleep(0.0001)
            except pexpect.TIMEOUT:
                if end_time - time.time() < 0:
                    # logger.error("Error: Timeout with input '%s'" % (incoming))
                    return {
                        'error':
                        "timed out after %f seconds" % max_expected_time
                    }
                else:
                    continue
            except pexpect.EOF:
                break

        tagged_list = list(filter(None, incoming.split('\r\n')))
        for item in tagged_list:
            item.replace('_', ' ')
        tagged_string = [item for item in tagged_list if item not in [text]][0]
        result = POSTagger.parse_output(POSTagger, tagged_string)
        return result
Пример #51
0
    def pos_data(self, method='stanford'):
        '''
        pos data with alternative method --stanford with pos-tagger writen by
        stanford,or --nltk (other word) with the pos-tagger inside NLTK
        '''
        print '正在标注语料....'
        my_tag = int
        if method == 'stanford':
            st=POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger'\
                        ,'../stanford-postagger-full-2015-01-30/stanford-postagger.jar')
            my_tag = st.tag_sents
            #get tagged train_data
            sentences = list()
            for sentence in self.train_data:
                sentences.append(self.tk.word_tokenize(sentence))
            self.tagged_train_data = my_tag(sentences)
            #get tagged test_data
            sentences = list()
            for sentence in self.test_data:
                sentences.append(self.tk.word_tokenize(sentence))
            self.tagged_test_data = my_tag(sentences)
        elif method == 'nltk':
            my_tag = nltk.pos_tag
            #get tagged train_data
            tagged_train_data = list()
            for row in self.train_data:
                tagged_train_data.append(my_tag(row.split()))
            #get tagged test_data
            tagged_test_data = list()
            for row in self.test_data:
                tagged_test_data.append(my_tag(row.split()))

            self.tagged_train_data = tagged_train_data
            self.tagged_test_data = tagged_test_data
        pickle.dump(self.tagged_train_data, open('__tagged_train_data', 'wb'))
        pickle.dump(self.tagged_test_data, open('__tagged_test_data', 'wb'))
        #self.tagged_train_data=pickle.load(open('__tagged_train_data','rb'));
        #self.tagged_test_data=pickle.load(open('__tagged_test_data','rb'));
        print '完成!'
        return
 def extract_examples(self):
     training_tuples = set()
     db_fh = open(self.database_loc, 'rb')
     for line in db_fh: #going through PPDB
         elements = line.strip().split(' ||| ')
         if len(elements[1].split()) == 2 or len(elements[2].split()) == 2: #only look at 2-to-1 or 1-to-2 paraphrases
             many_phrase = elements[1] if len(elements[1].split()) == 2 else elements[2]
             one_phrase = elements[1] if len(elements[1].split()) == 1 else elements[2]
             if self.filter_number: #filter numbers, these are useless
                 isNumber = False
                 for token in many_phrase.split():
                     if self.pos_provided:
                         token = token.split('#')[0]
                     if self.is_number(token):
                         isNumber = True
                 if not isNumber:
                     training_tuples.add((one_phrase, many_phrase))
             else:
                 training_tuples.add((one_phrase, many_phrase))
     tagger = POSTagger(self.TAGGER_MODEL, self.TAGGER_LOC)
     self.training_examples = {} #reset training examples
     for element in training_tuples: #now, tag the resulting data
         words = element[1].split()
         words_only = ""
         if self.pos_provided: #if pos tags provided externally can just merge them here otherwise call the tagger
             words_only = ' '.join([word_pos.split('#')[0] for word_pos in words])
         pos_tags = [word_pos.split('#')[1] for word_pos in words] if self.pos_provided else [word_pos[1] for word_pos in tagger.tag(words)]            
         collapsed_pos = []
         for pos in pos_tags: #cluster certain pos tags together
             new_pos = collapsePOS(pos)
             collapsed_pos.append(new_pos)
         key = ' '.join(collapsed_pos)
         examples = self.training_examples[key] if key in self.training_examples else []
         if self.pos_provided:
             examples.append(' '.join([element[0], words_only]))
         else:
             examples.append(' '.join([element[0], element[1]]))
         self.training_examples[key] = examples
     sys.stderr.write("PPDB training data tagged and sorted\n")
     db_fh.close()
Пример #53
0
    def __init__(self, name, is_lazy, lazy_directory, debug, encoding,
                 tag_separator, stanford_jar_path, language_model_path):
        """
    Constructor of the component.

    @param  name:                 The name of the component.
    @type   name:                 C{string}
    @param  is_lazy:              True if the component must load previous data,
                                  False if data must be computed tought they
                                  have already been computed.
    @type   is_lazy:              C{bool}
    @param  lazy_directory:       The directory used to store previously
                                  computed data.
    @type   lazy_directory:       C{string}
    @param  debug:                True if the component is in debug mode, else
                                  False. When the component is in debug mode, it
                                  will output each step of its processing.
    @type   debug:                C{bool}
    @param  encoding:             The encoding of the files to pre-process.
    @type   encoding:             C{string}
    @param  tag_separator:        The symbol to use as a separator between a
                                  word and its POS tag.
    @type   tag_separator:        C{string}
    @param  stanford_jar_path:    The path to the jar of the Java Stanford
                                  Tagger.
    @type   stanford_jar_path:    C{string}
    @param  language_model_path:  The path to the language-specific stafonrd's
                                  model.
    @type   language_model_path:  C{string}
    """

        super(StanfordPreProcessor,
              self).__init__(name, is_lazy, lazy_directory, debug, encoding,
                             tag_separator)

        self.set_sentence_tokenizer(PunktSentenceTokenizer())
        self.set_pos_tagger(
            POSTagger(language_model_path, stanford_jar_path, encoding))
Пример #54
0
def pos_stanford(tokens):

    tagger = POSTagger('./english-bidirectional-distsim.tagger',
                       './stanford-postagger.jar')
    return tagger.tag(tokens)
#!/usr/bin/env python
# -*- coding: utf-8 -*

import numpy
import nltk
from nltk.tag.stanford import POSTagger
import sys

if len(sys.argv) != 2:
    print 'must have one argument'
    sys.exit()

chunk = sys.argv[1].decode('utf-8')
#chunk = u"妈我"

text = nltk.word_tokenize(chunk.encode('utf-8'))
st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar')

tsentence = st.tag(text)
# print tsentence
for w in tsentence:
    # print w
    # print w[1].decode('utf-8'),
    print w[1].split('#')[1]
from scipy.sparse import hstack
import os

__author__ = 'Jasneet Sabharwal'

_POS_TAGGER_MODEL_PATH = os.path.join(
    os.path.dirname(__file__), '..', '..',
    'lib/english-bidirectional-distsim.tagger')
_POS_TAGGER_JAR_PATH = os.path.join(os.path.dirname(__file__), '..', '..',
                                    'lib/stanford-postagger.jar')
_SENTI_WORDNET_FILE_PATH = os.path.join(os.path.dirname(__file__), '..', '..',
                                        'lib/SentiWordNet_3.0.0_20130122.txt')
_BOW_VOCAB_PATH = os.path.join(os.path.dirname(__file__), '..', '..',
                               'lib/bow_vocab')

POS_TAGGER = POSTagger(_POS_TAGGER_MODEL_PATH, _POS_TAGGER_JAR_PATH)
SENTI_WORDNET = SentiWordNetCorpusReader(_SENTI_WORDNET_FILE_PATH)
BOW_VECTORIZER = CountVectorizer(
    min_df=1,
    binary=True,
    dtype='float64',
    lowercase=True,
    ngram_range=(1, 1),
    stop_words=stopwords.words('english'),
    vocabulary=utils.get_bow_vocab(_BOW_VOCAB_PATH))


def _pos_features(pos_tags):
    pos_tags = [(word, tag) for (word, tag) in pos_tags
                if not word.lower() in stopwords.words('english')]
    features = defaultdict(int)
Пример #57
0
from nltk.tag.stanford import POSTagger
import textprocess as tp
import os, time

#Wraps the part of speech taggin functionality within this file

try:
    pwd = os.path.dirname(os.path.realpath(__file__))
    print pwd
except:
    print 'Something screwed up, using os.getcwd() instead'
    pwd = os.getcwd()
    
print "POSTagger Loaded"
post = POSTagger(pwd+'/stanford-postagger/models/english-bidirectional-distsim.tagger',
                 pwd+"/stanford-postagger/stanford-postagger.jar")

def tag(text):
    text = tp.preprocess(text)
    #print text
    t1 = time.time()
    outlist = post.tag(text.split())
    t2 = time.time()
    print "POS Tagging complete. Time taken: ", t2-t1, " seconds"
    return outlist
Пример #58
0
def evaluate(granularity, text):

    preprocessor = Preprocessor()
    entry = TextEntry()
    entry.body = text
    preprocessor.entries = [entry]

    data = preprocessor.get_clean_data()
    ncharsAll = preprocessor.getNChars(items=data, freq=20)

    test_data_raw = preprocessor.get_clean_data()
    test_raw_text = preprocessor.get_raw_words()

    count_vect = joblib.load('../models/t1/vec_count.joblib')
    tfidf_transform = joblib.load('../models/t1/tfidf_transform.joblib')

    data_counts = count_vect.transform(test_data_raw)
    test_data = tfidf_transform.transform(data_counts)

    dense_test = test_data.toarray()

    vocab = count_vect.vocabulary_
    nchars = []
    for nchar in ncharsAll:
        if nchar not in vocab:
            nchars.append(nchar)

    numOfTags = len(tags)
    ncharVecSize = len(nchars)

    tag_vecs = []
    pos = POSTagger(model, jar, java_options='-mx2500m')
    for i, text in enumerate(test_raw_text):
        if i % 10 == 0:
            print(i)
        words = text.split()
        tag_vector = np.zeros(numOfTags)
        words_with_tags = pos.tag(words)
        only_tags = [tag for word, tag in words_with_tags[0]]
        tags_with_freq = Counter(only_tags)
        for tag, freq in tags_with_freq.items():
            tag_vector[tags.index(tag)] = freq / len(words)
        tag_vecs.append(tag_vector)

    for i, text in enumerate(test_raw_text):
        if i % 100 == 0:
            print(i)
        words = text.split()
        ncharVec = np.zeros(ncharVecSize)
        for word in words:
            for size in sizes:
                text_nchars = [
                    word[i:i + size] for i in range(len(word) - size + 1)
                ]
                text_nchars_with_freq = Counter(text_nchars)
                for nchar, freq in text_nchars_with_freq.items():
                    if nchar in nchars:
                        ncharVec[nchars.index(nchar)] = freq / len(words)

        test_data[i] = np.concatenate((dense_test[i], ncharVec, tag_vecs[i]))

    svm_l = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' +
                        granularity + '.joblib')
    svm_u = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' +
                        granularity + '.joblib')

    evaluator = ClfEval(svm_l, svm_u)
    return evaluator.eval_data(csr_matrix(test_data))