예제 #1
0
 def recreate_db(self):
     self.recreate_media_table()
     self.totag = []
     self.sb.config.config["dbupdatetime"] = loggy.currenttime()
     if not self.tagger:
         self.tagger = tagger.tagger()
     for folder in self.sb.config.config["libraryfolders"]:
         loggy.log("ELE " + folder)
         for path, dirs, files in os.walk(folder):
             for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files]:
                 mime = mimetypes.guess_type(filename)[0]  # TODO: get rid of mimetype
                 if not mime:
                     loggy.log("Update database - no mime type for " + filename)
                 elif mime.startswith("audio"):
                     loggy.log("Database recreate_db Adding Audio :" + filename)
                     self.totag.append(filename)
                 elif mime.startswith("video"):
                     loggy.log("Database recreate_db Adding Video :" + filename)
                     self.totag.append(filename)
                 else:
                     None
             #                    loggy.log("Database recreate_db Unknown mime type:" +mime+", ignoring:" +filename)
     self.totaltotag = len(self.totag)
     loggy.log("Database:" + str(self.totaltotag) + " files to scan")
     self.gettag()
예제 #2
0
	def recreate_db(self):
		self.recreate_table("media", (self.keys + self.addkeys)) #TODO: delete database and restart from scratch
		#self.recreate_table("videos", self.keys)
		#self.insert_row('music', ['fart.avi', 'farter', 'fart song'])
		self.totag = []
		self.tagger = tagger.tagger()
		self.tagger.init()

		for folder in self.config.get('Main', 'libraryfolders').split(" "):
			loggy.log('ELE '+folder)
			for path, dirs, files in os.walk(folder):
				for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files ]:
					mime = mimetypes.guess_type(filename)[0] #TODO: get rid of mimetype
					if not mime:
						None
					elif mime.startswith("audio"):
						loggy.log("Database recreate_db Adding Audio :" + filename)
						self.totag.append(filename)
					elif mime.startswith("video"):
						loggy.log("Database recreate_db Adding Video :" + filename)
						self.totag.append(filename)
					else:
						None
	#                    loggy.log("Database recreate_db Unknown mime type:" +mime+", ignoring:" +filename)
		self.totaltotag = len(self.totag)
		loggy.log('Database:' + str(self.totaltotag) + ' files to scan')
		self.gettag()
예제 #3
0
 def update_db(self):
     # self.recreate_media_table()
     self.sb.config.config["dbupdatetime"] = loggy.currenttime()
     self.totag = []
     if not self.tagger:
         self.tagger = tagger.tagger()
     for folder in self.sb.config.config["libraryfolders"]:
         loggy.log("ELE " + folder)
         for path, dirs, files in os.walk(folder):
             for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files]:
                 row = self.get_uri_db_info("file://" + filename)
                 if row:
                     mtime = int(os.path.getmtime(filename))
                     # (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(filename)
                     # print ' old {0}, new {1} mtimes'.format(row['mtime'], mtime)
                     if mtime >= row["mtime"]:
                         continue
                 mime = mimetypes.guess_type(filename)[0]  # TODO: get rid of mimetype
                 if not mime:
                     loggy.log("Update database - no mime type for " + filename)
                 elif mime.startswith("audio"):
                     loggy.log("Database recreate_db Adding Audio :" + filename)
                     self.totag.append(filename)
                 elif mime.startswith("video"):
                     loggy.log("Database recreate_db Adding Video :" + filename)
                     self.totag.append(filename)
                 else:
                     None
             #                    loggy.log("Database recreate_db Unknown mime type:" +mime+", ignoring:" +filename)
     self.totaltotag = len(self.totag)
     loggy.log("Database:" + str(self.totaltotag) + " files to scan")
     self.gettag()
예제 #4
0
def prune_tag(tweet, year, award_list, mov_list):

    nomin_count = 0
    lst = []
    tweet_dic = {}

    for award in award_list:
        award = award.lower()
        # ratio = fuzz.partial_ratio(str(award), str(tweet))
        # nomin_contains = 'nomin' in tweet
        # if(nomin_contains):
        #   nomin_count += 1
        # contains = award in tweet
        contains = partial_award_check(award, tweet)
        if (contains):
            # print(tweet)
            # if(ratio > 80):
            tags = tagger(tweet, mov_list)
            if (len(tags[0]) == 0 and len(tags[1]) == 0):
                break
            tweet_dic['text'] = tweet
            tweet_dic['tags'] = tags
            break

    return tweet_dic
예제 #5
0
def main():
    input_path = sys.argv[1]  # "../data/content/test_final_content.txt"
    classifier_path = sys.argv[2]  # '../classifiers/POS-tagger.pkl'
    brown_path = sys.argv[
        3]  # '../tools/TweeboParser/pretrained_models/twitter_brown_clustering_full'

    tagger = t.tagger(brown_cluster_path=brown_path)
    tagger.load_clf(classifier_path)

    infile = open(input_path, "r")
    train_sents = infile.readlines()
    infile.close()
    train_tokens, _ = tagger.preprocess(train_sents)
    tagged_sents = tagger.tag_sents(train_tokens, 'tweet')
    conll_sents = tagger.convert_conll(tagged_sents)
    tagger.output_tagged(conll_sents)
예제 #6
0
파일: index.py 프로젝트: shrsv/koko
    def POST(self):
        form = myform()
        if not form.validates():
            return render.index(form)
        else:
            para = form["paragraph"].value;
            tags = tagger.tagger(para)

            # Set maximum number of tags
            max_tags = 12
            if len(tags) > max_tags:
                tags = tags[:max_tags]

            # Return tags are unicode. Convert them
            # into web-friendly utf-8
            tags = [x.encode("utf-8") for x in tags]
            op = "<ol>\n"
            for tag in tags:
                op += "\t<li>" + tag + "</li>\n"
            op += "</ol>"
            return op
예제 #7
0
    def POST(self):
        form = myform()
        if not form.validates():
            return render.index(form)
        else:
            para = form["paragraph"].value
            tags = tagger.tagger(para)

            # Set maximum number of tags
            max_tags = 12
            if len(tags) > max_tags:
                tags = tags[:max_tags]

            # Return tags are unicode. Convert them
            # into web-friendly utf-8
            tags = [x.encode("utf-8") for x in tags]
            op = "<ol>\n"
            for tag in tags:
                op += "\t<li>" + tag + "</li>\n"
            op += "</ol>"
            return op
예제 #8
0
def train_tagger():
    # open training data
    infile = open("rjk2147/results/pos_tagged_4_fold_cv.txt", "r")
    train_sents = infile.readlines()
    infile.close()
    train_sents = train_sents[100:]
    # open CMU training data
    infile = open("rjk2147/data/gold/cmu_all_gold.txt")
    cmu_train_sents = infile.readlines()
    infile.close()

    tagger = t.tagger(
        brown_cluster_path=
        'rjk2147/tools/TweeboParser/pretrained_models/twitter_brown_clustering_full',
        #word2vec_path='../tools/word2vec/word2vec_twitter_model.bin',
        #word2vec_path= '../tools/word2vec/glove.6B/glove.6B.300d.txt',
        #word2vec_path= '../tools/word2vec/GoogleNews-vectors-negative300.bin',
        wiktionary_path='rjk2147/data/wiktionary')
    window = tagger.window

    half_cmu_train_sents = cmu_train_sents[:len(train_sents) / 2]
    infile.close()
    #

    all_sents = list()
    all_sents.extend(train_sents)
    all_sents.extend(cmu_train_sents)
    domain_list = None

    # Standard implementation of domain adaptation
    domain_list = ['*tweet*'] * len(train_sents)
    domain_list.extend(['*cmu*'] * len(cmu_train_sents))

    tagger.train(all_sents, domain_list)

    return tagger
예제 #9
0
파일: ftes.py 프로젝트: kqdtran/FTES
'''

# Noun phrase chunker
grammar = r"""
    # Nouns and Adjectives, terminated with Nouns
    NBAR:
        {<NN.*|JJ>*<NN.*>}
        
    # Above, connected with preposition or subordinating conjunction (in, of, etc...)
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}"""
chunker = nltk.RegexpParser(grammar)

# POS tagger - see tagger.py
tagger = tag.tagger()

def leaves(tree):
    '''
    Finds NP (nounphrase) leaf nodes of a chunk tree
    '''
    for subtree in tree.subtrees(filter = lambda t: t.node=='NP'):
        yield subtree.leaves()

def normalize(word):
    '''
    Normalizes words to lowercase and stems/lemmatizes it
    '''
    word = word.lower()
    #word = stem(word)
    word = strip(lemmatize(word), True)
예제 #10
0
from tagger import tagger, vectorizeTagSeq
import numpy as np
import nltk


def convert2array(text):
    text = text.split("\n")
    textinwords = []
    it = 0
    for i in text:
        temp = i.split(" ")
        if temp[-1] == ".":
            temp = temp[:len(temp) - 1]
        temp = " ".join(temp)
        textinwords.append(temp)
    return np.array(textinwords)


if __name__ == "__main__":
    text = open("incorrect_corpus.txt").read()
    x_words = convert2array(text)
    tagged_sequence = tagger(x_words)
    dimensions = np.load("dimensions.npy")
    dimension_array = np.load("dimension_array.npy")
    tagged_generated = vectorizeTagSeq(dimensions, tagged_sequence,
                                       dimension_array)
    np.save("tagged_vectors_gen_dropshuff.npy", np.array(tagged_generated))
    print len(tagged_sequence)
    print tagged_generated[1]
    #temp = np.load("tagged_vectors_gen_srilm_wiki.npy")
    #print temp[1]
예제 #11
0
	def add_to_library(self,songfile):
		mytagger = tagger()
		tags = mytagger.get_track_info(songfile)
		CMP.warn("adding "+songfile)
		self.cursor.execute(self.library_insert, ( songfile, tags['tracknum'], tags["title"], tags["artist"], tags["album"], tags['year'], str(os.path.getmtime(songfile)) ) )
예제 #12
0
 def __init__(self, parent):
     Frame.__init__(self, parent)   
      
     self.tagger = tagger()
     self.parent = parent        
     self.initUI()
stem = filtering.stem

# Taken from Su Nam Kim Paper
grammar = r"""
    # Nouns and Adjectives, terminated with Nouns
    NBAR:
        {<NN.*|JJ>*<NN.*>}
        
    # Above, connected with preposition or subordinating conjunction (in, of, etc...)
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}
"""
chunker = nltk.RegexpParser(grammar)
stopwords = stopwords.words('english')
tagger = tagger.tagger()

###############################################################################
## Helper function for normalizing words and extracting 
## noun phrases from the Syntax Tree
###############################################################################

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree"""
    for subtree in tree.subtrees(filter = lambda t: t.node=='NP'):
        yield subtree.leaves()


def normalize(word):
    """Normalizes words to lowercase and stems and lemmatizes it"""
    word = word.lower()
예제 #14
0
def mytag():
    t = text.get(0.0, END)
    text.delete(0.0, END)
    text.insert(0.0, tagger(t))
예제 #15
0
def main():
    # start timer
    time.clock()


    # Training the Tagger:

    # open training data
    #infile = open("../data/gold/simple_gold_revised_emojis.txt", "r")
    infile = open("../results/pos_tagged_4_fold_cv.txt", "r")
    t_sents = infile.readlines()
    infile.close()
    #train_sents = []
    train_sents = list(t_sents[102:])
    #train_sents = list(t_sents)
    # open CMU training data
    infile = open("../data/gold/cmu_all_gold.txt")
    sents = infile.readlines()
    cmu_train_sents = sents
    #cmu_train_sents = sents[:1328]
    #cmu_train_sents.extend(sents[1428:])
    #cmu_train_sents = []
    infile.close()

    all_sents = list()
    all_sents.extend(train_sents)
    all_sents.extend(cmu_train_sents)

    # Standard implementation of domain adaptation
    domain_list = ['tweet']*len(train_sents)
    #domain_list.extend(['tweet']*len(cmu_train_sents))
    domain_list.extend(['cmu']*len(cmu_train_sents))
    #domain_list = None


    # Initializing the tagger
    tagger = t.tagger(brown_cluster_path='../tools/TweeboParser/pretrained_models/twitter_brown_clustering_full',
                      word2vec_path='../tools/word2vec/word2vec_twitter_model.bin'

                      #word2vec_path= '../tools/word2vec/glove.6B/glove.6B.300d.txt',
                      #word2vec_path= '../tools/word2vec/glove.840B.300d/glove.840B.300d.txt'
                      #word2vec_path= '../tools/word2vec/glove.twitter.27B/glove.twitter.27B.200d.txt',
                      #word2vec_path= '../tools/word2vec/GoogleNews-vectors-negative300.bin'
                      #wiktionary_path='../data/wiktionary'
                      )


    #tagged_sents = tagger.cross_validation(train_sents, domain_list, len(train_sents), folds=4)
    #tagger.output_tagged(tagged_sents, '../results/pos_tagged_4_fold_cv.txt',)

    tagger.train(all_sents, domain_list)

    tagger.save_clf(path='../classifiers/POS-tagger.pkl')

    # Using the tagger to tag dev set data

    # open Corpus development data

    #infile = open("../data/content/simple_content_emoji.txt", "r")
    infile = open("../data/gold/simple_gold_revised_emojis.txt", "r")
    #infile = open("../data/gold/test_final.txt", "r")
    print('Reading Dev')
    train_Dev = infile.readlines()[:200]
    infile.close()
    dev_tokens, _ = tagger.preprocess(train_Dev)

    print('Testing Dev')
    tagged_sents = tagger.tag_sents(dev_tokens, 'tweet')
    print('Writing Results')
    tagger.output_tagged(tagged_sents, '../results/pos_tagged_cv.txt')


    infile = open("../data/content/test_final_content.txt", "r")
    print('Reading Dev')
    train_test = infile.readlines()[:200]
    infile.close()
    test_tokens, _ = tagger.preprocess(train_test)
    print('Testing Dev')
    tagged_sents = tagger.tag_sents(test_tokens, 'tweet')
    print('Writing Results')
    tagger.output_tagged(tagged_sents, '../results/pos_tagged_test_cv.txt')

    '''
    infile = open("../data/gold/cmu_test_gold.txt", "r")
    train_cmu = infile.readlines()
    cmu_tokens, _ = tagger.preprocess(train_cmu)
    tagged_sents = tagger.tag_sents(cmu_tokens, 'cmu')
    tagger.output_tagged(tagged_sents, '../results/cmu_pos_tagged_cv.txt')
    '''

    print("Time: " + str(time.clock()) + ' sec')