def recreate_db(self): self.recreate_media_table() self.totag = [] self.sb.config.config["dbupdatetime"] = loggy.currenttime() if not self.tagger: self.tagger = tagger.tagger() for folder in self.sb.config.config["libraryfolders"]: loggy.log("ELE " + folder) for path, dirs, files in os.walk(folder): for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files]: mime = mimetypes.guess_type(filename)[0] # TODO: get rid of mimetype if not mime: loggy.log("Update database - no mime type for " + filename) elif mime.startswith("audio"): loggy.log("Database recreate_db Adding Audio :" + filename) self.totag.append(filename) elif mime.startswith("video"): loggy.log("Database recreate_db Adding Video :" + filename) self.totag.append(filename) else: None # loggy.log("Database recreate_db Unknown mime type:" +mime+", ignoring:" +filename) self.totaltotag = len(self.totag) loggy.log("Database:" + str(self.totaltotag) + " files to scan") self.gettag()
def recreate_db(self): self.recreate_table("media", (self.keys + self.addkeys)) #TODO: delete database and restart from scratch #self.recreate_table("videos", self.keys) #self.insert_row('music', ['fart.avi', 'farter', 'fart song']) self.totag = [] self.tagger = tagger.tagger() self.tagger.init() for folder in self.config.get('Main', 'libraryfolders').split(" "): loggy.log('ELE '+folder) for path, dirs, files in os.walk(folder): for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files ]: mime = mimetypes.guess_type(filename)[0] #TODO: get rid of mimetype if not mime: None elif mime.startswith("audio"): loggy.log("Database recreate_db Adding Audio :" + filename) self.totag.append(filename) elif mime.startswith("video"): loggy.log("Database recreate_db Adding Video :" + filename) self.totag.append(filename) else: None # loggy.log("Database recreate_db Unknown mime type:" +mime+", ignoring:" +filename) self.totaltotag = len(self.totag) loggy.log('Database:' + str(self.totaltotag) + ' files to scan') self.gettag()
def update_db(self): # self.recreate_media_table() self.sb.config.config["dbupdatetime"] = loggy.currenttime() self.totag = [] if not self.tagger: self.tagger = tagger.tagger() for folder in self.sb.config.config["libraryfolders"]: loggy.log("ELE " + folder) for path, dirs, files in os.walk(folder): for filename in [os.path.abspath(os.path.join(path, filename)) for filename in files]: row = self.get_uri_db_info("file://" + filename) if row: mtime = int(os.path.getmtime(filename)) # (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(filename) # print ' old {0}, new {1} mtimes'.format(row['mtime'], mtime) if mtime >= row["mtime"]: continue mime = mimetypes.guess_type(filename)[0] # TODO: get rid of mimetype if not mime: loggy.log("Update database - no mime type for " + filename) elif mime.startswith("audio"): loggy.log("Database recreate_db Adding Audio :" + filename) self.totag.append(filename) elif mime.startswith("video"): loggy.log("Database recreate_db Adding Video :" + filename) self.totag.append(filename) else: None # loggy.log("Database recreate_db Unknown mime type:" +mime+", ignoring:" +filename) self.totaltotag = len(self.totag) loggy.log("Database:" + str(self.totaltotag) + " files to scan") self.gettag()
def prune_tag(tweet, year, award_list, mov_list): nomin_count = 0 lst = [] tweet_dic = {} for award in award_list: award = award.lower() # ratio = fuzz.partial_ratio(str(award), str(tweet)) # nomin_contains = 'nomin' in tweet # if(nomin_contains): # nomin_count += 1 # contains = award in tweet contains = partial_award_check(award, tweet) if (contains): # print(tweet) # if(ratio > 80): tags = tagger(tweet, mov_list) if (len(tags[0]) == 0 and len(tags[1]) == 0): break tweet_dic['text'] = tweet tweet_dic['tags'] = tags break return tweet_dic
def main(): input_path = sys.argv[1] # "../data/content/test_final_content.txt" classifier_path = sys.argv[2] # '../classifiers/POS-tagger.pkl' brown_path = sys.argv[ 3] # '../tools/TweeboParser/pretrained_models/twitter_brown_clustering_full' tagger = t.tagger(brown_cluster_path=brown_path) tagger.load_clf(classifier_path) infile = open(input_path, "r") train_sents = infile.readlines() infile.close() train_tokens, _ = tagger.preprocess(train_sents) tagged_sents = tagger.tag_sents(train_tokens, 'tweet') conll_sents = tagger.convert_conll(tagged_sents) tagger.output_tagged(conll_sents)
def POST(self): form = myform() if not form.validates(): return render.index(form) else: para = form["paragraph"].value; tags = tagger.tagger(para) # Set maximum number of tags max_tags = 12 if len(tags) > max_tags: tags = tags[:max_tags] # Return tags are unicode. Convert them # into web-friendly utf-8 tags = [x.encode("utf-8") for x in tags] op = "<ol>\n" for tag in tags: op += "\t<li>" + tag + "</li>\n" op += "</ol>" return op
def POST(self): form = myform() if not form.validates(): return render.index(form) else: para = form["paragraph"].value tags = tagger.tagger(para) # Set maximum number of tags max_tags = 12 if len(tags) > max_tags: tags = tags[:max_tags] # Return tags are unicode. Convert them # into web-friendly utf-8 tags = [x.encode("utf-8") for x in tags] op = "<ol>\n" for tag in tags: op += "\t<li>" + tag + "</li>\n" op += "</ol>" return op
def train_tagger(): # open training data infile = open("rjk2147/results/pos_tagged_4_fold_cv.txt", "r") train_sents = infile.readlines() infile.close() train_sents = train_sents[100:] # open CMU training data infile = open("rjk2147/data/gold/cmu_all_gold.txt") cmu_train_sents = infile.readlines() infile.close() tagger = t.tagger( brown_cluster_path= 'rjk2147/tools/TweeboParser/pretrained_models/twitter_brown_clustering_full', #word2vec_path='../tools/word2vec/word2vec_twitter_model.bin', #word2vec_path= '../tools/word2vec/glove.6B/glove.6B.300d.txt', #word2vec_path= '../tools/word2vec/GoogleNews-vectors-negative300.bin', wiktionary_path='rjk2147/data/wiktionary') window = tagger.window half_cmu_train_sents = cmu_train_sents[:len(train_sents) / 2] infile.close() # all_sents = list() all_sents.extend(train_sents) all_sents.extend(cmu_train_sents) domain_list = None # Standard implementation of domain adaptation domain_list = ['*tweet*'] * len(train_sents) domain_list.extend(['*cmu*'] * len(cmu_train_sents)) tagger.train(all_sents, domain_list) return tagger
''' # Noun phrase chunker grammar = r""" # Nouns and Adjectives, terminated with Nouns NBAR: {<NN.*|JJ>*<NN.*>} # Above, connected with preposition or subordinating conjunction (in, of, etc...) NP: {<NBAR>} {<NBAR><IN><NBAR>}""" chunker = nltk.RegexpParser(grammar) # POS tagger - see tagger.py tagger = tag.tagger() def leaves(tree): ''' Finds NP (nounphrase) leaf nodes of a chunk tree ''' for subtree in tree.subtrees(filter = lambda t: t.node=='NP'): yield subtree.leaves() def normalize(word): ''' Normalizes words to lowercase and stems/lemmatizes it ''' word = word.lower() #word = stem(word) word = strip(lemmatize(word), True)
from tagger import tagger, vectorizeTagSeq import numpy as np import nltk def convert2array(text): text = text.split("\n") textinwords = [] it = 0 for i in text: temp = i.split(" ") if temp[-1] == ".": temp = temp[:len(temp) - 1] temp = " ".join(temp) textinwords.append(temp) return np.array(textinwords) if __name__ == "__main__": text = open("incorrect_corpus.txt").read() x_words = convert2array(text) tagged_sequence = tagger(x_words) dimensions = np.load("dimensions.npy") dimension_array = np.load("dimension_array.npy") tagged_generated = vectorizeTagSeq(dimensions, tagged_sequence, dimension_array) np.save("tagged_vectors_gen_dropshuff.npy", np.array(tagged_generated)) print len(tagged_sequence) print tagged_generated[1] #temp = np.load("tagged_vectors_gen_srilm_wiki.npy") #print temp[1]
def add_to_library(self,songfile): mytagger = tagger() tags = mytagger.get_track_info(songfile) CMP.warn("adding "+songfile) self.cursor.execute(self.library_insert, ( songfile, tags['tracknum'], tags["title"], tags["artist"], tags["album"], tags['year'], str(os.path.getmtime(songfile)) ) )
def __init__(self, parent): Frame.__init__(self, parent) self.tagger = tagger() self.parent = parent self.initUI()
stem = filtering.stem # Taken from Su Nam Kim Paper grammar = r""" # Nouns and Adjectives, terminated with Nouns NBAR: {<NN.*|JJ>*<NN.*>} # Above, connected with preposition or subordinating conjunction (in, of, etc...) NP: {<NBAR>} {<NBAR><IN><NBAR>} """ chunker = nltk.RegexpParser(grammar) stopwords = stopwords.words('english') tagger = tagger.tagger() ############################################################################### ## Helper function for normalizing words and extracting ## noun phrases from the Syntax Tree ############################################################################### def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree""" for subtree in tree.subtrees(filter = lambda t: t.node=='NP'): yield subtree.leaves() def normalize(word): """Normalizes words to lowercase and stems and lemmatizes it""" word = word.lower()
def mytag(): t = text.get(0.0, END) text.delete(0.0, END) text.insert(0.0, tagger(t))
def main(): # start timer time.clock() # Training the Tagger: # open training data #infile = open("../data/gold/simple_gold_revised_emojis.txt", "r") infile = open("../results/pos_tagged_4_fold_cv.txt", "r") t_sents = infile.readlines() infile.close() #train_sents = [] train_sents = list(t_sents[102:]) #train_sents = list(t_sents) # open CMU training data infile = open("../data/gold/cmu_all_gold.txt") sents = infile.readlines() cmu_train_sents = sents #cmu_train_sents = sents[:1328] #cmu_train_sents.extend(sents[1428:]) #cmu_train_sents = [] infile.close() all_sents = list() all_sents.extend(train_sents) all_sents.extend(cmu_train_sents) # Standard implementation of domain adaptation domain_list = ['tweet']*len(train_sents) #domain_list.extend(['tweet']*len(cmu_train_sents)) domain_list.extend(['cmu']*len(cmu_train_sents)) #domain_list = None # Initializing the tagger tagger = t.tagger(brown_cluster_path='../tools/TweeboParser/pretrained_models/twitter_brown_clustering_full', word2vec_path='../tools/word2vec/word2vec_twitter_model.bin' #word2vec_path= '../tools/word2vec/glove.6B/glove.6B.300d.txt', #word2vec_path= '../tools/word2vec/glove.840B.300d/glove.840B.300d.txt' #word2vec_path= '../tools/word2vec/glove.twitter.27B/glove.twitter.27B.200d.txt', #word2vec_path= '../tools/word2vec/GoogleNews-vectors-negative300.bin' #wiktionary_path='../data/wiktionary' ) #tagged_sents = tagger.cross_validation(train_sents, domain_list, len(train_sents), folds=4) #tagger.output_tagged(tagged_sents, '../results/pos_tagged_4_fold_cv.txt',) tagger.train(all_sents, domain_list) tagger.save_clf(path='../classifiers/POS-tagger.pkl') # Using the tagger to tag dev set data # open Corpus development data #infile = open("../data/content/simple_content_emoji.txt", "r") infile = open("../data/gold/simple_gold_revised_emojis.txt", "r") #infile = open("../data/gold/test_final.txt", "r") print('Reading Dev') train_Dev = infile.readlines()[:200] infile.close() dev_tokens, _ = tagger.preprocess(train_Dev) print('Testing Dev') tagged_sents = tagger.tag_sents(dev_tokens, 'tweet') print('Writing Results') tagger.output_tagged(tagged_sents, '../results/pos_tagged_cv.txt') infile = open("../data/content/test_final_content.txt", "r") print('Reading Dev') train_test = infile.readlines()[:200] infile.close() test_tokens, _ = tagger.preprocess(train_test) print('Testing Dev') tagged_sents = tagger.tag_sents(test_tokens, 'tweet') print('Writing Results') tagger.output_tagged(tagged_sents, '../results/pos_tagged_test_cv.txt') ''' infile = open("../data/gold/cmu_test_gold.txt", "r") train_cmu = infile.readlines() cmu_tokens, _ = tagger.preprocess(train_cmu) tagged_sents = tagger.tag_sents(cmu_tokens, 'cmu') tagger.output_tagged(tagged_sents, '../results/cmu_pos_tagged_cv.txt') ''' print("Time: " + str(time.clock()) + ' sec')