def upload(): # tagger classes and functions weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary myreader = tagger.Reader() # or your own reader class mystemmer = tagger.Stemmer() # or your own stemmer class myrater = tagger.Rater(weights) # or your own... (you got the idea) mytagger = Tagger(myreader, mystemmer, myrater) #---------------------------------------------------------------------------------------- # Get the name of the uploaded files uploaded_files = request.files.getlist("file[]") filenames = [] tags = [] for file in uploaded_files: # Check if the file is one of the allowed types/extensions if file and allowed_file(file.filename): # Make the filename safe, remove unsupported chars filename = secure_filename(file.filename) tags.append(mytagger(file.read())) # Move the file form the temporal folder to the upload # folder we setup file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # Save the filename into a list, we'll use it later filenames.append(filename) # Redirect the user to the uploaded_file route, which # will basicaly show on the browser the uploaded file # Load an html page with a link to each uploaded file return render_template('upload.html', filenames=filenames, tags=tags)
def Get_TweetTags(data,no_tags,multi_tag_len, dict_path=None): if dict_path is not None: weights = pickle.load(open(dict_path, 'rb')) # or your own dictionary else: weights = pickle.load(open(BASE_DIR+'/Summarizer_Tagger/data/dict.pkl', 'rb')) # default dictionary myreader = tagger.Reader() # or your own reader class mystemmer = tagger.Stemmer() # or your own stemmer class myrater = tagger.Rater(weights,multi_tag_len) # or your own... (you got the idea) mytagger = Tagger(myreader, mystemmer, myrater) best_tags = mytagger(data, no_tags) return best_tags
# install Goose https://github.com/grangier/python-goose # # Done so far: basic keyword extraction using tagger works. # # Concerns about keyword extraction using Tagger library: # https://github.com/apresta/tagger # - dictionary should be built from relevant corpi to article to be more # effective at attracting attention in immersive interface # - TF-IDF is a function provided in the module build_dict... if articles # in collection ever accumulate enough around one subject, use TF-IDF # # immediate todos: # - implement multitag from goose import Goose import tagger import pickle url = "http://www.theverge.com/2014/9/11/6136443/the-largest-predatory-dinosaur-ever-was-half-duck-half-crocodile" g = Goose() article = g.extract(url=url).cleaned_text weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary mytagger = tagger.Tagger(tagger.Reader(), tagger.Stemmer(), tagger.Rater(weights)) best_3_tags = mytagger(article, 6) print best_3_tags
def Summarizerr_old(file_name,no_tags,no_line_whole,no_line_para,keyword=None): import tagger from tagger import Tagger,Stemmer import pickle data = file_name import os from TextSummarization import settings path = os.path.join(settings.PROJECT_ROOT,'static/dict.pkl') print path weights = pickle.load(open(path, 'rb')) # or your own dictionary # file = open("data/dict.txt", "w") # file.write(pickle.dump(data1, file)) # file.close() print no_tags myreader = tagger.Reader() # or your own reader class mystemmer = tagger.Stemmer() # or your own stemmer class myrater = tagger.Rater(weights,3) # or your own... (you got the idea) mytagger = Tagger(myreader, mystemmer, myrater) best_tags = mytagger(data, no_tags) print best_tags print "\n" from Summarizer import SummaryTool # Create a SummaryTool object st = SummaryTool() summary = 'Tags :' for i in best_tags: summary += str(i).title().replace("'"," ") summary += '\n\n' summary += "Summary :\n\n" summary += st.get_summary(data,no_line_para,no_line_whole,keyword) # Print the summary ratio = 100*(float(len(summary)) / len(data)) summary += "\n\n" summary +="Original Length :" summary +=str(len(data)) summary +="\n" summary +="Summary Length :" summary +=str(len(summary)) summary +="\n" summary +="Summary Ratio :" summary +=str(ratio) summary +="%" print "" print "Original Length %s" % len(data) print "Summary Length %s" % len(summary) print "Summary Ratio: %s" % ratio ,'%' return summary
string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() weights = pickle.load(open('data/dict.pkl', 'rb')) # or your own dictionary myreader = tagger.Reader() # or your own reader class mystemmer = tagger.Stemmer() # or your own stemmer class myrater = tagger.Rater(weights) # or your own... (you got the idea) mytagger = Tagger(myreader, mystemmer, myrater) tags = [] f = pd.read_csv('articlesabsctracts.csv', delimiter=';') abstracts = f['abstract'] for i in range(len(f['abstract']) - 1): f['abstract'][i] = re.sub('Abstract', '', f['abstract'][i]) for i in range(len(f['abstract']) - 1): f['abstract'][i] = re.sub('Summary', '', f['abstract'][i]) abstracts = [clean_str(a) for a in abstracts] abstracts = [wordpunct_tokenize(a) for a in abstracts] morph = pymorphy2.MorphAnalyzer() for i in range(len(abstracts)):
import os import pickle import tagger datafile = os.path.join(os.path.dirname(__file__), '..', 'data/dict.pkl') # print datafile weights = pickle.load(open(datafile, 'rb')) rdr = tagger.Reader() stmr = tagger.Stemmer() rtr = tagger.Rater(weights) extract_tags = tagger.Tagger(rdr, stmr, rtr)