def main(): import string import csv import re import itertools from topia.termextract import tag tagger = tag.Tagger() tagger.initialize() fp = open('Mech.txt', 'r') text = fp.read() text = ''.join(ch for ch, _ in itertools.groupby(text)) text = filter(lambda x: (x in string.printable), text) #text=text.replace('\n','.') text = re.sub('[^a-zA-Z0-9.,;:\\/\'&()]', ' ', text) print tagger.tokenize(text) print tagger(text) from topia.termextract import extract extractor = extract.TermExtractor() #extractor.filter = extract.permissiveFilter keywords = extractor(text) print keywords #print type(keywords) with open('topia_keywords.csv', 'wb') as tcsv: tcsv_write = csv.writer(tcsv) for row in sorted(keywords, key=lambda xrange: xrange[1]): tcsv_write.writerow(row)
def __init__(self, tagger=None, filter=None): if tagger is None: tagger = tag.Tagger() tagger.initialize() self.tagger = tagger if filter is None: filter = DefaultFilter() self.filter = filter
def terms(url): terms = {} html = requests.get(url) content = html.content.decode("utf-8") soup = BeautifulSoup(content) #print soup.get_text() ''' for script in soup(['script','style']): script.extract text=soup.get_text().decode("utf-8") print(text) ''' [ s.extract() for s in soup( ['style', 'script', '[document]', 'head', 'title', 'select']) ] visible_text = soup.getText() #print soup.getText() print visible_text.decode f = open('haha4.txt', 'w') for i in visible_text: f.write(i.encode('utf-8')) f.close() tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text patt = "((?: [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF7][\x80-\xBF]{3}){1,100})" s = nltk.data.load('haha4.txt', format='raw').lower() re.sub(patt, '', s) extractor.tagger(s) # extract all the terms, even the "weak" ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract print extractor(s) result = [] for ss in extractor(s): #print ss[0] for i in ss[0].split(" "): for j in i.split("-"): if not j in result: result.append(j) print result with open("words.txt", "a") as myfile: for i in result: myfile.write(i + "\n") return result
def keyterms(text, language='english'): # initialize the tagger with the required language tagger = tag.Tagger(language) tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text # s = nltk.data.load('corpora/operating/td1.txt',format = 'raw') extractor.tagger(text) # extract all the terms, even the "weak" ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract return extractor(text)
def terms(url): terms = {} url = "http://www." + url html = requests.get(url) content = html.content.decode("utf-8") soup = BeautifulSoup(content, "lxml") ''' for script in soup(['script','style']): script.extract text=soup.get_text().decode("utf-8") print(text) ''' [ s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title']) ] visible_text = soup.getText() #print visible_text.decode f = open('haha4.txt', 'w') f2 = open('keys', 'a') for i in visible_text: f.write(i.encode('utf-8')) if not i in terms: terms[i] = 1 else: terms[i] = terms[i] + 1 #print "yees" pickle.dump(terms, f2) f2.close() f.close() tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) # invoke tagging the text s = nltk.data.load('haha4.txt', format='raw') extractor.tagger(s) # extract all the terms, even the "weak" ones extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1) # extract #print extractor(s) return terms
def main(): try: # list of index terms index_list = list() # init tagging tagger = tag.Tagger() tagger.initialize() extractor = extract.TermExtractor(tagger) #extractor.filter = extract.permissiveFilter #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2) # get file path; you may need to customize this p = os.path.join('*.docx') # go through files for infile in glob.glob(p): # open document doc = Document(os.getcwd() + os.sep + infile) print os.getcwd() + os.sep + infile # get text from Word document text = getdocumenttext(doc) # tagging l = extractor(text) for item in l: if item[0] not in index_list: index_list.append(item[0]) # close Word document del doc file = codecs.open(os.getcwd() + os.sep + 'all_concordances.tsv', 'w', 'utf8') for row in sorted(index_list): file.write(row + '\t\n') file.close() finally: print "Done!"
def main(): try: # list of index terms index_list = list() # init tagging tagger = tag.Tagger() tagger.initialize() extractor = extract.TermExtractor(tagger) #extractor.filter = extract.permissiveFilter #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2) # get file path p = os.path.join('final.ms'+ os.sep, '*chapter*.docx') # you may need to customize this # go through files for infile in glob.glob(p): # open document doc = Document(os.getcwd()+'\\'+infile) print os.getcwd()+'\\'+infile # get text from Word document text = getdocumenttext(doc) # tagging l = extractor(text) for item in l: if item[0] not in index_list: index_list.append(item[0]) # close Word document del doc write_concordance(sorted(index_list), os.getcwd()+os.sep+'all_concordance.docx') finally: print "Done!"
for row1 in cur.fetchall(): words = row1[3] + " " + row1[1] corpus.append(words) titles.append(str(row1[0])) cur.execute( "SELECT id,title,description,content FROM stories where status_id != %s", (str(statusid), )) for row2 in cur.fetchall(): words = row2[3] + " " + row2[1] ct += 1 corpus.append(words) titles.append(str(row2[0])) str = " " seq = (corpus) # This is sequence of strings. newdoc = str.join(seq) from topia.termextract import tag from topia.termextract import extract extractor = extract.TermExtractor() tagger = tag.Tagger() tagger.initialize() tagger keywords.append(sorted(extractor(newdoc))) for keyword in keywords: print keyword
#coding=utf-8 from bs4 import BeautifulSoup import html2text import requests from topia.termextract import extract from topia.termextract import tag tagger = tag.Tagger('english') tagger.initialize() # create the extractor with the tagger extractor = extract.TermExtractor(tagger=tagger) extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=3) url = "http://habrahabr.ru" doc = requests.get(url).content doc = BeautifulSoup(doc, from_encoding="utf-8") [s.extract() for s in doc(u'script')] [s.extract() for s in doc(u'style')] doc = doc.get_text() #doc = html2text.html2text(doc) kw = extractor(doc) ''' test: huify(u'эту неделю').encode('utf-8') == u'эту xyеделю' ''' def huify(expr): word = expr if expr.rfind(' ') > -1: word = expr[expr.rindex(' ') + 1:] vowels = set(u'aeiouyаеиоуыяюё') mz = len(word) for vowel in vowels: