Exemplo n.º 1
0
def main():
    import string
    import csv
    import re
    import itertools
    from topia.termextract import tag
    tagger = tag.Tagger()
    tagger.initialize()
    fp = open('Mech.txt', 'r')
    text = fp.read()
    text = ''.join(ch for ch, _ in itertools.groupby(text))
    text = filter(lambda x: (x in string.printable), text)
    #text=text.replace('\n','.')
    text = re.sub('[^a-zA-Z0-9.,;:\\/\'&()]', ' ', text)

    print tagger.tokenize(text)
    print tagger(text)
    from topia.termextract import extract
    extractor = extract.TermExtractor()
    #extractor.filter = extract.permissiveFilter
    keywords = extractor(text)
    print keywords
    #print type(keywords)
    with open('topia_keywords.csv', 'wb') as tcsv:
        tcsv_write = csv.writer(tcsv)
        for row in sorted(keywords, key=lambda xrange: xrange[1]):
            tcsv_write.writerow(row)
Exemplo n.º 2
0
 def __init__(self, tagger=None, filter=None):
     if tagger is None:
         tagger = tag.Tagger()
         tagger.initialize()
     self.tagger = tagger
     if filter is None:
         filter = DefaultFilter()
     self.filter = filter
Exemplo n.º 3
0
def terms(url):
    terms = {}
    html = requests.get(url)
    content = html.content.decode("utf-8")
    soup = BeautifulSoup(content)
    #print soup.get_text()
    '''
	for script in soup(['script','style']):
		script.extract

	text=soup.get_text().decode("utf-8")
	print(text)
	'''
    [
        s.extract() for s in soup(
            ['style', 'script', '[document]', 'head', 'title', 'select'])
    ]
    visible_text = soup.getText()
    #print soup.getText()

    print visible_text.decode

    f = open('haha4.txt', 'w')

    for i in visible_text:
        f.write(i.encode('utf-8'))
    f.close()

    tagger = tag.Tagger('english')
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    patt = "((?: [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF7][\x80-\xBF]{3}){1,100})"
    s = nltk.data.load('haha4.txt', format='raw').lower()
    re.sub(patt, '', s)
    extractor.tagger(s)
    # extract all the terms, even the "weak" ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract

    print extractor(s)
    result = []
    for ss in extractor(s):
        #print ss[0]
        for i in ss[0].split(" "):
            for j in i.split("-"):
                if not j in result:
                    result.append(j)

    print result

    with open("words.txt", "a") as myfile:
        for i in result:
            myfile.write(i + "\n")

    return result
Exemplo n.º 4
0
def keyterms(text, language='english'):
    # initialize the tagger with the required language
    tagger = tag.Tagger(language)
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    #    s = nltk.data.load('corpora/operating/td1.txt',format = 'raw')
    extractor.tagger(text)
    # extract all the terms, even the "weak" ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract
    return extractor(text)
Exemplo n.º 5
0
def terms(url):
    terms = {}
    url = "http://www." + url
    html = requests.get(url)
    content = html.content.decode("utf-8")
    soup = BeautifulSoup(content, "lxml")
    '''
	for script in soup(['script','style']):
		script.extract

	text=soup.get_text().decode("utf-8")
	print(text)
	'''
    [
        s.extract()
        for s in soup(['style', 'script', '[document]', 'head', 'title'])
    ]
    visible_text = soup.getText()
    #print visible_text.decode
    f = open('haha4.txt', 'w')
    f2 = open('keys', 'a')
    for i in visible_text:
        f.write(i.encode('utf-8'))
        if not i in terms:
            terms[i] = 1
        else:
            terms[i] = terms[i] + 1
            #print "yees"
    pickle.dump(terms, f2)
    f2.close()
    f.close()

    tagger = tag.Tagger('english')
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    s = nltk.data.load('haha4.txt', format='raw')
    extractor.tagger(s)
    # extract all the terms, even the "weak" ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract
    #print extractor(s)
    return terms
Exemplo n.º 6
0
def main():
    try:
        # list of index terms
        index_list = list()

        # init tagging
        tagger = tag.Tagger()
        tagger.initialize()
        extractor = extract.TermExtractor(tagger)
        #extractor.filter = extract.permissiveFilter
        #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)

        # get file path; you may need to customize this
        p = os.path.join('*.docx')

        # go through files
        for infile in glob.glob(p):
            # open document
            doc = Document(os.getcwd() + os.sep + infile)
            print os.getcwd() + os.sep + infile

            # get text from Word document
            text = getdocumenttext(doc)

            # tagging
            l = extractor(text)
            for item in l:
                if item[0] not in index_list:
                    index_list.append(item[0])

            # close Word document
            del doc

            file = codecs.open(os.getcwd() + os.sep + 'all_concordances.tsv',
                               'w', 'utf8')
            for row in sorted(index_list):
                file.write(row + '\t\n')
            file.close()
    finally:
        print "Done!"
Exemplo n.º 7
0
def main():
   try:
       # list of index terms
       index_list = list()

       # init tagging
       tagger = tag.Tagger()
       tagger.initialize()
       extractor = extract.TermExtractor(tagger)
       #extractor.filter = extract.permissiveFilter
       #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)

       # get file path
       p = os.path.join('final.ms'+ os.sep, '*chapter*.docx') # you may need to customize this

       # go through files
       for infile in glob.glob(p):
          # open document
          doc = Document(os.getcwd()+'\\'+infile)
          print os.getcwd()+'\\'+infile

          # get text from Word document
          text = getdocumenttext(doc)

          # tagging
          l = extractor(text)
          for item in l:
             if item[0] not in index_list:
                index_list.append(item[0])

          # close Word document
          del doc

       write_concordance(sorted(index_list), os.getcwd()+os.sep+'all_concordance.docx')
   finally:
      print "Done!"
Exemplo n.º 8
0
    for row1 in cur.fetchall():
        words = row1[3] + " " + row1[1]
        corpus.append(words)
        titles.append(str(row1[0]))

cur.execute(
    "SELECT id,title,description,content FROM stories where status_id != %s",
    (str(statusid), ))
for row2 in cur.fetchall():
    words = row2[3] + " " + row2[1]
    ct += 1
    corpus.append(words)
    titles.append(str(row2[0]))

str = " "
seq = (corpus)
# This is sequence of strings.
newdoc = str.join(seq)

from topia.termextract import tag
from topia.termextract import extract
extractor = extract.TermExtractor()
tagger = tag.Tagger()
tagger.initialize()
tagger

keywords.append(sorted(extractor(newdoc)))

for keyword in keywords:
    print keyword
Exemplo n.º 9
0
#coding=utf-8
from bs4 import BeautifulSoup
import html2text
import requests
from topia.termextract import extract
from topia.termextract import tag

tagger = tag.Tagger('english')
tagger.initialize()

# create the extractor with the tagger
extractor = extract.TermExtractor(tagger=tagger)
extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=3)
url = "http://habrahabr.ru"
doc = requests.get(url).content
doc = BeautifulSoup(doc, from_encoding="utf-8")
[s.extract() for s in doc(u'script')]
[s.extract() for s in doc(u'style')]
doc = doc.get_text()
#doc = html2text.html2text(doc)
kw = extractor(doc)
''' test: huify(u'эту неделю').encode('utf-8') == u'эту xyеделю' '''


def huify(expr):
    word = expr
    if expr.rfind(' ') > -1:
        word = expr[expr.rindex(' ') + 1:]
    vowels = set(u'aeiouyаеиоуыяюё')
    mz = len(word)
    for vowel in vowels: