Пример #1
0
def extractKeywords(category_label, event_text):
    from topia.termextract import extract
    keywords_count = {}
    # training data, use first 150 entries in the dataset.
    for i in range(202):
        extractor = extract.TermExtractor()
        a = extractor(event_text[i])
        for pair in a:
            if pair[0] in keywords_count:
                keywords_count[pair[0]] += pair[1]
            else:
                keywords_count[pair[0]] = pair[1]
    import operator
    sorted_keywords_count = sorted(keywords_count.items(),
                                   key=operator.itemgetter(1))
    tmp = []
    import re
    for sorted_keyword_count in sorted_keywords_count:
        if re.match('^[A-Za-z ]*$', sorted_keyword_count[0]) is not None:
            tmp.append(sorted_keyword_count)
    sorted_keywords_count = tmp
    total_extracted_keywords_count = len(sorted_keywords_count)
    # choose the most frequent 500 keywords as features
    keywords = sorted_keywords_count
    f2 = open('keywords', 'w')
    for keyword in keywords:
        f2.write(keyword[0] + "\n")
Пример #2
0
def extractorTest():
    reviews = open("./crawled/concatReviews.txt").read()
    extractorOut = open("./taggers/extractorOut.txt", 'w+')
    extractor = extract.TermExtractor()
    # print(extractor(reviews))
    for extension in re.split('-{41,}', reviews):
        extractorOut.write(str(extractor(str(extension))) + '\n')
Пример #3
0
	def extract_word_distribution(self):
		extractor = extract.TermExtractor()
		indices = []
		i = 0
		for label in self.raw_labels:
			if label in ["raw.abstract", "title", 'raw.title']:
				indices.append(i)
			i += 1
		if len(indices) > 2:
			indices = indices[1:]
		total = 0
		documents_to_words = []
		for paper_data in self.raw_data:
			paper_text = ''
			for index in indices:
				paper_text += paper_data[index]
				total += len(paper_data[index])
			document_to_words = []
			keywords = extractor(paper_text)
			for keyword in keywords:
				if keyword[2] > 3:
					break
				word_id = self.insert_word(keyword[0])
				word_count = keyword[1]
				self.words_inverted[word_id] = keyword[0]
				document_to_words.append((word_id, word_count))
			documents_to_words.append(document_to_words)
		print("EXtracted total {}".format(total))
		return documents_to_words
Пример #4
0
def main():
    import string
    import csv
    import re
    import itertools
    from topia.termextract import tag
    tagger = tag.Tagger()
    tagger.initialize()
    fp = open('Mech.txt', 'r')
    text = fp.read()
    text = ''.join(ch for ch, _ in itertools.groupby(text))
    text = filter(lambda x: (x in string.printable), text)
    #text=text.replace('\n','.')
    text = re.sub('[^a-zA-Z0-9.,;:\\/\'&()]', ' ', text)

    print tagger.tokenize(text)
    print tagger(text)
    from topia.termextract import extract
    extractor = extract.TermExtractor()
    #extractor.filter = extract.permissiveFilter
    keywords = extractor(text)
    print keywords
    #print type(keywords)
    with open('topia_keywords.csv', 'wb') as tcsv:
        tcsv_write = csv.writer(tcsv)
        for row in sorted(keywords, key=lambda xrange: xrange[1]):
            tcsv_write.writerow(row)
        def return_food_sentences(eatery_id):
                from sklearn.externals import joblib
                sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()
                reviews_list = list()
                for post in reviews.find({"eatery_id": eatery_id}):
                        reviews_list.extend([[sent, post.get("review_time")] for sent in sent_tokenizer.tokenize(post.get("review_text"))])
                

                tags = TAG_CLASSIFIER_LIB.predict([e[0] for e in reviews_list])
                food_sentences = list()
                for (sent, review_time),  tag in zip(reviews_list, tags):
                        if tag == "food":
                                food_sentences.append([sent, review_time])
   
                sub_tags = FOOD_SB_TAG_CLASSIFIER_LIB.predict([e[0] for e in food_sentences])

                dishes_n_drinks = list()

                for (sent, review_time), sub_tag in zip(food_sentences, sub_tags):
                        if sub_tag == "dishes" or sub_tag == "drinks":
                                dishes_n_drinks.append([sent, review_time])
                        
    
  

                sentiments = SENTI_CLASSIFIER_LIB.predict([e[0] for e in dishes_n_drinks])
    
                from topia.termextract import extract
                topia_extractor = extract.TermExtractor()
                noun_phrases = list()
                for (sent, review_time), tag in zip(dishes_n_drinks, sentiments):
                        nouns = topia_extractor(sent)
                        noun_phrases.append([tag, [e[0].lower() for e in nouns], review_time])
                        
                return (filter(lambda x: x[1], noun_phrases), [e[0] for e in dishes_n_drinks])
Пример #6
0
 def __init__(self):
     self.extractor = extract.TermExtractor()
     self.extractor.filter = extract.permissiveFilter
     self.usable_characters = set(
         '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \''
     )
     secret = open(secrets).readlines()[0].strip()
     self.client = wolframalpha.Client(secret)
Пример #7
0
def terms(url):
    terms = {}
    html = requests.get(url)
    content = html.content.decode("utf-8")
    soup = BeautifulSoup(content)
    #print soup.get_text()
    '''
	for script in soup(['script','style']):
		script.extract

	text=soup.get_text().decode("utf-8")
	print(text)
	'''
    [
        s.extract() for s in soup(
            ['style', 'script', '[document]', 'head', 'title', 'select'])
    ]
    visible_text = soup.getText()
    #print soup.getText()

    print visible_text.decode

    f = open('haha4.txt', 'w')

    for i in visible_text:
        f.write(i.encode('utf-8'))
    f.close()

    tagger = tag.Tagger('english')
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    patt = "((?: [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF7][\x80-\xBF]{3}){1,100})"
    s = nltk.data.load('haha4.txt', format='raw').lower()
    re.sub(patt, '', s)
    extractor.tagger(s)
    # extract all the terms, even the "weak" ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract

    print extractor(s)
    result = []
    for ss in extractor(s):
        #print ss[0]
        for i in ss[0].split(" "):
            for j in i.split("-"):
                if not j in result:
                    result.append(j)

    print result

    with open("words.txt", "a") as myfile:
        for i in result:
            myfile.write(i + "\n")

    return result
Пример #8
0
def extract_terms(text):
    extractor = extract.TermExtractor()
    extractor.filter = extract.permissiveFilter
    terms = extractor(text)
    #return terms
    return [
        t[0].lower() for t in terms
        if t[2] == 1 and MIN_TERM_LENGTH <= len(t[0]) <= MAX_TERM_LENGTH
    ]
Пример #9
0
def getTerms(text):
    terms = []
    extractor = extract.TermExtractor()
    extractor.filter = extract.permissiveFilter
    for t in text:
        ext = extractor(t)
        newterms = []
        for e in ext:
            newterms.append(e[0])
        terms.append(newterms)
    return terms
    def __init__(self, review_id, review_text, review_time, eatery_id):
        """
                Lowering the review text
                """
        self.review_id, self.review_text, self.review_time, self.eatery_id = review_id, \
                SolveEncoding.to_unicode_or_bust(review_text.lower().replace("&nbsp;&nbsp;\n", "")), review_time, eatery_id

        print self.review_time, self.review_text, self.review_id, self.eatery_id
        self.cuisine_name = list()
        self.places_names = list()
        self.np_extractor = extract.TermExtractor()
Пример #11
0
def extractKeywords(text):
    extractor = extract.TermExtractor()
    #inputFile = open("input.txt", 'r')
    #text = inputFile.read();
    keywords = sorted(extractor(text))

    keyPhrases = []

    for tuples in keywords:
        keyPhrases.append(tuples[0])
    return keyPhrases
Пример #12
0
    def getImportaantFeatures(self):
        extractor = extract.TermExtractor()
        extractor.filter = extract.permissiveFilter
        key_word_for_desc = extractor(self.description)
        dict_important_features = {}
        for element in key_word_for_desc:
            word = stem_word(element[0])
            if len(word) != 0:
                dict_important_features[word] = element[1]

        #print str(dict_important_features)
        return dict_important_features
Пример #13
0
def taggerTest():
    reviews = open("./crawled/concatReviews.txt").read()
    # tagger = tag.Tagger()
    # tagger.initialize()
    # tagger.tokenize(reviews)
    # extract.TermExtractor(tagger)
    # extractor.filter = extract.permissiveFilter
    # # extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)
    # extracted = extractor(reviews)
    # printTaggedTerms(extracted)
    extractor = extract.TermExtractor()
    print(extractor(reviews))
Пример #14
0
def get_keywords(text):
    if text is None or "" or False:
        return []

    extractor = extract.TermExtractor()
    keywords = sorted(extractor(text))

    filtered_keywords = []
    for keyword in keywords:
        if keyword[1] > 2:
            filtered_keywords.append(keyword[0])

    return filtered_keywords
Пример #15
0
def extract_keywords(doc, lower=False):
    extractor = extract.TermExtractor()
    extractor.filter = extract.DefaultFilter()

    keywords_list = []
    keywords = extractor(doc)

    for keyword in keywords:
        if lower == True:
            keywords_list.append(keyword[0].lower())
        else:
            keywords_list.append(keyword[0])

    return keywords_list
Пример #16
0
def keyterms(text, language='english'):
    # initialize the tagger with the required language
    tagger = tag.Tagger(language)
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    #    s = nltk.data.load('corpora/operating/td1.txt',format = 'raw')
    extractor.tagger(text)
    # extract all the terms, even the &amp;quot;weak&amp;quot; ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract
    return extractor(text)
Пример #17
0
def get_terms(url):
    text = get_text(url)
    extractor = extract.TermExtractor()
    extractor.filter = extract.permissiveFilter
    #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)
    terms = extractor(text)
    for t in terms:
        if MIN_TERM_LENGTH <= len(t[0]) <= MAX_TERM_LENGTH:
            txt = t[0]
            txt = txt.translate(title_trans)
            txt = txt.replace('_', ' ')
            txt = txt.strip().lower()
            if len(txt) > 2:
                t2 = [txt, t[1], t[2]]
                yield t2
Пример #18
0
        def __init__(self, text):
                self.text = text
                self.conll_extractor = ConllExtractor()
                self.topia_extractor = extract.TermExtractor()
                
                ##Our custom tokenizer
                self.custom_sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()
                self.tokenized_sentences = self.custom_sent_tokenizer.tokenize(self.text)
                
                ##This method will apply the sstemmers to the sentences
                self.stemming()

                print nltk.sent_tokenize(self.text)
                self.np_textblob()
                self.np_topia()
Пример #19
0
    def POST(self):
        import sys
        import re
        import simplejson as json
        from topia.termextract import extract
        extractor = extract.TermExtractor()
        #extractor.filter = extract.permissiveFilter
        extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)

        def term_compare(x, y):
            if y[1] + y[2] * 2 > x[1] + x[2] * 2:
                return 1
            elif y[1] == x[1] and y[2] == x[2]:
                return 0
            else:  # x<y
                return -1

        input = web.input(callback=None)
        content = input.context.lower()
        content = content.replace(u"\u201c",
                                  '"').replace(u"\u201d", '"').replace(
                                      u"\u2018",
                                      "'").replace(u"\u2019",
                                                   "'").replace(u"\u2026", "")
        list = sorted(extractor(content), term_compare)
        list = list[:50]
        for i in range(len(list) - 1, -1, -1):
            if len(list[i][0]) == 1 or list[i][2] > 2 or (
                    list[i][0].find("http") >= 0) or not re.search(
                        '[a-z]', list[i][0]) or re.search('[0-9]', list[i][0]):
                list.remove(list[i])
            else:
                # prepend /tags/ to match expected input on server
                list[i] = list[i][0].strip()
        callback = input.callback
        pattern = r'[^a-zA-Z0-9 ]'
        for i in range(len(list) - 1, -1, -1):
            if re.search(pattern, list[i]):
                list.remove(list[i])
        if (len(sys.argv) > 2):
            length = int(sys.argv[2])
            if (len(list) > length):
                list = list[:length]
        list = json.dumps(list, indent=4)
        if callback and re.match('^[a-zA-Z0-9._\[\]]+$', callback):
            return callback + '(' + list + ')'
        else:
            return list
Пример #20
0
def extract_terms(text):
    """
    Use topia.termextract to perform a simple tag extraction from
    user comments.
    """
    extractor = extract.TermExtractor()
    # Use permissive filter to find all possibly relevant terms in short texts.
    extractor.filter = extract.permissiveFilter
    terms = extractor(text)

    # Collect terms in lower case, but only the ones that consist of single
    # words (t[2] == 1), and are at most 25 chars long.
    return [
        t[0].lower() for t in terms if t[2] == 1
        and settings.MIN_TERM_LENGTH <= len(t[0]) <= settings.MAX_TERM_LENGTH
    ]
Пример #21
0
def buildX(event_text, keywords):
    import pandas as pd
    from topia.termextract import extract
    X = []
    for i in range(len(event_text)):
        text = event_text[i]
        x = {}
        for keyword in keywords:
            x[keyword] = 0
        extractor = extract.TermExtractor()
        pairs = extractor(event_text[i])
        for pair in pairs:
            if pair[0] in keywords:
                x[pair[0]] += pair[1]
        X.append(x)
    return pd.DataFrame(X)
Пример #22
0
def get_keywords(input_text):

    input_text = input_text.lower()
    stop_words = stopwords.words('english')
    remove = '|'.join(stop_words)
    regex = re.compile(r'\b(' + remove + r')\b', flags=re.IGNORECASE)
    input_text = regex.sub("", input_text)

    keyword_set = set()

    extractor = extract.TermExtractor()
    for x in sorted(extractor(input_text)):
        words = re.sub('[^0-9a-zA-Z@#]+', ' ', x[0]).split()
        for word in words:
            keyword_set.add(word)
    return keyword_set
Пример #23
0
def terms(url):
    terms = {}
    url = "http://www." + url
    html = requests.get(url)
    content = html.content.decode("utf-8")
    soup = BeautifulSoup(content, "lxml")
    '''
	for script in soup(['script','style']):
		script.extract

	text=soup.get_text().decode("utf-8")
	print(text)
	'''
    [
        s.extract()
        for s in soup(['style', 'script', '[document]', 'head', 'title'])
    ]
    visible_text = soup.getText()
    #print visible_text.decode
    f = open('haha4.txt', 'w')
    f2 = open('keys', 'a')
    for i in visible_text:
        f.write(i.encode('utf-8'))
        if not i in terms:
            terms[i] = 1
        else:
            terms[i] = terms[i] + 1
            #print "yees"
    pickle.dump(terms, f2)
    f2.close()
    f.close()

    tagger = tag.Tagger('english')
    tagger.initialize()

    # create the extractor with the tagger
    extractor = extract.TermExtractor(tagger=tagger)
    # invoke tagging the text
    s = nltk.data.load('haha4.txt', format='raw')
    extractor.tagger(s)
    # extract all the terms, even the &amp;quot;weak&amp;quot; ones
    extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=1)
    # extract
    #print extractor(s)
    return terms
Пример #24
0
	def topia(self):
				### extract terms using topia
		from topia.termextract import extract
		extractor = extract.TermExtractor()
		sentences = eval(segmentor.segment(text,1))['sentences'][:-5]
		terms = sorted(extractor(" ".join(sentences)))
		terms = [i for i in terms if filtered(i[0])]

		def filtered(term):
			if not re.search('[a-zA-Z]+', term):
				return False
			if re.search('(PM|AM)', term):
				return False
			if isVerb(term):
				return False
			return True

		return terms
Пример #25
0
def keywords(env, start_response):
    """Extracts key words and phrases from resume."""
    start_response('200 OK', [('Content-Type', 'text/xml')])
    try:
        with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
            text = f.read()
    except:
        raise restlite.Status, '400 Error Reading File'
    mycleaner = clean_text(text, [
        "strip_characters", "eliminate_stopwords", "eliminate_nonwords",
        "normalize_tokens"
    ])
    cleaned = mycleaner.clean()

    extractor = extract.TermExtractor()
    keywords_tuples = extractor(cleaned)
    doc = create_xml({'keywords': keywords_tuples})

    return [doc.toxml()]
Пример #26
0
    def generate(self, size):

        try:
            from topia.termextract import extract
        except ImportError:
            raise CommandError("topia.termextract library required")

        extractor = extract.TermExtractor()
        extractor.filter = extract.permissiveFilter
        titles = Link.objects.values_list("title", flat=True)
        tags = extractor(" ".join(titles))
        tags.sort(key=lambda tag: tag[1], reverse=True)

        def valid_tag(tag):
            def valid_char(char):
                return not (char in punctuation or char.isdigit())
            return filter(valid_char, slugify(tag[0]))

        for tag in filter(valid_tag, tags)[:size]:
            print("Creating keyword %s" % tag[0])
            Keyword.objects.get_or_create(title=tag[0])
Пример #27
0
def main():
    try:
        # list of index terms
        index_list = list()

        # init tagging
        tagger = tag.Tagger()
        tagger.initialize()
        extractor = extract.TermExtractor(tagger)
        #extractor.filter = extract.permissiveFilter
        #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)

        # get file path; you may need to customize this
        p = os.path.join('*.docx')

        # go through files
        for infile in glob.glob(p):
            # open document
            doc = Document(os.getcwd() + os.sep + infile)
            print os.getcwd() + os.sep + infile

            # get text from Word document
            text = getdocumenttext(doc)

            # tagging
            l = extractor(text)
            for item in l:
                if item[0] not in index_list:
                    index_list.append(item[0])

            # close Word document
            del doc

            file = codecs.open(os.getcwd() + os.sep + 'all_concordances.tsv',
                               'w', 'utf8')
            for row in sorted(index_list):
                file.write(row + '\t\n')
            file.close()
    finally:
        print "Done!"
Пример #28
0
    def __init__(self,
                 list_of_sentences,
                 default_np_extractor=None,
                 regexp_grammer=None,
                 if_postagged=False):
        """
                Args:
                        list_of_sentences: A list of lists with each element is a list of sentences which is pos tagged
                        Example:
                                [[('I', 'PRP'), ('went', 'VBD'), ('there', 'RB'), ('for', 'IN'), ('phirni', 'NN')], [], [], ...]

                        default_np_extractor:
                                    if a list been passed then the noun phrases from various np_extractors will be appended
                                    if a string is passed, only the noun phrases from that np extractor will be appended
                                    Options
                                        regex_np_extractor
                                        regex_textblob_conll_np
                                        textblob_np_conll
                                        textblob_np_base

                """

        self.if_postagged = if_postagged
        self.noun_phrases = list()
        self.conll_extractor = ConllExtractor()
        self.topia_extractor = extract.TermExtractor()

        self.list_of_sentences = list_of_sentences
        self.np_extractor = (
            "textblob_np_conll",
            default_np_extractor)[default_np_extractor != None]
        if not regexp_grammer:
            self.regexp_grammer = r"CustomNounP:{<JJ|VB|FW|VBN>?<NN.*>*<NN.*>}"

        eval("self.{0}()".format(self.np_extractor))

        self.noun_phrases = {self.np_extractor: self.noun_phrases}

        return
Пример #29
0
def main():
   try:
       # list of index terms
       index_list = list()

       # init tagging
       tagger = tag.Tagger()
       tagger.initialize()
       extractor = extract.TermExtractor(tagger)
       #extractor.filter = extract.permissiveFilter
       #extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)

       # get file path
       p = os.path.join('final.ms'+ os.sep, '*chapter*.docx') # you may need to customize this

       # go through files
       for infile in glob.glob(p):
          # open document
          doc = Document(os.getcwd()+'\\'+infile)
          print os.getcwd()+'\\'+infile

          # get text from Word document
          text = getdocumenttext(doc)

          # tagging
          l = extractor(text)
          for item in l:
             if item[0] not in index_list:
                index_list.append(item[0])

          # close Word document
          del doc

       write_concordance(sorted(index_list), os.getcwd()+os.sep+'all_concordance.docx')
   finally:
      print "Done!"
    def __get_category(self, document):
        extractor = extract.TermExtractor()
        extractor.filter = extract.permissiveFilter
        extracted_key_word = extractor(document.text)
        dict_category_value = {}
        # calculating likelihood for each category
        for word_object in extracted_key_word:
            word = word_object[0]
            count = word_object[1]
            if word in self.dict_refined_features:
                key_feature = self.dict_refined_features[word]
                for category_name in key_feature.dict_category_probability:
                    if category_name in dict_category_value:
                        dict_category_value[
                            category_name] = dict_category_value[
                                category_name] + count * math.log(
                                    key_feature.dict_category_probability[
                                        category_name], 10)
                    else:
                        dict_category_value[category_name] = count * math.log(
                            key_feature.
                            dict_category_probability[category_name], 10)

        # calculating prior
        for category_name in dict_category_value:
            dict_category_value[
                category_name] = dict_category_value[category_name] + math.log(
                    self.dict_category[category_name].prior)
        max_value = -10000000000
        document_category_name = ""
        # calculating max
        for category_name in dict_category_value:
            if dict_category_value[category_name] >= max_value:
                max_value = dict_category_value[category_name]
                document_category_name = category_name
        return document_category_name