Пример #1
0
 def confusion_matrix(self, key=None, output_format=None, split=False):
     """Returns a confusion matrix for the model based on splitting the data set randomly into two pieces, training on one and testing on the other"""
     if split:
         list_of_dependent = self.dependent_in_use(key=key)
     else:
         list_of_dependent = [None]
     output = ''
     matrices = dict()
     for current_dep in list_of_dependent:
         testing_set = list()
         model = self._learner()
         for record in self.classified_entries(key=key):
             if split:
                 dep_result = str(record.dependent == current_dep)
             else:
                 dep_result = record.dependent
             if random.random() < 0.5:
                 model.train(Document(record.independent.lower(), stemmer=PORTER), dep_result)
             else:
                 testing_set.append((Document(record.independent.lower(), stemmer=PORTER), dep_result))
         matrix = model.confusion_matrix(documents=testing_set)
         matrices[current_dep] = matrix
         if output_format == 'html':
             if split:
                 output += '<h4>' + current_dep + "</h4>"
             vals = matrix.keys()
             output += '<table class="table table-bordered"><thead><tr><td></td><td></td><td style="text-align: center" colspan="' + str(len(vals)) + '">Actual</td></tr><tr><th></th><th></th>'
             first = True
             for val in vals:
                 output += '<th>' + val + '</th>'
             output += '</tr></thead><tbody>'
             for val_a in vals:
                 output += '<tr>'                
                 if first:
                     output += '<td style="text-align: right; vertical-align: middle;" rowspan="' + str(len(vals)) + '">Predicted</td>'
                     first = False
                 output += '<th>' + val_a + '</th>'
                 for val_b in vals:
                     output += '<td>' + str(matrix[val_b].get(val_a, 0)) + '</td>'
                 output += '</tr>'
             output += '</tbody></table>'
             #output += "\n\n`" + str(matrix) + "`"
             # output += '<ul>'
             # for document, actual in testing_set:
             #     predicted = model.classify(document)
             #     output += '<li>Predicted: ' + predicted + '; Actual: ' + actual + '</li>'
             # output += '</ul>'
     if output_format == 'html':
         return output
     if split:
         ret_val = matrices
     else:
         ret_val = matrices[None]
     if output_format == 'json':
         return json.dumps(ret_val, sort_keys=True, indent=4)
     if output_format == 'yaml':
         return yaml.safe_dump(ret_val, default_flow_style=False)
     if output_format is None:
         return ret_val
     return ret_val
Пример #2
0
def setup():
    global pages
    global urlalias
    global revurlalias
    global knn
    pages = dict()
    urlalias = dict()
    revurlalias = dict()
    knn = KNN()
    db = MySQLdb.connect(host="192.168.200.26",
                         user="******",
                         passwd="xxxsecretxxx",
                         db="pla")
    cur = db.cursor()
    cur.execute("select source, alias from url_alias")
    for row in cur.fetchall():
        urlalias[row[1]] = row[0]
        revurlalias[row[0]] = row[1]
    cur.execute("select tid, name, description, vid from taxonomy_term_data;")
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
            url = revurlalias[url]
        if row[3] == 3:
            soup = bs4.BeautifulSoup(row[2])
            the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
            knn.train(Document(the_text, stemmer=PORTER), url)
            knn.train(Document(row[1].lower()), url)
    cur.execute(
        "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);"
    )
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        if url in revurlalias:
            url = revurlalias[url]
        soup = bs4.BeautifulSoup(row[1])
        the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
        knn.train(Document(the_text, stemmer=PORTER), url)
        knn.train(Document(row[2].lower()), url)
    cur.execute("select nid, title from node where status=1;")
    for row in cur.fetchall():
        url = 'node/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
    db.close()
    pgcur = conn.cursor()
    pgcur.execute(
        "select query, target from website_queries where target is not null group by query, target"
    )
    for row in pgcur.fetchall():
        words = re.split(r'[\n\r,;]+ *', row[1])
        for word in words:
            print("training on " + row[0].lower() + " for " + word)
            knn.train(Document(row[0].lower()), word)
    conn.commit()
    pgcur.close()
Пример #3
0
def resolve_certainty(certainty_info):
    '''Resolve certainty with Naive Bayes'''
    if certainty_info == '':
        return 'No certainty info.'
    else:
        nb = NB()
        for observation, certainty in csv(
                'library/templatetags/c_training_data.csv'):
            v = Document(observation, type=int(certainty), stopwords=True)
            nb.train(v)
        return nb.classify(Document(certainty_info))
Пример #4
0
def evaluate_query(query):
    probs = dict()
    for key, value in knn.classify(Document(query),
                                   discrete=False).iteritems():
        probs[key] = value
    if not len(probs):
        probs[knn.classify(Document(query))] = 1.0
    seen = set()
    probs = map(lambda x: fixurl(x, seen),
                sorted(probs, key=probs.get, reverse=True))
    probs = [prob for prob in probs if prob is not None]
    return probs
Пример #5
0
def nnps_and_keywords(text):
    s = parsetree(text, relations=True, lemmata=True)

    nnp_kw = {}
    for e in s:
        d = Document(e)
        kw = d.keywords()

        nnp = set()
        for w in kw:
            if w[1].type == 'NNP':
                wdstr = []
                for wd in w[1].phrase.words:
                    if wd.type == 'NNP':
                        wdstr.append(wd.string)
                nnp.add("-".join(wdstr))


        kw = d.keywords(top=5)
        words = set()
        for w in kw:
            if w[1].type != 'NNP':
                if w[1].lemma:
                    words.add(w[1].lemma)
                else:
                    words.add(w[1].string)

        if len(nnp)>1 and len(words)>1:
            if tuple(nnp) in nnp_kw:
                nnp_kw[tuple(nnp)].update(words)
            else:
                nnp_kw[tuple(nnp)]=words

    return nnp_kw
 def crearDocumentoPattern(self, contenido, name=""):
     '''Creacion de documentos eliminando stopwords, aplicando stemming y peso de frecuencias TFIDF'''
     return Document(contenido,
                     name=name,
                     stemmer=PORTER,
                     stopwords=True,
                     weigth=TFIDF)
Пример #7
0
def summarize(text, n=1):
    """
    extract most relevant sentences from text according to TextRank algorithm
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dict of TextRank ranking of cosine similarity matrix
    ranking = utils.textrank(m.documents, m.distance)

    # indexes of top n sentences
    top_sents_idx, _ = list(zip(*ranking.most_common(n)))

    # reordering
    output = [sentences[i] for i in sorted(top_sents_idx)]

    return ''.join(output)
Пример #8
0
 def predict(self, indep, probabilities=False):
     """Returns a list of predicted dependent variables for a given independent variable."""
     indep = re.sub(r'[\n\r]+', r'  ', indep).lower()
     if not self._train_from_db():
         return list()
     probs = dict()
     for key, value in learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER), discrete=False).iteritems():
         probs[key] = value
     if not len(probs):
         single_result = learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER))
         if single_result is not None:
             probs[single_result] = 1.0
     if probabilities:
         return [(x, probs[x]) for x in sorted(probs.keys(), key=probs.get, reverse=True)]
     else:
         return sorted(probs.keys(), key=probs.get, reverse=True)
Пример #9
0
 def _train(self, indep, depend):
     """Trains the machine learner given an independent variable and a corresponding dependent variable."""
     if indep is None:
         return
     the_text = re.sub(r'[\n\r]+', r'  ', indep).lower()
     learners[self.group_id].train(
         Document(the_text.lower(), stemmer=PORTER), depend)
Пример #10
0
def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking
Пример #11
0
def articles_to_trends(articles):
    news = {}
    for story in articles:
        if story['added_at']:
            article_text = get_article_text(story['url'])
            d, s = timestamptext(story['added_at'], article_text)

            # Each key in the news dictionary is a date: news is grouped per day.
            # Each value is a dictionary of id => story items.
            # We use hash(story['summary']) as a unique id to avoid duplicate
            # content.
            news.setdefault(d, {})[hash(s)] = s

    m = Model()
    for date, stories in news.items():
        s = stories.values()
        s = ' '.join(s).lower()
        # Each day of news is a single document.
        # By adding all documents to a model we can calculate tf-idf.
        m.append(Document(s, stemmer=LEMMA, exclude=[
                 'news', 'day'], name=date))

    for document in m:
        print document.name
        print document.keywords(top=10)
Пример #12
0
def summarize(text_to_summarize):
    stokens = tokenize(text_to_summarize)
 
    # STEP 1
    # pattern.vector's Document is a nifty bag-o-words structure,
    # with a TF weighting scheme
    docs = [Document(string= s, name=e,stemmer=LEMMA)
            for e,s in enumerate(stokens) if len(s.split(" ")) > 7]
    
    linkgraph = []
    # STEP 2 and 3 happen interwovenly
    for doc in docs:
        for doc_copy in docs:
            if doc.name != doc_copy.name:
                # STEP 2 happens here
                wordset_a = [x[1] for x in doc.keywords()]
                wordset_b = [y[1] for y in doc_copy.keywords()]
                jacc_dist = distance.jaccard(wordset_a, wordset_b)
                if jacc_dist < 1:
                    linkgraph.append((str(doc.name), #index to sentence
                                      str(doc_copy.name),1-jacc_dist)) #dist. score
    # By the time we reach here, we'd have completed STEP 3
    
    # STEP 4
    #I referenced this SO post for help with pagerank'ing
    #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx
    D=nx.DiGraph()
    D.add_weighted_edges_from(linkgraph)
    pagerank = nx.pagerank(D)
    sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1))
    sort_pagerank.reverse()
    top2 = sort_pagerank[:2]
    orderedtop2 = [int(x[0]) for x in top2]
    orderedtop2 = sorted(orderedtop2)
    return " ".join([ stokens[i] for i in orderedtop2 ])
Пример #13
0
def feeds_to_trends(feeds):
    for url in feeds:
        url = url['feed_url']
        news = {}
        try:
            for story in Newsfeed().search(url, cached=False):
                d, s = datetext(story.date, story.description)

                # Each key in the news dictionary is a date: news is grouped per day.
                # Each value is a dictionary of id => story items.
                # We use hash(story.description) as a unique id to avoid duplicate
                # content.
                news.setdefault(d, {})[hash(s)] = s

            m = Model()
            for date, stories in news.items():
                s = stories.values()
                s = ' '.join(s).lower()
                # Each day of news is a single document.
                # By adding all documents to a model we can calculate tf-idf.
                m.append(Document(s, stemmer=LEMMA, exclude=[
                         'news', 'day'], name=date))

            for document in m:
                print document.name
                print document.keywords(top=10)
        except HTTP404NotFound:
            print url
            pass
Пример #14
0
 def get_keywords_article(article):
     tagged_content_words = ([
         i.Word for i in article.tagged_content if i.Tag.startswith('NN')
     ])
     d = Document(tagged_content_words)
     k = d.keywords(top=5)
     article.keywords = k
Пример #15
0
def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict
Пример #16
0
def doclist_from_feeds(feeds):
    titles = gettitles(feeds)
    documents = []
    for key in titles:
        doc = Document(" ".join(titles[key]), stemmer=LEMMA, threshold=0)
        documents.append(doc)
    return documents
Пример #17
0
 def get_labeled_feats(self, data):
     labeled_binary = []
     for (word, tag) in data:
         feat = FeatExtract(
             word,
             ArtOrDet=(self.error_tag == 'ArtOrDet')).binary_features()
         d = Document(feat, type=tag, stopwords=True)
         labeled_binary.append(d)
     return labeled_binary
Пример #18
0
def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
            text = TextBlob(PageParser.parse(w.read()))
            text = ' '.join([
                word for word in text.words if word not in cachedStopWords
            ]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
            #ent_text = PageParser.parse(w.read())
            docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
    # Clustering could be a useful technique, commenting out for now
    #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
    #	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
        for i, concept in enumerate(m.lsa.concepts):
            print("Concept {0}:".format(i)),
            w.write(unicode("Concept {0}:".format(i)))
            count = 0
            # Show top only first 5 features we come across
        for feature, weight in m.lsa.concepts[i].items():
            if abs(weight) > 0.2:
                print(feature),
                w.write(feature + " ")
                count += 1

            if count > 5:
                break
        w.write(unicode('\n'))
        #print

        cat_docs = []
        for d in m.documents:
            cat = (0, 0, {})
            #print d.name.split('\\')[-1]
            for idx, weight in m.lsa.vectors[d.id].items():
                print "\tCat {0}: {1}".format(idx, weight)
                if abs(weight) > abs(cat[1]) or cat[1] == 0:
                    cat = (idx, weight, d)

            if cat[0] == i:
                cat_docs.append(cat)
                #print "\t{0}".format(d.name.split('\\')[-1])

        cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
        for cat, weight, d in cat_docs:
            f = d.name.split('\\')[-1]
            w.write(
                unicode("\t{0} - {1}\n").format(
                    filter(lambda x: x in string.printable, f), weight))
Пример #19
0
def asDocumentReview(data):
    '''
    a function that converts list of reviews to Documents to be used by Pattern
    '''
    data = [(r['review/text'], float(r['review/score'])) for r in data]
    data = [
        Document(review, type=rating, stopwords=True)
        for review, rating in data
    ]
    return data
Пример #20
0
def asDocumentClass(data, classification):
    '''
    a function that converts list of reviews to Documents to be used by Pattern
    '''
    data = [(r['review/text'], str(classification)) for r in data]
    data = [
        Document(review, type=classification, stopwords=True)
        for review, classification in data
    ]
    return data
 def insertarDocumento(self, url, contenido):
     """ Crea registro en mongodb y un archivo Pattern Document"""
     unDocumento = Document(contenido,
                            name=url,
                            stopwords=True,
                            stemming=PORTER,
                            weigth=TFIDF)
     result = self.mongodb.crearDocumento(unDocumento)
     if result:
         unDocumento.save("DocumentoPattern/" + str(result.inserted_id))
     return unDocumento
Пример #22
0
 def run(self, minePackage):
     ac = 0.0  #acierto clave
     ap = 0.0  #acierto positivo
     an = 0.0  #acierto negativo
     alpha = 1.00
     beta = 0.75
     gamma = 0.25
     dictionary = open(os.path.dirname(__file__) + "/dictionary.txt",
                       'r').read()
     dictionary = Document(dictionary, stemmer=PORTER)
     clouds = minePackage['clouds']
     query = minePackage['searchKeyStemmer']
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             # document=methodData.getData()
             # for t in document:
             #     tf=document[t]
             #     if t in query:
             #         print "entroooooooooooooooooo"
             #         ac+=tf
             #     else:
             #         if t in dictionary:#creo que me olvide de hacer stemming a las palabras del diccionario
             #             ap+=tf
             #         else:
             #             an+=tf
             content = Document(methodData.getContent(), stemmer=PORTER)
             for doc in content.keywords(top=200, normalized=True):
                 if doc[1] in query:
                     ac += doc[0]
                 else:
                     if doc[1] in dictionary.words:
                         ap += doc[0]
                     else:
                         an += doc[0]
             if ac + ap + an > 0:
                 cloud.graph.node[n]['weight_WA'] = (
                     (ac * alpha) + (ap * beta) +
                     (an * gamma)) / (ac + ap + an)
             else:
                 cloud.graph.node[n]['weight_WA'] = 0
Пример #23
0
 def calculate(self, minePackage):
     webDocuments = []
     query = Document((minePackage['searchKey']))
     clouds = minePackage['clouds']
     count = UnPack()
     totalLinks = count.total(clouds)
     urlContent = UrlToPlainText()
     step = 0
     for cloud in clouds:
         for n in cloud.graph.nodes():
             doc = cloud.graph.node[n]['methodData']
             webDocuments.append(Document(doc.getData()))
             step += 1
     m = Model(documents=webDocuments, weight=TFIDF)
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             vector = Document(methodData.getData())
             cloud.graph.node[n]['weight_VSM'] = m.similarity(
                 vector,
                 query)  #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
Пример #24
0
def summarize(raw_text):
    if len(raw_text) == 0:
        return ""

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    tokens = sentence_tokenizer.tokenize(raw_text.strip())

    documents = []
    for position, sentence in enumerate(tokens):
        if len(sentence.split(" ")) > 5:
            document = Document(string=sentence, name=position, stemmer=LEMMA)
            if len(document.features) > 0:
                documents.append(document)

    edges = []
    for document in documents:
        for other_document in documents:
            if document.name == other_document.name:
                continue
            doc_words = document.features
            other_doc_words = other_document.features
            similarity = jaccard_similarity(doc_words, other_doc_words)
            if similarity > 0:
                edges.append((document.name, other_document.name, similarity))

    graph = networkx.DiGraph()
    graph.add_weighted_edges_from(edges)
    page_rank = networkx.pagerank(graph)

    sorted_ranks = sorted(page_rank.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

    summary = []
    sentence_numbers = []

    num_sentences = 3
    for i in range(num_sentences):
        if i < len(sorted_ranks):
            node = sorted_ranks[i]
            sentence_numbers.append(node[0])

    sentence_numbers = sorted(sentence_numbers)

    for sentence_number in sentence_numbers:
        sentence = tokens[sentence_number]
        summary.append(sentence)

    if len(summary) == 0:
        summary.append(tokens[0])

    return " ".join(summary)
Пример #25
0
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.items():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
Пример #26
0
def create_doc_list(df):
    '''
    Given a dataframe containing an 'id' column and a 'review' column, create a
    list of documents in Pattern.Vector Document format. Because of how the data
    is formatted in the dataframe, the id contains an extra quote at the beginning
    and end of the id which need to be stripped away.
    '''
    print "Creating a list of {} documents".format(len(df))
    doc_list = []
    for index, row in df.iterrows():
        d = Document(row['review'], threshold=1, name=row['id'][1:-1])
        doc_list.append(d)
    return doc_list
Пример #27
0
    def classify(text):
        predicted_category = Classifications._category.classify(Document(text),
                                                                discrete=True)
        predicted_rate = Classifications._rating.classify(Document(text),
                                                          discrete=True)
        predicted_rate_nlp = Classifications._rating_nlp.classify(
            Classifications.selectWords(text), discrete=True)
        predicted_sentiment_dict = Classifications._sentiment.classify(
            Classifications.selectWords(text), discrete=False)
        predicted_sentiment = True if str(
            sorted(predicted_sentiment_dict.items(),
                   key=operator.itemgetter(1),
                   reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0'
                                            ] else False

        return {
            'text': text,
            'rate': predicted_rate,
            'category': predicted_category,
            'rate_nlp': predicted_rate_nlp,
            'positivity': predicted_sentiment
        }
Пример #28
0
def extract():
    print 'Extracting features from app descriptions...\n'
    if os.path.exists(OUTPUT_PATH):
        shutil.rmtree(OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH)

    for dir in os.listdir(INPUT_PATH):
        if not dir.startswith('.'):
            os.makedirs("{}/{}".format(OUTPUT_PATH, dir))
            for file in os.listdir('{}/'.format(INPUT_PATH) + dir):
                with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f:
                    reader = csv.reader(f)
                    next(reader)
                    with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file),
                              'wb') as r:
                        writer = csv.writer(r)
                        for app in reader:
                            name = app[0]
                            description = app[2]

                            # Prepare an app description string for NLTK and LDA processing
                            preparedDescription = prepare_description(
                                description)

                            # Extract 3 word featurlets from the description
                            featurelets = featurelet_extraction(
                                preparedDescription)

                            list = []
                            for feature in featurelets:
                                featurelet = '{} {} {}'.format(
                                    feature[0], feature[1], feature[2])
                                list.append(
                                    Document(featurelet, name=featurelet))

                            # Perform hierarchical clustering
                            m = Model(list)
                            cluster = m.cluster(method=HIERARCHICAL,
                                                k=3,
                                                iterations=1000,
                                                distance=COSINE)

                            # Organize clusters into features and alternative tokens
                            (features,
                             alterTokens) = group(cluster, [], [], [])

                            # Write results to file
                            writer.writerow(
                                [name, description, features, alterTokens])
                        r.close()
                    f.close()
def get_model_from_documents(path='./*/*.txt'):
    '''return model from given txt files'''
    import codecs
    import glob
    from pattern.vector import Document, Model, TFIDF

    documents = []
    files = glob.glob('./*/*.*')
    for file in files:
        f = codecs.open(file, 'r')
        data = f.read()
        document = Document(data)
        documents.append(document)

    model = Model(documents=documents, weight=TFIDF)
    return documents, model
Пример #30
0
def GetVectors():
    essay_path = 'training'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    percepticon = PerceptronTagger()
    cat_dict = defaultdict(int)
    for f in files:
        extended_text = ExtendText(f, percepticon)
        name = ''
        cats = ['high', 'medium', 'low']
        for cat in cats:
            if cat in f:
                name = cat + str(cat_dict[cat])
                cat_dict[cat] += 1
        docs.append(Document(extended_text, name=name, top=None))
    m = Model(docs)
    #lsa = m.reduce(5)
    return m